xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 78cd75393ec79565c63927bf200f06f839a1dc05)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/IR/DiagnosticInfo.h"
29 #include "llvm/IR/IntrinsicsAMDGPU.h"
30 #include "llvm/IR/IntrinsicsR600.h"
31 
32 #define DEBUG_TYPE "amdgpu-legalinfo"
33 
34 using namespace llvm;
35 using namespace LegalizeActions;
36 using namespace LegalizeMutations;
37 using namespace LegalityPredicates;
38 using namespace MIPatternMatch;
39 
40 // Hack until load/store selection patterns support any tuple of legal types.
41 static cl::opt<bool> EnableNewLegality(
42   "amdgpu-global-isel-new-legality",
43   cl::desc("Use GlobalISel desired legality, rather than try to use"
44            "rules compatible with selection patterns"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 static constexpr unsigned MaxRegisterSize = 1024;
49 
50 // Round the number of elements to the next power of two elements
51 static LLT getPow2VectorType(LLT Ty) {
52   unsigned NElts = Ty.getNumElements();
53   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
54   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
55 }
56 
57 // Round the number of bits to the next power of two bits
58 static LLT getPow2ScalarType(LLT Ty) {
59   unsigned Bits = Ty.getSizeInBits();
60   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
61   return LLT::scalar(Pow2Bits);
62 }
63 
64 /// \returns true if this is an odd sized vector which should widen by adding an
65 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
66 /// excludes s1 vectors, which should always be scalarized.
67 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     if (!Ty.isVector())
71       return false;
72 
73     const LLT EltTy = Ty.getElementType();
74     const unsigned EltSize = EltTy.getSizeInBits();
75     return Ty.getNumElements() % 2 != 0 &&
76            EltSize > 1 && EltSize < 32 &&
77            Ty.getSizeInBits() % 32 != 0;
78   };
79 }
80 
81 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     return Ty.getSizeInBits() % 32 == 0;
85   };
86 }
87 
88 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
89   return [=](const LegalityQuery &Query) {
90     const LLT Ty = Query.Types[TypeIdx];
91     const LLT EltTy = Ty.getScalarType();
92     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
93   };
94 }
95 
96 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99     const LLT EltTy = Ty.getElementType();
100     return std::pair(TypeIdx,
101                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
102   };
103 }
104 
105 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
106   return [=](const LegalityQuery &Query) {
107     const LLT Ty = Query.Types[TypeIdx];
108     const LLT EltTy = Ty.getElementType();
109     unsigned Size = Ty.getSizeInBits();
110     unsigned Pieces = (Size + 63) / 64;
111     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
112     return std::pair(TypeIdx, LLT::scalarOrVector(
113                                   ElementCount::getFixed(NewNumElts), EltTy));
114   };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120   return [=](const LegalityQuery &Query) {
121     const LLT Ty = Query.Types[TypeIdx];
122 
123     const LLT EltTy = Ty.getElementType();
124     const int Size = Ty.getSizeInBits();
125     const int EltSize = EltTy.getSizeInBits();
126     const int NextMul32 = (Size + 31) / 32;
127 
128     assert(EltSize < 32);
129 
130     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132   };
133 }
134 
135 // Increase the number of vector elements to reach the next legal RegClass.
136 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
137   return [=](const LegalityQuery &Query) {
138     const LLT Ty = Query.Types[TypeIdx];
139     const unsigned NumElts = Ty.getNumElements();
140     const unsigned EltSize = Ty.getElementType().getSizeInBits();
141     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
142 
143     assert(EltSize == 32 || EltSize == 64);
144     assert(Ty.getSizeInBits() < MaxRegisterSize);
145 
146     unsigned NewNumElts;
147     // Find the nearest legal RegClass that is larger than the current type.
148     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
149       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
150         break;
151     }
152 
153     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
154   };
155 }
156 
157 static LLT getBufferRsrcScalarType(const LLT Ty) {
158   if (!Ty.isVector())
159     return LLT::scalar(128);
160   const ElementCount NumElems = Ty.getElementCount();
161   return LLT::vector(NumElems, LLT::scalar(128));
162 }
163 
164 static LLT getBufferRsrcRegisterType(const LLT Ty) {
165   if (!Ty.isVector())
166     return LLT::fixed_vector(4, LLT::scalar(32));
167   const unsigned NumElems = Ty.getElementCount().getFixedValue();
168   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
169 }
170 
171 static LLT getBitcastRegisterType(const LLT Ty) {
172   const unsigned Size = Ty.getSizeInBits();
173 
174   if (Size <= 32) {
175     // <2 x s8> -> s16
176     // <4 x s8> -> s32
177     return LLT::scalar(Size);
178   }
179 
180   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
181 }
182 
183 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
184   return [=](const LegalityQuery &Query) {
185     const LLT Ty = Query.Types[TypeIdx];
186     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
187   };
188 }
189 
190 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT Ty = Query.Types[TypeIdx];
193     unsigned Size = Ty.getSizeInBits();
194     assert(Size % 32 == 0);
195     return std::pair(
196         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
197   };
198 }
199 
200 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
201   return [=](const LegalityQuery &Query) {
202     const LLT QueryTy = Query.Types[TypeIdx];
203     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
204   };
205 }
206 
207 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
208   return [=](const LegalityQuery &Query) {
209     const LLT QueryTy = Query.Types[TypeIdx];
210     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
211   };
212 }
213 
214 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
215   return [=](const LegalityQuery &Query) {
216     const LLT QueryTy = Query.Types[TypeIdx];
217     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
218   };
219 }
220 
221 static bool isRegisterSize(unsigned Size) {
222   return Size % 32 == 0 && Size <= MaxRegisterSize;
223 }
224 
225 static bool isRegisterVectorElementType(LLT EltTy) {
226   const int EltSize = EltTy.getSizeInBits();
227   return EltSize == 16 || EltSize % 32 == 0;
228 }
229 
230 static bool isRegisterVectorType(LLT Ty) {
231   const int EltSize = Ty.getElementType().getSizeInBits();
232   return EltSize == 32 || EltSize == 64 ||
233          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
234          EltSize == 128 || EltSize == 256;
235 }
236 
237 static bool isRegisterType(LLT Ty) {
238   if (!isRegisterSize(Ty.getSizeInBits()))
239     return false;
240 
241   if (Ty.isVector())
242     return isRegisterVectorType(Ty);
243 
244   return true;
245 }
246 
247 // Any combination of 32 or 64-bit elements up the maximum register size, and
248 // multiples of v2s16.
249 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
250   return [=](const LegalityQuery &Query) {
251     return isRegisterType(Query.Types[TypeIdx]);
252   };
253 }
254 
255 // RegisterType that doesn't have a corresponding RegClass.
256 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
257   return [=](const LegalityQuery &Query) {
258     LLT Ty = Query.Types[TypeIdx];
259     return isRegisterType(Ty) &&
260            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
261   };
262 }
263 
264 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
265   return [=](const LegalityQuery &Query) {
266     const LLT QueryTy = Query.Types[TypeIdx];
267     if (!QueryTy.isVector())
268       return false;
269     const LLT EltTy = QueryTy.getElementType();
270     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
271   };
272 }
273 
274 // If we have a truncating store or an extending load with a data size larger
275 // than 32-bits, we need to reduce to a 32-bit type.
276 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
277   return [=](const LegalityQuery &Query) {
278     const LLT Ty = Query.Types[TypeIdx];
279     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
280            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
281   };
282 }
283 
284 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
285 // handle some operations by just promoting the register during
286 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
287 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
288                                     bool IsLoad, bool IsAtomic) {
289   switch (AS) {
290   case AMDGPUAS::PRIVATE_ADDRESS:
291     // FIXME: Private element size.
292     return ST.enableFlatScratch() ? 128 : 32;
293   case AMDGPUAS::LOCAL_ADDRESS:
294     return ST.useDS128() ? 128 : 64;
295   case AMDGPUAS::GLOBAL_ADDRESS:
296   case AMDGPUAS::CONSTANT_ADDRESS:
297   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
298   case AMDGPUAS::BUFFER_RESOURCE:
299     // Treat constant and global as identical. SMRD loads are sometimes usable for
300     // global loads (ideally constant address space should be eliminated)
301     // depending on the context. Legality cannot be context dependent, but
302     // RegBankSelect can split the load as necessary depending on the pointer
303     // register bank/uniformity and if the memory is invariant or not written in a
304     // kernel.
305     return IsLoad ? 512 : 128;
306   default:
307     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
308     // if they may alias scratch depending on the subtarget.  This needs to be
309     // moved to custom handling to use addressMayBeAccessedAsPrivate
310     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
311   }
312 }
313 
314 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
315                                  const LegalityQuery &Query) {
316   const LLT Ty = Query.Types[0];
317 
318   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
319   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
320 
321   unsigned RegSize = Ty.getSizeInBits();
322   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
323   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
324   unsigned AS = Query.Types[1].getAddressSpace();
325 
326   // All of these need to be custom lowered to cast the pointer operand.
327   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
328     return false;
329 
330   // Do not handle extending vector loads.
331   if (Ty.isVector() && MemSize != RegSize)
332     return false;
333 
334   // TODO: We should be able to widen loads if the alignment is high enough, but
335   // we also need to modify the memory access size.
336 #if 0
337   // Accept widening loads based on alignment.
338   if (IsLoad && MemSize < Size)
339     MemSize = std::max(MemSize, Align);
340 #endif
341 
342   // Only 1-byte and 2-byte to 32-bit extloads are valid.
343   if (MemSize != RegSize && RegSize != 32)
344     return false;
345 
346   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
347                                     Query.MMODescrs[0].Ordering !=
348                                         AtomicOrdering::NotAtomic))
349     return false;
350 
351   switch (MemSize) {
352   case 8:
353   case 16:
354   case 32:
355   case 64:
356   case 128:
357     break;
358   case 96:
359     if (!ST.hasDwordx3LoadStores())
360       return false;
361     break;
362   case 256:
363   case 512:
364     // These may contextually need to be broken down.
365     break;
366   default:
367     return false;
368   }
369 
370   assert(RegSize >= MemSize);
371 
372   if (AlignBits < MemSize) {
373     const SITargetLowering *TLI = ST.getTargetLowering();
374     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
375                                                  Align(AlignBits / 8)))
376       return false;
377   }
378 
379   return true;
380 }
381 
382 // The newer buffer intrinsic forms take their resource arguments as
383 // pointers in address space 8, aka s128 values. However, in order to not break
384 // SelectionDAG, the underlying operations have to continue to take v4i32
385 // arguments. Therefore, we convert resource pointers - or vectors of them
386 // to integer values here.
387 static bool hasBufferRsrcWorkaround(const LLT Ty) {
388   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
389     return true;
390   if (Ty.isVector()) {
391     const LLT ElemTy = Ty.getElementType();
392     return hasBufferRsrcWorkaround(ElemTy);
393   }
394   return false;
395 }
396 
397 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
398 // workaround this. Eventually it should ignore the type for loads and only care
399 // about the size. Return true in cases where we will workaround this for now by
400 // bitcasting.
401 static bool loadStoreBitcastWorkaround(const LLT Ty) {
402   if (EnableNewLegality)
403     return false;
404 
405   const unsigned Size = Ty.getSizeInBits();
406   if (Size <= 64)
407     return false;
408   // Address space 8 pointers get their own workaround.
409   if (hasBufferRsrcWorkaround(Ty))
410     return false;
411   if (!Ty.isVector())
412     return true;
413 
414   LLT EltTy = Ty.getElementType();
415   if (EltTy.isPointer())
416     return true;
417 
418   unsigned EltSize = EltTy.getSizeInBits();
419   return EltSize != 32 && EltSize != 64;
420 }
421 
422 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
423   const LLT Ty = Query.Types[0];
424   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
425          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
426 }
427 
428 /// Return true if a load or store of the type should be lowered with a bitcast
429 /// to a different type.
430 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
431                                        const LLT MemTy) {
432   const unsigned MemSizeInBits = MemTy.getSizeInBits();
433   const unsigned Size = Ty.getSizeInBits();
434   if (Size != MemSizeInBits)
435     return Size <= 32 && Ty.isVector();
436 
437   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
438     return true;
439 
440   // Don't try to handle bitcasting vector ext loads for now.
441   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
442          (Size <= 32 || isRegisterSize(Size)) &&
443          !isRegisterVectorElementType(Ty.getElementType());
444 }
445 
446 /// Return true if we should legalize a load by widening an odd sized memory
447 /// access up to the alignment. Note this case when the memory access itself
448 /// changes, not the size of the result register.
449 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
450                             uint64_t AlignInBits, unsigned AddrSpace,
451                             unsigned Opcode) {
452   unsigned SizeInBits = MemoryTy.getSizeInBits();
453   // We don't want to widen cases that are naturally legal.
454   if (isPowerOf2_32(SizeInBits))
455     return false;
456 
457   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
458   // end up widening these for a scalar load during RegBankSelect, since there
459   // aren't 96-bit scalar loads.
460   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
461     return false;
462 
463   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
464     return false;
465 
466   // A load is known dereferenceable up to the alignment, so it's legal to widen
467   // to it.
468   //
469   // TODO: Could check dereferenceable for less aligned cases.
470   unsigned RoundedSize = NextPowerOf2(SizeInBits);
471   if (AlignInBits < RoundedSize)
472     return false;
473 
474   // Do not widen if it would introduce a slow unaligned load.
475   const SITargetLowering *TLI = ST.getTargetLowering();
476   unsigned Fast = 0;
477   return TLI->allowsMisalignedMemoryAccessesImpl(
478              RoundedSize, AddrSpace, Align(AlignInBits / 8),
479              MachineMemOperand::MOLoad, &Fast) &&
480          Fast;
481 }
482 
483 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
484                             unsigned Opcode) {
485   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
486     return false;
487 
488   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
489                          Query.MMODescrs[0].AlignInBits,
490                          Query.Types[1].getAddressSpace(), Opcode);
491 }
492 
493 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
494 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
495 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
496 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
497                                    MachineRegisterInfo &MRI, unsigned Idx) {
498   MachineOperand &MO = MI.getOperand(Idx);
499 
500   const LLT PointerTy = MRI.getType(MO.getReg());
501 
502   // Paranoidly prevent us from doing this multiple times.
503   if (!hasBufferRsrcWorkaround(PointerTy))
504     return PointerTy;
505 
506   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
507   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
508   if (!PointerTy.isVector()) {
509     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
510     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
511     const LLT S32 = LLT::scalar(32);
512 
513     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
514     std::array<Register, 4> VectorElems;
515     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
516     for (unsigned I = 0; I < NumParts; ++I)
517       VectorElems[I] =
518           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
519     B.buildMergeValues(MO, VectorElems);
520     MO.setReg(VectorReg);
521     return VectorTy;
522   }
523   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
524   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
525   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
526   B.buildIntToPtr(MO, Scalar);
527   MO.setReg(BitcastReg);
528 
529   return VectorTy;
530 }
531 
532 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
533 /// the form in which the value must be in order to be passed to the low-level
534 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
535 /// needed in order to account for the fact that we can't define a register
536 /// class for s128 without breaking SelectionDAG.
537 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
538   MachineRegisterInfo &MRI = *B.getMRI();
539   const LLT PointerTy = MRI.getType(Pointer);
540   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
541   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
542 
543   if (!PointerTy.isVector()) {
544     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
545     SmallVector<Register, 4> PointerParts;
546     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
547     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
548     for (unsigned I = 0; I < NumParts; ++I)
549       PointerParts.push_back(Unmerged.getReg(I));
550     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
551   }
552   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
553   return B.buildBitcast(VectorTy, Scalar).getReg(0);
554 }
555 
556 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
557                                      unsigned Idx) {
558   MachineOperand &MO = MI.getOperand(Idx);
559 
560   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
561   // Paranoidly prevent us from doing this multiple times.
562   if (!hasBufferRsrcWorkaround(PointerTy))
563     return;
564   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
565 }
566 
567 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
568                                          const GCNTargetMachine &TM)
569   :  ST(ST_) {
570   using namespace TargetOpcode;
571 
572   auto GetAddrSpacePtr = [&TM](unsigned AS) {
573     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
574   };
575 
576   const LLT S1 = LLT::scalar(1);
577   const LLT S8 = LLT::scalar(8);
578   const LLT S16 = LLT::scalar(16);
579   const LLT S32 = LLT::scalar(32);
580   const LLT S64 = LLT::scalar(64);
581   const LLT S128 = LLT::scalar(128);
582   const LLT S256 = LLT::scalar(256);
583   const LLT S512 = LLT::scalar(512);
584   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
585 
586   const LLT V2S8 = LLT::fixed_vector(2, 8);
587   const LLT V2S16 = LLT::fixed_vector(2, 16);
588   const LLT V4S16 = LLT::fixed_vector(4, 16);
589 
590   const LLT V2S32 = LLT::fixed_vector(2, 32);
591   const LLT V3S32 = LLT::fixed_vector(3, 32);
592   const LLT V4S32 = LLT::fixed_vector(4, 32);
593   const LLT V5S32 = LLT::fixed_vector(5, 32);
594   const LLT V6S32 = LLT::fixed_vector(6, 32);
595   const LLT V7S32 = LLT::fixed_vector(7, 32);
596   const LLT V8S32 = LLT::fixed_vector(8, 32);
597   const LLT V9S32 = LLT::fixed_vector(9, 32);
598   const LLT V10S32 = LLT::fixed_vector(10, 32);
599   const LLT V11S32 = LLT::fixed_vector(11, 32);
600   const LLT V12S32 = LLT::fixed_vector(12, 32);
601   const LLT V13S32 = LLT::fixed_vector(13, 32);
602   const LLT V14S32 = LLT::fixed_vector(14, 32);
603   const LLT V15S32 = LLT::fixed_vector(15, 32);
604   const LLT V16S32 = LLT::fixed_vector(16, 32);
605   const LLT V32S32 = LLT::fixed_vector(32, 32);
606 
607   const LLT V2S64 = LLT::fixed_vector(2, 64);
608   const LLT V3S64 = LLT::fixed_vector(3, 64);
609   const LLT V4S64 = LLT::fixed_vector(4, 64);
610   const LLT V5S64 = LLT::fixed_vector(5, 64);
611   const LLT V6S64 = LLT::fixed_vector(6, 64);
612   const LLT V7S64 = LLT::fixed_vector(7, 64);
613   const LLT V8S64 = LLT::fixed_vector(8, 64);
614   const LLT V16S64 = LLT::fixed_vector(16, 64);
615 
616   std::initializer_list<LLT> AllS32Vectors =
617     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
618      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
619   std::initializer_list<LLT> AllS64Vectors =
620     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
621 
622   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
623   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
624   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
625   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
626   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
627   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
628   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
629   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
630   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
631 
632   const LLT CodePtr = FlatPtr;
633 
634   const std::initializer_list<LLT> AddrSpaces64 = {
635     GlobalPtr, ConstantPtr, FlatPtr
636   };
637 
638   const std::initializer_list<LLT> AddrSpaces32 = {
639     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
640   };
641 
642   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
643 
644   const std::initializer_list<LLT> FPTypesBase = {
645     S32, S64
646   };
647 
648   const std::initializer_list<LLT> FPTypes16 = {
649     S32, S64, S16
650   };
651 
652   const std::initializer_list<LLT> FPTypesPK16 = {
653     S32, S64, S16, V2S16
654   };
655 
656   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
657 
658   // s1 for VCC branches, s32 for SCC branches.
659   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
660 
661   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
662   // elements for v3s16
663   getActionDefinitionsBuilder(G_PHI)
664       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
665       .legalFor(AllS32Vectors)
666       .legalFor(AllS64Vectors)
667       .legalFor(AddrSpaces64)
668       .legalFor(AddrSpaces32)
669       .legalFor(AddrSpaces128)
670       .legalIf(isPointer(0))
671       .clampScalar(0, S16, S256)
672       .widenScalarToNextPow2(0, 32)
673       .clampMaxNumElements(0, S32, 16)
674       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
675       .scalarize(0);
676 
677   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
678     // Full set of gfx9 features.
679     getActionDefinitionsBuilder({G_ADD, G_SUB})
680       .legalFor({S32, S16, V2S16})
681       .clampMaxNumElementsStrict(0, S16, 2)
682       .scalarize(0)
683       .minScalar(0, S16)
684       .widenScalarToNextMultipleOf(0, 32)
685       .maxScalar(0, S32);
686 
687     getActionDefinitionsBuilder(G_MUL)
688       .legalFor({S32, S16, V2S16})
689       .clampMaxNumElementsStrict(0, S16, 2)
690       .scalarize(0)
691       .minScalar(0, S16)
692       .widenScalarToNextMultipleOf(0, 32)
693       .custom();
694     assert(ST.hasMad64_32());
695 
696     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
697       .legalFor({S32, S16, V2S16}) // Clamp modifier
698       .minScalarOrElt(0, S16)
699       .clampMaxNumElementsStrict(0, S16, 2)
700       .scalarize(0)
701       .widenScalarToNextPow2(0, 32)
702       .lower();
703   } else if (ST.has16BitInsts()) {
704     getActionDefinitionsBuilder({G_ADD, G_SUB})
705       .legalFor({S32, S16})
706       .minScalar(0, S16)
707       .widenScalarToNextMultipleOf(0, 32)
708       .maxScalar(0, S32)
709       .scalarize(0);
710 
711     getActionDefinitionsBuilder(G_MUL)
712       .legalFor({S32, S16})
713       .scalarize(0)
714       .minScalar(0, S16)
715       .widenScalarToNextMultipleOf(0, 32)
716       .custom();
717     assert(ST.hasMad64_32());
718 
719     // Technically the saturating operations require clamp bit support, but this
720     // was introduced at the same time as 16-bit operations.
721     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
722       .legalFor({S32, S16}) // Clamp modifier
723       .minScalar(0, S16)
724       .scalarize(0)
725       .widenScalarToNextPow2(0, 16)
726       .lower();
727 
728     // We're just lowering this, but it helps get a better result to try to
729     // coerce to the desired type first.
730     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
731       .minScalar(0, S16)
732       .scalarize(0)
733       .lower();
734   } else {
735     getActionDefinitionsBuilder({G_ADD, G_SUB})
736       .legalFor({S32})
737       .widenScalarToNextMultipleOf(0, 32)
738       .clampScalar(0, S32, S32)
739       .scalarize(0);
740 
741     auto &Mul = getActionDefinitionsBuilder(G_MUL)
742       .legalFor({S32})
743       .scalarize(0)
744       .minScalar(0, S32)
745       .widenScalarToNextMultipleOf(0, 32);
746 
747     if (ST.hasMad64_32())
748       Mul.custom();
749     else
750       Mul.maxScalar(0, S32);
751 
752     if (ST.hasIntClamp()) {
753       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
754         .legalFor({S32}) // Clamp modifier.
755         .scalarize(0)
756         .minScalarOrElt(0, S32)
757         .lower();
758     } else {
759       // Clamp bit support was added in VI, along with 16-bit operations.
760       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
761         .minScalar(0, S32)
762         .scalarize(0)
763         .lower();
764     }
765 
766     // FIXME: DAG expansion gets better results. The widening uses the smaller
767     // range values and goes for the min/max lowering directly.
768     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
769       .minScalar(0, S32)
770       .scalarize(0)
771       .lower();
772   }
773 
774   getActionDefinitionsBuilder(
775       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
776       .customFor({S32, S64})
777       .clampScalar(0, S32, S64)
778       .widenScalarToNextPow2(0, 32)
779       .scalarize(0);
780 
781   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
782                    .legalFor({S32})
783                    .maxScalar(0, S32);
784 
785   if (ST.hasVOP3PInsts()) {
786     Mulh
787       .clampMaxNumElements(0, S8, 2)
788       .lowerFor({V2S8});
789   }
790 
791   Mulh
792     .scalarize(0)
793     .lower();
794 
795   // Report legal for any types we can handle anywhere. For the cases only legal
796   // on the SALU, RegBankSelect will be able to re-legalize.
797   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
798     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
799     .clampScalar(0, S32, S64)
800     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
801     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
802     .widenScalarToNextPow2(0)
803     .scalarize(0);
804 
805   getActionDefinitionsBuilder(
806       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
807       .legalFor({{S32, S1}, {S32, S32}})
808       .clampScalar(0, S32, S32)
809       .scalarize(0);
810 
811   getActionDefinitionsBuilder(G_BITCAST)
812     // Don't worry about the size constraint.
813     .legalIf(all(isRegisterType(0), isRegisterType(1)))
814     .lower();
815 
816 
817   getActionDefinitionsBuilder(G_CONSTANT)
818     .legalFor({S1, S32, S64, S16, GlobalPtr,
819                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
820     .legalIf(isPointer(0))
821     .clampScalar(0, S32, S64)
822     .widenScalarToNextPow2(0);
823 
824   getActionDefinitionsBuilder(G_FCONSTANT)
825     .legalFor({S32, S64, S16})
826     .clampScalar(0, S16, S64);
827 
828   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
829       .legalIf(isRegisterType(0))
830       // s1 and s16 are special cases because they have legal operations on
831       // them, but don't really occupy registers in the normal way.
832       .legalFor({S1, S16})
833       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
834       .clampScalarOrElt(0, S32, MaxScalar)
835       .widenScalarToNextPow2(0, 32)
836       .clampMaxNumElements(0, S32, 16);
837 
838   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
839 
840   // If the amount is divergent, we have to do a wave reduction to get the
841   // maximum value, so this is expanded during RegBankSelect.
842   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
843     .legalFor({{PrivatePtr, S32}});
844 
845   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
846     .customIf(typeIsNot(0, PrivatePtr));
847 
848   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
849 
850   auto &FPOpActions = getActionDefinitionsBuilder(
851     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
852       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
853     .legalFor({S32, S64});
854   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
855     .customFor({S32, S64});
856   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
857     .customFor({S32, S64});
858 
859   if (ST.has16BitInsts()) {
860     if (ST.hasVOP3PInsts())
861       FPOpActions.legalFor({S16, V2S16});
862     else
863       FPOpActions.legalFor({S16});
864 
865     TrigActions.customFor({S16});
866     FDIVActions.customFor({S16});
867   }
868 
869   auto &MinNumMaxNum = getActionDefinitionsBuilder({
870       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
871 
872   if (ST.hasVOP3PInsts()) {
873     MinNumMaxNum.customFor(FPTypesPK16)
874       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
875       .clampMaxNumElements(0, S16, 2)
876       .clampScalar(0, S16, S64)
877       .scalarize(0);
878   } else if (ST.has16BitInsts()) {
879     MinNumMaxNum.customFor(FPTypes16)
880       .clampScalar(0, S16, S64)
881       .scalarize(0);
882   } else {
883     MinNumMaxNum.customFor(FPTypesBase)
884       .clampScalar(0, S32, S64)
885       .scalarize(0);
886   }
887 
888   if (ST.hasVOP3PInsts())
889     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
890 
891   FPOpActions
892     .scalarize(0)
893     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
894 
895   TrigActions
896     .scalarize(0)
897     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
898 
899   FDIVActions
900     .scalarize(0)
901     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
902 
903   getActionDefinitionsBuilder({G_FNEG, G_FABS})
904     .legalFor(FPTypesPK16)
905     .clampMaxNumElementsStrict(0, S16, 2)
906     .scalarize(0)
907     .clampScalar(0, S16, S64);
908 
909   if (ST.has16BitInsts()) {
910     getActionDefinitionsBuilder(G_FSQRT)
911       .legalFor({S32, S16})
912       .customFor({S64})
913       .scalarize(0)
914       .clampScalar(0, S16, S64);
915     getActionDefinitionsBuilder(G_FFLOOR)
916       .legalFor({S32, S64, S16})
917       .scalarize(0)
918       .clampScalar(0, S16, S64);
919 
920     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
921       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
922       .scalarize(0)
923       .maxScalarIf(typeIs(0, S16), 1, S16)
924       .clampScalar(1, S32, S32)
925       .lower();
926 
927     getActionDefinitionsBuilder(G_FFREXP)
928       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
929       .scalarize(0)
930       .lower();
931   } else {
932     getActionDefinitionsBuilder(G_FSQRT)
933       .legalFor({S32})
934       .customFor({S64})
935       .scalarize(0)
936       .clampScalar(0, S32, S64);
937 
938     if (ST.hasFractBug()) {
939       getActionDefinitionsBuilder(G_FFLOOR)
940         .customFor({S64})
941         .legalFor({S32, S64})
942         .scalarize(0)
943         .clampScalar(0, S32, S64);
944     } else {
945       getActionDefinitionsBuilder(G_FFLOOR)
946         .legalFor({S32, S64})
947         .scalarize(0)
948         .clampScalar(0, S32, S64);
949     }
950 
951     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
952       .legalFor({{S32, S32}, {S64, S32}})
953       .scalarize(0)
954       .clampScalar(0, S32, S64)
955       .clampScalar(1, S32, S32)
956       .lower();
957 
958     getActionDefinitionsBuilder(G_FFREXP)
959       .customFor({{S32, S32}, {S64, S32}})
960       .scalarize(0)
961       .minScalar(0, S32)
962       .clampScalar(1, S32, S32)
963       .lower();
964   }
965 
966   getActionDefinitionsBuilder(G_FPTRUNC)
967     .legalFor({{S32, S64}, {S16, S32}})
968     .scalarize(0)
969     .lower();
970 
971   getActionDefinitionsBuilder(G_FPEXT)
972     .legalFor({{S64, S32}, {S32, S16}})
973     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
974     .scalarize(0);
975 
976   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
977   if (ST.has16BitInsts()) {
978     FSubActions
979       // Use actual fsub instruction
980       .legalFor({S32, S16})
981       // Must use fadd + fneg
982       .lowerFor({S64, V2S16});
983   } else {
984     FSubActions
985       // Use actual fsub instruction
986       .legalFor({S32})
987       // Must use fadd + fneg
988       .lowerFor({S64, S16, V2S16});
989   }
990 
991   FSubActions
992     .scalarize(0)
993     .clampScalar(0, S32, S64);
994 
995   // Whether this is legal depends on the floating point mode for the function.
996   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
997   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
998     FMad.customFor({S32, S16});
999   else if (ST.hasMadMacF32Insts())
1000     FMad.customFor({S32});
1001   else if (ST.hasMadF16())
1002     FMad.customFor({S16});
1003   FMad.scalarize(0)
1004       .lower();
1005 
1006   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1007   if (ST.has16BitInsts()) {
1008     FRem.customFor({S16, S32, S64});
1009   } else {
1010     FRem.minScalar(0, S32)
1011         .customFor({S32, S64});
1012   }
1013   FRem.scalarize(0);
1014 
1015   // TODO: Do we need to clamp maximum bitwidth?
1016   getActionDefinitionsBuilder(G_TRUNC)
1017     .legalIf(isScalar(0))
1018     .legalFor({{V2S16, V2S32}})
1019     .clampMaxNumElements(0, S16, 2)
1020     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1021     // situations (like an invalid implicit use), we don't want to infinite loop
1022     // in the legalizer.
1023     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1024     .alwaysLegal();
1025 
1026   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1027     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1028                {S32, S1}, {S64, S1}, {S16, S1}})
1029     .scalarize(0)
1030     .clampScalar(0, S32, S64)
1031     .widenScalarToNextPow2(1, 32);
1032 
1033   // TODO: Split s1->s64 during regbankselect for VALU.
1034   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1035                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1036                     .lowerIf(typeIs(1, S1))
1037                     .customFor({{S32, S64}, {S64, S64}});
1038   if (ST.has16BitInsts())
1039     IToFP.legalFor({{S16, S16}});
1040   IToFP.clampScalar(1, S32, S64)
1041        .minScalar(0, S32)
1042        .scalarize(0)
1043        .widenScalarToNextPow2(1);
1044 
1045   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1046     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1047     .customFor({{S64, S32}, {S64, S64}})
1048     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1049   if (ST.has16BitInsts())
1050     FPToI.legalFor({{S16, S16}});
1051   else
1052     FPToI.minScalar(1, S32);
1053 
1054   FPToI.minScalar(0, S32)
1055        .widenScalarToNextPow2(0, 32)
1056        .scalarize(0)
1057        .lower();
1058 
1059   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1060       .customFor({S16, S32})
1061       .scalarize(0)
1062       .lower();
1063 
1064   // Lower roundeven into G_FRINT
1065   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
1066     .scalarize(0)
1067     .lower();
1068 
1069   if (ST.has16BitInsts()) {
1070     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
1071       .legalFor({S16, S32, S64})
1072       .clampScalar(0, S16, S64)
1073       .scalarize(0);
1074   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1075     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
1076       .legalFor({S32, S64})
1077       .clampScalar(0, S32, S64)
1078       .scalarize(0);
1079   } else {
1080     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
1081       .legalFor({S32})
1082       .customFor({S64})
1083       .clampScalar(0, S32, S64)
1084       .scalarize(0);
1085   }
1086 
1087   getActionDefinitionsBuilder(G_PTR_ADD)
1088       .unsupportedFor({BufferFatPtr, RsrcPtr})
1089       .legalIf(all(isPointer(0), sameSize(0, 1)))
1090       .scalarize(0)
1091       .scalarSameSizeAs(1, 0);
1092 
1093   getActionDefinitionsBuilder(G_PTRMASK)
1094     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1095     .scalarSameSizeAs(1, 0)
1096     .scalarize(0);
1097 
1098   auto &CmpBuilder =
1099     getActionDefinitionsBuilder(G_ICMP)
1100     // The compare output type differs based on the register bank of the output,
1101     // so make both s1 and s32 legal.
1102     //
1103     // Scalar compares producing output in scc will be promoted to s32, as that
1104     // is the allocatable register type that will be needed for the copy from
1105     // scc. This will be promoted during RegBankSelect, and we assume something
1106     // before that won't try to use s32 result types.
1107     //
1108     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1109     // bank.
1110     .legalForCartesianProduct(
1111       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1112     .legalForCartesianProduct(
1113       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1114   if (ST.has16BitInsts()) {
1115     CmpBuilder.legalFor({{S1, S16}});
1116   }
1117 
1118   CmpBuilder
1119     .widenScalarToNextPow2(1)
1120     .clampScalar(1, S32, S64)
1121     .scalarize(0)
1122     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1123 
1124   getActionDefinitionsBuilder(G_FCMP)
1125     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
1126     .widenScalarToNextPow2(1)
1127     .clampScalar(1, S32, S64)
1128     .scalarize(0);
1129 
1130   // FIXME: fpow has a selection pattern that should move to custom lowering.
1131   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1132   if (ST.has16BitInsts())
1133     ExpOps.customFor({{S32}, {S16}});
1134   else
1135     ExpOps.customFor({S32});
1136   ExpOps.clampScalar(0, MinScalarFPTy, S32)
1137         .scalarize(0);
1138 
1139   getActionDefinitionsBuilder(G_FPOWI)
1140     .clampScalar(0, MinScalarFPTy, S32)
1141     .lower();
1142 
1143   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1144   Log2Ops.customFor({S32});
1145   if (ST.has16BitInsts())
1146     Log2Ops.legalFor({S16});
1147   else
1148     Log2Ops.customFor({S16});
1149   Log2Ops.scalarize(0)
1150     .lower();
1151 
1152   auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP});
1153   LogOps.customFor({S32, S16});
1154   LogOps.clampScalar(0, MinScalarFPTy, S32)
1155         .scalarize(0);
1156 
1157   // The 64-bit versions produce 32-bit results, but only on the SALU.
1158   getActionDefinitionsBuilder(G_CTPOP)
1159     .legalFor({{S32, S32}, {S32, S64}})
1160     .clampScalar(0, S32, S32)
1161     .widenScalarToNextPow2(1, 32)
1162     .clampScalar(1, S32, S64)
1163     .scalarize(0)
1164     .widenScalarToNextPow2(0, 32);
1165 
1166   // If no 16 bit instr is available, lower into different instructions.
1167   if (ST.has16BitInsts())
1168     getActionDefinitionsBuilder(G_IS_FPCLASS)
1169         .legalForCartesianProduct({S1}, FPTypes16)
1170         .widenScalarToNextPow2(1)
1171         .scalarize(0)
1172         .lower();
1173   else
1174     getActionDefinitionsBuilder(G_IS_FPCLASS)
1175         .legalForCartesianProduct({S1}, FPTypesBase)
1176         .lowerFor({S1, S16})
1177         .widenScalarToNextPow2(1)
1178         .scalarize(0)
1179         .lower();
1180 
1181   // The hardware instructions return a different result on 0 than the generic
1182   // instructions expect. The hardware produces -1, but these produce the
1183   // bitwidth.
1184   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1185     .scalarize(0)
1186     .clampScalar(0, S32, S32)
1187     .clampScalar(1, S32, S64)
1188     .widenScalarToNextPow2(0, 32)
1189     .widenScalarToNextPow2(1, 32)
1190     .custom();
1191 
1192   // The 64-bit versions produce 32-bit results, but only on the SALU.
1193   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1194     .legalFor({{S32, S32}, {S32, S64}})
1195     .clampScalar(0, S32, S32)
1196     .clampScalar(1, S32, S64)
1197     .scalarize(0)
1198     .widenScalarToNextPow2(0, 32)
1199     .widenScalarToNextPow2(1, 32);
1200 
1201   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1202   // RegBankSelect.
1203   getActionDefinitionsBuilder(G_BITREVERSE)
1204     .legalFor({S32, S64})
1205     .clampScalar(0, S32, S64)
1206     .scalarize(0)
1207     .widenScalarToNextPow2(0);
1208 
1209   if (ST.has16BitInsts()) {
1210     getActionDefinitionsBuilder(G_BSWAP)
1211       .legalFor({S16, S32, V2S16})
1212       .clampMaxNumElementsStrict(0, S16, 2)
1213       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1214       // narrowScalar limitation.
1215       .widenScalarToNextPow2(0)
1216       .clampScalar(0, S16, S32)
1217       .scalarize(0);
1218 
1219     if (ST.hasVOP3PInsts()) {
1220       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1221         .legalFor({S32, S16, V2S16})
1222         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1223         .clampMaxNumElements(0, S16, 2)
1224         .minScalar(0, S16)
1225         .widenScalarToNextPow2(0)
1226         .scalarize(0)
1227         .lower();
1228     } else {
1229       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1230         .legalFor({S32, S16})
1231         .widenScalarToNextPow2(0)
1232         .minScalar(0, S16)
1233         .scalarize(0)
1234         .lower();
1235     }
1236   } else {
1237     // TODO: Should have same legality without v_perm_b32
1238     getActionDefinitionsBuilder(G_BSWAP)
1239       .legalFor({S32})
1240       .lowerIf(scalarNarrowerThan(0, 32))
1241       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1242       // narrowScalar limitation.
1243       .widenScalarToNextPow2(0)
1244       .maxScalar(0, S32)
1245       .scalarize(0)
1246       .lower();
1247 
1248     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1249       .legalFor({S32})
1250       .minScalar(0, S32)
1251       .widenScalarToNextPow2(0)
1252       .scalarize(0)
1253       .lower();
1254   }
1255 
1256   getActionDefinitionsBuilder(G_INTTOPTR)
1257       // List the common cases
1258       .legalForCartesianProduct(AddrSpaces64, {S64})
1259       .legalForCartesianProduct(AddrSpaces32, {S32})
1260       .scalarize(0)
1261       // Accept any address space as long as the size matches
1262       .legalIf(sameSize(0, 1))
1263       .widenScalarIf(smallerThan(1, 0),
1264                      [](const LegalityQuery &Query) {
1265                        return std::pair(
1266                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1267                      })
1268       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1269         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1270       });
1271 
1272   getActionDefinitionsBuilder(G_PTRTOINT)
1273       // List the common cases
1274       .legalForCartesianProduct(AddrSpaces64, {S64})
1275       .legalForCartesianProduct(AddrSpaces32, {S32})
1276       .scalarize(0)
1277       // Accept any address space as long as the size matches
1278       .legalIf(sameSize(0, 1))
1279       .widenScalarIf(smallerThan(0, 1),
1280                      [](const LegalityQuery &Query) {
1281                        return std::pair(
1282                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1283                      })
1284       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1285         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1286       });
1287 
1288   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1289     .scalarize(0)
1290     .custom();
1291 
1292   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1293                                     bool IsLoad) -> bool {
1294     const LLT DstTy = Query.Types[0];
1295 
1296     // Split vector extloads.
1297     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1298 
1299     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1300       return true;
1301 
1302     const LLT PtrTy = Query.Types[1];
1303     unsigned AS = PtrTy.getAddressSpace();
1304     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1305                                       Query.MMODescrs[0].Ordering !=
1306                                           AtomicOrdering::NotAtomic))
1307       return true;
1308 
1309     // Catch weird sized loads that don't evenly divide into the access sizes
1310     // TODO: May be able to widen depending on alignment etc.
1311     unsigned NumRegs = (MemSize + 31) / 32;
1312     if (NumRegs == 3) {
1313       if (!ST.hasDwordx3LoadStores())
1314         return true;
1315     } else {
1316       // If the alignment allows, these should have been widened.
1317       if (!isPowerOf2_32(NumRegs))
1318         return true;
1319     }
1320 
1321     return false;
1322   };
1323 
1324   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1325   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1326   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1327 
1328   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1329   // LDS
1330   // TODO: Unsupported flat for SI.
1331 
1332   for (unsigned Op : {G_LOAD, G_STORE}) {
1333     const bool IsStore = Op == G_STORE;
1334 
1335     auto &Actions = getActionDefinitionsBuilder(Op);
1336     // Explicitly list some common cases.
1337     // TODO: Does this help compile time at all?
1338     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1339                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1340                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1341                                       {S64, GlobalPtr, S64, GlobalAlign32},
1342                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1343                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1344                                       {S32, GlobalPtr, S8, GlobalAlign8},
1345                                       {S32, GlobalPtr, S16, GlobalAlign16},
1346 
1347                                       {S32, LocalPtr, S32, 32},
1348                                       {S64, LocalPtr, S64, 32},
1349                                       {V2S32, LocalPtr, V2S32, 32},
1350                                       {S32, LocalPtr, S8, 8},
1351                                       {S32, LocalPtr, S16, 16},
1352                                       {V2S16, LocalPtr, S32, 32},
1353 
1354                                       {S32, PrivatePtr, S32, 32},
1355                                       {S32, PrivatePtr, S8, 8},
1356                                       {S32, PrivatePtr, S16, 16},
1357                                       {V2S16, PrivatePtr, S32, 32},
1358 
1359                                       {S32, ConstantPtr, S32, GlobalAlign32},
1360                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1361                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1362                                       {S64, ConstantPtr, S64, GlobalAlign32},
1363                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1364     Actions.legalIf(
1365       [=](const LegalityQuery &Query) -> bool {
1366         return isLoadStoreLegal(ST, Query);
1367       });
1368 
1369     // The custom pointers (fat pointers, buffer resources) don't work with load
1370     // and store at this level. Fat pointers should have been lowered to
1371     // intrinsics before the translation to MIR.
1372     Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
1373 
1374     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1375     // ptrtoint. This is needed to account for the fact that we can't have i128
1376     // as a register class for SelectionDAG reasons.
1377     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1378       return hasBufferRsrcWorkaround(Query.Types[0]);
1379     });
1380 
1381     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1382     // 64-bits.
1383     //
1384     // TODO: Should generalize bitcast action into coerce, which will also cover
1385     // inserting addrspacecasts.
1386     Actions.customIf(typeIs(1, Constant32Ptr));
1387 
1388     // Turn any illegal element vectors into something easier to deal
1389     // with. These will ultimately produce 32-bit scalar shifts to extract the
1390     // parts anyway.
1391     //
1392     // For odd 16-bit element vectors, prefer to split those into pieces with
1393     // 16-bit vector parts.
1394     Actions.bitcastIf(
1395       [=](const LegalityQuery &Query) -> bool {
1396         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1397                                           Query.MMODescrs[0].MemoryTy);
1398       }, bitcastToRegisterType(0));
1399 
1400     if (!IsStore) {
1401       // Widen suitably aligned loads by loading extra bytes. The standard
1402       // legalization actions can't properly express widening memory operands.
1403       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1404         return shouldWidenLoad(ST, Query, G_LOAD);
1405       });
1406     }
1407 
1408     // FIXME: load/store narrowing should be moved to lower action
1409     Actions
1410         .narrowScalarIf(
1411             [=](const LegalityQuery &Query) -> bool {
1412               return !Query.Types[0].isVector() &&
1413                      needToSplitMemOp(Query, Op == G_LOAD);
1414             },
1415             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1416               const LLT DstTy = Query.Types[0];
1417               const LLT PtrTy = Query.Types[1];
1418 
1419               const unsigned DstSize = DstTy.getSizeInBits();
1420               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1421 
1422               // Split extloads.
1423               if (DstSize > MemSize)
1424                 return std::pair(0, LLT::scalar(MemSize));
1425 
1426               unsigned MaxSize = maxSizeForAddrSpace(
1427                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1428                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1429               if (MemSize > MaxSize)
1430                 return std::pair(0, LLT::scalar(MaxSize));
1431 
1432               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1433               return std::pair(0, LLT::scalar(Align));
1434             })
1435         .fewerElementsIf(
1436             [=](const LegalityQuery &Query) -> bool {
1437               return Query.Types[0].isVector() &&
1438                      needToSplitMemOp(Query, Op == G_LOAD);
1439             },
1440             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1441               const LLT DstTy = Query.Types[0];
1442               const LLT PtrTy = Query.Types[1];
1443 
1444               LLT EltTy = DstTy.getElementType();
1445               unsigned MaxSize = maxSizeForAddrSpace(
1446                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1447                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1448 
1449               // FIXME: Handle widened to power of 2 results better. This ends
1450               // up scalarizing.
1451               // FIXME: 3 element stores scalarized on SI
1452 
1453               // Split if it's too large for the address space.
1454               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1455               if (MemSize > MaxSize) {
1456                 unsigned NumElts = DstTy.getNumElements();
1457                 unsigned EltSize = EltTy.getSizeInBits();
1458 
1459                 if (MaxSize % EltSize == 0) {
1460                   return std::pair(
1461                       0, LLT::scalarOrVector(
1462                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1463                 }
1464 
1465                 unsigned NumPieces = MemSize / MaxSize;
1466 
1467                 // FIXME: Refine when odd breakdowns handled
1468                 // The scalars will need to be re-legalized.
1469                 if (NumPieces == 1 || NumPieces >= NumElts ||
1470                     NumElts % NumPieces != 0)
1471                   return std::pair(0, EltTy);
1472 
1473                 return std::pair(0,
1474                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1475               }
1476 
1477               // FIXME: We could probably handle weird extending loads better.
1478               if (DstTy.getSizeInBits() > MemSize)
1479                 return std::pair(0, EltTy);
1480 
1481               unsigned EltSize = EltTy.getSizeInBits();
1482               unsigned DstSize = DstTy.getSizeInBits();
1483               if (!isPowerOf2_32(DstSize)) {
1484                 // We're probably decomposing an odd sized store. Try to split
1485                 // to the widest type. TODO: Account for alignment. As-is it
1486                 // should be OK, since the new parts will be further legalized.
1487                 unsigned FloorSize = llvm::bit_floor(DstSize);
1488                 return std::pair(
1489                     0, LLT::scalarOrVector(
1490                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1491               }
1492 
1493               // May need relegalization for the scalars.
1494               return std::pair(0, EltTy);
1495             })
1496     .minScalar(0, S32)
1497     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1498     .widenScalarToNextPow2(0)
1499     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1500     .lower();
1501   }
1502 
1503   // FIXME: Unaligned accesses not lowered.
1504   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1505                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1506                                                   {S32, GlobalPtr, S16, 2 * 8},
1507                                                   {S32, LocalPtr, S8, 8},
1508                                                   {S32, LocalPtr, S16, 16},
1509                                                   {S32, PrivatePtr, S8, 8},
1510                                                   {S32, PrivatePtr, S16, 16},
1511                                                   {S32, ConstantPtr, S8, 8},
1512                                                   {S32, ConstantPtr, S16, 2 * 8}})
1513                        .legalIf(
1514                          [=](const LegalityQuery &Query) -> bool {
1515                            return isLoadStoreLegal(ST, Query);
1516                          });
1517 
1518   if (ST.hasFlatAddressSpace()) {
1519     ExtLoads.legalForTypesWithMemDesc(
1520         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1521   }
1522 
1523   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1524   // 64-bits.
1525   //
1526   // TODO: Should generalize bitcast action into coerce, which will also cover
1527   // inserting addrspacecasts.
1528   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1529 
1530   ExtLoads.clampScalar(0, S32, S32)
1531           .widenScalarToNextPow2(0)
1532           .lower();
1533 
1534   auto &Atomics = getActionDefinitionsBuilder(
1535     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1536      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1537      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1538      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1539     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1540                {S64, GlobalPtr}, {S64, LocalPtr},
1541                {S32, RegionPtr}, {S64, RegionPtr}});
1542   if (ST.hasFlatAddressSpace()) {
1543     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1544   }
1545 
1546   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1547   if (ST.hasLDSFPAtomicAdd()) {
1548     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1549     if (ST.hasGFX90AInsts())
1550       Atomic.legalFor({{S64, LocalPtr}});
1551     if (ST.hasAtomicDsPkAdd16Insts())
1552       Atomic.legalFor({{V2S16, LocalPtr}});
1553   }
1554   if (ST.hasAtomicFaddInsts())
1555     Atomic.legalFor({{S32, GlobalPtr}});
1556   if (ST.hasFlatAtomicFaddF32Inst())
1557     Atomic.legalFor({{S32, FlatPtr}});
1558 
1559   if (ST.hasGFX90AInsts()) {
1560     // These are legal with some caveats, and should have undergone expansion in
1561     // the IR in most situations
1562     // TODO: Move atomic expansion into legalizer
1563     Atomic.legalFor({
1564         {S32, GlobalPtr},
1565         {S64, GlobalPtr},
1566         {S64, FlatPtr}
1567       });
1568   }
1569 
1570   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1571   // demarshalling
1572   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1573     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1574                 {S32, FlatPtr}, {S64, FlatPtr}})
1575     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1576                {S32, RegionPtr}, {S64, RegionPtr}});
1577   // TODO: Pointer types, any 32-bit or 64-bit vector
1578 
1579   // Condition should be s32 for scalar, s1 for vector.
1580   getActionDefinitionsBuilder(G_SELECT)
1581       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1582                                  LocalPtr, FlatPtr, PrivatePtr,
1583                                  LLT::fixed_vector(2, LocalPtr),
1584                                  LLT::fixed_vector(2, PrivatePtr)},
1585                                 {S1, S32})
1586       .clampScalar(0, S16, S64)
1587       .scalarize(1)
1588       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1589       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1590       .clampMaxNumElements(0, S32, 2)
1591       .clampMaxNumElements(0, LocalPtr, 2)
1592       .clampMaxNumElements(0, PrivatePtr, 2)
1593       .scalarize(0)
1594       .widenScalarToNextPow2(0)
1595       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1596 
1597   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1598   // be more flexible with the shift amount type.
1599   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1600     .legalFor({{S32, S32}, {S64, S32}});
1601   if (ST.has16BitInsts()) {
1602     if (ST.hasVOP3PInsts()) {
1603       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1604             .clampMaxNumElements(0, S16, 2);
1605     } else
1606       Shifts.legalFor({{S16, S16}});
1607 
1608     // TODO: Support 16-bit shift amounts for all types
1609     Shifts.widenScalarIf(
1610       [=](const LegalityQuery &Query) {
1611         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1612         // 32-bit amount.
1613         const LLT ValTy = Query.Types[0];
1614         const LLT AmountTy = Query.Types[1];
1615         return ValTy.getSizeInBits() <= 16 &&
1616                AmountTy.getSizeInBits() < 16;
1617       }, changeTo(1, S16));
1618     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1619     Shifts.clampScalar(1, S32, S32);
1620     Shifts.widenScalarToNextPow2(0, 16);
1621     Shifts.clampScalar(0, S16, S64);
1622 
1623     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1624       .minScalar(0, S16)
1625       .scalarize(0)
1626       .lower();
1627   } else {
1628     // Make sure we legalize the shift amount type first, as the general
1629     // expansion for the shifted type will produce much worse code if it hasn't
1630     // been truncated already.
1631     Shifts.clampScalar(1, S32, S32);
1632     Shifts.widenScalarToNextPow2(0, 32);
1633     Shifts.clampScalar(0, S32, S64);
1634 
1635     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1636       .minScalar(0, S32)
1637       .scalarize(0)
1638       .lower();
1639   }
1640   Shifts.scalarize(0);
1641 
1642   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1643     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1644     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1645     unsigned IdxTypeIdx = 2;
1646 
1647     getActionDefinitionsBuilder(Op)
1648       .customIf([=](const LegalityQuery &Query) {
1649           const LLT EltTy = Query.Types[EltTypeIdx];
1650           const LLT VecTy = Query.Types[VecTypeIdx];
1651           const LLT IdxTy = Query.Types[IdxTypeIdx];
1652           const unsigned EltSize = EltTy.getSizeInBits();
1653           const bool isLegalVecType =
1654               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1655           // Address space 8 pointers are 128-bit wide values, but the logic
1656           // below will try to bitcast them to 2N x s64, which will fail.
1657           // Therefore, as an intermediate step, wrap extracts/insertions from a
1658           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1659           // extraction result) in order to produce a vector operation that can
1660           // be handled by the logic below.
1661           if (EltTy.isPointer() && EltSize > 64)
1662             return true;
1663           return (EltSize == 32 || EltSize == 64) &&
1664                   VecTy.getSizeInBits() % 32 == 0 &&
1665                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1666                   IdxTy.getSizeInBits() == 32 &&
1667                   isLegalVecType;
1668         })
1669       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1670                  bitcastToVectorElement32(VecTypeIdx))
1671       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1672       .bitcastIf(
1673         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1674         [=](const LegalityQuery &Query) {
1675           // For > 64-bit element types, try to turn this into a 64-bit
1676           // element vector since we may be able to do better indexing
1677           // if this is scalar. If not, fall back to 32.
1678           const LLT EltTy = Query.Types[EltTypeIdx];
1679           const LLT VecTy = Query.Types[VecTypeIdx];
1680           const unsigned DstEltSize = EltTy.getSizeInBits();
1681           const unsigned VecSize = VecTy.getSizeInBits();
1682 
1683           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1684           return std::pair(
1685               VecTypeIdx,
1686               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1687         })
1688       .clampScalar(EltTypeIdx, S32, S64)
1689       .clampScalar(VecTypeIdx, S32, S64)
1690       .clampScalar(IdxTypeIdx, S32, S32)
1691       .clampMaxNumElements(VecTypeIdx, S32, 32)
1692       // TODO: Clamp elements for 64-bit vectors?
1693       .moreElementsIf(
1694         isIllegalRegisterType(VecTypeIdx),
1695         moreElementsToNextExistingRegClass(VecTypeIdx))
1696       // It should only be necessary with variable indexes.
1697       // As a last resort, lower to the stack
1698       .lower();
1699   }
1700 
1701   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1702     .unsupportedIf([=](const LegalityQuery &Query) {
1703         const LLT &EltTy = Query.Types[1].getElementType();
1704         return Query.Types[0] != EltTy;
1705       });
1706 
1707   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1708     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1709     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1710 
1711     // FIXME: Doesn't handle extract of illegal sizes.
1712     getActionDefinitionsBuilder(Op)
1713       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1714       .lowerIf([=](const LegalityQuery &Query) {
1715           // Sub-vector(or single element) insert and extract.
1716           // TODO: verify immediate offset here since lower only works with
1717           // whole elements.
1718           const LLT BigTy = Query.Types[BigTyIdx];
1719           return BigTy.isVector();
1720         })
1721       // FIXME: Multiples of 16 should not be legal.
1722       .legalIf([=](const LegalityQuery &Query) {
1723           const LLT BigTy = Query.Types[BigTyIdx];
1724           const LLT LitTy = Query.Types[LitTyIdx];
1725           return (BigTy.getSizeInBits() % 32 == 0) &&
1726                  (LitTy.getSizeInBits() % 16 == 0);
1727         })
1728       .widenScalarIf(
1729         [=](const LegalityQuery &Query) {
1730           const LLT BigTy = Query.Types[BigTyIdx];
1731           return (BigTy.getScalarSizeInBits() < 16);
1732         },
1733         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1734       .widenScalarIf(
1735         [=](const LegalityQuery &Query) {
1736           const LLT LitTy = Query.Types[LitTyIdx];
1737           return (LitTy.getScalarSizeInBits() < 16);
1738         },
1739         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1740       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1741       .widenScalarToNextPow2(BigTyIdx, 32);
1742 
1743   }
1744 
1745   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1746     .legalForCartesianProduct(AllS32Vectors, {S32})
1747     .legalForCartesianProduct(AllS64Vectors, {S64})
1748     .clampNumElements(0, V16S32, V32S32)
1749     .clampNumElements(0, V2S64, V16S64)
1750     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1751     .moreElementsIf(
1752       isIllegalRegisterType(0),
1753       moreElementsToNextExistingRegClass(0));
1754 
1755   if (ST.hasScalarPackInsts()) {
1756     BuildVector
1757       // FIXME: Should probably widen s1 vectors straight to s32
1758       .minScalarOrElt(0, S16)
1759       .minScalar(1, S16);
1760 
1761     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1762       .legalFor({V2S16, S32})
1763       .lower();
1764   } else {
1765     BuildVector.customFor({V2S16, S16});
1766     BuildVector.minScalarOrElt(0, S32);
1767 
1768     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1769       .customFor({V2S16, S32})
1770       .lower();
1771   }
1772 
1773   BuildVector.legalIf(isRegisterType(0));
1774 
1775   // FIXME: Clamp maximum size
1776   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1777     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1778     .clampMaxNumElements(0, S32, 32)
1779     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1780     .clampMaxNumElements(0, S16, 64);
1781 
1782   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1783 
1784   // Merge/Unmerge
1785   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1786     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1787     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1788 
1789     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1790       const LLT Ty = Query.Types[TypeIdx];
1791       if (Ty.isVector()) {
1792         const LLT &EltTy = Ty.getElementType();
1793         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1794           return true;
1795         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1796           return true;
1797       }
1798       return false;
1799     };
1800 
1801     auto &Builder = getActionDefinitionsBuilder(Op)
1802       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1803       .lowerFor({{S16, V2S16}})
1804       .lowerIf([=](const LegalityQuery &Query) {
1805           const LLT BigTy = Query.Types[BigTyIdx];
1806           return BigTy.getSizeInBits() == 32;
1807         })
1808       // Try to widen to s16 first for small types.
1809       // TODO: Only do this on targets with legal s16 shifts
1810       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1811       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1812       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1813       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1814                            elementTypeIs(1, S16)),
1815                        changeTo(1, V2S16))
1816       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1817       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1818       // valid.
1819       .clampScalar(LitTyIdx, S32, S512)
1820       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1821       // Break up vectors with weird elements into scalars
1822       .fewerElementsIf(
1823         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1824         scalarize(0))
1825       .fewerElementsIf(
1826         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1827         scalarize(1))
1828       .clampScalar(BigTyIdx, S32, MaxScalar);
1829 
1830     if (Op == G_MERGE_VALUES) {
1831       Builder.widenScalarIf(
1832         // TODO: Use 16-bit shifts if legal for 8-bit values?
1833         [=](const LegalityQuery &Query) {
1834           const LLT Ty = Query.Types[LitTyIdx];
1835           return Ty.getSizeInBits() < 32;
1836         },
1837         changeTo(LitTyIdx, S32));
1838     }
1839 
1840     Builder.widenScalarIf(
1841       [=](const LegalityQuery &Query) {
1842         const LLT Ty = Query.Types[BigTyIdx];
1843         return Ty.getSizeInBits() % 16 != 0;
1844       },
1845       [=](const LegalityQuery &Query) {
1846         // Pick the next power of 2, or a multiple of 64 over 128.
1847         // Whichever is smaller.
1848         const LLT &Ty = Query.Types[BigTyIdx];
1849         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1850         if (NewSizeInBits >= 256) {
1851           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1852           if (RoundedTo < NewSizeInBits)
1853             NewSizeInBits = RoundedTo;
1854         }
1855         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1856       })
1857       // Any vectors left are the wrong size. Scalarize them.
1858       .scalarize(0)
1859       .scalarize(1);
1860   }
1861 
1862   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1863   // RegBankSelect.
1864   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1865     .legalFor({{S32}, {S64}});
1866 
1867   if (ST.hasVOP3PInsts()) {
1868     SextInReg.lowerFor({{V2S16}})
1869       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1870       // get more vector shift opportunities, since we'll get those when
1871       // expanded.
1872       .clampMaxNumElementsStrict(0, S16, 2);
1873   } else if (ST.has16BitInsts()) {
1874     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1875   } else {
1876     // Prefer to promote to s32 before lowering if we don't have 16-bit
1877     // shifts. This avoid a lot of intermediate truncate and extend operations.
1878     SextInReg.lowerFor({{S32}, {S64}});
1879   }
1880 
1881   SextInReg
1882     .scalarize(0)
1883     .clampScalar(0, S32, S64)
1884     .lower();
1885 
1886   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1887     .scalarize(0)
1888     .lower();
1889 
1890   // TODO: Only Try to form v2s16 with legal packed instructions.
1891   getActionDefinitionsBuilder(G_FSHR)
1892     .legalFor({{S32, S32}})
1893     .lowerFor({{V2S16, V2S16}})
1894     .clampMaxNumElementsStrict(0, S16, 2)
1895     .scalarize(0)
1896     .lower();
1897 
1898   if (ST.hasVOP3PInsts()) {
1899     getActionDefinitionsBuilder(G_FSHL)
1900       .lowerFor({{V2S16, V2S16}})
1901       .clampMaxNumElementsStrict(0, S16, 2)
1902       .scalarize(0)
1903       .lower();
1904   } else {
1905     getActionDefinitionsBuilder(G_FSHL)
1906       .scalarize(0)
1907       .lower();
1908   }
1909 
1910   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1911     .legalFor({S64});
1912 
1913   getActionDefinitionsBuilder(G_FENCE)
1914     .alwaysLegal();
1915 
1916   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1917       .scalarize(0)
1918       .minScalar(0, S32)
1919       .lower();
1920 
1921   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1922       .legalFor({{S32, S32}, {S64, S32}})
1923       .clampScalar(1, S32, S32)
1924       .clampScalar(0, S32, S64)
1925       .widenScalarToNextPow2(0)
1926       .scalarize(0);
1927 
1928   getActionDefinitionsBuilder({
1929       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1930       G_FCOPYSIGN,
1931 
1932       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1933       G_ATOMICRMW_NAND,
1934       G_ATOMICRMW_FSUB,
1935       G_READ_REGISTER,
1936       G_WRITE_REGISTER,
1937 
1938       G_SADDO, G_SSUBO,
1939 
1940        // TODO: Implement
1941       G_FMINIMUM, G_FMAXIMUM}).lower();
1942 
1943   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1944       .lower();
1945 
1946   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1947         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1948         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1949     .unsupported();
1950 
1951   getLegacyLegalizerInfo().computeTables();
1952   verify(*ST.getInstrInfo());
1953 }
1954 
1955 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1956                                          MachineInstr &MI) const {
1957   MachineIRBuilder &B = Helper.MIRBuilder;
1958   MachineRegisterInfo &MRI = *B.getMRI();
1959 
1960   switch (MI.getOpcode()) {
1961   case TargetOpcode::G_ADDRSPACE_CAST:
1962     return legalizeAddrSpaceCast(MI, MRI, B);
1963   case TargetOpcode::G_FRINT:
1964     return legalizeFrint(MI, MRI, B);
1965   case TargetOpcode::G_FCEIL:
1966     return legalizeFceil(MI, MRI, B);
1967   case TargetOpcode::G_FREM:
1968     return legalizeFrem(MI, MRI, B);
1969   case TargetOpcode::G_INTRINSIC_TRUNC:
1970     return legalizeIntrinsicTrunc(MI, MRI, B);
1971   case TargetOpcode::G_SITOFP:
1972     return legalizeITOFP(MI, MRI, B, true);
1973   case TargetOpcode::G_UITOFP:
1974     return legalizeITOFP(MI, MRI, B, false);
1975   case TargetOpcode::G_FPTOSI:
1976     return legalizeFPTOI(MI, MRI, B, true);
1977   case TargetOpcode::G_FPTOUI:
1978     return legalizeFPTOI(MI, MRI, B, false);
1979   case TargetOpcode::G_FMINNUM:
1980   case TargetOpcode::G_FMAXNUM:
1981   case TargetOpcode::G_FMINNUM_IEEE:
1982   case TargetOpcode::G_FMAXNUM_IEEE:
1983     return legalizeMinNumMaxNum(Helper, MI);
1984   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1985     return legalizeExtractVectorElt(MI, MRI, B);
1986   case TargetOpcode::G_INSERT_VECTOR_ELT:
1987     return legalizeInsertVectorElt(MI, MRI, B);
1988   case TargetOpcode::G_FSIN:
1989   case TargetOpcode::G_FCOS:
1990     return legalizeSinCos(MI, MRI, B);
1991   case TargetOpcode::G_GLOBAL_VALUE:
1992     return legalizeGlobalValue(MI, MRI, B);
1993   case TargetOpcode::G_LOAD:
1994   case TargetOpcode::G_SEXTLOAD:
1995   case TargetOpcode::G_ZEXTLOAD:
1996     return legalizeLoad(Helper, MI);
1997   case TargetOpcode::G_STORE:
1998     return legalizeStore(Helper, MI);
1999   case TargetOpcode::G_FMAD:
2000     return legalizeFMad(MI, MRI, B);
2001   case TargetOpcode::G_FDIV:
2002     return legalizeFDIV(MI, MRI, B);
2003   case TargetOpcode::G_FFREXP:
2004     return legalizeFFREXP(MI, MRI, B);
2005   case TargetOpcode::G_FSQRT:
2006     return legalizeFSQRT(MI, MRI, B);
2007   case TargetOpcode::G_UDIV:
2008   case TargetOpcode::G_UREM:
2009   case TargetOpcode::G_UDIVREM:
2010     return legalizeUnsignedDIV_REM(MI, MRI, B);
2011   case TargetOpcode::G_SDIV:
2012   case TargetOpcode::G_SREM:
2013   case TargetOpcode::G_SDIVREM:
2014     return legalizeSignedDIV_REM(MI, MRI, B);
2015   case TargetOpcode::G_ATOMIC_CMPXCHG:
2016     return legalizeAtomicCmpXChg(MI, MRI, B);
2017   case TargetOpcode::G_FLOG2:
2018     return legalizeFlog2(MI, B);
2019   case TargetOpcode::G_FLOG:
2020   case TargetOpcode::G_FLOG10:
2021     return legalizeFlogCommon(MI, B);
2022   case TargetOpcode::G_FEXP2:
2023     return legalizeFExp2(MI, B);
2024   case TargetOpcode::G_FEXP:
2025     return legalizeFExp(MI, B);
2026   case TargetOpcode::G_FPOW:
2027     return legalizeFPow(MI, B);
2028   case TargetOpcode::G_FFLOOR:
2029     return legalizeFFloor(MI, MRI, B);
2030   case TargetOpcode::G_BUILD_VECTOR:
2031   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2032     return legalizeBuildVector(MI, MRI, B);
2033   case TargetOpcode::G_MUL:
2034     return legalizeMul(Helper, MI);
2035   case TargetOpcode::G_CTLZ:
2036   case TargetOpcode::G_CTTZ:
2037     return legalizeCTLZ_CTTZ(MI, MRI, B);
2038   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2039     return legalizeFPTruncRound(MI, B);
2040   default:
2041     return false;
2042   }
2043 
2044   llvm_unreachable("expected switch to return");
2045 }
2046 
2047 Register AMDGPULegalizerInfo::getSegmentAperture(
2048   unsigned AS,
2049   MachineRegisterInfo &MRI,
2050   MachineIRBuilder &B) const {
2051   MachineFunction &MF = B.getMF();
2052   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2053   const LLT S32 = LLT::scalar(32);
2054   const LLT S64 = LLT::scalar(64);
2055 
2056   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2057 
2058   if (ST.hasApertureRegs()) {
2059     // Note: this register is somewhat broken. When used as a 32-bit operand,
2060     // it only returns zeroes. The real value is in the upper 32 bits.
2061     // Thus, we must emit extract the high 32 bits.
2062     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2063                                        ? AMDGPU::SRC_SHARED_BASE
2064                                        : AMDGPU::SRC_PRIVATE_BASE;
2065     // FIXME: It would be more natural to emit a COPY here, but then copy
2066     // coalescing would kick in and it would think it's okay to use the "HI"
2067     // subregister (instead of extracting the HI 32 bits) which is an artificial
2068     // (unusable) register.
2069     //  Register TableGen definitions would need an overhaul to get rid of the
2070     //  artificial "HI" aperture registers and prevent this kind of issue from
2071     //  happening.
2072     Register Dst = MRI.createGenericVirtualRegister(S64);
2073     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2074     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2075     return B.buildUnmerge(S32, Dst).getReg(1);
2076   }
2077 
2078   // TODO: can we be smarter about machine pointer info?
2079   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2080   Register LoadAddr = MRI.createGenericVirtualRegister(
2081     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2082   // For code object version 5, private_base and shared_base are passed through
2083   // implicit kernargs.
2084   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
2085       AMDGPU::AMDHSA_COV5) {
2086     AMDGPUTargetLowering::ImplicitParameter Param =
2087         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2088                                       : AMDGPUTargetLowering::PRIVATE_BASE;
2089     uint64_t Offset =
2090         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2091 
2092     Register KernargPtrReg = MRI.createGenericVirtualRegister(
2093         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2094 
2095     if (!loadInputValue(KernargPtrReg, B,
2096                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2097       return Register();
2098 
2099     MachineMemOperand *MMO = MF.getMachineMemOperand(
2100         PtrInfo,
2101         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2102             MachineMemOperand::MOInvariant,
2103         LLT::scalar(32), commonAlignment(Align(64), Offset));
2104 
2105     // Pointer address
2106     B.buildPtrAdd(LoadAddr, KernargPtrReg,
2107                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2108     // Load address
2109     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2110   }
2111 
2112   Register QueuePtr = MRI.createGenericVirtualRegister(
2113     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2114 
2115   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2116     return Register();
2117 
2118   // Offset into amd_queue_t for group_segment_aperture_base_hi /
2119   // private_segment_aperture_base_hi.
2120   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2121 
2122   MachineMemOperand *MMO = MF.getMachineMemOperand(
2123       PtrInfo,
2124       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2125           MachineMemOperand::MOInvariant,
2126       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2127 
2128   B.buildPtrAdd(LoadAddr, QueuePtr,
2129                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2130   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2131 }
2132 
2133 /// Return true if the value is a known valid address, such that a null check is
2134 /// not necessary.
2135 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2136                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2137   MachineInstr *Def = MRI.getVRegDef(Val);
2138   switch (Def->getOpcode()) {
2139   case AMDGPU::G_FRAME_INDEX:
2140   case AMDGPU::G_GLOBAL_VALUE:
2141   case AMDGPU::G_BLOCK_ADDR:
2142     return true;
2143   case AMDGPU::G_CONSTANT: {
2144     const ConstantInt *CI = Def->getOperand(1).getCImm();
2145     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2146   }
2147   default:
2148     return false;
2149   }
2150 
2151   return false;
2152 }
2153 
2154 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2155   MachineInstr &MI, MachineRegisterInfo &MRI,
2156   MachineIRBuilder &B) const {
2157   MachineFunction &MF = B.getMF();
2158 
2159   const LLT S32 = LLT::scalar(32);
2160   Register Dst = MI.getOperand(0).getReg();
2161   Register Src = MI.getOperand(1).getReg();
2162 
2163   LLT DstTy = MRI.getType(Dst);
2164   LLT SrcTy = MRI.getType(Src);
2165   unsigned DestAS = DstTy.getAddressSpace();
2166   unsigned SrcAS = SrcTy.getAddressSpace();
2167 
2168   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2169   // vector element.
2170   assert(!DstTy.isVector());
2171 
2172   const AMDGPUTargetMachine &TM
2173     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2174 
2175   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2176     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2177     return true;
2178   }
2179 
2180   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2181       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2182        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2183     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2184       // Extract low 32-bits of the pointer.
2185       B.buildExtract(Dst, Src, 0);
2186       MI.eraseFromParent();
2187       return true;
2188     }
2189 
2190     unsigned NullVal = TM.getNullPointerValue(DestAS);
2191 
2192     auto SegmentNull = B.buildConstant(DstTy, NullVal);
2193     auto FlatNull = B.buildConstant(SrcTy, 0);
2194 
2195     // Extract low 32-bits of the pointer.
2196     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2197 
2198     auto CmpRes =
2199         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2200     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2201 
2202     MI.eraseFromParent();
2203     return true;
2204   }
2205 
2206   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2207       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2208        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2209     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2210     if (!ApertureReg.isValid())
2211       return false;
2212 
2213     // Coerce the type of the low half of the result so we can use merge_values.
2214     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2215 
2216     // TODO: Should we allow mismatched types but matching sizes in merges to
2217     // avoid the ptrtoint?
2218     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2219 
2220     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
2221       B.buildCopy(Dst, BuildPtr);
2222       MI.eraseFromParent();
2223       return true;
2224     }
2225 
2226     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2227     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2228 
2229     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2230                               SegmentNull.getReg(0));
2231 
2232     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2233 
2234     MI.eraseFromParent();
2235     return true;
2236   }
2237 
2238   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2239       SrcTy.getSizeInBits() == 64) {
2240     // Truncate.
2241     B.buildExtract(Dst, Src, 0);
2242     MI.eraseFromParent();
2243     return true;
2244   }
2245 
2246   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2247       DstTy.getSizeInBits() == 64) {
2248     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2249     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2250     auto PtrLo = B.buildPtrToInt(S32, Src);
2251     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2252     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2253     MI.eraseFromParent();
2254     return true;
2255   }
2256 
2257   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2258       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2259 
2260   LLVMContext &Ctx = MF.getFunction().getContext();
2261   Ctx.diagnose(InvalidAddrSpaceCast);
2262   B.buildUndef(Dst);
2263   MI.eraseFromParent();
2264   return true;
2265 }
2266 
2267 bool AMDGPULegalizerInfo::legalizeFrint(
2268   MachineInstr &MI, MachineRegisterInfo &MRI,
2269   MachineIRBuilder &B) const {
2270   Register Src = MI.getOperand(1).getReg();
2271   LLT Ty = MRI.getType(Src);
2272   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2273 
2274   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2275   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2276 
2277   auto C1 = B.buildFConstant(Ty, C1Val);
2278   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2279 
2280   // TODO: Should this propagate fast-math-flags?
2281   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2282   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2283 
2284   auto C2 = B.buildFConstant(Ty, C2Val);
2285   auto Fabs = B.buildFAbs(Ty, Src);
2286 
2287   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2288   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2289   MI.eraseFromParent();
2290   return true;
2291 }
2292 
2293 bool AMDGPULegalizerInfo::legalizeFceil(
2294   MachineInstr &MI, MachineRegisterInfo &MRI,
2295   MachineIRBuilder &B) const {
2296 
2297   const LLT S1 = LLT::scalar(1);
2298   const LLT S64 = LLT::scalar(64);
2299 
2300   Register Src = MI.getOperand(1).getReg();
2301   assert(MRI.getType(Src) == S64);
2302 
2303   // result = trunc(src)
2304   // if (src > 0.0 && src != result)
2305   //   result += 1.0
2306 
2307   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2308 
2309   const auto Zero = B.buildFConstant(S64, 0.0);
2310   const auto One = B.buildFConstant(S64, 1.0);
2311   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2312   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2313   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2314   auto Add = B.buildSelect(S64, And, One, Zero);
2315 
2316   // TODO: Should this propagate fast-math-flags?
2317   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2318   MI.eraseFromParent();
2319   return true;
2320 }
2321 
2322 bool AMDGPULegalizerInfo::legalizeFrem(
2323   MachineInstr &MI, MachineRegisterInfo &MRI,
2324   MachineIRBuilder &B) const {
2325     Register DstReg = MI.getOperand(0).getReg();
2326     Register Src0Reg = MI.getOperand(1).getReg();
2327     Register Src1Reg = MI.getOperand(2).getReg();
2328     auto Flags = MI.getFlags();
2329     LLT Ty = MRI.getType(DstReg);
2330 
2331     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2332     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2333     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2334     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2335     MI.eraseFromParent();
2336     return true;
2337 }
2338 
2339 static MachineInstrBuilder extractF64Exponent(Register Hi,
2340                                               MachineIRBuilder &B) {
2341   const unsigned FractBits = 52;
2342   const unsigned ExpBits = 11;
2343   LLT S32 = LLT::scalar(32);
2344 
2345   auto Const0 = B.buildConstant(S32, FractBits - 32);
2346   auto Const1 = B.buildConstant(S32, ExpBits);
2347 
2348   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2349     .addUse(Hi)
2350     .addUse(Const0.getReg(0))
2351     .addUse(Const1.getReg(0));
2352 
2353   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2354 }
2355 
2356 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2357   MachineInstr &MI, MachineRegisterInfo &MRI,
2358   MachineIRBuilder &B) const {
2359   const LLT S1 = LLT::scalar(1);
2360   const LLT S32 = LLT::scalar(32);
2361   const LLT S64 = LLT::scalar(64);
2362 
2363   Register Src = MI.getOperand(1).getReg();
2364   assert(MRI.getType(Src) == S64);
2365 
2366   // TODO: Should this use extract since the low half is unused?
2367   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2368   Register Hi = Unmerge.getReg(1);
2369 
2370   // Extract the upper half, since this is where we will find the sign and
2371   // exponent.
2372   auto Exp = extractF64Exponent(Hi, B);
2373 
2374   const unsigned FractBits = 52;
2375 
2376   // Extract the sign bit.
2377   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2378   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2379 
2380   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2381 
2382   const auto Zero32 = B.buildConstant(S32, 0);
2383 
2384   // Extend back to 64-bits.
2385   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2386 
2387   auto Shr = B.buildAShr(S64, FractMask, Exp);
2388   auto Not = B.buildNot(S64, Shr);
2389   auto Tmp0 = B.buildAnd(S64, Src, Not);
2390   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2391 
2392   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2393   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2394 
2395   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2396   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2397   MI.eraseFromParent();
2398   return true;
2399 }
2400 
2401 bool AMDGPULegalizerInfo::legalizeITOFP(
2402   MachineInstr &MI, MachineRegisterInfo &MRI,
2403   MachineIRBuilder &B, bool Signed) const {
2404 
2405   Register Dst = MI.getOperand(0).getReg();
2406   Register Src = MI.getOperand(1).getReg();
2407 
2408   const LLT S64 = LLT::scalar(64);
2409   const LLT S32 = LLT::scalar(32);
2410 
2411   assert(MRI.getType(Src) == S64);
2412 
2413   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2414   auto ThirtyTwo = B.buildConstant(S32, 32);
2415 
2416   if (MRI.getType(Dst) == S64) {
2417     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2418                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2419 
2420     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2421     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2422 
2423     // TODO: Should this propagate fast-math-flags?
2424     B.buildFAdd(Dst, LdExp, CvtLo);
2425     MI.eraseFromParent();
2426     return true;
2427   }
2428 
2429   assert(MRI.getType(Dst) == S32);
2430 
2431   auto One = B.buildConstant(S32, 1);
2432 
2433   MachineInstrBuilder ShAmt;
2434   if (Signed) {
2435     auto ThirtyOne = B.buildConstant(S32, 31);
2436     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2437     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2438     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2439     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2440                                /*HasSideEffects=*/false)
2441                   .addUse(Unmerge.getReg(1));
2442     auto LS2 = B.buildSub(S32, LS, One);
2443     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2444   } else
2445     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2446   auto Norm = B.buildShl(S64, Src, ShAmt);
2447   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2448   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2449   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2450   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2451   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2452   B.buildFLdexp(Dst, FVal, Scale);
2453   MI.eraseFromParent();
2454   return true;
2455 }
2456 
2457 // TODO: Copied from DAG implementation. Verify logic and document how this
2458 // actually works.
2459 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2460                                         MachineRegisterInfo &MRI,
2461                                         MachineIRBuilder &B,
2462                                         bool Signed) const {
2463 
2464   Register Dst = MI.getOperand(0).getReg();
2465   Register Src = MI.getOperand(1).getReg();
2466 
2467   const LLT S64 = LLT::scalar(64);
2468   const LLT S32 = LLT::scalar(32);
2469 
2470   const LLT SrcLT = MRI.getType(Src);
2471   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2472 
2473   unsigned Flags = MI.getFlags();
2474 
2475   // The basic idea of converting a floating point number into a pair of 32-bit
2476   // integers is illustrated as follows:
2477   //
2478   //     tf := trunc(val);
2479   //    hif := floor(tf * 2^-32);
2480   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2481   //     hi := fptoi(hif);
2482   //     lo := fptoi(lof);
2483   //
2484   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2485   MachineInstrBuilder Sign;
2486   if (Signed && SrcLT == S32) {
2487     // However, a 32-bit floating point number has only 23 bits mantissa and
2488     // it's not enough to hold all the significant bits of `lof` if val is
2489     // negative. To avoid the loss of precision, We need to take the absolute
2490     // value after truncating and flip the result back based on the original
2491     // signedness.
2492     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2493     Trunc = B.buildFAbs(S32, Trunc, Flags);
2494   }
2495   MachineInstrBuilder K0, K1;
2496   if (SrcLT == S64) {
2497     K0 = B.buildFConstant(
2498         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2499     K1 = B.buildFConstant(
2500         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2501   } else {
2502     K0 = B.buildFConstant(
2503         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2504     K1 = B.buildFConstant(
2505         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2506   }
2507 
2508   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2509   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2510   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2511 
2512   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2513                                      : B.buildFPTOUI(S32, FloorMul);
2514   auto Lo = B.buildFPTOUI(S32, Fma);
2515 
2516   if (Signed && SrcLT == S32) {
2517     // Flip the result based on the signedness, which is either all 0s or 1s.
2518     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2519     // r := xor({lo, hi}, sign) - sign;
2520     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2521                Sign);
2522   } else
2523     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2524   MI.eraseFromParent();
2525 
2526   return true;
2527 }
2528 
2529 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2530                                                MachineInstr &MI) const {
2531   MachineFunction &MF = Helper.MIRBuilder.getMF();
2532   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2533 
2534   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2535                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2536 
2537   // With ieee_mode disabled, the instructions have the correct behavior
2538   // already for G_FMINNUM/G_FMAXNUM
2539   if (!MFI->getMode().IEEE)
2540     return !IsIEEEOp;
2541 
2542   if (IsIEEEOp)
2543     return true;
2544 
2545   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2546 }
2547 
2548 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2549   MachineInstr &MI, MachineRegisterInfo &MRI,
2550   MachineIRBuilder &B) const {
2551   // TODO: Should move some of this into LegalizerHelper.
2552 
2553   // TODO: Promote dynamic indexing of s16 to s32
2554 
2555   Register Dst = MI.getOperand(0).getReg();
2556   Register Vec = MI.getOperand(1).getReg();
2557 
2558   LLT VecTy = MRI.getType(Vec);
2559   LLT EltTy = VecTy.getElementType();
2560   assert(EltTy == MRI.getType(Dst));
2561 
2562   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2563   // but we can't go directly to that logic becasue you can't bitcast a vector
2564   // of pointers to a vector of integers. Therefore, introduce an intermediate
2565   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2566   // drive the legalization forward.
2567   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2568     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2569     LLT IntVecTy = VecTy.changeElementType(IntTy);
2570 
2571     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2572     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2573     B.buildIntToPtr(Dst, IntElt);
2574 
2575     MI.eraseFromParent();
2576     return true;
2577   }
2578 
2579   // FIXME: Artifact combiner probably should have replaced the truncated
2580   // constant before this, so we shouldn't need
2581   // getIConstantVRegValWithLookThrough.
2582   std::optional<ValueAndVReg> MaybeIdxVal =
2583       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2584   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2585     return true;
2586   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2587 
2588   if (IdxVal < VecTy.getNumElements()) {
2589     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2590     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2591   } else {
2592     B.buildUndef(Dst);
2593   }
2594 
2595   MI.eraseFromParent();
2596   return true;
2597 }
2598 
2599 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2600   MachineInstr &MI, MachineRegisterInfo &MRI,
2601   MachineIRBuilder &B) const {
2602   // TODO: Should move some of this into LegalizerHelper.
2603 
2604   // TODO: Promote dynamic indexing of s16 to s32
2605 
2606   Register Dst = MI.getOperand(0).getReg();
2607   Register Vec = MI.getOperand(1).getReg();
2608   Register Ins = MI.getOperand(2).getReg();
2609 
2610   LLT VecTy = MRI.getType(Vec);
2611   LLT EltTy = VecTy.getElementType();
2612   assert(EltTy == MRI.getType(Ins));
2613 
2614   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2615   // but we can't go directly to that logic becasue you can't bitcast a vector
2616   // of pointers to a vector of integers. Therefore, make the pointer vector
2617   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2618   // new value, and then inttoptr the result vector back. This will then allow
2619   // the rest of legalization to take over.
2620   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2621     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2622     LLT IntVecTy = VecTy.changeElementType(IntTy);
2623 
2624     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2625     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2626     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2627                                                  MI.getOperand(3));
2628     B.buildIntToPtr(Dst, IntVecDest);
2629     MI.eraseFromParent();
2630     return true;
2631   }
2632 
2633   // FIXME: Artifact combiner probably should have replaced the truncated
2634   // constant before this, so we shouldn't need
2635   // getIConstantVRegValWithLookThrough.
2636   std::optional<ValueAndVReg> MaybeIdxVal =
2637       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2638   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2639     return true;
2640 
2641   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2642 
2643   unsigned NumElts = VecTy.getNumElements();
2644   if (IdxVal < NumElts) {
2645     SmallVector<Register, 8> SrcRegs;
2646     for (unsigned i = 0; i < NumElts; ++i)
2647       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2648     B.buildUnmerge(SrcRegs, Vec);
2649 
2650     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2651     B.buildMergeLikeInstr(Dst, SrcRegs);
2652   } else {
2653     B.buildUndef(Dst);
2654   }
2655 
2656   MI.eraseFromParent();
2657   return true;
2658 }
2659 
2660 bool AMDGPULegalizerInfo::legalizeSinCos(
2661   MachineInstr &MI, MachineRegisterInfo &MRI,
2662   MachineIRBuilder &B) const {
2663 
2664   Register DstReg = MI.getOperand(0).getReg();
2665   Register SrcReg = MI.getOperand(1).getReg();
2666   LLT Ty = MRI.getType(DstReg);
2667   unsigned Flags = MI.getFlags();
2668 
2669   Register TrigVal;
2670   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2671   if (ST.hasTrigReducedRange()) {
2672     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2673     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2674       .addUse(MulVal.getReg(0))
2675       .setMIFlags(Flags).getReg(0);
2676   } else
2677     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2678 
2679   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2680     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2681   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
2682       .addUse(TrigVal)
2683       .setMIFlags(Flags);
2684   MI.eraseFromParent();
2685   return true;
2686 }
2687 
2688 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2689                                                   MachineIRBuilder &B,
2690                                                   const GlobalValue *GV,
2691                                                   int64_t Offset,
2692                                                   unsigned GAFlags) const {
2693   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2694   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2695   // to the following code sequence:
2696   //
2697   // For constant address space:
2698   //   s_getpc_b64 s[0:1]
2699   //   s_add_u32 s0, s0, $symbol
2700   //   s_addc_u32 s1, s1, 0
2701   //
2702   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2703   //   a fixup or relocation is emitted to replace $symbol with a literal
2704   //   constant, which is a pc-relative offset from the encoding of the $symbol
2705   //   operand to the global variable.
2706   //
2707   // For global address space:
2708   //   s_getpc_b64 s[0:1]
2709   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2710   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2711   //
2712   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2713   //   fixups or relocations are emitted to replace $symbol@*@lo and
2714   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2715   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2716   //   operand to the global variable.
2717   //
2718   // What we want here is an offset from the value returned by s_getpc
2719   // (which is the address of the s_add_u32 instruction) to the global
2720   // variable, but since the encoding of $symbol starts 4 bytes after the start
2721   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2722   // small. This requires us to add 4 to the global variable offset in order to
2723   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2724   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2725   // instruction.
2726 
2727   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2728 
2729   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2730     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2731 
2732   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2733     .addDef(PCReg);
2734 
2735   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2736   if (GAFlags == SIInstrInfo::MO_NONE)
2737     MIB.addImm(0);
2738   else
2739     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2740 
2741   if (!B.getMRI()->getRegClassOrNull(PCReg))
2742     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2743 
2744   if (PtrTy.getSizeInBits() == 32)
2745     B.buildExtract(DstReg, PCReg, 0);
2746   return true;
2747  }
2748 
2749 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2750   MachineInstr &MI, MachineRegisterInfo &MRI,
2751   MachineIRBuilder &B) const {
2752   Register DstReg = MI.getOperand(0).getReg();
2753   LLT Ty = MRI.getType(DstReg);
2754   unsigned AS = Ty.getAddressSpace();
2755 
2756   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2757   MachineFunction &MF = B.getMF();
2758   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2759 
2760   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2761     if (!MFI->isModuleEntryFunction() &&
2762         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2763       const Function &Fn = MF.getFunction();
2764       DiagnosticInfoUnsupported BadLDSDecl(
2765         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2766         DS_Warning);
2767       Fn.getContext().diagnose(BadLDSDecl);
2768 
2769       // We currently don't have a way to correctly allocate LDS objects that
2770       // aren't directly associated with a kernel. We do force inlining of
2771       // functions that use local objects. However, if these dead functions are
2772       // not eliminated, we don't want a compile time error. Just emit a warning
2773       // and a trap, since there should be no callable path here.
2774       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2775       B.buildUndef(DstReg);
2776       MI.eraseFromParent();
2777       return true;
2778     }
2779 
2780     // TODO: We could emit code to handle the initialization somewhere.
2781     // We ignore the initializer for now and legalize it to allow selection.
2782     // The initializer will anyway get errored out during assembly emission.
2783     const SITargetLowering *TLI = ST.getTargetLowering();
2784     if (!TLI->shouldUseLDSConstAddress(GV)) {
2785       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2786       return true; // Leave in place;
2787     }
2788 
2789     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2790       Type *Ty = GV->getValueType();
2791       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2792       // zero-sized type in other languages to declare the dynamic shared
2793       // memory which size is not known at the compile time. They will be
2794       // allocated by the runtime and placed directly after the static
2795       // allocated ones. They all share the same offset.
2796       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2797         // Adjust alignment for that dynamic shared memory array.
2798         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2799         LLT S32 = LLT::scalar(32);
2800         auto Sz =
2801             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2802         B.buildIntToPtr(DstReg, Sz);
2803         MI.eraseFromParent();
2804         return true;
2805       }
2806     }
2807 
2808     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2809                                                    *cast<GlobalVariable>(GV)));
2810     MI.eraseFromParent();
2811     return true;
2812   }
2813 
2814   const SITargetLowering *TLI = ST.getTargetLowering();
2815 
2816   if (TLI->shouldEmitFixup(GV)) {
2817     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2818     MI.eraseFromParent();
2819     return true;
2820   }
2821 
2822   if (TLI->shouldEmitPCReloc(GV)) {
2823     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2824     MI.eraseFromParent();
2825     return true;
2826   }
2827 
2828   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2829   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2830 
2831   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2832   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2833       MachinePointerInfo::getGOT(MF),
2834       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2835           MachineMemOperand::MOInvariant,
2836       LoadTy, Align(8));
2837 
2838   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2839 
2840   if (Ty.getSizeInBits() == 32) {
2841     // Truncate if this is a 32-bit constant address.
2842     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2843     B.buildExtract(DstReg, Load, 0);
2844   } else
2845     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2846 
2847   MI.eraseFromParent();
2848   return true;
2849 }
2850 
2851 static LLT widenToNextPowerOf2(LLT Ty) {
2852   if (Ty.isVector())
2853     return Ty.changeElementCount(
2854         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2855   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2856 }
2857 
2858 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2859                                        MachineInstr &MI) const {
2860   MachineIRBuilder &B = Helper.MIRBuilder;
2861   MachineRegisterInfo &MRI = *B.getMRI();
2862   GISelChangeObserver &Observer = Helper.Observer;
2863 
2864   Register PtrReg = MI.getOperand(1).getReg();
2865   LLT PtrTy = MRI.getType(PtrReg);
2866   unsigned AddrSpace = PtrTy.getAddressSpace();
2867 
2868   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2869     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2870     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2871     Observer.changingInstr(MI);
2872     MI.getOperand(1).setReg(Cast.getReg(0));
2873     Observer.changedInstr(MI);
2874     return true;
2875   }
2876 
2877   if (MI.getOpcode() != AMDGPU::G_LOAD)
2878     return false;
2879 
2880   Register ValReg = MI.getOperand(0).getReg();
2881   LLT ValTy = MRI.getType(ValReg);
2882 
2883   if (hasBufferRsrcWorkaround(ValTy)) {
2884     Observer.changingInstr(MI);
2885     castBufferRsrcFromV4I32(MI, B, MRI, 0);
2886     Observer.changedInstr(MI);
2887     return true;
2888   }
2889 
2890   MachineMemOperand *MMO = *MI.memoperands_begin();
2891   const unsigned ValSize = ValTy.getSizeInBits();
2892   const LLT MemTy = MMO->getMemoryType();
2893   const Align MemAlign = MMO->getAlign();
2894   const unsigned MemSize = MemTy.getSizeInBits();
2895   const uint64_t AlignInBits = 8 * MemAlign.value();
2896 
2897   // Widen non-power-of-2 loads to the alignment if needed
2898   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2899     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2900 
2901     // This was already the correct extending load result type, so just adjust
2902     // the memory type.
2903     if (WideMemSize == ValSize) {
2904       MachineFunction &MF = B.getMF();
2905 
2906       MachineMemOperand *WideMMO =
2907           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2908       Observer.changingInstr(MI);
2909       MI.setMemRefs(MF, {WideMMO});
2910       Observer.changedInstr(MI);
2911       return true;
2912     }
2913 
2914     // Don't bother handling edge case that should probably never be produced.
2915     if (ValSize > WideMemSize)
2916       return false;
2917 
2918     LLT WideTy = widenToNextPowerOf2(ValTy);
2919 
2920     Register WideLoad;
2921     if (!WideTy.isVector()) {
2922       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2923       B.buildTrunc(ValReg, WideLoad).getReg(0);
2924     } else {
2925       // Extract the subvector.
2926 
2927       if (isRegisterType(ValTy)) {
2928         // If this a case where G_EXTRACT is legal, use it.
2929         // (e.g. <3 x s32> -> <4 x s32>)
2930         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2931         B.buildExtract(ValReg, WideLoad, 0);
2932       } else {
2933         // For cases where the widened type isn't a nice register value, unmerge
2934         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2935         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2936         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2937       }
2938     }
2939 
2940     MI.eraseFromParent();
2941     return true;
2942   }
2943 
2944   return false;
2945 }
2946 
2947 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
2948                                         MachineInstr &MI) const {
2949   MachineIRBuilder &B = Helper.MIRBuilder;
2950   MachineRegisterInfo &MRI = *B.getMRI();
2951   GISelChangeObserver &Observer = Helper.Observer;
2952 
2953   Register DataReg = MI.getOperand(0).getReg();
2954   LLT DataTy = MRI.getType(DataReg);
2955 
2956   if (hasBufferRsrcWorkaround(DataTy)) {
2957     Observer.changingInstr(MI);
2958     castBufferRsrcArgToV4I32(MI, B, 0);
2959     Observer.changedInstr(MI);
2960     return true;
2961   }
2962   return false;
2963 }
2964 
2965 bool AMDGPULegalizerInfo::legalizeFMad(
2966   MachineInstr &MI, MachineRegisterInfo &MRI,
2967   MachineIRBuilder &B) const {
2968   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2969   assert(Ty.isScalar());
2970 
2971   MachineFunction &MF = B.getMF();
2972   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2973 
2974   // TODO: Always legal with future ftz flag.
2975   // FIXME: Do we need just output?
2976   if (Ty == LLT::scalar(32) &&
2977       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
2978     return true;
2979   if (Ty == LLT::scalar(16) &&
2980       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
2981     return true;
2982 
2983   MachineIRBuilder HelperBuilder(MI);
2984   GISelObserverWrapper DummyObserver;
2985   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2986   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2987 }
2988 
2989 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2990   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2991   Register DstReg = MI.getOperand(0).getReg();
2992   Register PtrReg = MI.getOperand(1).getReg();
2993   Register CmpVal = MI.getOperand(2).getReg();
2994   Register NewVal = MI.getOperand(3).getReg();
2995 
2996   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2997          "this should not have been custom lowered");
2998 
2999   LLT ValTy = MRI.getType(CmpVal);
3000   LLT VecTy = LLT::fixed_vector(2, ValTy);
3001 
3002   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3003 
3004   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3005     .addDef(DstReg)
3006     .addUse(PtrReg)
3007     .addUse(PackedVal)
3008     .setMemRefs(MI.memoperands());
3009 
3010   MI.eraseFromParent();
3011   return true;
3012 }
3013 
3014 /// Return true if it's known that \p Src can never be an f32 denormal value.
3015 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3016                                        Register Src) {
3017   Register ExtSrc;
3018   if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc))))
3019     return MRI.getType(ExtSrc) == LLT::scalar(16);
3020   return false;
3021 }
3022 
3023 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3024   if (Flags & MachineInstr::FmAfn)
3025     return true;
3026   const auto &Options = MF.getTarget().Options;
3027   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3028 }
3029 
3030 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3031                                    unsigned Flags) {
3032   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3033          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3034              DenormalMode::PreserveSign;
3035 }
3036 
3037 std::pair<Register, Register>
3038 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3039                                        unsigned Flags) const {
3040   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3041     return {};
3042 
3043   const LLT F32 = LLT::scalar(32);
3044   auto SmallestNormal = B.buildFConstant(
3045       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3046   auto IsLtSmallestNormal =
3047       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3048 
3049   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3050   auto One = B.buildFConstant(F32, 1.0);
3051   auto ScaleFactor =
3052       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3053   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3054 
3055   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3056 }
3057 
3058 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3059                                         MachineIRBuilder &B) const {
3060   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3061   // If we have to handle denormals, scale up the input and adjust the result.
3062 
3063   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3064   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3065 
3066   Register Dst = MI.getOperand(0).getReg();
3067   Register Src = MI.getOperand(1).getReg();
3068   LLT Ty = B.getMRI()->getType(Dst);
3069   unsigned Flags = MI.getFlags();
3070 
3071   if (Ty == LLT::scalar(16)) {
3072     const LLT F32 = LLT::scalar(32);
3073     // Nothing in half is a denormal when promoted to f32.
3074     auto Ext = B.buildFPExt(F32, Src, Flags);
3075     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false)
3076       .addUse(Ext.getReg(0))
3077       .setMIFlags(Flags);
3078     B.buildFPTrunc(Dst, Log2, Flags);
3079     MI.eraseFromParent();
3080     return true;
3081   }
3082 
3083   assert(Ty == LLT::scalar(32));
3084 
3085   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3086   if (!ScaledInput) {
3087     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false)
3088         .addUse(Src)
3089         .setMIFlags(Flags);
3090     MI.eraseFromParent();
3091     return true;
3092   }
3093 
3094   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3095                   .addUse(ScaledInput)
3096                   .setMIFlags(Flags);
3097 
3098   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3099   auto Zero = B.buildFConstant(Ty, 0.0);
3100   auto ResultOffset =
3101       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3102   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3103 
3104   MI.eraseFromParent();
3105   return true;
3106 }
3107 
3108 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3109                        Register Z, unsigned Flags) {
3110   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3111   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3112 }
3113 
3114 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3115                                              MachineIRBuilder &B) const {
3116   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3117   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3118 
3119   MachineRegisterInfo &MRI = *B.getMRI();
3120   Register Dst = MI.getOperand(0).getReg();
3121   Register X = MI.getOperand(1).getReg();
3122   unsigned Flags = MI.getFlags();
3123   const LLT Ty = MRI.getType(X);
3124   MachineFunction &MF = B.getMF();
3125 
3126   const LLT F32 = LLT::scalar(32);
3127   const LLT F16 = LLT::scalar(16);
3128 
3129   const AMDGPUTargetMachine &TM =
3130       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3131 
3132   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3133       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3134     if (Ty == F16 && !ST.has16BitInsts()) {
3135       Register LogVal = MRI.createGenericVirtualRegister(F32);
3136       auto PromoteSrc = B.buildFPExt(F32, X);
3137       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3138       B.buildFPTrunc(Dst, LogVal);
3139     } else {
3140       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3141     }
3142 
3143     MI.eraseFromParent();
3144     return true;
3145   }
3146 
3147   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3148   if (ScaledInput)
3149     X = ScaledInput;
3150 
3151   auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3152     .addUse(X)
3153     .setMIFlags(Flags);
3154 
3155   Register R;
3156   if (ST.hasFastFMAF32()) {
3157     // c+cc are ln(2)/ln(10) to more than 49 bits
3158     const float c_log10 = 0x1.344134p-2f;
3159     const float cc_log10 = 0x1.09f79ep-26f;
3160 
3161     // c + cc is ln(2) to more than 49 bits
3162     const float c_log = 0x1.62e42ep-1f;
3163     const float cc_log = 0x1.efa39ep-25f;
3164 
3165     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3166     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3167 
3168     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3169     auto NegR = B.buildFNeg(Ty, R, Flags);
3170     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3171     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3172     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3173   } else {
3174     // ch+ct is ln(2)/ln(10) to more than 36 bits
3175     const float ch_log10 = 0x1.344000p-2f;
3176     const float ct_log10 = 0x1.3509f6p-18f;
3177 
3178     // ch + ct is ln(2) to more than 36 bits
3179     const float ch_log = 0x1.62e000p-1f;
3180     const float ct_log = 0x1.0bfbe8p-15f;
3181 
3182     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3183     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3184 
3185     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3186     auto YH = B.buildAnd(Ty, Y, MaskConst);
3187     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3188     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3189 
3190     Register Mad0 =
3191         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3192     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3193     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3194   }
3195 
3196   const bool IsFiniteOnly =
3197       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3198       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3199 
3200   if (!IsFiniteOnly) {
3201     // Expand isfinite(x) => fabs(x) < inf
3202     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3203     auto Fabs = B.buildFAbs(Ty, Y);
3204     auto IsFinite =
3205         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3206     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3207   }
3208 
3209   if (ScaledInput) {
3210     auto Zero = B.buildFConstant(Ty, 0.0);
3211     auto ShiftK =
3212         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3213     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3214     B.buildFSub(Dst, R, Shift, Flags);
3215   } else {
3216     B.buildCopy(Dst, R);
3217   }
3218 
3219   MI.eraseFromParent();
3220   return true;
3221 }
3222 
3223 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3224                                              Register Src, bool IsLog10,
3225                                              unsigned Flags) const {
3226   const double Log2BaseInverted =
3227       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3228 
3229   LLT Ty = B.getMRI()->getType(Dst);
3230 
3231   if (Ty == LLT::scalar(32)) {
3232     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3233     if (ScaledInput) {
3234       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3235                         .addUse(Src)
3236                         .setMIFlags(Flags);
3237       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3238       auto Zero = B.buildFConstant(Ty, 0.0);
3239       auto ResultOffset =
3240           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3241       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3242 
3243       if (ST.hasFastFMAF32())
3244         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3245       else {
3246         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3247         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3248       }
3249 
3250       return true;
3251     }
3252   }
3253 
3254   auto Log2Operand = Ty == LLT::scalar(16)
3255                          ? B.buildFLog2(Ty, Src, Flags)
3256                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3257                                .addUse(Src)
3258                                .setMIFlags(Flags);
3259   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3260   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3261   return true;
3262 }
3263 
3264 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3265                                         MachineIRBuilder &B) const {
3266   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3267   // If we have to handle denormals, scale up the input and adjust the result.
3268 
3269   Register Dst = MI.getOperand(0).getReg();
3270   Register Src = MI.getOperand(1).getReg();
3271   unsigned Flags = MI.getFlags();
3272   LLT Ty = B.getMRI()->getType(Dst);
3273   const LLT F16 = LLT::scalar(16);
3274   const LLT F32 = LLT::scalar(32);
3275 
3276   if (Ty == F16) {
3277     // Nothing in half is a denormal when promoted to f32.
3278     auto Ext = B.buildFPExt(F32, Src, Flags);
3279     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false)
3280       .addUse(Ext.getReg(0))
3281       .setMIFlags(Flags);
3282     B.buildFPTrunc(Dst, Log2, Flags);
3283     MI.eraseFromParent();
3284     return true;
3285   }
3286 
3287   assert(Ty == F32);
3288 
3289   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3290     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
3291         .addUse(Src)
3292         .setMIFlags(Flags);
3293     MI.eraseFromParent();
3294     return true;
3295   }
3296 
3297   // bool needs_scaling = x < -0x1.f80000p+6f;
3298   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3299 
3300   // -nextafter(128.0, -1)
3301   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3302   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3303                                   RangeCheckConst, Flags);
3304 
3305   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3306   auto Zero = B.buildFConstant(Ty, 0.0);
3307   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3308   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3309 
3310   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
3311                   .addUse(AddInput.getReg(0))
3312                   .setMIFlags(Flags);
3313 
3314   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3315   auto One = B.buildFConstant(Ty, 1.0);
3316   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3317   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3318   MI.eraseFromParent();
3319   return true;
3320 }
3321 
3322 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3323                                              Register Src,
3324                                              unsigned Flags) const {
3325   LLT Ty = B.getMRI()->getType(Dst);
3326   auto K = B.buildFConstant(Ty, numbers::log2e);
3327   auto Mul = B.buildFMul(Ty, Src, K, Flags);
3328 
3329   if (Ty == LLT::scalar(32)) {
3330     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
3331       .addUse(Mul.getReg(0))
3332       .setMIFlags(Flags);
3333   } else {
3334     B.buildFExp2(Dst, Mul.getReg(0), Flags);
3335   }
3336 
3337   return true;
3338 }
3339 
3340 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3341                                        MachineIRBuilder &B) const {
3342   Register Dst = MI.getOperand(0).getReg();
3343   Register X = MI.getOperand(1).getReg();
3344   const unsigned Flags = MI.getFlags();
3345   MachineFunction &MF = B.getMF();
3346   MachineRegisterInfo &MRI = *B.getMRI();
3347   LLT Ty = MRI.getType(Dst);
3348   const LLT F16 = LLT::scalar(16);
3349   const LLT F32 = LLT::scalar(32);
3350   const bool IsExp10 = false; // TODO: For some reason exp10 is missing
3351 
3352   if (Ty == F16) {
3353     // v_exp_f16 (fmul x, log2e)
3354     if (allowApproxFunc(MF, Flags)) {
3355       // TODO: Does this really require fast?
3356       legalizeFExpUnsafe(B, Dst, X, Flags);
3357       MI.eraseFromParent();
3358       return true;
3359     }
3360 
3361     // exp(f16 x) ->
3362     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3363 
3364     // Nothing in half is a denormal when promoted to f32.
3365     auto Ext = B.buildFPExt(F32, X, Flags);
3366     Register Lowered = MRI.createGenericVirtualRegister(F32);
3367     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3368     B.buildFPTrunc(Dst, Lowered, Flags);
3369     MI.eraseFromParent();
3370     return true;
3371   }
3372 
3373   assert(Ty == F32);
3374 
3375   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3376   // library behavior. Also, is known-not-daz source sufficient?
3377   if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) {
3378     legalizeFExpUnsafe(B, Dst, X, Flags);
3379     MI.eraseFromParent();
3380     return true;
3381   }
3382 
3383   //    Algorithm:
3384   //
3385   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3386   //
3387   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3388   //    n = 64*m + j,   0 <= j < 64
3389   //
3390   //    e^x = 2^((64*m + j + f)/64)
3391   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3392   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3393   //
3394   //    f = x*(64/ln(2)) - n
3395   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3396   //
3397   //    e^x = (2^m) * (2^(j/64)) * e^r
3398   //
3399   //    (2^(j/64)) is precomputed
3400   //
3401   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3402   //    e^r = 1 + q
3403   //
3404   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3405   //
3406   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3407   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3408   Register PH, PL;
3409 
3410   if (ST.hasFastFMAF32()) {
3411     const float c_exp = numbers::log2ef;
3412     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3413     const float c_exp10 = 0x1.a934f0p+1f;
3414     const float cc_exp10 = 0x1.2f346ep-24f;
3415 
3416     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3417     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3418     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3419     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3420 
3421     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3422     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3423   } else {
3424     const float ch_exp = 0x1.714000p+0f;
3425     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3426 
3427     const float ch_exp10 = 0x1.a92000p+1f;
3428     const float cl_exp10 = 0x1.4f0978p-11f;
3429 
3430     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3431     auto XH = B.buildAnd(Ty, X, MaskConst);
3432     auto XL = B.buildFSub(Ty, X, XH, Flags);
3433 
3434     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3435     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3436 
3437     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3438     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3439 
3440     Register Mad0 =
3441         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3442     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3443   }
3444 
3445   auto E = B.buildFRint(Ty, PH, Flags);
3446 
3447   // It is unsafe to contract this fsub into the PH multiply.
3448   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3449   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3450   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3451 
3452   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
3453                   .addUse(A.getReg(0))
3454                   .setMIFlags(Flags);
3455   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3456 
3457   auto UnderflowCheckConst =
3458       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3459   auto Zero = B.buildFConstant(Ty, 0.0);
3460   auto Underflow =
3461       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3462 
3463   R = B.buildSelect(Ty, Underflow, Zero, R);
3464 
3465   const auto &Options = MF.getTarget().Options;
3466 
3467   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3468     auto OverflowCheckConst =
3469         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3470 
3471     auto Overflow =
3472         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3473     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3474     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3475   }
3476 
3477   B.buildCopy(Dst, R);
3478   MI.eraseFromParent();
3479   return true;
3480 }
3481 
3482 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3483                                        MachineIRBuilder &B) const {
3484   Register Dst = MI.getOperand(0).getReg();
3485   Register Src0 = MI.getOperand(1).getReg();
3486   Register Src1 = MI.getOperand(2).getReg();
3487   unsigned Flags = MI.getFlags();
3488   LLT Ty = B.getMRI()->getType(Dst);
3489   const LLT S16 = LLT::scalar(16);
3490   const LLT S32 = LLT::scalar(32);
3491 
3492   if (Ty == S32) {
3493     auto Log = B.buildFLog2(S32, Src0, Flags);
3494     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
3495       .addUse(Log.getReg(0))
3496       .addUse(Src1)
3497       .setMIFlags(Flags);
3498     B.buildFExp2(Dst, Mul, Flags);
3499   } else if (Ty == S16) {
3500     // There's no f16 fmul_legacy, so we need to convert for it.
3501     auto Log = B.buildFLog2(S16, Src0, Flags);
3502     auto Ext0 = B.buildFPExt(S32, Log, Flags);
3503     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
3504     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
3505       .addUse(Ext0.getReg(0))
3506       .addUse(Ext1.getReg(0))
3507       .setMIFlags(Flags);
3508 
3509     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
3510   } else
3511     return false;
3512 
3513   MI.eraseFromParent();
3514   return true;
3515 }
3516 
3517 // Find a source register, ignoring any possible source modifiers.
3518 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3519   Register ModSrc = OrigSrc;
3520   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3521     ModSrc = SrcFNeg->getOperand(1).getReg();
3522     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3523       ModSrc = SrcFAbs->getOperand(1).getReg();
3524   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3525     ModSrc = SrcFAbs->getOperand(1).getReg();
3526   return ModSrc;
3527 }
3528 
3529 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3530                                          MachineRegisterInfo &MRI,
3531                                          MachineIRBuilder &B) const {
3532 
3533   const LLT S1 = LLT::scalar(1);
3534   const LLT S64 = LLT::scalar(64);
3535   Register Dst = MI.getOperand(0).getReg();
3536   Register OrigSrc = MI.getOperand(1).getReg();
3537   unsigned Flags = MI.getFlags();
3538   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
3539          "this should not have been custom lowered");
3540 
3541   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3542   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3543   // efficient way to implement it is using V_FRACT_F64. The workaround for the
3544   // V_FRACT bug is:
3545   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3546   //
3547   // Convert floor(x) to (x - fract(x))
3548 
3549   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
3550     .addUse(OrigSrc)
3551     .setMIFlags(Flags);
3552 
3553   // Give source modifier matching some assistance before obscuring a foldable
3554   // pattern.
3555 
3556   // TODO: We can avoid the neg on the fract? The input sign to fract
3557   // shouldn't matter?
3558   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3559 
3560   auto Const =
3561       B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff));
3562 
3563   Register Min = MRI.createGenericVirtualRegister(S64);
3564 
3565   // We don't need to concern ourselves with the snan handling difference, so
3566   // use the one which will directly select.
3567   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3568   if (MFI->getMode().IEEE)
3569     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3570   else
3571     B.buildFMinNum(Min, Fract, Const, Flags);
3572 
3573   Register CorrectedFract = Min;
3574   if (!MI.getFlag(MachineInstr::FmNoNans)) {
3575     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3576     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
3577   }
3578 
3579   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
3580   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3581 
3582   MI.eraseFromParent();
3583   return true;
3584 }
3585 
3586 // Turn an illegal packed v2s16 build vector into bit operations.
3587 // TODO: This should probably be a bitcast action in LegalizerHelper.
3588 bool AMDGPULegalizerInfo::legalizeBuildVector(
3589   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3590   Register Dst = MI.getOperand(0).getReg();
3591   const LLT S32 = LLT::scalar(32);
3592   const LLT S16 = LLT::scalar(16);
3593   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3594 
3595   Register Src0 = MI.getOperand(1).getReg();
3596   Register Src1 = MI.getOperand(2).getReg();
3597 
3598   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3599     assert(MRI.getType(Src0) == S32);
3600     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3601     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3602   }
3603 
3604   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3605   B.buildBitcast(Dst, Merge);
3606 
3607   MI.eraseFromParent();
3608   return true;
3609 }
3610 
3611 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3612 //
3613 // Source and accumulation registers must all be 32-bits.
3614 //
3615 // TODO: When the multiply is uniform, we should produce a code sequence
3616 // that is better suited to instruction selection on the SALU. Instead of
3617 // the outer loop going over parts of the result, the outer loop should go
3618 // over parts of one of the factors. This should result in instruction
3619 // selection that makes full use of S_ADDC_U32 instructions.
3620 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3621                                         MutableArrayRef<Register> Accum,
3622                                         ArrayRef<Register> Src0,
3623                                         ArrayRef<Register> Src1,
3624                                         bool UsePartialMad64_32,
3625                                         bool SeparateOddAlignedProducts) const {
3626   // Use (possibly empty) vectors of S1 registers to represent the set of
3627   // carries from one pair of positions to the next.
3628   using Carry = SmallVector<Register, 2>;
3629 
3630   MachineIRBuilder &B = Helper.MIRBuilder;
3631   GISelKnownBits &KB = *Helper.getKnownBits();
3632 
3633   const LLT S1 = LLT::scalar(1);
3634   const LLT S32 = LLT::scalar(32);
3635   const LLT S64 = LLT::scalar(64);
3636 
3637   Register Zero32;
3638   Register Zero64;
3639 
3640   auto getZero32 = [&]() -> Register {
3641     if (!Zero32)
3642       Zero32 = B.buildConstant(S32, 0).getReg(0);
3643     return Zero32;
3644   };
3645   auto getZero64 = [&]() -> Register {
3646     if (!Zero64)
3647       Zero64 = B.buildConstant(S64, 0).getReg(0);
3648     return Zero64;
3649   };
3650 
3651   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3652   for (unsigned i = 0; i < Src0.size(); ++i) {
3653     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3654     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3655   }
3656 
3657   // Merge the given carries into the 32-bit LocalAccum, which is modified
3658   // in-place.
3659   //
3660   // Returns the carry-out, which is a single S1 register or null.
3661   auto mergeCarry =
3662       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3663         if (CarryIn.empty())
3664           return Register();
3665 
3666         bool HaveCarryOut = true;
3667         Register CarryAccum;
3668         if (CarryIn.size() == 1) {
3669           if (!LocalAccum) {
3670             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3671             return Register();
3672           }
3673 
3674           CarryAccum = getZero32();
3675         } else {
3676           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3677           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3678             CarryAccum =
3679                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3680                     .getReg(0);
3681           }
3682 
3683           if (!LocalAccum) {
3684             LocalAccum = getZero32();
3685             HaveCarryOut = false;
3686           }
3687         }
3688 
3689         auto Add =
3690             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3691         LocalAccum = Add.getReg(0);
3692         return HaveCarryOut ? Add.getReg(1) : Register();
3693       };
3694 
3695   // Build a multiply-add chain to compute
3696   //
3697   //   LocalAccum + (partial products at DstIndex)
3698   //       + (opportunistic subset of CarryIn)
3699   //
3700   // LocalAccum is an array of one or two 32-bit registers that are updated
3701   // in-place. The incoming registers may be null.
3702   //
3703   // In some edge cases, carry-ins can be consumed "for free". In that case,
3704   // the consumed carry bits are removed from CarryIn in-place.
3705   auto buildMadChain =
3706       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3707           -> Carry {
3708         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3709                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3710 
3711         Carry CarryOut;
3712         unsigned j0 = 0;
3713 
3714         // Use plain 32-bit multiplication for the most significant part of the
3715         // result by default.
3716         if (LocalAccum.size() == 1 &&
3717             (!UsePartialMad64_32 || !CarryIn.empty())) {
3718           do {
3719             // Skip multiplication if one of the operands is 0
3720             unsigned j1 = DstIndex - j0;
3721             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3722               ++j0;
3723               continue;
3724             }
3725             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3726             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3727               LocalAccum[0] = Mul.getReg(0);
3728             } else {
3729               if (CarryIn.empty()) {
3730                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3731               } else {
3732                 LocalAccum[0] =
3733                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3734                         .getReg(0);
3735                 CarryIn.pop_back();
3736               }
3737             }
3738             ++j0;
3739           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3740         }
3741 
3742         // Build full 64-bit multiplies.
3743         if (j0 <= DstIndex) {
3744           bool HaveSmallAccum = false;
3745           Register Tmp;
3746 
3747           if (LocalAccum[0]) {
3748             if (LocalAccum.size() == 1) {
3749               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3750               HaveSmallAccum = true;
3751             } else if (LocalAccum[1]) {
3752               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
3753               HaveSmallAccum = false;
3754             } else {
3755               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3756               HaveSmallAccum = true;
3757             }
3758           } else {
3759             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3760             Tmp = getZero64();
3761             HaveSmallAccum = true;
3762           }
3763 
3764           do {
3765             unsigned j1 = DstIndex - j0;
3766             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3767               ++j0;
3768               continue;
3769             }
3770             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3771                                     {Src0[j0], Src1[j1], Tmp});
3772             Tmp = Mad.getReg(0);
3773             if (!HaveSmallAccum)
3774               CarryOut.push_back(Mad.getReg(1));
3775             HaveSmallAccum = false;
3776 
3777             ++j0;
3778           } while (j0 <= DstIndex);
3779 
3780           auto Unmerge = B.buildUnmerge(S32, Tmp);
3781           LocalAccum[0] = Unmerge.getReg(0);
3782           if (LocalAccum.size() > 1)
3783             LocalAccum[1] = Unmerge.getReg(1);
3784         }
3785 
3786         return CarryOut;
3787       };
3788 
3789   // Outer multiply loop, iterating over destination parts from least
3790   // significant to most significant parts.
3791   //
3792   // The columns of the following diagram correspond to the destination parts
3793   // affected by one iteration of the outer loop (ignoring boundary
3794   // conditions).
3795   //
3796   //   Dest index relative to 2 * i:      1 0 -1
3797   //                                      ------
3798   //   Carries from previous iteration:     e o
3799   //   Even-aligned partial product sum:  E E .
3800   //   Odd-aligned partial product sum:     O O
3801   //
3802   // 'o' is OddCarry, 'e' is EvenCarry.
3803   // EE and OO are computed from partial products via buildMadChain and use
3804   // accumulation where possible and appropriate.
3805   //
3806   Register SeparateOddCarry;
3807   Carry EvenCarry;
3808   Carry OddCarry;
3809 
3810   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3811     Carry OddCarryIn = std::move(OddCarry);
3812     Carry EvenCarryIn = std::move(EvenCarry);
3813     OddCarry.clear();
3814     EvenCarry.clear();
3815 
3816     // Partial products at offset 2 * i.
3817     if (2 * i < Accum.size()) {
3818       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3819       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3820     }
3821 
3822     // Partial products at offset 2 * i - 1.
3823     if (i > 0) {
3824       if (!SeparateOddAlignedProducts) {
3825         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3826         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3827       } else {
3828         bool IsHighest = 2 * i >= Accum.size();
3829         Register SeparateOddOut[2];
3830         auto LocalAccum = MutableArrayRef(SeparateOddOut)
3831                               .take_front(IsHighest ? 1 : 2);
3832         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3833 
3834         MachineInstr *Lo;
3835 
3836         if (i == 1) {
3837           if (!IsHighest)
3838             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3839           else
3840             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3841         } else {
3842           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3843                             SeparateOddCarry);
3844         }
3845         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3846 
3847         if (!IsHighest) {
3848           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3849                                 Lo->getOperand(1).getReg());
3850           Accum[2 * i] = Hi.getReg(0);
3851           SeparateOddCarry = Hi.getReg(1);
3852         }
3853       }
3854     }
3855 
3856     // Add in the carries from the previous iteration
3857     if (i > 0) {
3858       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3859         EvenCarryIn.push_back(CarryOut);
3860 
3861       if (2 * i < Accum.size()) {
3862         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3863           OddCarry.push_back(CarryOut);
3864       }
3865     }
3866   }
3867 }
3868 
3869 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3870 //
3871 // TODO: If the multiply is followed by an addition, we should attempt to
3872 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3873 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3874                                       MachineInstr &MI) const {
3875   assert(ST.hasMad64_32());
3876   assert(MI.getOpcode() == TargetOpcode::G_MUL);
3877 
3878   MachineIRBuilder &B = Helper.MIRBuilder;
3879   MachineRegisterInfo &MRI = *B.getMRI();
3880 
3881   Register DstReg = MI.getOperand(0).getReg();
3882   Register Src0 = MI.getOperand(1).getReg();
3883   Register Src1 = MI.getOperand(2).getReg();
3884 
3885   LLT Ty = MRI.getType(DstReg);
3886   assert(Ty.isScalar());
3887 
3888   unsigned Size = Ty.getSizeInBits();
3889   unsigned NumParts = Size / 32;
3890   assert((Size % 32) == 0);
3891   assert(NumParts >= 2);
3892 
3893   // Whether to use MAD_64_32 for partial products whose high half is
3894   // discarded. This avoids some ADD instructions but risks false dependency
3895   // stalls on some subtargets in some cases.
3896   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3897 
3898   // Whether to compute odd-aligned partial products separately. This is
3899   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3900   // in an even-aligned VGPR.
3901   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3902 
3903   LLT S32 = LLT::scalar(32);
3904   SmallVector<Register, 2> Src0Parts, Src1Parts;
3905   for (unsigned i = 0; i < NumParts; ++i) {
3906     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3907     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3908   }
3909   B.buildUnmerge(Src0Parts, Src0);
3910   B.buildUnmerge(Src1Parts, Src1);
3911 
3912   SmallVector<Register, 2> AccumRegs(NumParts);
3913   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3914                 SeparateOddAlignedProducts);
3915 
3916   B.buildMergeLikeInstr(DstReg, AccumRegs);
3917   MI.eraseFromParent();
3918   return true;
3919 }
3920 
3921 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3922 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3923 // case with a single min instruction instead of a compare+select.
3924 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3925                                             MachineRegisterInfo &MRI,
3926                                             MachineIRBuilder &B) const {
3927   Register Dst = MI.getOperand(0).getReg();
3928   Register Src = MI.getOperand(1).getReg();
3929   LLT DstTy = MRI.getType(Dst);
3930   LLT SrcTy = MRI.getType(Src);
3931 
3932   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3933                         ? AMDGPU::G_AMDGPU_FFBH_U32
3934                         : AMDGPU::G_AMDGPU_FFBL_B32;
3935   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3936   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3937 
3938   MI.eraseFromParent();
3939   return true;
3940 }
3941 
3942 // Check that this is a G_XOR x, -1
3943 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3944   if (MI.getOpcode() != TargetOpcode::G_XOR)
3945     return false;
3946   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3947   return ConstVal && *ConstVal == -1;
3948 }
3949 
3950 // Return the use branch instruction, otherwise null if the usage is invalid.
3951 static MachineInstr *
3952 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3953                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3954   Register CondDef = MI.getOperand(0).getReg();
3955   if (!MRI.hasOneNonDBGUse(CondDef))
3956     return nullptr;
3957 
3958   MachineBasicBlock *Parent = MI.getParent();
3959   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3960 
3961   if (isNot(MRI, *UseMI)) {
3962     Register NegatedCond = UseMI->getOperand(0).getReg();
3963     if (!MRI.hasOneNonDBGUse(NegatedCond))
3964       return nullptr;
3965 
3966     // We're deleting the def of this value, so we need to remove it.
3967     eraseInstr(*UseMI, MRI);
3968 
3969     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3970     Negated = true;
3971   }
3972 
3973   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3974     return nullptr;
3975 
3976   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3977   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3978   if (Next == Parent->end()) {
3979     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3980     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3981       return nullptr;
3982     UncondBrTarget = &*NextMBB;
3983   } else {
3984     if (Next->getOpcode() != AMDGPU::G_BR)
3985       return nullptr;
3986     Br = &*Next;
3987     UncondBrTarget = Br->getOperand(0).getMBB();
3988   }
3989 
3990   return UseMI;
3991 }
3992 
3993 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3994                                          const ArgDescriptor *Arg,
3995                                          const TargetRegisterClass *ArgRC,
3996                                          LLT ArgTy) const {
3997   MCRegister SrcReg = Arg->getRegister();
3998   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3999   assert(DstReg.isVirtual() && "Virtual register expected");
4000 
4001   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4002                                              *ArgRC, B.getDebugLoc(), ArgTy);
4003   if (Arg->isMasked()) {
4004     // TODO: Should we try to emit this once in the entry block?
4005     const LLT S32 = LLT::scalar(32);
4006     const unsigned Mask = Arg->getMask();
4007     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4008 
4009     Register AndMaskSrc = LiveIn;
4010 
4011     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4012     // 0.
4013     if (Shift != 0) {
4014       auto ShiftAmt = B.buildConstant(S32, Shift);
4015       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4016     }
4017 
4018     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4019   } else {
4020     B.buildCopy(DstReg, LiveIn);
4021   }
4022 
4023   return true;
4024 }
4025 
4026 bool AMDGPULegalizerInfo::loadInputValue(
4027     Register DstReg, MachineIRBuilder &B,
4028     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4029   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4030   const ArgDescriptor *Arg;
4031   const TargetRegisterClass *ArgRC;
4032   LLT ArgTy;
4033   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4034 
4035   if (!Arg) {
4036     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4037       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4038       // case the pointer argument may be missing and we use null.
4039       B.buildConstant(DstReg, 0);
4040       return true;
4041     }
4042 
4043     // It's undefined behavior if a function marked with the amdgpu-no-*
4044     // attributes uses the corresponding intrinsic.
4045     B.buildUndef(DstReg);
4046     return true;
4047   }
4048 
4049   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4050     return false; // TODO: Handle these
4051   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4052 }
4053 
4054 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4055     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4056     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4057   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4058     return false;
4059 
4060   MI.eraseFromParent();
4061   return true;
4062 }
4063 
4064 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4065                                 int64_t C) {
4066   B.buildConstant(MI.getOperand(0).getReg(), C);
4067   MI.eraseFromParent();
4068   return true;
4069 }
4070 
4071 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4072     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4073     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4074   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4075   if (MaxID == 0)
4076     return replaceWithConstant(B, MI, 0);
4077 
4078   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4079   const ArgDescriptor *Arg;
4080   const TargetRegisterClass *ArgRC;
4081   LLT ArgTy;
4082   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4083 
4084   Register DstReg = MI.getOperand(0).getReg();
4085   if (!Arg) {
4086     // It's undefined behavior if a function marked with the amdgpu-no-*
4087     // attributes uses the corresponding intrinsic.
4088     B.buildUndef(DstReg);
4089     MI.eraseFromParent();
4090     return true;
4091   }
4092 
4093   if (Arg->isMasked()) {
4094     // Don't bother inserting AssertZext for packed IDs since we're emitting the
4095     // masking operations anyway.
4096     //
4097     // TODO: We could assert the top bit is 0 for the source copy.
4098     if (!loadInputValue(DstReg, B, ArgType))
4099       return false;
4100   } else {
4101     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4102     if (!loadInputValue(TmpReg, B, ArgType))
4103       return false;
4104     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4105   }
4106 
4107   MI.eraseFromParent();
4108   return true;
4109 }
4110 
4111 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4112                                                      int64_t Offset) const {
4113   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4114   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4115 
4116   // TODO: If we passed in the base kernel offset we could have a better
4117   // alignment than 4, but we don't really need it.
4118   if (!loadInputValue(KernArgReg, B,
4119                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4120     llvm_unreachable("failed to find kernarg segment ptr");
4121 
4122   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4123   // TODO: Should get nuw
4124   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4125 }
4126 
4127 /// Legalize a value that's loaded from kernel arguments. This is only used by
4128 /// legacy intrinsics.
4129 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4130                                                       MachineIRBuilder &B,
4131                                                       uint64_t Offset,
4132                                                       Align Alignment) const {
4133   Register DstReg = MI.getOperand(0).getReg();
4134 
4135   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4136          "unexpected kernarg parameter type");
4137 
4138   Register Ptr = getKernargParameterPtr(B, Offset);
4139   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4140   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4141               MachineMemOperand::MODereferenceable |
4142                   MachineMemOperand::MOInvariant);
4143   MI.eraseFromParent();
4144   return true;
4145 }
4146 
4147 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4148                                        MachineRegisterInfo &MRI,
4149                                        MachineIRBuilder &B) const {
4150   Register Dst = MI.getOperand(0).getReg();
4151   LLT DstTy = MRI.getType(Dst);
4152   LLT S16 = LLT::scalar(16);
4153   LLT S32 = LLT::scalar(32);
4154   LLT S64 = LLT::scalar(64);
4155 
4156   if (DstTy == S16)
4157     return legalizeFDIV16(MI, MRI, B);
4158   if (DstTy == S32)
4159     return legalizeFDIV32(MI, MRI, B);
4160   if (DstTy == S64)
4161     return legalizeFDIV64(MI, MRI, B);
4162 
4163   return false;
4164 }
4165 
4166 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4167                                                         Register DstDivReg,
4168                                                         Register DstRemReg,
4169                                                         Register X,
4170                                                         Register Y) const {
4171   const LLT S1 = LLT::scalar(1);
4172   const LLT S32 = LLT::scalar(32);
4173 
4174   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4175   // algorithm used here.
4176 
4177   // Initial estimate of inv(y).
4178   auto FloatY = B.buildUITOFP(S32, Y);
4179   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4180   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4181   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4182   auto Z = B.buildFPTOUI(S32, ScaledY);
4183 
4184   // One round of UNR.
4185   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4186   auto NegYZ = B.buildMul(S32, NegY, Z);
4187   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4188 
4189   // Quotient/remainder estimate.
4190   auto Q = B.buildUMulH(S32, X, Z);
4191   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4192 
4193   // First quotient/remainder refinement.
4194   auto One = B.buildConstant(S32, 1);
4195   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4196   if (DstDivReg)
4197     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4198   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4199 
4200   // Second quotient/remainder refinement.
4201   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4202   if (DstDivReg)
4203     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4204 
4205   if (DstRemReg)
4206     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4207 }
4208 
4209 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4210 //
4211 // Return lo, hi of result
4212 //
4213 // %cvt.lo = G_UITOFP Val.lo
4214 // %cvt.hi = G_UITOFP Val.hi
4215 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4216 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4217 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4218 // %mul2 = G_FMUL %mul1, 2**(-32)
4219 // %trunc = G_INTRINSIC_TRUNC %mul2
4220 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4221 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4222 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4223                                                        Register Val) {
4224   const LLT S32 = LLT::scalar(32);
4225   auto Unmerge = B.buildUnmerge(S32, Val);
4226 
4227   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4228   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4229 
4230   auto Mad = B.buildFMAD(
4231       S32, CvtHi, // 2**32
4232       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4233 
4234   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4235   auto Mul1 = B.buildFMul(
4236       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4237 
4238   // 2**(-32)
4239   auto Mul2 = B.buildFMul(
4240       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4241   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4242 
4243   // -(2**32)
4244   auto Mad2 = B.buildFMAD(
4245       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4246       Mul1);
4247 
4248   auto ResultLo = B.buildFPTOUI(S32, Mad2);
4249   auto ResultHi = B.buildFPTOUI(S32, Trunc);
4250 
4251   return {ResultLo.getReg(0), ResultHi.getReg(0)};
4252 }
4253 
4254 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4255                                                         Register DstDivReg,
4256                                                         Register DstRemReg,
4257                                                         Register Numer,
4258                                                         Register Denom) const {
4259   const LLT S32 = LLT::scalar(32);
4260   const LLT S64 = LLT::scalar(64);
4261   const LLT S1 = LLT::scalar(1);
4262   Register RcpLo, RcpHi;
4263 
4264   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4265 
4266   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4267 
4268   auto Zero64 = B.buildConstant(S64, 0);
4269   auto NegDenom = B.buildSub(S64, Zero64, Denom);
4270 
4271   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4272   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4273 
4274   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4275   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4276   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4277 
4278   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4279   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4280   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4281 
4282   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4283   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4284   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4285   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4286   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4287 
4288   auto Zero32 = B.buildConstant(S32, 0);
4289   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4290   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4291   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4292 
4293   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4294   Register NumerLo = UnmergeNumer.getReg(0);
4295   Register NumerHi = UnmergeNumer.getReg(1);
4296 
4297   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4298   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4299   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4300   Register Mul3_Lo = UnmergeMul3.getReg(0);
4301   Register Mul3_Hi = UnmergeMul3.getReg(1);
4302   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4303   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4304   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4305   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4306 
4307   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4308   Register DenomLo = UnmergeDenom.getReg(0);
4309   Register DenomHi = UnmergeDenom.getReg(1);
4310 
4311   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4312   auto C1 = B.buildSExt(S32, CmpHi);
4313 
4314   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4315   auto C2 = B.buildSExt(S32, CmpLo);
4316 
4317   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4318   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4319 
4320   // TODO: Here and below portions of the code can be enclosed into if/endif.
4321   // Currently control flow is unconditional and we have 4 selects after
4322   // potential endif to substitute PHIs.
4323 
4324   // if C3 != 0 ...
4325   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4326   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4327   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4328   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4329 
4330   auto One64 = B.buildConstant(S64, 1);
4331   auto Add3 = B.buildAdd(S64, MulHi3, One64);
4332 
4333   auto C4 =
4334       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4335   auto C5 =
4336       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4337   auto C6 = B.buildSelect(
4338       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4339 
4340   // if (C6 != 0)
4341   auto Add4 = B.buildAdd(S64, Add3, One64);
4342   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4343 
4344   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4345   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4346   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4347 
4348   // endif C6
4349   // endif C3
4350 
4351   if (DstDivReg) {
4352     auto Sel1 = B.buildSelect(
4353         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4354     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4355                   Sel1, MulHi3);
4356   }
4357 
4358   if (DstRemReg) {
4359     auto Sel2 = B.buildSelect(
4360         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4361     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4362                   Sel2, Sub1);
4363   }
4364 }
4365 
4366 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4367                                                   MachineRegisterInfo &MRI,
4368                                                   MachineIRBuilder &B) const {
4369   Register DstDivReg, DstRemReg;
4370   switch (MI.getOpcode()) {
4371   default:
4372     llvm_unreachable("Unexpected opcode!");
4373   case AMDGPU::G_UDIV: {
4374     DstDivReg = MI.getOperand(0).getReg();
4375     break;
4376   }
4377   case AMDGPU::G_UREM: {
4378     DstRemReg = MI.getOperand(0).getReg();
4379     break;
4380   }
4381   case AMDGPU::G_UDIVREM: {
4382     DstDivReg = MI.getOperand(0).getReg();
4383     DstRemReg = MI.getOperand(1).getReg();
4384     break;
4385   }
4386   }
4387 
4388   const LLT S64 = LLT::scalar(64);
4389   const LLT S32 = LLT::scalar(32);
4390   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4391   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4392   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4393   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4394 
4395   if (Ty == S32)
4396     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4397   else if (Ty == S64)
4398     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4399   else
4400     return false;
4401 
4402   MI.eraseFromParent();
4403   return true;
4404 }
4405 
4406 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4407                                                 MachineRegisterInfo &MRI,
4408                                                 MachineIRBuilder &B) const {
4409   const LLT S64 = LLT::scalar(64);
4410   const LLT S32 = LLT::scalar(32);
4411 
4412   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4413   if (Ty != S32 && Ty != S64)
4414     return false;
4415 
4416   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4417   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4418   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4419 
4420   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4421   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4422   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4423 
4424   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4425   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4426 
4427   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4428   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4429 
4430   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4431   switch (MI.getOpcode()) {
4432   default:
4433     llvm_unreachable("Unexpected opcode!");
4434   case AMDGPU::G_SDIV: {
4435     DstDivReg = MI.getOperand(0).getReg();
4436     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4437     break;
4438   }
4439   case AMDGPU::G_SREM: {
4440     DstRemReg = MI.getOperand(0).getReg();
4441     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4442     break;
4443   }
4444   case AMDGPU::G_SDIVREM: {
4445     DstDivReg = MI.getOperand(0).getReg();
4446     DstRemReg = MI.getOperand(1).getReg();
4447     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4448     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4449     break;
4450   }
4451   }
4452 
4453   if (Ty == S32)
4454     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4455   else
4456     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4457 
4458   if (DstDivReg) {
4459     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4460     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4461     B.buildSub(DstDivReg, SignXor, Sign);
4462   }
4463 
4464   if (DstRemReg) {
4465     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4466     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4467     B.buildSub(DstRemReg, SignXor, Sign);
4468   }
4469 
4470   MI.eraseFromParent();
4471   return true;
4472 }
4473 
4474 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4475                                                  MachineRegisterInfo &MRI,
4476                                                  MachineIRBuilder &B) const {
4477   Register Res = MI.getOperand(0).getReg();
4478   Register LHS = MI.getOperand(1).getReg();
4479   Register RHS = MI.getOperand(2).getReg();
4480   uint16_t Flags = MI.getFlags();
4481   LLT ResTy = MRI.getType(Res);
4482 
4483   const MachineFunction &MF = B.getMF();
4484   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4485                             MF.getTarget().Options.UnsafeFPMath;
4486 
4487   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4488     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4489       return false;
4490 
4491     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4492     // the CI documentation has a worst case error of 1 ulp.
4493     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4494     // use it as long as we aren't trying to use denormals.
4495     //
4496     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4497 
4498     // 1 / x -> RCP(x)
4499     if (CLHS->isExactlyValue(1.0)) {
4500       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
4501         .addUse(RHS)
4502         .setMIFlags(Flags);
4503 
4504       MI.eraseFromParent();
4505       return true;
4506     }
4507 
4508     // TODO: Match rsq
4509 
4510     // -1 / x -> RCP( FNEG(x) )
4511     if (CLHS->isExactlyValue(-1.0)) {
4512       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4513       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
4514         .addUse(FNeg.getReg(0))
4515         .setMIFlags(Flags);
4516 
4517       MI.eraseFromParent();
4518       return true;
4519     }
4520   }
4521 
4522   // For f16 require arcp only.
4523   // For f32 require afn+arcp.
4524   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4525                               !MI.getFlag(MachineInstr::FmArcp)))
4526     return false;
4527 
4528   // x / y -> x * (1.0 / y)
4529   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
4530     .addUse(RHS)
4531     .setMIFlags(Flags);
4532   B.buildFMul(Res, LHS, RCP, Flags);
4533 
4534   MI.eraseFromParent();
4535   return true;
4536 }
4537 
4538 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4539                                                    MachineRegisterInfo &MRI,
4540                                                    MachineIRBuilder &B) const {
4541   Register Res = MI.getOperand(0).getReg();
4542   Register X = MI.getOperand(1).getReg();
4543   Register Y = MI.getOperand(2).getReg();
4544   uint16_t Flags = MI.getFlags();
4545   LLT ResTy = MRI.getType(Res);
4546 
4547   const MachineFunction &MF = B.getMF();
4548   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4549                             MI.getFlag(MachineInstr::FmAfn);
4550 
4551   if (!AllowInaccurateRcp)
4552     return false;
4553 
4554   auto NegY = B.buildFNeg(ResTy, Y);
4555   auto One = B.buildFConstant(ResTy, 1.0);
4556 
4557   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
4558     .addUse(Y)
4559     .setMIFlags(Flags);
4560 
4561   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4562   R = B.buildFMA(ResTy, Tmp0, R, R);
4563 
4564   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4565   R = B.buildFMA(ResTy, Tmp1, R, R);
4566 
4567   auto Ret = B.buildFMul(ResTy, X, R);
4568   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4569 
4570   B.buildFMA(Res, Tmp2, R, Ret);
4571   MI.eraseFromParent();
4572   return true;
4573 }
4574 
4575 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4576                                          MachineRegisterInfo &MRI,
4577                                          MachineIRBuilder &B) const {
4578   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4579     return true;
4580 
4581   Register Res = MI.getOperand(0).getReg();
4582   Register LHS = MI.getOperand(1).getReg();
4583   Register RHS = MI.getOperand(2).getReg();
4584 
4585   uint16_t Flags = MI.getFlags();
4586 
4587   LLT S16 = LLT::scalar(16);
4588   LLT S32 = LLT::scalar(32);
4589 
4590   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4591   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4592 
4593   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4594     .addUse(RHSExt.getReg(0))
4595     .setMIFlags(Flags);
4596 
4597   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4598   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4599 
4600   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
4601     .addUse(RDst.getReg(0))
4602     .addUse(RHS)
4603     .addUse(LHS)
4604     .setMIFlags(Flags);
4605 
4606   MI.eraseFromParent();
4607   return true;
4608 }
4609 
4610 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4611 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4612 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4613                                const GCNSubtarget &ST,
4614                                SIModeRegisterDefaults Mode) {
4615   // Set SP denorm mode to this value.
4616   unsigned SPDenormMode =
4617     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4618 
4619   if (ST.hasDenormModeInst()) {
4620     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4621     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4622 
4623     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4624     B.buildInstr(AMDGPU::S_DENORM_MODE)
4625       .addImm(NewDenormModeValue);
4626 
4627   } else {
4628     // Select FP32 bit field in mode register.
4629     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
4630                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4631                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4632 
4633     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4634       .addImm(SPDenormMode)
4635       .addImm(SPDenormModeBitField);
4636   }
4637 }
4638 
4639 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4640                                          MachineRegisterInfo &MRI,
4641                                          MachineIRBuilder &B) const {
4642   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4643     return true;
4644 
4645   Register Res = MI.getOperand(0).getReg();
4646   Register LHS = MI.getOperand(1).getReg();
4647   Register RHS = MI.getOperand(2).getReg();
4648   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4649   SIModeRegisterDefaults Mode = MFI->getMode();
4650 
4651   uint16_t Flags = MI.getFlags();
4652 
4653   LLT S32 = LLT::scalar(32);
4654   LLT S1 = LLT::scalar(1);
4655 
4656   auto One = B.buildFConstant(S32, 1.0f);
4657 
4658   auto DenominatorScaled =
4659     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
4660       .addUse(LHS)
4661       .addUse(RHS)
4662       .addImm(0)
4663       .setMIFlags(Flags);
4664   auto NumeratorScaled =
4665     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
4666       .addUse(LHS)
4667       .addUse(RHS)
4668       .addImm(1)
4669       .setMIFlags(Flags);
4670 
4671   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4672     .addUse(DenominatorScaled.getReg(0))
4673     .setMIFlags(Flags);
4674   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4675 
4676   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
4677   // aren't modeled as reading it.
4678   if (Mode.FP32Denormals != DenormalMode::getIEEE())
4679     toggleSPDenormMode(true, B, ST, Mode);
4680 
4681   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4682   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4683   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4684   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4685   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4686   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4687 
4688   // FIXME: This mishandles dynamic denormal mode. We need to query the
4689   // current mode and restore the original.
4690   if (Mode.FP32Denormals != DenormalMode::getIEEE())
4691     toggleSPDenormMode(false, B, ST, Mode);
4692 
4693   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
4694     .addUse(Fma4.getReg(0))
4695     .addUse(Fma1.getReg(0))
4696     .addUse(Fma3.getReg(0))
4697     .addUse(NumeratorScaled.getReg(1))
4698     .setMIFlags(Flags);
4699 
4700   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
4701     .addUse(Fmas.getReg(0))
4702     .addUse(RHS)
4703     .addUse(LHS)
4704     .setMIFlags(Flags);
4705 
4706   MI.eraseFromParent();
4707   return true;
4708 }
4709 
4710 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4711                                          MachineRegisterInfo &MRI,
4712                                          MachineIRBuilder &B) const {
4713   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4714     return true;
4715 
4716   Register Res = MI.getOperand(0).getReg();
4717   Register LHS = MI.getOperand(1).getReg();
4718   Register RHS = MI.getOperand(2).getReg();
4719 
4720   uint16_t Flags = MI.getFlags();
4721 
4722   LLT S64 = LLT::scalar(64);
4723   LLT S1 = LLT::scalar(1);
4724 
4725   auto One = B.buildFConstant(S64, 1.0);
4726 
4727   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4728     .addUse(LHS)
4729     .addUse(RHS)
4730     .addImm(0)
4731     .setMIFlags(Flags);
4732 
4733   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4734 
4735   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
4736     .addUse(DivScale0.getReg(0))
4737     .setMIFlags(Flags);
4738 
4739   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4740   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4741   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4742 
4743   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4744     .addUse(LHS)
4745     .addUse(RHS)
4746     .addImm(1)
4747     .setMIFlags(Flags);
4748 
4749   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4750   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4751   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4752 
4753   Register Scale;
4754   if (!ST.hasUsableDivScaleConditionOutput()) {
4755     // Workaround a hardware bug on SI where the condition output from div_scale
4756     // is not usable.
4757 
4758     LLT S32 = LLT::scalar(32);
4759 
4760     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4761     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4762     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4763     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4764 
4765     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4766                               Scale1Unmerge.getReg(1));
4767     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4768                               Scale0Unmerge.getReg(1));
4769     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4770   } else {
4771     Scale = DivScale1.getReg(1);
4772   }
4773 
4774   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4775     .addUse(Fma4.getReg(0))
4776     .addUse(Fma3.getReg(0))
4777     .addUse(Mul.getReg(0))
4778     .addUse(Scale)
4779     .setMIFlags(Flags);
4780 
4781   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4782       .addUse(Fmas.getReg(0))
4783       .addUse(RHS)
4784       .addUse(LHS)
4785       .setMIFlags(Flags);
4786 
4787   MI.eraseFromParent();
4788   return true;
4789 }
4790 
4791 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
4792                                          MachineRegisterInfo &MRI,
4793                                          MachineIRBuilder &B) const {
4794   Register Res0 = MI.getOperand(0).getReg();
4795   Register Res1 = MI.getOperand(1).getReg();
4796   Register Val = MI.getOperand(2).getReg();
4797   uint16_t Flags = MI.getFlags();
4798 
4799   LLT Ty = MRI.getType(Res0);
4800   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
4801 
4802   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false)
4803                   .addUse(Val)
4804                   .setMIFlags(Flags);
4805   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false)
4806                  .addUse(Val)
4807                  .setMIFlags(Flags);
4808 
4809   if (ST.hasFractBug()) {
4810     auto Fabs = B.buildFAbs(Ty, Val);
4811     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
4812     auto IsFinite =
4813         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
4814     auto Zero = B.buildConstant(InstrExpTy, 0);
4815     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
4816     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
4817   }
4818 
4819   B.buildCopy(Res0, Mant);
4820   B.buildSExtOrTrunc(Res1, Exp);
4821 
4822   MI.eraseFromParent();
4823   return true;
4824 }
4825 
4826 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4827                                                  MachineRegisterInfo &MRI,
4828                                                  MachineIRBuilder &B) const {
4829   Register Res = MI.getOperand(0).getReg();
4830   Register LHS = MI.getOperand(2).getReg();
4831   Register RHS = MI.getOperand(3).getReg();
4832   uint16_t Flags = MI.getFlags();
4833 
4834   LLT S32 = LLT::scalar(32);
4835   LLT S1 = LLT::scalar(1);
4836 
4837   auto Abs = B.buildFAbs(S32, RHS, Flags);
4838   const APFloat C0Val(1.0f);
4839 
4840   auto C0 = B.buildFConstant(S32, 0x1p+96f);
4841   auto C1 = B.buildFConstant(S32, 0x1p-32f);
4842   auto C2 = B.buildFConstant(S32, 1.0f);
4843 
4844   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4845   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4846 
4847   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4848 
4849   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4850     .addUse(Mul0.getReg(0))
4851     .setMIFlags(Flags);
4852 
4853   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4854 
4855   B.buildFMul(Res, Sel, Mul1, Flags);
4856 
4857   MI.eraseFromParent();
4858   return true;
4859 }
4860 
4861 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
4862                                         MachineRegisterInfo &MRI,
4863                                         MachineIRBuilder &B) const {
4864   // For double type, the SQRT and RSQ instructions don't have required
4865   // precision, we apply Goldschmidt's algorithm to improve the result:
4866   //
4867   //   y0 = rsq(x)
4868   //   g0 = x * y0
4869   //   h0 = 0.5 * y0
4870   //
4871   //   r0 = 0.5 - h0 * g0
4872   //   g1 = g0 * r0 + g0
4873   //   h1 = h0 * r0 + h0
4874   //
4875   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
4876   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
4877   //   h2 = h1 * r1 + h1
4878   //
4879   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
4880   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
4881   //
4882   //   sqrt(x) = g3
4883 
4884   const LLT S1 = LLT::scalar(1);
4885   const LLT S32 = LLT::scalar(32);
4886   const LLT F64 = LLT::scalar(64);
4887 
4888   Register Dst = MI.getOperand(0).getReg();
4889   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
4890 
4891   Register X = MI.getOperand(1).getReg();
4892   unsigned Flags = MI.getFlags();
4893 
4894   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
4895 
4896   auto ZeroInt = B.buildConstant(S32, 0);
4897   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
4898 
4899   // Scale up input if it is too small.
4900   auto ScaleUpFactor = B.buildConstant(S32, 256);
4901   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
4902   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
4903 
4904   auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false)
4905                    .addReg(SqrtX.getReg(0));
4906 
4907   auto Half = B.buildFConstant(F64, 0.5);
4908   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
4909   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
4910 
4911   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
4912   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
4913 
4914   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
4915   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
4916 
4917   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
4918   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
4919 
4920   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
4921 
4922   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
4923   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
4924 
4925   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
4926 
4927   // Scale down the result.
4928   auto ScaleDownFactor = B.buildConstant(S32, -128);
4929   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
4930   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
4931 
4932   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
4933   // with finite only or nsz because rsq(+/-0) = +/-inf
4934 
4935   // TODO: Check for DAZ and expand to subnormals
4936   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
4937 
4938   // If x is +INF, +0, or -0, use its original value
4939   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
4940 
4941   MI.eraseFromParent();
4942   return true;
4943 }
4944 
4945 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4946 // FIXME: Why do we handle this one but not other removed instructions?
4947 //
4948 // Reciprocal square root.  The clamp prevents infinite results, clamping
4949 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4950 // +-max_float.
4951 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4952                                                     MachineRegisterInfo &MRI,
4953                                                     MachineIRBuilder &B) const {
4954   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4955     return true;
4956 
4957   Register Dst = MI.getOperand(0).getReg();
4958   Register Src = MI.getOperand(2).getReg();
4959   auto Flags = MI.getFlags();
4960 
4961   LLT Ty = MRI.getType(Dst);
4962 
4963   const fltSemantics *FltSemantics;
4964   if (Ty == LLT::scalar(32))
4965     FltSemantics = &APFloat::IEEEsingle();
4966   else if (Ty == LLT::scalar(64))
4967     FltSemantics = &APFloat::IEEEdouble();
4968   else
4969     return false;
4970 
4971   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4972     .addUse(Src)
4973     .setMIFlags(Flags);
4974 
4975   // We don't need to concern ourselves with the snan handling difference, since
4976   // the rsq quieted (or not) so use the one which will directly select.
4977   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4978   const bool UseIEEE = MFI->getMode().IEEE;
4979 
4980   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4981   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4982                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4983 
4984   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4985 
4986   if (UseIEEE)
4987     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4988   else
4989     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4990   MI.eraseFromParent();
4991   return true;
4992 }
4993 
4994 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4995   switch (IID) {
4996   case Intrinsic::amdgcn_ds_fadd:
4997     return AMDGPU::G_ATOMICRMW_FADD;
4998   case Intrinsic::amdgcn_ds_fmin:
4999     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5000   case Intrinsic::amdgcn_ds_fmax:
5001     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5002   default:
5003     llvm_unreachable("not a DS FP intrinsic");
5004   }
5005 }
5006 
5007 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
5008                                                       MachineInstr &MI,
5009                                                       Intrinsic::ID IID) const {
5010   GISelChangeObserver &Observer = Helper.Observer;
5011   Observer.changingInstr(MI);
5012 
5013   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5014 
5015   // The remaining operands were used to set fields in the MemOperand on
5016   // construction.
5017   for (int I = 6; I > 3; --I)
5018     MI.removeOperand(I);
5019 
5020   MI.removeOperand(1); // Remove the intrinsic ID.
5021   Observer.changedInstr(MI);
5022   return true;
5023 }
5024 
5025 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5026                                             MachineRegisterInfo &MRI,
5027                                             MachineIRBuilder &B) const {
5028   uint64_t Offset =
5029     ST.getTargetLowering()->getImplicitParameterOffset(
5030       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5031   LLT DstTy = MRI.getType(DstReg);
5032   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5033 
5034   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5035   if (!loadInputValue(KernargPtrReg, B,
5036                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5037     return false;
5038 
5039   // FIXME: This should be nuw
5040   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5041   return true;
5042 }
5043 
5044 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5045 /// bits of the pointer and replace them with the stride argument, then
5046 /// merge_values everything together. In the common case of a raw buffer (the
5047 /// stride component is 0), we can just AND off the upper half.
5048 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5049     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5050   Register Result = MI.getOperand(0).getReg();
5051   Register Pointer = MI.getOperand(2).getReg();
5052   Register Stride = MI.getOperand(3).getReg();
5053   Register NumRecords = MI.getOperand(4).getReg();
5054   Register Flags = MI.getOperand(5).getReg();
5055 
5056   LLT S32 = LLT::scalar(32);
5057 
5058   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5059   auto Unmerge = B.buildUnmerge(S32, Pointer);
5060   Register LowHalf = Unmerge.getReg(0);
5061   Register HighHalf = Unmerge.getReg(1);
5062 
5063   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5064   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5065 
5066   MachineInstrBuilder NewHighHalf = Masked;
5067   std::optional<ValueAndVReg> StrideConst =
5068       getIConstantVRegValWithLookThrough(Stride, MRI);
5069   if (!StrideConst || !StrideConst->Value.isZero()) {
5070     MachineInstrBuilder ShiftedStride;
5071     if (StrideConst) {
5072       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5073       uint32_t ShiftedStrideVal = StrideVal << 16;
5074       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5075     } else {
5076       auto ExtStride = B.buildAnyExt(S32, Stride);
5077       auto ShiftConst = B.buildConstant(S32, 16);
5078       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5079     }
5080     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5081   }
5082   Register NewHighHalfReg = NewHighHalf.getReg(0);
5083   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5084   MI.eraseFromParent();
5085   return true;
5086 }
5087 
5088 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5089                                                  MachineRegisterInfo &MRI,
5090                                                  MachineIRBuilder &B) const {
5091   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5092   if (!MFI->isEntryFunction()) {
5093     return legalizePreloadedArgIntrin(MI, MRI, B,
5094                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5095   }
5096 
5097   Register DstReg = MI.getOperand(0).getReg();
5098   if (!getImplicitArgPtr(DstReg, MRI, B))
5099     return false;
5100 
5101   MI.eraseFromParent();
5102   return true;
5103 }
5104 
5105 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5106                                          MachineRegisterInfo &MRI,
5107                                          MachineIRBuilder &B) const {
5108   Function &F = B.getMF().getFunction();
5109   std::optional<uint32_t> KnownSize =
5110       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5111   if (KnownSize.has_value())
5112     B.buildConstant(DstReg, *KnownSize);
5113   return false;
5114 }
5115 
5116 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5117                                               MachineRegisterInfo &MRI,
5118                                               MachineIRBuilder &B) const {
5119 
5120   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5121   if (!MFI->isEntryFunction()) {
5122     return legalizePreloadedArgIntrin(MI, MRI, B,
5123                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5124   }
5125 
5126   Register DstReg = MI.getOperand(0).getReg();
5127   if (!getLDSKernelId(DstReg, MRI, B))
5128     return false;
5129 
5130   MI.eraseFromParent();
5131   return true;
5132 }
5133 
5134 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5135                                               MachineRegisterInfo &MRI,
5136                                               MachineIRBuilder &B,
5137                                               unsigned AddrSpace) const {
5138   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5139   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5140   Register Hi32 = Unmerge.getReg(1);
5141 
5142   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5143   MI.eraseFromParent();
5144   return true;
5145 }
5146 
5147 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5148 // offset (the offset that is included in bounds checking and swizzling, to be
5149 // split between the instruction's voffset and immoffset fields) and soffset
5150 // (the offset that is excluded from bounds checking and swizzling, to go in
5151 // the instruction's soffset field).  This function takes the first kind of
5152 // offset and figures out how to split it between voffset and immoffset.
5153 std::pair<Register, unsigned>
5154 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5155                                         Register OrigOffset) const {
5156   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
5157   Register BaseReg;
5158   unsigned ImmOffset;
5159   const LLT S32 = LLT::scalar(32);
5160   MachineRegisterInfo &MRI = *B.getMRI();
5161 
5162   std::tie(BaseReg, ImmOffset) =
5163       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5164 
5165   // If BaseReg is a pointer, convert it to int.
5166   if (MRI.getType(BaseReg).isPointer())
5167     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5168 
5169   // If the immediate value is too big for the immoffset field, put only bits
5170   // that would normally fit in the immoffset field. The remaining value that
5171   // is copied/added for the voffset field is a large power of 2, and it
5172   // stands more chance of being CSEd with the copy/add for another similar
5173   // load/store.
5174   // However, do not do that rounding down if that is a negative
5175   // number, as it appears to be illegal to have a negative offset in the
5176   // vgpr, even if adding the immediate offset makes it positive.
5177   unsigned Overflow = ImmOffset & ~MaxImm;
5178   ImmOffset -= Overflow;
5179   if ((int32_t)Overflow < 0) {
5180     Overflow += ImmOffset;
5181     ImmOffset = 0;
5182   }
5183 
5184   if (Overflow != 0) {
5185     if (!BaseReg) {
5186       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5187     } else {
5188       auto OverflowVal = B.buildConstant(S32, Overflow);
5189       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5190     }
5191   }
5192 
5193   if (!BaseReg)
5194     BaseReg = B.buildConstant(S32, 0).getReg(0);
5195 
5196   return std::pair(BaseReg, ImmOffset);
5197 }
5198 
5199 /// Handle register layout difference for f16 images for some subtargets.
5200 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5201                                              MachineRegisterInfo &MRI,
5202                                              Register Reg,
5203                                              bool ImageStore) const {
5204   const LLT S16 = LLT::scalar(16);
5205   const LLT S32 = LLT::scalar(32);
5206   LLT StoreVT = MRI.getType(Reg);
5207   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5208 
5209   if (ST.hasUnpackedD16VMem()) {
5210     auto Unmerge = B.buildUnmerge(S16, Reg);
5211 
5212     SmallVector<Register, 4> WideRegs;
5213     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5214       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5215 
5216     int NumElts = StoreVT.getNumElements();
5217 
5218     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5219         .getReg(0);
5220   }
5221 
5222   if (ImageStore && ST.hasImageStoreD16Bug()) {
5223     if (StoreVT.getNumElements() == 2) {
5224       SmallVector<Register, 4> PackedRegs;
5225       Reg = B.buildBitcast(S32, Reg).getReg(0);
5226       PackedRegs.push_back(Reg);
5227       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5228       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5229           .getReg(0);
5230     }
5231 
5232     if (StoreVT.getNumElements() == 3) {
5233       SmallVector<Register, 4> PackedRegs;
5234       auto Unmerge = B.buildUnmerge(S16, Reg);
5235       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5236         PackedRegs.push_back(Unmerge.getReg(I));
5237       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5238       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5239       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5240     }
5241 
5242     if (StoreVT.getNumElements() == 4) {
5243       SmallVector<Register, 4> PackedRegs;
5244       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5245       auto Unmerge = B.buildUnmerge(S32, Reg);
5246       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5247         PackedRegs.push_back(Unmerge.getReg(I));
5248       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5249       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5250           .getReg(0);
5251     }
5252 
5253     llvm_unreachable("invalid data type");
5254   }
5255 
5256   if (StoreVT == LLT::fixed_vector(3, S16)) {
5257     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5258               .getReg(0);
5259   }
5260   return Reg;
5261 }
5262 
5263 Register AMDGPULegalizerInfo::fixStoreSourceType(
5264   MachineIRBuilder &B, Register VData, bool IsFormat) const {
5265   MachineRegisterInfo *MRI = B.getMRI();
5266   LLT Ty = MRI->getType(VData);
5267 
5268   const LLT S16 = LLT::scalar(16);
5269 
5270   // Fixup buffer resources themselves needing to be v4i128.
5271   if (hasBufferRsrcWorkaround(Ty))
5272     return castBufferRsrcToV4I32(VData, B);
5273 
5274   // Fixup illegal register types for i8 stores.
5275   if (Ty == LLT::scalar(8) || Ty == S16) {
5276     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5277     return AnyExt;
5278   }
5279 
5280   if (Ty.isVector()) {
5281     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5282       if (IsFormat)
5283         return handleD16VData(B, *MRI, VData);
5284     }
5285   }
5286 
5287   return VData;
5288 }
5289 
5290 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5291                                               MachineRegisterInfo &MRI,
5292                                               MachineIRBuilder &B,
5293                                               bool IsTyped,
5294                                               bool IsFormat) const {
5295   Register VData = MI.getOperand(1).getReg();
5296   LLT Ty = MRI.getType(VData);
5297   LLT EltTy = Ty.getScalarType();
5298   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5299   const LLT S32 = LLT::scalar(32);
5300 
5301   VData = fixStoreSourceType(B, VData, IsFormat);
5302   castBufferRsrcArgToV4I32(MI, B, 2);
5303   Register RSrc = MI.getOperand(2).getReg();
5304 
5305   MachineMemOperand *MMO = *MI.memoperands_begin();
5306   const int MemSize = MMO->getSize();
5307 
5308   unsigned ImmOffset;
5309 
5310   // The typed intrinsics add an immediate after the registers.
5311   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5312 
5313   // The struct intrinsic variants add one additional operand over raw.
5314   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5315   Register VIndex;
5316   int OpOffset = 0;
5317   if (HasVIndex) {
5318     VIndex = MI.getOperand(3).getReg();
5319     OpOffset = 1;
5320   } else {
5321     VIndex = B.buildConstant(S32, 0).getReg(0);
5322   }
5323 
5324   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5325   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5326 
5327   unsigned Format = 0;
5328   if (IsTyped) {
5329     Format = MI.getOperand(5 + OpOffset).getImm();
5330     ++OpOffset;
5331   }
5332 
5333   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5334 
5335   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5336 
5337   unsigned Opc;
5338   if (IsTyped) {
5339     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5340                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5341   } else if (IsFormat) {
5342     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5343                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5344   } else {
5345     switch (MemSize) {
5346     case 1:
5347       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5348       break;
5349     case 2:
5350       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5351       break;
5352     default:
5353       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5354       break;
5355     }
5356   }
5357 
5358   auto MIB = B.buildInstr(Opc)
5359     .addUse(VData)              // vdata
5360     .addUse(RSrc)               // rsrc
5361     .addUse(VIndex)             // vindex
5362     .addUse(VOffset)            // voffset
5363     .addUse(SOffset)            // soffset
5364     .addImm(ImmOffset);         // offset(imm)
5365 
5366   if (IsTyped)
5367     MIB.addImm(Format);
5368 
5369   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5370      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5371      .addMemOperand(MMO);
5372 
5373   MI.eraseFromParent();
5374   return true;
5375 }
5376 
5377 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5378                             Register VIndex, Register VOffset, Register SOffset,
5379                             unsigned ImmOffset, unsigned Format,
5380                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5381                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5382   auto MIB = B.buildInstr(Opc)
5383                  .addDef(LoadDstReg) // vdata
5384                  .addUse(RSrc)       // rsrc
5385                  .addUse(VIndex)     // vindex
5386                  .addUse(VOffset)    // voffset
5387                  .addUse(SOffset)    // soffset
5388                  .addImm(ImmOffset); // offset(imm)
5389 
5390   if (IsTyped)
5391     MIB.addImm(Format);
5392 
5393   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5394       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5395       .addMemOperand(MMO);
5396 }
5397 
5398 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5399                                              MachineRegisterInfo &MRI,
5400                                              MachineIRBuilder &B,
5401                                              bool IsFormat,
5402                                              bool IsTyped) const {
5403   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5404   MachineMemOperand *MMO = *MI.memoperands_begin();
5405   const LLT MemTy = MMO->getMemoryType();
5406   const LLT S32 = LLT::scalar(32);
5407 
5408   Register Dst = MI.getOperand(0).getReg();
5409 
5410   Register StatusDst;
5411   int OpOffset = 0;
5412   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5413   bool IsTFE = MI.getNumExplicitDefs() == 2;
5414   if (IsTFE) {
5415     StatusDst = MI.getOperand(1).getReg();
5416     ++OpOffset;
5417   }
5418 
5419   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5420   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5421 
5422   // The typed intrinsics add an immediate after the registers.
5423   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5424 
5425   // The struct intrinsic variants add one additional operand over raw.
5426   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5427   Register VIndex;
5428   if (HasVIndex) {
5429     VIndex = MI.getOperand(3 + OpOffset).getReg();
5430     ++OpOffset;
5431   } else {
5432     VIndex = B.buildConstant(S32, 0).getReg(0);
5433   }
5434 
5435   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5436   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5437 
5438   unsigned Format = 0;
5439   if (IsTyped) {
5440     Format = MI.getOperand(5 + OpOffset).getImm();
5441     ++OpOffset;
5442   }
5443 
5444   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5445   unsigned ImmOffset;
5446 
5447   LLT Ty = MRI.getType(Dst);
5448   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5449   // logic doesn't have to handle that case.
5450   if (hasBufferRsrcWorkaround(Ty)) {
5451     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5452     Dst = MI.getOperand(0).getReg();
5453   }
5454   LLT EltTy = Ty.getScalarType();
5455   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5456   const bool Unpacked = ST.hasUnpackedD16VMem();
5457 
5458   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5459 
5460   unsigned Opc;
5461 
5462   // TODO: Support TFE for typed and narrow loads.
5463   if (IsTyped) {
5464     if (IsTFE)
5465       return false;
5466     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5467                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5468   } else if (IsFormat) {
5469     if (IsD16) {
5470       if (IsTFE)
5471         return false;
5472       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5473     } else {
5474       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5475                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5476     }
5477   } else {
5478     if (IsTFE)
5479       return false;
5480     switch (MemTy.getSizeInBits()) {
5481     case 8:
5482       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5483       break;
5484     case 16:
5485       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5486       break;
5487     default:
5488       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5489       break;
5490     }
5491   }
5492 
5493   if (IsTFE) {
5494     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5495     unsigned NumLoadDWords = NumValueDWords + 1;
5496     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5497     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5498     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5499                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5500     if (NumValueDWords == 1) {
5501       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5502     } else {
5503       SmallVector<Register, 5> LoadElts;
5504       for (unsigned I = 0; I != NumValueDWords; ++I)
5505         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5506       LoadElts.push_back(StatusDst);
5507       B.buildUnmerge(LoadElts, LoadDstReg);
5508       LoadElts.truncate(NumValueDWords);
5509       B.buildMergeLikeInstr(Dst, LoadElts);
5510     }
5511   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5512              (IsD16 && !Ty.isVector())) {
5513     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5514     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5515                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5516     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5517     B.buildTrunc(Dst, LoadDstReg);
5518   } else if (Unpacked && IsD16 && Ty.isVector()) {
5519     LLT UnpackedTy = Ty.changeElementSize(32);
5520     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5521     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5522                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5523     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5524     // FIXME: G_TRUNC should work, but legalization currently fails
5525     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
5526     SmallVector<Register, 4> Repack;
5527     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
5528       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5529     B.buildMergeLikeInstr(Dst, Repack);
5530   } else {
5531     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5532                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5533   }
5534 
5535   MI.eraseFromParent();
5536   return true;
5537 }
5538 
5539 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
5540   switch (IntrID) {
5541   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5542   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5543   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5544   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5545     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5546   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5547   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5548   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5549   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5550     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5551   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5552   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5553   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5554   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5555     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5556   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5557   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5558   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5559   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5560     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5561   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5562   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5563   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5564   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5565     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5566   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5567   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5568   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5569   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5570     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5571   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5572   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5573   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5574   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5575     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5576   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5577   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5578   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5579   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5580     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5581   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5582   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5583   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5584   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5585     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5586   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5587   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5588   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5589   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5590     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5591   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5592   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5593   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5594   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5595     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5596   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5597   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5598   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5599   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5600     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5601   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5602   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5603   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5604   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5605     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5606   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5607   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5608   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5609   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5610     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5611   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5612   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5613   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5614   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5615     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5616   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5617   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5618   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5619   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5620     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5621   default:
5622     llvm_unreachable("unhandled atomic opcode");
5623   }
5624 }
5625 
5626 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
5627                                                MachineIRBuilder &B,
5628                                                Intrinsic::ID IID) const {
5629   const bool IsCmpSwap =
5630       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
5631       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
5632       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
5633       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
5634   const bool HasReturn = MI.getNumExplicitDefs() != 0;
5635 
5636   Register Dst;
5637 
5638   int OpOffset = 0;
5639   if (HasReturn) {
5640     // A few FP atomics do not support return values.
5641     Dst = MI.getOperand(0).getReg();
5642   } else {
5643     OpOffset = -1;
5644   }
5645 
5646   // Since we don't have 128-bit atomics, we don't need to handle the case of
5647   // p8 argmunents to the atomic itself
5648   Register VData = MI.getOperand(2 + OpOffset).getReg();
5649   Register CmpVal;
5650 
5651   if (IsCmpSwap) {
5652     CmpVal = MI.getOperand(3 + OpOffset).getReg();
5653     ++OpOffset;
5654   }
5655 
5656   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
5657   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
5658   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
5659 
5660   // The struct intrinsic variants add one additional operand over raw.
5661   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5662   Register VIndex;
5663   if (HasVIndex) {
5664     VIndex = MI.getOperand(4 + OpOffset).getReg();
5665     ++OpOffset;
5666   } else {
5667     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
5668   }
5669 
5670   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
5671   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
5672   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
5673 
5674   MachineMemOperand *MMO = *MI.memoperands_begin();
5675 
5676   unsigned ImmOffset;
5677   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5678 
5679   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
5680 
5681   if (HasReturn)
5682     MIB.addDef(Dst);
5683 
5684   MIB.addUse(VData); // vdata
5685 
5686   if (IsCmpSwap)
5687     MIB.addReg(CmpVal);
5688 
5689   MIB.addUse(RSrc)               // rsrc
5690      .addUse(VIndex)             // vindex
5691      .addUse(VOffset)            // voffset
5692      .addUse(SOffset)            // soffset
5693      .addImm(ImmOffset)          // offset(imm)
5694      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5695      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5696      .addMemOperand(MMO);
5697 
5698   MI.eraseFromParent();
5699   return true;
5700 }
5701 
5702 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
5703 /// vector with s16 typed elements.
5704 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
5705                                       SmallVectorImpl<Register> &PackedAddrs,
5706                                       unsigned ArgOffset,
5707                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
5708                                       bool IsA16, bool IsG16) {
5709   const LLT S16 = LLT::scalar(16);
5710   const LLT V2S16 = LLT::fixed_vector(2, 16);
5711   auto EndIdx = Intr->VAddrEnd;
5712 
5713   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
5714     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
5715     if (!SrcOp.isReg())
5716       continue; // _L to _LZ may have eliminated this.
5717 
5718     Register AddrReg = SrcOp.getReg();
5719 
5720     if ((I < Intr->GradientStart) ||
5721         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
5722         (I >= Intr->CoordStart && !IsA16)) {
5723       if ((I < Intr->GradientStart) && IsA16 &&
5724           (B.getMRI()->getType(AddrReg) == S16)) {
5725         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
5726         // Special handling of bias when A16 is on. Bias is of type half but
5727         // occupies full 32-bit.
5728         PackedAddrs.push_back(
5729             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
5730                 .getReg(0));
5731       } else {
5732         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
5733                "Bias needs to be converted to 16 bit in A16 mode");
5734         // Handle any gradient or coordinate operands that should not be packed
5735         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
5736         PackedAddrs.push_back(AddrReg);
5737       }
5738     } else {
5739       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
5740       // derivatives dx/dh and dx/dv are packed with undef.
5741       if (((I + 1) >= EndIdx) ||
5742           ((Intr->NumGradients / 2) % 2 == 1 &&
5743            (I == static_cast<unsigned>(Intr->GradientStart +
5744                                        (Intr->NumGradients / 2) - 1) ||
5745             I == static_cast<unsigned>(Intr->GradientStart +
5746                                        Intr->NumGradients - 1))) ||
5747           // Check for _L to _LZ optimization
5748           !MI.getOperand(ArgOffset + I + 1).isReg()) {
5749         PackedAddrs.push_back(
5750             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
5751                 .getReg(0));
5752       } else {
5753         PackedAddrs.push_back(
5754             B.buildBuildVector(
5755                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
5756                 .getReg(0));
5757         ++I;
5758       }
5759     }
5760   }
5761 }
5762 
5763 /// Convert from separate vaddr components to a single vector address register,
5764 /// and replace the remaining operands with $noreg.
5765 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
5766                                      int DimIdx, int NumVAddrs) {
5767   const LLT S32 = LLT::scalar(32);
5768   (void)S32;
5769   SmallVector<Register, 8> AddrRegs;
5770   for (int I = 0; I != NumVAddrs; ++I) {
5771     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
5772     if (SrcOp.isReg()) {
5773       AddrRegs.push_back(SrcOp.getReg());
5774       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
5775     }
5776   }
5777 
5778   int NumAddrRegs = AddrRegs.size();
5779   if (NumAddrRegs != 1) {
5780     auto VAddr =
5781         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
5782     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
5783   }
5784 
5785   for (int I = 1; I != NumVAddrs; ++I) {
5786     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
5787     if (SrcOp.isReg())
5788       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
5789   }
5790 }
5791 
5792 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
5793 ///
5794 /// Depending on the subtarget, load/store with 16-bit element data need to be
5795 /// rewritten to use the low half of 32-bit registers, or directly use a packed
5796 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
5797 /// registers.
5798 ///
5799 /// We don't want to directly select image instructions just yet, but also want
5800 /// to exposes all register repacking to the legalizer/combiners. We also don't
5801 /// want a selected instruction entering RegBankSelect. In order to avoid
5802 /// defining a multitude of intermediate image instructions, directly hack on
5803 /// the intrinsic's arguments. In cases like a16 addresses, this requires
5804 /// padding now unnecessary arguments with $noreg.
5805 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
5806     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
5807     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
5808 
5809   const MachineFunction &MF = *MI.getMF();
5810   const unsigned NumDefs = MI.getNumExplicitDefs();
5811   const unsigned ArgOffset = NumDefs + 1;
5812   bool IsTFE = NumDefs == 2;
5813   // We are only processing the operands of d16 image operations on subtargets
5814   // that use the unpacked register layout, or need to repack the TFE result.
5815 
5816   // TODO: Do we need to guard against already legalized intrinsics?
5817   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5818       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
5819 
5820   MachineRegisterInfo *MRI = B.getMRI();
5821   const LLT S32 = LLT::scalar(32);
5822   const LLT S16 = LLT::scalar(16);
5823   const LLT V2S16 = LLT::fixed_vector(2, 16);
5824 
5825   unsigned DMask = 0;
5826   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
5827   LLT Ty = MRI->getType(VData);
5828 
5829   // Check for 16 bit addresses and pack if true.
5830   LLT GradTy =
5831       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
5832   LLT AddrTy =
5833       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
5834   const bool IsG16 =
5835       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
5836   const bool IsA16 = AddrTy == S16;
5837   const bool IsD16 = Ty.getScalarType() == S16;
5838 
5839   int DMaskLanes = 0;
5840   if (!BaseOpcode->Atomic) {
5841     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
5842     if (BaseOpcode->Gather4) {
5843       DMaskLanes = 4;
5844     } else if (DMask != 0) {
5845       DMaskLanes = llvm::popcount(DMask);
5846     } else if (!IsTFE && !BaseOpcode->Store) {
5847       // If dmask is 0, this is a no-op load. This can be eliminated.
5848       B.buildUndef(MI.getOperand(0));
5849       MI.eraseFromParent();
5850       return true;
5851     }
5852   }
5853 
5854   Observer.changingInstr(MI);
5855   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
5856 
5857   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
5858                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
5859   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
5860                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
5861   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
5862 
5863   // Track that we legalized this
5864   MI.setDesc(B.getTII().get(NewOpcode));
5865 
5866   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
5867   // dmask to be at least 1 otherwise the instruction will fail
5868   if (IsTFE && DMask == 0) {
5869     DMask = 0x1;
5870     DMaskLanes = 1;
5871     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
5872   }
5873 
5874   if (BaseOpcode->Atomic) {
5875     Register VData0 = MI.getOperand(2).getReg();
5876     LLT Ty = MRI->getType(VData0);
5877 
5878     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
5879     if (Ty.isVector())
5880       return false;
5881 
5882     if (BaseOpcode->AtomicX2) {
5883       Register VData1 = MI.getOperand(3).getReg();
5884       // The two values are packed in one register.
5885       LLT PackedTy = LLT::fixed_vector(2, Ty);
5886       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
5887       MI.getOperand(2).setReg(Concat.getReg(0));
5888       MI.getOperand(3).setReg(AMDGPU::NoRegister);
5889     }
5890   }
5891 
5892   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
5893 
5894   // Rewrite the addressing register layout before doing anything else.
5895   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
5896     // 16 bit gradients are supported, but are tied to the A16 control
5897     // so both gradients and addresses must be 16 bit
5898     return false;
5899   }
5900 
5901   if (IsA16 && !ST.hasA16()) {
5902     // A16 not supported
5903     return false;
5904   }
5905 
5906   const unsigned NSAMaxSize = ST.getNSAMaxSize();
5907   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
5908 
5909   if (IsA16 || IsG16) {
5910     if (Intr->NumVAddrs > 1) {
5911       SmallVector<Register, 4> PackedRegs;
5912 
5913       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
5914                                 IsG16);
5915 
5916       // See also below in the non-a16 branch
5917       const bool UseNSA = ST.hasNSAEncoding() &&
5918                           PackedRegs.size() >= ST.getNSAThreshold(MF) &&
5919                           (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
5920       const bool UsePartialNSA =
5921           UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
5922 
5923       if (UsePartialNSA) {
5924         // Pack registers that would go over NSAMaxSize into last VAddr register
5925         LLT PackedAddrTy =
5926             LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
5927         auto Concat = B.buildConcatVectors(
5928             PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
5929         PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
5930         PackedRegs.resize(NSAMaxSize);
5931       } else if (!UseNSA && PackedRegs.size() > 1) {
5932         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
5933         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
5934         PackedRegs[0] = Concat.getReg(0);
5935         PackedRegs.resize(1);
5936       }
5937 
5938       const unsigned NumPacked = PackedRegs.size();
5939       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
5940         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
5941         if (!SrcOp.isReg()) {
5942           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
5943           continue;
5944         }
5945 
5946         assert(SrcOp.getReg() != AMDGPU::NoRegister);
5947 
5948         if (I - Intr->VAddrStart < NumPacked)
5949           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
5950         else
5951           SrcOp.setReg(AMDGPU::NoRegister);
5952       }
5953     }
5954   } else {
5955     // If the register allocator cannot place the address registers contiguously
5956     // without introducing moves, then using the non-sequential address encoding
5957     // is always preferable, since it saves VALU instructions and is usually a
5958     // wash in terms of code size or even better.
5959     //
5960     // However, we currently have no way of hinting to the register allocator
5961     // that MIMG addresses should be placed contiguously when it is possible to
5962     // do so, so force non-NSA for the common 2-address case as a heuristic.
5963     //
5964     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5965     // allocation when possible.
5966     //
5967     // Partial NSA is allowed on GFX11 where the final register is a contiguous
5968     // set of the remaining addresses.
5969     const bool UseNSA = ST.hasNSAEncoding() &&
5970                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5971                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
5972     const bool UsePartialNSA =
5973         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
5974 
5975     if (UsePartialNSA) {
5976       convertImageAddrToPacked(B, MI,
5977                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
5978                                Intr->NumVAddrs - NSAMaxSize + 1);
5979     } else if (!UseNSA && Intr->NumVAddrs > 1) {
5980       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5981                                Intr->NumVAddrs);
5982     }
5983   }
5984 
5985   int Flags = 0;
5986   if (IsA16)
5987     Flags |= 1;
5988   if (IsG16)
5989     Flags |= 2;
5990   MI.addOperand(MachineOperand::CreateImm(Flags));
5991 
5992   if (BaseOpcode->Store) { // No TFE for stores?
5993     // TODO: Handle dmask trim
5994     if (!Ty.isVector() || !IsD16)
5995       return true;
5996 
5997     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5998     if (RepackedReg != VData) {
5999       MI.getOperand(1).setReg(RepackedReg);
6000     }
6001 
6002     return true;
6003   }
6004 
6005   Register DstReg = MI.getOperand(0).getReg();
6006   const LLT EltTy = Ty.getScalarType();
6007   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6008 
6009   // Confirm that the return type is large enough for the dmask specified
6010   if (NumElts < DMaskLanes)
6011     return false;
6012 
6013   if (NumElts > 4 || DMaskLanes > 4)
6014     return false;
6015 
6016   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6017   const LLT AdjustedTy =
6018       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6019 
6020   // The raw dword aligned data component of the load. The only legal cases
6021   // where this matters should be when using the packed D16 format, for
6022   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6023   LLT RoundedTy;
6024 
6025   // S32 vector to cover all data, plus TFE result element.
6026   LLT TFETy;
6027 
6028   // Register type to use for each loaded component. Will be S32 or V2S16.
6029   LLT RegTy;
6030 
6031   if (IsD16 && ST.hasUnpackedD16VMem()) {
6032     RoundedTy =
6033         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6034     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6035     RegTy = S32;
6036   } else {
6037     unsigned EltSize = EltTy.getSizeInBits();
6038     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6039     unsigned RoundedSize = 32 * RoundedElts;
6040     RoundedTy = LLT::scalarOrVector(
6041         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6042     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6043     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6044   }
6045 
6046   // The return type does not need adjustment.
6047   // TODO: Should we change s16 case to s32 or <2 x s16>?
6048   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6049     return true;
6050 
6051   Register Dst1Reg;
6052 
6053   // Insert after the instruction.
6054   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6055 
6056   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6057   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6058   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6059   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6060 
6061   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6062 
6063   MI.getOperand(0).setReg(NewResultReg);
6064 
6065   // In the IR, TFE is supposed to be used with a 2 element struct return
6066   // type. The instruction really returns these two values in one contiguous
6067   // register, with one additional dword beyond the loaded data. Rewrite the
6068   // return type to use a single register result.
6069 
6070   if (IsTFE) {
6071     Dst1Reg = MI.getOperand(1).getReg();
6072     if (MRI->getType(Dst1Reg) != S32)
6073       return false;
6074 
6075     // TODO: Make sure the TFE operand bit is set.
6076     MI.removeOperand(1);
6077 
6078     // Handle the easy case that requires no repack instructions.
6079     if (Ty == S32) {
6080       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6081       return true;
6082     }
6083   }
6084 
6085   // Now figure out how to copy the new result register back into the old
6086   // result.
6087   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6088 
6089   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6090 
6091   if (ResultNumRegs == 1) {
6092     assert(!IsTFE);
6093     ResultRegs[0] = NewResultReg;
6094   } else {
6095     // We have to repack into a new vector of some kind.
6096     for (int I = 0; I != NumDataRegs; ++I)
6097       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6098     B.buildUnmerge(ResultRegs, NewResultReg);
6099 
6100     // Drop the final TFE element to get the data part. The TFE result is
6101     // directly written to the right place already.
6102     if (IsTFE)
6103       ResultRegs.resize(NumDataRegs);
6104   }
6105 
6106   // For an s16 scalar result, we form an s32 result with a truncate regardless
6107   // of packed vs. unpacked.
6108   if (IsD16 && !Ty.isVector()) {
6109     B.buildTrunc(DstReg, ResultRegs[0]);
6110     return true;
6111   }
6112 
6113   // Avoid a build/concat_vector of 1 entry.
6114   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6115     B.buildBitcast(DstReg, ResultRegs[0]);
6116     return true;
6117   }
6118 
6119   assert(Ty.isVector());
6120 
6121   if (IsD16) {
6122     // For packed D16 results with TFE enabled, all the data components are
6123     // S32. Cast back to the expected type.
6124     //
6125     // TODO: We don't really need to use load s32 elements. We would only need one
6126     // cast for the TFE result if a multiple of v2s16 was used.
6127     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6128       for (Register &Reg : ResultRegs)
6129         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6130     } else if (ST.hasUnpackedD16VMem()) {
6131       for (Register &Reg : ResultRegs)
6132         Reg = B.buildTrunc(S16, Reg).getReg(0);
6133     }
6134   }
6135 
6136   auto padWithUndef = [&](LLT Ty, int NumElts) {
6137     if (NumElts == 0)
6138       return;
6139     Register Undef = B.buildUndef(Ty).getReg(0);
6140     for (int I = 0; I != NumElts; ++I)
6141       ResultRegs.push_back(Undef);
6142   };
6143 
6144   // Pad out any elements eliminated due to the dmask.
6145   LLT ResTy = MRI->getType(ResultRegs[0]);
6146   if (!ResTy.isVector()) {
6147     padWithUndef(ResTy, NumElts - ResultRegs.size());
6148     B.buildBuildVector(DstReg, ResultRegs);
6149     return true;
6150   }
6151 
6152   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6153   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6154 
6155   // Deal with the one annoying legal case.
6156   const LLT V3S16 = LLT::fixed_vector(3, 16);
6157   if (Ty == V3S16) {
6158     if (IsTFE) {
6159       if (ResultRegs.size() == 1) {
6160         NewResultReg = ResultRegs[0];
6161       } else if (ResultRegs.size() == 2) {
6162         LLT V4S16 = LLT::fixed_vector(4, 16);
6163         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6164       } else {
6165         return false;
6166       }
6167     }
6168 
6169     if (MRI->getType(DstReg).getNumElements() <
6170         MRI->getType(NewResultReg).getNumElements()) {
6171       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6172     } else {
6173       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6174     }
6175     return true;
6176   }
6177 
6178   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6179   B.buildConcatVectors(DstReg, ResultRegs);
6180   return true;
6181 }
6182 
6183 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
6184   LegalizerHelper &Helper, MachineInstr &MI) const {
6185   MachineIRBuilder &B = Helper.MIRBuilder;
6186   GISelChangeObserver &Observer = Helper.Observer;
6187 
6188   Register Dst = MI.getOperand(0).getReg();
6189   LLT Ty = B.getMRI()->getType(Dst);
6190   unsigned Size = Ty.getSizeInBits();
6191   MachineFunction &MF = B.getMF();
6192 
6193   Observer.changingInstr(MI);
6194 
6195   // Handle needing to s.buffer.load() a p8 value.
6196   if (hasBufferRsrcWorkaround(Ty)) {
6197     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6198     Dst = MI.getOperand(0).getReg();
6199     B.setInsertPt(B.getMBB(), MI);
6200   }
6201   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6202     Ty = getBitcastRegisterType(Ty);
6203     Helper.bitcastDst(MI, Ty, 0);
6204     Dst = MI.getOperand(0).getReg();
6205     B.setInsertPt(B.getMBB(), MI);
6206   }
6207 
6208   // FIXME: We don't really need this intermediate instruction. The intrinsic
6209   // should be fixed to have a memory operand. Since it's readnone, we're not
6210   // allowed to add one.
6211   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
6212   MI.removeOperand(1); // Remove intrinsic ID
6213 
6214   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6215   // TODO: Should this use datalayout alignment?
6216   const unsigned MemSize = (Size + 7) / 8;
6217   const Align MemAlign(4);
6218   MachineMemOperand *MMO = MF.getMachineMemOperand(
6219       MachinePointerInfo(),
6220       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6221           MachineMemOperand::MOInvariant,
6222       MemSize, MemAlign);
6223   MI.addMemOperand(MF, MMO);
6224 
6225   // There are no 96-bit result scalar loads, but widening to 128-bit should
6226   // always be legal. We may need to restore this to a 96-bit result if it turns
6227   // out this needs to be converted to a vector load during RegBankSelect.
6228   if (!isPowerOf2_32(Size)) {
6229     if (Ty.isVector())
6230       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6231     else
6232       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6233   }
6234 
6235   Observer.changedInstr(MI);
6236   return true;
6237 }
6238 
6239 // TODO: Move to selection
6240 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
6241                                                 MachineRegisterInfo &MRI,
6242                                                 MachineIRBuilder &B) const {
6243   if (!ST.isTrapHandlerEnabled() ||
6244       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6245     return legalizeTrapEndpgm(MI, MRI, B);
6246 
6247   const Module *M = B.getMF().getFunction().getParent();
6248   unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
6249   if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
6250     return legalizeTrapHsaQueuePtr(MI, MRI, B);
6251 
6252   return ST.supportsGetDoorbellID() ?
6253          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6254 }
6255 
6256 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6257     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6258   const DebugLoc &DL = MI.getDebugLoc();
6259   MachineBasicBlock &BB = B.getMBB();
6260   MachineFunction *MF = BB.getParent();
6261 
6262   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6263     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6264       .addImm(0);
6265     MI.eraseFromParent();
6266     return true;
6267   }
6268 
6269   // We need a block split to make the real endpgm a terminator. We also don't
6270   // want to break phis in successor blocks, so we can't just delete to the
6271   // end of the block.
6272   BB.splitAt(MI, false /*UpdateLiveIns*/);
6273   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6274   MF->push_back(TrapBB);
6275   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6276     .addImm(0);
6277   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6278     .addMBB(TrapBB);
6279 
6280   BB.addSuccessor(TrapBB);
6281   MI.eraseFromParent();
6282   return true;
6283 }
6284 
6285 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6286     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6287   MachineFunction &MF = B.getMF();
6288   const LLT S64 = LLT::scalar(64);
6289 
6290   Register SGPR01(AMDGPU::SGPR0_SGPR1);
6291   // For code object version 5, queue_ptr is passed through implicit kernarg.
6292   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
6293       AMDGPU::AMDHSA_COV5) {
6294     AMDGPUTargetLowering::ImplicitParameter Param =
6295         AMDGPUTargetLowering::QUEUE_PTR;
6296     uint64_t Offset =
6297         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6298 
6299     Register KernargPtrReg = MRI.createGenericVirtualRegister(
6300         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6301 
6302     if (!loadInputValue(KernargPtrReg, B,
6303                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6304       return false;
6305 
6306     // TODO: can we be smarter about machine pointer info?
6307     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6308     MachineMemOperand *MMO = MF.getMachineMemOperand(
6309         PtrInfo,
6310         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6311             MachineMemOperand::MOInvariant,
6312         LLT::scalar(64), commonAlignment(Align(64), Offset));
6313 
6314     // Pointer address
6315     Register LoadAddr = MRI.createGenericVirtualRegister(
6316         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6317     B.buildPtrAdd(LoadAddr, KernargPtrReg,
6318                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6319     // Load address
6320     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6321     B.buildCopy(SGPR01, Temp);
6322     B.buildInstr(AMDGPU::S_TRAP)
6323         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6324         .addReg(SGPR01, RegState::Implicit);
6325     MI.eraseFromParent();
6326     return true;
6327   }
6328 
6329   // Pass queue pointer to trap handler as input, and insert trap instruction
6330   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6331   Register LiveIn =
6332     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6333   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6334     return false;
6335 
6336   B.buildCopy(SGPR01, LiveIn);
6337   B.buildInstr(AMDGPU::S_TRAP)
6338       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6339       .addReg(SGPR01, RegState::Implicit);
6340 
6341   MI.eraseFromParent();
6342   return true;
6343 }
6344 
6345 bool AMDGPULegalizerInfo::legalizeTrapHsa(
6346     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6347   B.buildInstr(AMDGPU::S_TRAP)
6348       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6349   MI.eraseFromParent();
6350   return true;
6351 }
6352 
6353 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
6354     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6355   // Is non-HSA path or trap-handler disabled? Then, report a warning
6356   // accordingly
6357   if (!ST.isTrapHandlerEnabled() ||
6358       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6359     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6360                                      "debugtrap handler not supported",
6361                                      MI.getDebugLoc(), DS_Warning);
6362     LLVMContext &Ctx = B.getMF().getFunction().getContext();
6363     Ctx.diagnose(NoTrap);
6364   } else {
6365     // Insert debug-trap instruction
6366     B.buildInstr(AMDGPU::S_TRAP)
6367         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6368   }
6369 
6370   MI.eraseFromParent();
6371   return true;
6372 }
6373 
6374 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6375                                                MachineIRBuilder &B) const {
6376   MachineRegisterInfo &MRI = *B.getMRI();
6377   const LLT S16 = LLT::scalar(16);
6378   const LLT S32 = LLT::scalar(32);
6379   const LLT V2S16 = LLT::fixed_vector(2, 16);
6380   const LLT V3S32 = LLT::fixed_vector(3, 32);
6381 
6382   Register DstReg = MI.getOperand(0).getReg();
6383   Register NodePtr = MI.getOperand(2).getReg();
6384   Register RayExtent = MI.getOperand(3).getReg();
6385   Register RayOrigin = MI.getOperand(4).getReg();
6386   Register RayDir = MI.getOperand(5).getReg();
6387   Register RayInvDir = MI.getOperand(6).getReg();
6388   Register TDescr = MI.getOperand(7).getReg();
6389 
6390   if (!ST.hasGFX10_AEncoding()) {
6391     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6392                                         "intrinsic not supported on subtarget",
6393                                         MI.getDebugLoc());
6394     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6395     return false;
6396   }
6397 
6398   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6399   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6400   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6401   const unsigned NumVDataDwords = 4;
6402   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6403   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6404   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
6405   const unsigned BaseOpcodes[2][2] = {
6406       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6407       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6408        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6409   int Opcode;
6410   if (UseNSA) {
6411     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6412                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
6413                                                : AMDGPU::MIMGEncGfx10NSA,
6414                                    NumVDataDwords, NumVAddrDwords);
6415   } else {
6416     Opcode = AMDGPU::getMIMGOpcode(
6417         BaseOpcodes[Is64][IsA16],
6418         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
6419         NumVDataDwords, NumVAddrDwords);
6420   }
6421   assert(Opcode != -1);
6422 
6423   SmallVector<Register, 12> Ops;
6424   if (UseNSA && IsGFX11Plus) {
6425     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6426       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6427       auto Merged = B.buildMergeLikeInstr(
6428           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6429       Ops.push_back(Merged.getReg(0));
6430     };
6431 
6432     Ops.push_back(NodePtr);
6433     Ops.push_back(RayExtent);
6434     packLanes(RayOrigin);
6435 
6436     if (IsA16) {
6437       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6438       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6439       auto MergedDir = B.buildMergeLikeInstr(
6440           V3S32,
6441           {B.buildBitcast(
6442                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
6443                                                    UnmergeRayDir.getReg(0)}))
6444                .getReg(0),
6445            B.buildBitcast(
6446                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
6447                                                    UnmergeRayDir.getReg(1)}))
6448                .getReg(0),
6449            B.buildBitcast(
6450                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
6451                                                    UnmergeRayDir.getReg(2)}))
6452                .getReg(0)});
6453       Ops.push_back(MergedDir.getReg(0));
6454     } else {
6455       packLanes(RayDir);
6456       packLanes(RayInvDir);
6457     }
6458   } else {
6459     if (Is64) {
6460       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6461       Ops.push_back(Unmerge.getReg(0));
6462       Ops.push_back(Unmerge.getReg(1));
6463     } else {
6464       Ops.push_back(NodePtr);
6465     }
6466     Ops.push_back(RayExtent);
6467 
6468     auto packLanes = [&Ops, &S32, &B](Register Src) {
6469       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6470       Ops.push_back(Unmerge.getReg(0));
6471       Ops.push_back(Unmerge.getReg(1));
6472       Ops.push_back(Unmerge.getReg(2));
6473     };
6474 
6475     packLanes(RayOrigin);
6476     if (IsA16) {
6477       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6478       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6479       Register R1 = MRI.createGenericVirtualRegister(S32);
6480       Register R2 = MRI.createGenericVirtualRegister(S32);
6481       Register R3 = MRI.createGenericVirtualRegister(S32);
6482       B.buildMergeLikeInstr(R1,
6483                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6484       B.buildMergeLikeInstr(
6485           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6486       B.buildMergeLikeInstr(
6487           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6488       Ops.push_back(R1);
6489       Ops.push_back(R2);
6490       Ops.push_back(R3);
6491     } else {
6492       packLanes(RayDir);
6493       packLanes(RayInvDir);
6494     }
6495   }
6496 
6497   if (!UseNSA) {
6498     // Build a single vector containing all the operands so far prepared.
6499     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6500     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6501     Ops.clear();
6502     Ops.push_back(MergedOps);
6503   }
6504 
6505   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6506     .addDef(DstReg)
6507     .addImm(Opcode);
6508 
6509   for (Register R : Ops) {
6510     MIB.addUse(R);
6511   }
6512 
6513   MIB.addUse(TDescr)
6514      .addImm(IsA16 ? 1 : 0)
6515      .cloneMemRefs(MI);
6516 
6517   MI.eraseFromParent();
6518   return true;
6519 }
6520 
6521 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
6522                                                MachineIRBuilder &B) const {
6523   unsigned Opc;
6524   int RoundMode = MI.getOperand(2).getImm();
6525 
6526   if (RoundMode == (int)RoundingMode::TowardPositive)
6527     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6528   else if (RoundMode == (int)RoundingMode::TowardNegative)
6529     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6530   else
6531     return false;
6532 
6533   B.buildInstr(Opc)
6534       .addDef(MI.getOperand(0).getReg())
6535       .addUse(MI.getOperand(1).getReg());
6536 
6537   MI.eraseFromParent();
6538 
6539   return true;
6540 }
6541 
6542 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
6543                                             MachineInstr &MI) const {
6544   MachineIRBuilder &B = Helper.MIRBuilder;
6545   MachineRegisterInfo &MRI = *B.getMRI();
6546 
6547   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
6548   auto IntrID = MI.getIntrinsicID();
6549   switch (IntrID) {
6550   case Intrinsic::amdgcn_if:
6551   case Intrinsic::amdgcn_else: {
6552     MachineInstr *Br = nullptr;
6553     MachineBasicBlock *UncondBrTarget = nullptr;
6554     bool Negated = false;
6555     if (MachineInstr *BrCond =
6556             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6557       const SIRegisterInfo *TRI
6558         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6559 
6560       Register Def = MI.getOperand(1).getReg();
6561       Register Use = MI.getOperand(3).getReg();
6562 
6563       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6564 
6565       if (Negated)
6566         std::swap(CondBrTarget, UncondBrTarget);
6567 
6568       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6569       if (IntrID == Intrinsic::amdgcn_if) {
6570         B.buildInstr(AMDGPU::SI_IF)
6571           .addDef(Def)
6572           .addUse(Use)
6573           .addMBB(UncondBrTarget);
6574       } else {
6575         B.buildInstr(AMDGPU::SI_ELSE)
6576             .addDef(Def)
6577             .addUse(Use)
6578             .addMBB(UncondBrTarget);
6579       }
6580 
6581       if (Br) {
6582         Br->getOperand(0).setMBB(CondBrTarget);
6583       } else {
6584         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
6585         // since we're swapping branch targets it needs to be reinserted.
6586         // FIXME: IRTranslator should probably not do this
6587         B.buildBr(*CondBrTarget);
6588       }
6589 
6590       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
6591       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
6592       MI.eraseFromParent();
6593       BrCond->eraseFromParent();
6594       return true;
6595     }
6596 
6597     return false;
6598   }
6599   case Intrinsic::amdgcn_loop: {
6600     MachineInstr *Br = nullptr;
6601     MachineBasicBlock *UncondBrTarget = nullptr;
6602     bool Negated = false;
6603     if (MachineInstr *BrCond =
6604             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
6605       const SIRegisterInfo *TRI
6606         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
6607 
6608       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6609       Register Reg = MI.getOperand(2).getReg();
6610 
6611       if (Negated)
6612         std::swap(CondBrTarget, UncondBrTarget);
6613 
6614       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6615       B.buildInstr(AMDGPU::SI_LOOP)
6616         .addUse(Reg)
6617         .addMBB(UncondBrTarget);
6618 
6619       if (Br)
6620         Br->getOperand(0).setMBB(CondBrTarget);
6621       else
6622         B.buildBr(*CondBrTarget);
6623 
6624       MI.eraseFromParent();
6625       BrCond->eraseFromParent();
6626       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
6627       return true;
6628     }
6629 
6630     return false;
6631   }
6632   case Intrinsic::amdgcn_make_buffer_rsrc:
6633     return legalizePointerAsRsrcIntrin(MI, MRI, B);
6634   case Intrinsic::amdgcn_kernarg_segment_ptr:
6635     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
6636       // This only makes sense to call in a kernel, so just lower to null.
6637       B.buildConstant(MI.getOperand(0).getReg(), 0);
6638       MI.eraseFromParent();
6639       return true;
6640     }
6641 
6642     return legalizePreloadedArgIntrin(
6643       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
6644   case Intrinsic::amdgcn_implicitarg_ptr:
6645     return legalizeImplicitArgPtr(MI, MRI, B);
6646   case Intrinsic::amdgcn_workitem_id_x:
6647     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
6648                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
6649   case Intrinsic::amdgcn_workitem_id_y:
6650     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
6651                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
6652   case Intrinsic::amdgcn_workitem_id_z:
6653     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
6654                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
6655   case Intrinsic::amdgcn_workgroup_id_x:
6656     return legalizePreloadedArgIntrin(MI, MRI, B,
6657                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
6658   case Intrinsic::amdgcn_workgroup_id_y:
6659     return legalizePreloadedArgIntrin(MI, MRI, B,
6660                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
6661   case Intrinsic::amdgcn_workgroup_id_z:
6662     return legalizePreloadedArgIntrin(MI, MRI, B,
6663                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6664   case Intrinsic::amdgcn_lds_kernel_id:
6665     return legalizePreloadedArgIntrin(MI, MRI, B,
6666                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6667   case Intrinsic::amdgcn_dispatch_ptr:
6668     return legalizePreloadedArgIntrin(MI, MRI, B,
6669                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
6670   case Intrinsic::amdgcn_queue_ptr:
6671     return legalizePreloadedArgIntrin(MI, MRI, B,
6672                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
6673   case Intrinsic::amdgcn_implicit_buffer_ptr:
6674     return legalizePreloadedArgIntrin(
6675       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
6676   case Intrinsic::amdgcn_dispatch_id:
6677     return legalizePreloadedArgIntrin(MI, MRI, B,
6678                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
6679   case Intrinsic::r600_read_ngroups_x:
6680     // TODO: Emit error for hsa
6681     return legalizeKernargMemParameter(MI, B,
6682                                        SI::KernelInputOffsets::NGROUPS_X);
6683   case Intrinsic::r600_read_ngroups_y:
6684     return legalizeKernargMemParameter(MI, B,
6685                                        SI::KernelInputOffsets::NGROUPS_Y);
6686   case Intrinsic::r600_read_ngroups_z:
6687     return legalizeKernargMemParameter(MI, B,
6688                                        SI::KernelInputOffsets::NGROUPS_Z);
6689   case Intrinsic::r600_read_local_size_x:
6690     // TODO: Could insert G_ASSERT_ZEXT from s16
6691     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
6692   case Intrinsic::r600_read_local_size_y:
6693     // TODO: Could insert G_ASSERT_ZEXT from s16
6694     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
6695     // TODO: Could insert G_ASSERT_ZEXT from s16
6696   case Intrinsic::r600_read_local_size_z:
6697     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
6698   case Intrinsic::r600_read_global_size_x:
6699     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
6700   case Intrinsic::r600_read_global_size_y:
6701     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
6702   case Intrinsic::r600_read_global_size_z:
6703     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
6704   case Intrinsic::amdgcn_fdiv_fast:
6705     return legalizeFDIVFastIntrin(MI, MRI, B);
6706   case Intrinsic::amdgcn_is_shared:
6707     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
6708   case Intrinsic::amdgcn_is_private:
6709     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
6710   case Intrinsic::amdgcn_wavefrontsize: {
6711     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
6712     MI.eraseFromParent();
6713     return true;
6714   }
6715   case Intrinsic::amdgcn_s_buffer_load:
6716     return legalizeSBufferLoad(Helper, MI);
6717   case Intrinsic::amdgcn_raw_buffer_store:
6718   case Intrinsic::amdgcn_raw_ptr_buffer_store:
6719   case Intrinsic::amdgcn_struct_buffer_store:
6720   case Intrinsic::amdgcn_struct_ptr_buffer_store:
6721     return legalizeBufferStore(MI, MRI, B, false, false);
6722   case Intrinsic::amdgcn_raw_buffer_store_format:
6723   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
6724   case Intrinsic::amdgcn_struct_buffer_store_format:
6725   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
6726     return legalizeBufferStore(MI, MRI, B, false, true);
6727   case Intrinsic::amdgcn_raw_tbuffer_store:
6728   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
6729   case Intrinsic::amdgcn_struct_tbuffer_store:
6730   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
6731     return legalizeBufferStore(MI, MRI, B, true, true);
6732   case Intrinsic::amdgcn_raw_buffer_load:
6733   case Intrinsic::amdgcn_raw_ptr_buffer_load:
6734   case Intrinsic::amdgcn_struct_buffer_load:
6735   case Intrinsic::amdgcn_struct_ptr_buffer_load:
6736     return legalizeBufferLoad(MI, MRI, B, false, false);
6737   case Intrinsic::amdgcn_raw_buffer_load_format:
6738   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
6739   case Intrinsic::amdgcn_struct_buffer_load_format:
6740   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
6741     return legalizeBufferLoad(MI, MRI, B, true, false);
6742   case Intrinsic::amdgcn_raw_tbuffer_load:
6743   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
6744   case Intrinsic::amdgcn_struct_tbuffer_load:
6745   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
6746     return legalizeBufferLoad(MI, MRI, B, true, true);
6747   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6748   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6749   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6750   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6751   case Intrinsic::amdgcn_raw_buffer_atomic_add:
6752   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6753   case Intrinsic::amdgcn_struct_buffer_atomic_add:
6754   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6755   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6756   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6757   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6758   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6759   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6760   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6761   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6762   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6763   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6764   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6765   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6766   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6767   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6768   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6769   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6770   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6771   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6772   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6773   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6774   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6775   case Intrinsic::amdgcn_raw_buffer_atomic_and:
6776   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6777   case Intrinsic::amdgcn_struct_buffer_atomic_and:
6778   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6779   case Intrinsic::amdgcn_raw_buffer_atomic_or:
6780   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6781   case Intrinsic::amdgcn_struct_buffer_atomic_or:
6782   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6783   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6784   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6785   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6786   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6787   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6788   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6789   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6790   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6791   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6792   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6793   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6794   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6795   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6796   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6797   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6798   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6799   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6800   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6801   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6802   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6803   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6804   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6805   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6806   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6807   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6808   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6809   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6810   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6811     return legalizeBufferAtomic(MI, B, IntrID);
6812   case Intrinsic::trap:
6813     return legalizeTrapIntrinsic(MI, MRI, B);
6814   case Intrinsic::debugtrap:
6815     return legalizeDebugTrapIntrinsic(MI, MRI, B);
6816   case Intrinsic::amdgcn_rsq_clamp:
6817     return legalizeRsqClampIntrinsic(MI, MRI, B);
6818   case Intrinsic::amdgcn_ds_fadd:
6819   case Intrinsic::amdgcn_ds_fmin:
6820   case Intrinsic::amdgcn_ds_fmax:
6821     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
6822   case Intrinsic::amdgcn_image_bvh_intersect_ray:
6823     return legalizeBVHIntrinsic(MI, B);
6824   case Intrinsic::amdgcn_fmed3: {
6825     GISelChangeObserver &Observer = Helper.Observer;
6826 
6827     // FIXME: This is to workaround the inability of tablegen match combiners to
6828     // match intrinsics in patterns.
6829     Observer.changingInstr(MI);
6830     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
6831     MI.removeOperand(1);
6832     Observer.changedInstr(MI);
6833     return true;
6834   }
6835   default: {
6836     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6837             AMDGPU::getImageDimIntrinsicInfo(IntrID))
6838       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
6839     return true;
6840   }
6841   }
6842 
6843   return true;
6844 }
6845