xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIInstrInfo.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "SIRegisterInfo.h"
24 #include "Utils/AMDGPUBaseInfo.h"
25 #include "llvm/ADT/ScopeExit.h"
26 #include "llvm/BinaryFormat/ELF.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47   "amdgpu-global-isel-new-legality",
48   cl::desc("Use GlobalISel desired legality, rather than try to use"
49            "rules compatible with selection patterns"),
50   cl::init(false),
51   cl::ReallyHidden);
52 
53 static constexpr unsigned MaxRegisterSize = 1024;
54 
55 // Round the number of elements to the next power of two elements
56 static LLT getPow2VectorType(LLT Ty) {
57   unsigned NElts = Ty.getNumElements();
58   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
59   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60 }
61 
62 // Round the number of bits to the next power of two bits
63 static LLT getPow2ScalarType(LLT Ty) {
64   unsigned Bits = Ty.getSizeInBits();
65   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
66   return LLT::scalar(Pow2Bits);
67 }
68 
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     if (!Ty.isVector())
76       return false;
77 
78     const LLT EltTy = Ty.getElementType();
79     const unsigned EltSize = EltTy.getSizeInBits();
80     return Ty.getNumElements() % 2 != 0 &&
81            EltSize > 1 && EltSize < 32 &&
82            Ty.getSizeInBits() % 32 != 0;
83   };
84 }
85 
86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     return Ty.getSizeInBits() % 32 == 0;
90   };
91 }
92 
93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getScalarType();
97     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98   };
99 }
100 
101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     return std::pair(TypeIdx,
106                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107   };
108 }
109 
110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111   return [=](const LegalityQuery &Query) {
112     const LLT Ty = Query.Types[TypeIdx];
113     const LLT EltTy = Ty.getElementType();
114     unsigned Size = Ty.getSizeInBits();
115     unsigned Pieces = (Size + 63) / 64;
116     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117     return std::pair(TypeIdx, LLT::scalarOrVector(
118                                   ElementCount::getFixed(NewNumElts), EltTy));
119   };
120 }
121 
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125   return [=](const LegalityQuery &Query) {
126     const LLT Ty = Query.Types[TypeIdx];
127 
128     const LLT EltTy = Ty.getElementType();
129     const int Size = Ty.getSizeInBits();
130     const int EltSize = EltTy.getSizeInBits();
131     const int NextMul32 = (Size + 31) / 32;
132 
133     assert(EltSize < 32);
134 
135     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137   };
138 }
139 
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142   return [=](const LegalityQuery &Query) {
143     const LLT Ty = Query.Types[TypeIdx];
144     const unsigned NumElts = Ty.getNumElements();
145     const unsigned EltSize = Ty.getElementType().getSizeInBits();
146     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147 
148     assert(EltSize == 32 || EltSize == 64);
149     assert(Ty.getSizeInBits() < MaxRegisterSize);
150 
151     unsigned NewNumElts;
152     // Find the nearest legal RegClass that is larger than the current type.
153     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155         break;
156     }
157 
158     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
159   };
160 }
161 
162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163   if (!Ty.isVector())
164     return LLT::scalar(128);
165   const ElementCount NumElems = Ty.getElementCount();
166   return LLT::vector(NumElems, LLT::scalar(128));
167 }
168 
169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170   if (!Ty.isVector())
171     return LLT::fixed_vector(4, LLT::scalar(32));
172   const unsigned NumElems = Ty.getElementCount().getFixedValue();
173   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174 }
175 
176 static LLT getBitcastRegisterType(const LLT Ty) {
177   const unsigned Size = Ty.getSizeInBits();
178 
179   if (Size <= 32) {
180     // <2 x s8> -> s16
181     // <4 x s8> -> s32
182     return LLT::scalar(Size);
183   }
184 
185   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186 }
187 
188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189   return [=](const LegalityQuery &Query) {
190     const LLT Ty = Query.Types[TypeIdx];
191     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192   };
193 }
194 
195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196   return [=](const LegalityQuery &Query) {
197     const LLT Ty = Query.Types[TypeIdx];
198     unsigned Size = Ty.getSizeInBits();
199     assert(Size % 32 == 0);
200     return std::pair(
201         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
202   };
203 }
204 
205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206   return [=](const LegalityQuery &Query) {
207     const LLT QueryTy = Query.Types[TypeIdx];
208     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209   };
210 }
211 
212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213   return [=](const LegalityQuery &Query) {
214     const LLT QueryTy = Query.Types[TypeIdx];
215     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216   };
217 }
218 
219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223   };
224 }
225 
226 static bool isRegisterSize(unsigned Size) {
227   return Size % 32 == 0 && Size <= MaxRegisterSize;
228 }
229 
230 static bool isRegisterVectorElementType(LLT EltTy) {
231   const int EltSize = EltTy.getSizeInBits();
232   return EltSize == 16 || EltSize % 32 == 0;
233 }
234 
235 static bool isRegisterVectorType(LLT Ty) {
236   const int EltSize = Ty.getElementType().getSizeInBits();
237   return EltSize == 32 || EltSize == 64 ||
238          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
239          EltSize == 128 || EltSize == 256;
240 }
241 
242 // TODO: replace all uses of isRegisterType with isRegisterClassType
243 static bool isRegisterType(LLT Ty) {
244   if (!isRegisterSize(Ty.getSizeInBits()))
245     return false;
246 
247   if (Ty.isVector())
248     return isRegisterVectorType(Ty);
249 
250   return true;
251 }
252 
253 // Any combination of 32 or 64-bit elements up the maximum register size, and
254 // multiples of v2s16.
255 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
256   return [=](const LegalityQuery &Query) {
257     return isRegisterType(Query.Types[TypeIdx]);
258   };
259 }
260 
261 // RegisterType that doesn't have a corresponding RegClass.
262 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263 // should be removed.
264 static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
265   return [=](const LegalityQuery &Query) {
266     LLT Ty = Query.Types[TypeIdx];
267     return isRegisterType(Ty) &&
268            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
269   };
270 }
271 
272 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
273   return [=](const LegalityQuery &Query) {
274     const LLT QueryTy = Query.Types[TypeIdx];
275     if (!QueryTy.isVector())
276       return false;
277     const LLT EltTy = QueryTy.getElementType();
278     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
279   };
280 }
281 
282 static const LLT S1 = LLT::scalar(1);
283 static const LLT S8 = LLT::scalar(8);
284 static const LLT S16 = LLT::scalar(16);
285 static const LLT S32 = LLT::scalar(32);
286 static const LLT F32 = LLT::float32();
287 static const LLT S64 = LLT::scalar(64);
288 static const LLT F64 = LLT::float64();
289 static const LLT S96 = LLT::scalar(96);
290 static const LLT S128 = LLT::scalar(128);
291 static const LLT S160 = LLT::scalar(160);
292 static const LLT S224 = LLT::scalar(224);
293 static const LLT S256 = LLT::scalar(256);
294 static const LLT S512 = LLT::scalar(512);
295 static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
296 
297 static const LLT V2S8 = LLT::fixed_vector(2, 8);
298 static const LLT V2S16 = LLT::fixed_vector(2, 16);
299 static const LLT V4S16 = LLT::fixed_vector(4, 16);
300 static const LLT V6S16 = LLT::fixed_vector(6, 16);
301 static const LLT V8S16 = LLT::fixed_vector(8, 16);
302 static const LLT V10S16 = LLT::fixed_vector(10, 16);
303 static const LLT V12S16 = LLT::fixed_vector(12, 16);
304 static const LLT V16S16 = LLT::fixed_vector(16, 16);
305 
306 static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
307 static const LLT V2BF16 = V2F16; // FIXME
308 
309 static const LLT V2S32 = LLT::fixed_vector(2, 32);
310 static const LLT V3S32 = LLT::fixed_vector(3, 32);
311 static const LLT V4S32 = LLT::fixed_vector(4, 32);
312 static const LLT V5S32 = LLT::fixed_vector(5, 32);
313 static const LLT V6S32 = LLT::fixed_vector(6, 32);
314 static const LLT V7S32 = LLT::fixed_vector(7, 32);
315 static const LLT V8S32 = LLT::fixed_vector(8, 32);
316 static const LLT V9S32 = LLT::fixed_vector(9, 32);
317 static const LLT V10S32 = LLT::fixed_vector(10, 32);
318 static const LLT V11S32 = LLT::fixed_vector(11, 32);
319 static const LLT V12S32 = LLT::fixed_vector(12, 32);
320 static const LLT V16S32 = LLT::fixed_vector(16, 32);
321 static const LLT V32S32 = LLT::fixed_vector(32, 32);
322 
323 static const LLT V2S64 = LLT::fixed_vector(2, 64);
324 static const LLT V3S64 = LLT::fixed_vector(3, 64);
325 static const LLT V4S64 = LLT::fixed_vector(4, 64);
326 static const LLT V5S64 = LLT::fixed_vector(5, 64);
327 static const LLT V6S64 = LLT::fixed_vector(6, 64);
328 static const LLT V7S64 = LLT::fixed_vector(7, 64);
329 static const LLT V8S64 = LLT::fixed_vector(8, 64);
330 static const LLT V16S64 = LLT::fixed_vector(16, 64);
331 
332 static const LLT V2S128 = LLT::fixed_vector(2, 128);
333 static const LLT V4S128 = LLT::fixed_vector(4, 128);
334 
335 static std::initializer_list<LLT> AllScalarTypes = {S32,  S64,  S96,  S128,
336                                                     S160, S224, S256, S512};
337 
338 static std::initializer_list<LLT> AllS16Vectors{
339     V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
340 
341 static std::initializer_list<LLT> AllS32Vectors = {
342     V2S32, V3S32,  V4S32,  V5S32,  V6S32,  V7S32, V8S32,
343     V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
344 
345 static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
346                                                    V6S64, V7S64, V8S64, V16S64};
347 
348 // Checks whether a type is in the list of legal register types.
349 static bool isRegisterClassType(LLT Ty) {
350   if (Ty.isPointerOrPointerVector())
351     Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
352 
353   return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
354          is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
355 }
356 
357 static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358   return [TypeIdx](const LegalityQuery &Query) {
359     return isRegisterClassType(Query.Types[TypeIdx]);
360   };
361 }
362 
363 // If we have a truncating store or an extending load with a data size larger
364 // than 32-bits, we need to reduce to a 32-bit type.
365 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
366   return [=](const LegalityQuery &Query) {
367     const LLT Ty = Query.Types[TypeIdx];
368     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
370   };
371 }
372 
373 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
374 // handle some operations by just promoting the register during
375 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
376 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
377                                     bool IsLoad, bool IsAtomic) {
378   switch (AS) {
379   case AMDGPUAS::PRIVATE_ADDRESS:
380     // FIXME: Private element size.
381     return ST.enableFlatScratch() ? 128 : 32;
382   case AMDGPUAS::LOCAL_ADDRESS:
383     return ST.useDS128() ? 128 : 64;
384   case AMDGPUAS::GLOBAL_ADDRESS:
385   case AMDGPUAS::CONSTANT_ADDRESS:
386   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
387   case AMDGPUAS::BUFFER_RESOURCE:
388     // Treat constant and global as identical. SMRD loads are sometimes usable for
389     // global loads (ideally constant address space should be eliminated)
390     // depending on the context. Legality cannot be context dependent, but
391     // RegBankSelect can split the load as necessary depending on the pointer
392     // register bank/uniformity and if the memory is invariant or not written in a
393     // kernel.
394     return IsLoad ? 512 : 128;
395   default:
396     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
397     // if they may alias scratch depending on the subtarget.  This needs to be
398     // moved to custom handling to use addressMayBeAccessedAsPrivate
399     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
400   }
401 }
402 
403 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404                                  const LegalityQuery &Query) {
405   const LLT Ty = Query.Types[0];
406 
407   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
409 
410   unsigned RegSize = Ty.getSizeInBits();
411   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
412   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
413   unsigned AS = Query.Types[1].getAddressSpace();
414 
415   // All of these need to be custom lowered to cast the pointer operand.
416   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
417     return false;
418 
419   // Do not handle extending vector loads.
420   if (Ty.isVector() && MemSize != RegSize)
421     return false;
422 
423   // TODO: We should be able to widen loads if the alignment is high enough, but
424   // we also need to modify the memory access size.
425 #if 0
426   // Accept widening loads based on alignment.
427   if (IsLoad && MemSize < Size)
428     MemSize = std::max(MemSize, Align);
429 #endif
430 
431   // Only 1-byte and 2-byte to 32-bit extloads are valid.
432   if (MemSize != RegSize && RegSize != 32)
433     return false;
434 
435   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
436                                     Query.MMODescrs[0].Ordering !=
437                                         AtomicOrdering::NotAtomic))
438     return false;
439 
440   switch (MemSize) {
441   case 8:
442   case 16:
443   case 32:
444   case 64:
445   case 128:
446     break;
447   case 96:
448     if (!ST.hasDwordx3LoadStores())
449       return false;
450     break;
451   case 256:
452   case 512:
453     // These may contextually need to be broken down.
454     break;
455   default:
456     return false;
457   }
458 
459   assert(RegSize >= MemSize);
460 
461   if (AlignBits < MemSize) {
462     const SITargetLowering *TLI = ST.getTargetLowering();
463     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464                                                  Align(AlignBits / 8)))
465       return false;
466   }
467 
468   return true;
469 }
470 
471 // The newer buffer intrinsic forms take their resource arguments as
472 // pointers in address space 8, aka s128 values. However, in order to not break
473 // SelectionDAG, the underlying operations have to continue to take v4i32
474 // arguments. Therefore, we convert resource pointers - or vectors of them
475 // to integer values here.
476 static bool hasBufferRsrcWorkaround(const LLT Ty) {
477   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
478     return true;
479   if (Ty.isVector()) {
480     const LLT ElemTy = Ty.getElementType();
481     return hasBufferRsrcWorkaround(ElemTy);
482   }
483   return false;
484 }
485 
486 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
487 // workaround this. Eventually it should ignore the type for loads and only care
488 // about the size. Return true in cases where we will workaround this for now by
489 // bitcasting.
490 static bool loadStoreBitcastWorkaround(const LLT Ty) {
491   if (EnableNewLegality)
492     return false;
493 
494   const unsigned Size = Ty.getSizeInBits();
495   if (Size <= 64)
496     return false;
497   // Address space 8 pointers get their own workaround.
498   if (hasBufferRsrcWorkaround(Ty))
499     return false;
500   if (!Ty.isVector())
501     return true;
502 
503   if (Ty.isPointerVector())
504     return true;
505 
506   unsigned EltSize = Ty.getScalarSizeInBits();
507   return EltSize != 32 && EltSize != 64;
508 }
509 
510 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
511   const LLT Ty = Query.Types[0];
512   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
513          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
514 }
515 
516 /// Return true if a load or store of the type should be lowered with a bitcast
517 /// to a different type.
518 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519                                        const LLT MemTy) {
520   const unsigned MemSizeInBits = MemTy.getSizeInBits();
521   const unsigned Size = Ty.getSizeInBits();
522   if (Size != MemSizeInBits)
523     return Size <= 32 && Ty.isVector();
524 
525   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
526     return true;
527 
528   // Don't try to handle bitcasting vector ext loads for now.
529   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530          (Size <= 32 || isRegisterSize(Size)) &&
531          !isRegisterVectorElementType(Ty.getElementType());
532 }
533 
534 /// Return true if we should legalize a load by widening an odd sized memory
535 /// access up to the alignment. Note this case when the memory access itself
536 /// changes, not the size of the result register.
537 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
538                             uint64_t AlignInBits, unsigned AddrSpace,
539                             unsigned Opcode) {
540   unsigned SizeInBits = MemoryTy.getSizeInBits();
541   // We don't want to widen cases that are naturally legal.
542   if (isPowerOf2_32(SizeInBits))
543     return false;
544 
545   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
546   // end up widening these for a scalar load during RegBankSelect, if we don't
547   // have 96-bit scalar loads.
548   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549     return false;
550 
551   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552     return false;
553 
554   // A load is known dereferenceable up to the alignment, so it's legal to widen
555   // to it.
556   //
557   // TODO: Could check dereferenceable for less aligned cases.
558   unsigned RoundedSize = NextPowerOf2(SizeInBits);
559   if (AlignInBits < RoundedSize)
560     return false;
561 
562   // Do not widen if it would introduce a slow unaligned load.
563   const SITargetLowering *TLI = ST.getTargetLowering();
564   unsigned Fast = 0;
565   return TLI->allowsMisalignedMemoryAccessesImpl(
566              RoundedSize, AddrSpace, Align(AlignInBits / 8),
567              MachineMemOperand::MOLoad, &Fast) &&
568          Fast;
569 }
570 
571 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572                             unsigned Opcode) {
573   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574     return false;
575 
576   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577                          Query.MMODescrs[0].AlignInBits,
578                          Query.Types[1].getAddressSpace(), Opcode);
579 }
580 
581 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
582 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
583 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
584 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
585                                    MachineRegisterInfo &MRI, unsigned Idx) {
586   MachineOperand &MO = MI.getOperand(Idx);
587 
588   const LLT PointerTy = MRI.getType(MO.getReg());
589 
590   // Paranoidly prevent us from doing this multiple times.
591   if (!hasBufferRsrcWorkaround(PointerTy))
592     return PointerTy;
593 
594   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
595   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
596   if (!PointerTy.isVector()) {
597     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
598     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
599     const LLT S32 = LLT::scalar(32);
600 
601     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
602     std::array<Register, 4> VectorElems;
603     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
604     for (unsigned I = 0; I < NumParts; ++I)
605       VectorElems[I] =
606           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
607     B.buildMergeValues(MO, VectorElems);
608     MO.setReg(VectorReg);
609     return VectorTy;
610   }
611   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
612   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
613   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
614   B.buildIntToPtr(MO, Scalar);
615   MO.setReg(BitcastReg);
616 
617   return VectorTy;
618 }
619 
620 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
621 /// the form in which the value must be in order to be passed to the low-level
622 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
623 /// needed in order to account for the fact that we can't define a register
624 /// class for s128 without breaking SelectionDAG.
625 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
626   MachineRegisterInfo &MRI = *B.getMRI();
627   const LLT PointerTy = MRI.getType(Pointer);
628   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630 
631   if (!PointerTy.isVector()) {
632     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
633     SmallVector<Register, 4> PointerParts;
634     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
635     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
636     for (unsigned I = 0; I < NumParts; ++I)
637       PointerParts.push_back(Unmerged.getReg(I));
638     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
639   }
640   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
641   return B.buildBitcast(VectorTy, Scalar).getReg(0);
642 }
643 
644 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
645                                      unsigned Idx) {
646   MachineOperand &MO = MI.getOperand(Idx);
647 
648   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
649   // Paranoidly prevent us from doing this multiple times.
650   if (!hasBufferRsrcWorkaround(PointerTy))
651     return;
652   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
653 }
654 
655 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
656                                          const GCNTargetMachine &TM)
657   :  ST(ST_) {
658   using namespace TargetOpcode;
659 
660   auto GetAddrSpacePtr = [&TM](unsigned AS) {
661     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
662   };
663 
664   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
665   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
666   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
667   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
668   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
669   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
670   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
671   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
672   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
673   const LLT BufferStridedPtr =
674       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
675 
676   const LLT CodePtr = FlatPtr;
677 
678   const std::initializer_list<LLT> AddrSpaces64 = {
679     GlobalPtr, ConstantPtr, FlatPtr
680   };
681 
682   const std::initializer_list<LLT> AddrSpaces32 = {
683     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
684   };
685 
686   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
687 
688   const std::initializer_list<LLT> FPTypesBase = {
689     S32, S64
690   };
691 
692   const std::initializer_list<LLT> FPTypes16 = {
693     S32, S64, S16
694   };
695 
696   const std::initializer_list<LLT> FPTypesPK16 = {
697     S32, S64, S16, V2S16
698   };
699 
700   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
701 
702   // s1 for VCC branches, s32 for SCC branches.
703   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
704 
705   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
706   // elements for v3s16
707   getActionDefinitionsBuilder(G_PHI)
708       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
709       .legalFor(AllS32Vectors)
710       .legalFor(AllS64Vectors)
711       .legalFor(AddrSpaces64)
712       .legalFor(AddrSpaces32)
713       .legalFor(AddrSpaces128)
714       .legalIf(isPointer(0))
715       .clampScalar(0, S16, S256)
716       .widenScalarToNextPow2(0, 32)
717       .clampMaxNumElements(0, S32, 16)
718       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
719       .scalarize(0);
720 
721   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722     // Full set of gfx9 features.
723     if (ST.hasScalarAddSub64()) {
724       getActionDefinitionsBuilder({G_ADD, G_SUB})
725           .legalFor({S64, S32, S16, V2S16})
726           .clampMaxNumElementsStrict(0, S16, 2)
727           .scalarize(0)
728           .minScalar(0, S16)
729           .widenScalarToNextMultipleOf(0, 32)
730           .maxScalar(0, S32);
731     } else {
732       getActionDefinitionsBuilder({G_ADD, G_SUB})
733           .legalFor({S32, S16, V2S16})
734           .clampMaxNumElementsStrict(0, S16, 2)
735           .scalarize(0)
736           .minScalar(0, S16)
737           .widenScalarToNextMultipleOf(0, 32)
738           .maxScalar(0, S32);
739     }
740 
741     if (ST.hasScalarSMulU64()) {
742       getActionDefinitionsBuilder(G_MUL)
743           .legalFor({S64, S32, S16, V2S16})
744           .clampMaxNumElementsStrict(0, S16, 2)
745           .scalarize(0)
746           .minScalar(0, S16)
747           .widenScalarToNextMultipleOf(0, 32)
748           .custom();
749     } else {
750       getActionDefinitionsBuilder(G_MUL)
751           .legalFor({S32, S16, V2S16})
752           .clampMaxNumElementsStrict(0, S16, 2)
753           .scalarize(0)
754           .minScalar(0, S16)
755           .widenScalarToNextMultipleOf(0, 32)
756           .custom();
757     }
758     assert(ST.hasMad64_32());
759 
760     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761       .legalFor({S32, S16, V2S16}) // Clamp modifier
762       .minScalarOrElt(0, S16)
763       .clampMaxNumElementsStrict(0, S16, 2)
764       .scalarize(0)
765       .widenScalarToNextPow2(0, 32)
766       .lower();
767   } else if (ST.has16BitInsts()) {
768     getActionDefinitionsBuilder({G_ADD, G_SUB})
769       .legalFor({S32, S16})
770       .minScalar(0, S16)
771       .widenScalarToNextMultipleOf(0, 32)
772       .maxScalar(0, S32)
773       .scalarize(0);
774 
775     getActionDefinitionsBuilder(G_MUL)
776       .legalFor({S32, S16})
777       .scalarize(0)
778       .minScalar(0, S16)
779       .widenScalarToNextMultipleOf(0, 32)
780       .custom();
781     assert(ST.hasMad64_32());
782 
783     // Technically the saturating operations require clamp bit support, but this
784     // was introduced at the same time as 16-bit operations.
785     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786       .legalFor({S32, S16}) // Clamp modifier
787       .minScalar(0, S16)
788       .scalarize(0)
789       .widenScalarToNextPow2(0, 16)
790       .lower();
791 
792     // We're just lowering this, but it helps get a better result to try to
793     // coerce to the desired type first.
794     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795       .minScalar(0, S16)
796       .scalarize(0)
797       .lower();
798   } else {
799     getActionDefinitionsBuilder({G_ADD, G_SUB})
800       .legalFor({S32})
801       .widenScalarToNextMultipleOf(0, 32)
802       .clampScalar(0, S32, S32)
803       .scalarize(0);
804 
805     auto &Mul = getActionDefinitionsBuilder(G_MUL)
806       .legalFor({S32})
807       .scalarize(0)
808       .minScalar(0, S32)
809       .widenScalarToNextMultipleOf(0, 32);
810 
811     if (ST.hasMad64_32())
812       Mul.custom();
813     else
814       Mul.maxScalar(0, S32);
815 
816     if (ST.hasIntClamp()) {
817       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818         .legalFor({S32}) // Clamp modifier.
819         .scalarize(0)
820         .minScalarOrElt(0, S32)
821         .lower();
822     } else {
823       // Clamp bit support was added in VI, along with 16-bit operations.
824       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825         .minScalar(0, S32)
826         .scalarize(0)
827         .lower();
828     }
829 
830     // FIXME: DAG expansion gets better results. The widening uses the smaller
831     // range values and goes for the min/max lowering directly.
832     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833       .minScalar(0, S32)
834       .scalarize(0)
835       .lower();
836   }
837 
838   getActionDefinitionsBuilder(
839       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
840       .customFor({S32, S64})
841       .clampScalar(0, S32, S64)
842       .widenScalarToNextPow2(0, 32)
843       .scalarize(0);
844 
845   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
846                    .legalFor({S32})
847                    .maxScalar(0, S32);
848 
849   if (ST.hasVOP3PInsts()) {
850     Mulh
851       .clampMaxNumElements(0, S8, 2)
852       .lowerFor({V2S8});
853   }
854 
855   Mulh
856     .scalarize(0)
857     .lower();
858 
859   // Report legal for any types we can handle anywhere. For the cases only legal
860   // on the SALU, RegBankSelect will be able to re-legalize.
861   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
862     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
863     .clampScalar(0, S32, S64)
864     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
865     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
866     .widenScalarToNextPow2(0)
867     .scalarize(0);
868 
869   getActionDefinitionsBuilder(
870       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871       .legalFor({{S32, S1}, {S32, S32}})
872       .clampScalar(0, S32, S32)
873       .scalarize(0);
874 
875   getActionDefinitionsBuilder(G_BITCAST)
876       // Don't worry about the size constraint.
877       .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
878       .lower();
879 
880   getActionDefinitionsBuilder(G_CONSTANT)
881     .legalFor({S1, S32, S64, S16, GlobalPtr,
882                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883     .legalIf(isPointer(0))
884     .clampScalar(0, S32, S64)
885     .widenScalarToNextPow2(0);
886 
887   getActionDefinitionsBuilder(G_FCONSTANT)
888     .legalFor({S32, S64, S16})
889     .clampScalar(0, S16, S64);
890 
891   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
892       .legalIf(isRegisterType(0))
893       // s1 and s16 are special cases because they have legal operations on
894       // them, but don't really occupy registers in the normal way.
895       .legalFor({S1, S16})
896       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
897       .clampScalarOrElt(0, S32, MaxScalar)
898       .widenScalarToNextPow2(0, 32)
899       .clampMaxNumElements(0, S32, 16);
900 
901   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
902 
903   // If the amount is divergent, we have to do a wave reduction to get the
904   // maximum value, so this is expanded during RegBankSelect.
905   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
906     .legalFor({{PrivatePtr, S32}});
907 
908   getActionDefinitionsBuilder(G_STACKSAVE)
909     .customFor({PrivatePtr});
910   getActionDefinitionsBuilder(G_STACKRESTORE)
911     .legalFor({PrivatePtr});
912 
913   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
914 
915   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916     .customIf(typeIsNot(0, PrivatePtr));
917 
918   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
919 
920   auto &FPOpActions = getActionDefinitionsBuilder(
921     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
923     .legalFor({S32, S64});
924   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
925     .customFor({S32, S64});
926   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
927     .customFor({S32, S64});
928 
929   if (ST.has16BitInsts()) {
930     if (ST.hasVOP3PInsts())
931       FPOpActions.legalFor({S16, V2S16});
932     else
933       FPOpActions.legalFor({S16});
934 
935     TrigActions.customFor({S16});
936     FDIVActions.customFor({S16});
937   }
938 
939   if (ST.hasPackedFP32Ops()) {
940     FPOpActions.legalFor({V2S32});
941     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
942   }
943 
944   auto &MinNumMaxNum = getActionDefinitionsBuilder({
945       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
946 
947   if (ST.hasVOP3PInsts()) {
948     MinNumMaxNum.customFor(FPTypesPK16)
949       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
950       .clampMaxNumElements(0, S16, 2)
951       .clampScalar(0, S16, S64)
952       .scalarize(0);
953   } else if (ST.has16BitInsts()) {
954     MinNumMaxNum.customFor(FPTypes16)
955       .clampScalar(0, S16, S64)
956       .scalarize(0);
957   } else {
958     MinNumMaxNum.customFor(FPTypesBase)
959       .clampScalar(0, S32, S64)
960       .scalarize(0);
961   }
962 
963   if (ST.hasVOP3PInsts())
964     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
965 
966   FPOpActions
967     .scalarize(0)
968     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
969 
970   TrigActions
971     .scalarize(0)
972     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
973 
974   FDIVActions
975     .scalarize(0)
976     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
977 
978   getActionDefinitionsBuilder({G_FNEG, G_FABS})
979     .legalFor(FPTypesPK16)
980     .clampMaxNumElementsStrict(0, S16, 2)
981     .scalarize(0)
982     .clampScalar(0, S16, S64);
983 
984   if (ST.has16BitInsts()) {
985     getActionDefinitionsBuilder(G_FSQRT)
986       .legalFor({S16})
987       .customFor({S32, S64})
988       .scalarize(0)
989       .unsupported();
990     getActionDefinitionsBuilder(G_FFLOOR)
991       .legalFor({S32, S64, S16})
992       .scalarize(0)
993       .clampScalar(0, S16, S64);
994 
995     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
996       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
997       .scalarize(0)
998       .maxScalarIf(typeIs(0, S16), 1, S16)
999       .clampScalar(1, S32, S32)
1000       .lower();
1001 
1002     getActionDefinitionsBuilder(G_FFREXP)
1003       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1004       .scalarize(0)
1005       .lower();
1006   } else {
1007     getActionDefinitionsBuilder(G_FSQRT)
1008       .customFor({S32, S64, S16})
1009       .scalarize(0)
1010       .unsupported();
1011 
1012 
1013     if (ST.hasFractBug()) {
1014       getActionDefinitionsBuilder(G_FFLOOR)
1015         .customFor({S64})
1016         .legalFor({S32, S64})
1017         .scalarize(0)
1018         .clampScalar(0, S32, S64);
1019     } else {
1020       getActionDefinitionsBuilder(G_FFLOOR)
1021         .legalFor({S32, S64})
1022         .scalarize(0)
1023         .clampScalar(0, S32, S64);
1024     }
1025 
1026     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1027       .legalFor({{S32, S32}, {S64, S32}})
1028       .scalarize(0)
1029       .clampScalar(0, S32, S64)
1030       .clampScalar(1, S32, S32)
1031       .lower();
1032 
1033     getActionDefinitionsBuilder(G_FFREXP)
1034       .customFor({{S32, S32}, {S64, S32}})
1035       .scalarize(0)
1036       .minScalar(0, S32)
1037       .clampScalar(1, S32, S32)
1038       .lower();
1039   }
1040 
1041   getActionDefinitionsBuilder(G_FPTRUNC)
1042     .legalFor({{S32, S64}, {S16, S32}})
1043     .scalarize(0)
1044     .lower();
1045 
1046   getActionDefinitionsBuilder(G_FPEXT)
1047     .legalFor({{S64, S32}, {S32, S16}})
1048     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1049     .scalarize(0);
1050 
1051   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1052   if (ST.has16BitInsts()) {
1053     FSubActions
1054       // Use actual fsub instruction
1055       .legalFor({S32, S16})
1056       // Must use fadd + fneg
1057       .lowerFor({S64, V2S16});
1058   } else {
1059     FSubActions
1060       // Use actual fsub instruction
1061       .legalFor({S32})
1062       // Must use fadd + fneg
1063       .lowerFor({S64, S16, V2S16});
1064   }
1065 
1066   FSubActions
1067     .scalarize(0)
1068     .clampScalar(0, S32, S64);
1069 
1070   // Whether this is legal depends on the floating point mode for the function.
1071   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1072   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1073     FMad.customFor({S32, S16});
1074   else if (ST.hasMadMacF32Insts())
1075     FMad.customFor({S32});
1076   else if (ST.hasMadF16())
1077     FMad.customFor({S16});
1078   FMad.scalarize(0)
1079       .lower();
1080 
1081   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082   if (ST.has16BitInsts()) {
1083     FRem.customFor({S16, S32, S64});
1084   } else {
1085     FRem.minScalar(0, S32)
1086         .customFor({S32, S64});
1087   }
1088   FRem.scalarize(0);
1089 
1090   // TODO: Do we need to clamp maximum bitwidth?
1091   getActionDefinitionsBuilder(G_TRUNC)
1092     .legalIf(isScalar(0))
1093     .legalFor({{V2S16, V2S32}})
1094     .clampMaxNumElements(0, S16, 2)
1095     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1096     // situations (like an invalid implicit use), we don't want to infinite loop
1097     // in the legalizer.
1098     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1099     .alwaysLegal();
1100 
1101   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1102     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1103                {S32, S1}, {S64, S1}, {S16, S1}})
1104     .scalarize(0)
1105     .clampScalar(0, S32, S64)
1106     .widenScalarToNextPow2(1, 32);
1107 
1108   // TODO: Split s1->s64 during regbankselect for VALU.
1109   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111                     .lowerIf(typeIs(1, S1))
1112                     .customFor({{S32, S64}, {S64, S64}});
1113   if (ST.has16BitInsts())
1114     IToFP.legalFor({{S16, S16}});
1115   IToFP.clampScalar(1, S32, S64)
1116        .minScalar(0, S32)
1117        .scalarize(0)
1118        .widenScalarToNextPow2(1);
1119 
1120   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1121     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122     .customFor({{S64, S32}, {S64, S64}})
1123     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1124   if (ST.has16BitInsts())
1125     FPToI.legalFor({{S16, S16}});
1126   else
1127     FPToI.minScalar(1, S32);
1128 
1129   FPToI.minScalar(0, S32)
1130        .widenScalarToNextPow2(0, 32)
1131        .scalarize(0)
1132        .lower();
1133 
1134   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1135       .customFor({S16, S32})
1136       .scalarize(0)
1137       .lower();
1138 
1139   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1140   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1141       .scalarize(0)
1142       .lower();
1143 
1144   if (ST.has16BitInsts()) {
1145     getActionDefinitionsBuilder(
1146         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1147         .legalFor({S16, S32, S64})
1148         .clampScalar(0, S16, S64)
1149         .scalarize(0);
1150   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1151     getActionDefinitionsBuilder(
1152         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1153         .legalFor({S32, S64})
1154         .clampScalar(0, S32, S64)
1155         .scalarize(0);
1156   } else {
1157     getActionDefinitionsBuilder(
1158         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1159         .legalFor({S32})
1160         .customFor({S64})
1161         .clampScalar(0, S32, S64)
1162         .scalarize(0);
1163   }
1164 
1165   getActionDefinitionsBuilder(G_PTR_ADD)
1166       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1167       .legalIf(all(isPointer(0), sameSize(0, 1)))
1168       .scalarize(0)
1169       .scalarSameSizeAs(1, 0);
1170 
1171   getActionDefinitionsBuilder(G_PTRMASK)
1172     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1173     .scalarSameSizeAs(1, 0)
1174     .scalarize(0);
1175 
1176   auto &CmpBuilder =
1177     getActionDefinitionsBuilder(G_ICMP)
1178     // The compare output type differs based on the register bank of the output,
1179     // so make both s1 and s32 legal.
1180     //
1181     // Scalar compares producing output in scc will be promoted to s32, as that
1182     // is the allocatable register type that will be needed for the copy from
1183     // scc. This will be promoted during RegBankSelect, and we assume something
1184     // before that won't try to use s32 result types.
1185     //
1186     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1187     // bank.
1188     .legalForCartesianProduct(
1189       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1190     .legalForCartesianProduct(
1191       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1192   if (ST.has16BitInsts()) {
1193     CmpBuilder.legalFor({{S1, S16}});
1194   }
1195 
1196   CmpBuilder
1197     .widenScalarToNextPow2(1)
1198     .clampScalar(1, S32, S64)
1199     .scalarize(0)
1200     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1201 
1202   auto &FCmpBuilder =
1203       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1204           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1205 
1206   if (ST.hasSALUFloatInsts())
1207     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1208 
1209   FCmpBuilder
1210     .widenScalarToNextPow2(1)
1211     .clampScalar(1, S32, S64)
1212     .scalarize(0);
1213 
1214   // FIXME: fpow has a selection pattern that should move to custom lowering.
1215   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1216   if (ST.has16BitInsts())
1217     ExpOps.customFor({{S32}, {S16}});
1218   else
1219     ExpOps.customFor({S32});
1220   ExpOps.clampScalar(0, MinScalarFPTy, S32)
1221         .scalarize(0);
1222 
1223   getActionDefinitionsBuilder(G_FPOWI)
1224     .clampScalar(0, MinScalarFPTy, S32)
1225     .lower();
1226 
1227   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1228   Log2Ops.customFor({S32});
1229   if (ST.has16BitInsts())
1230     Log2Ops.legalFor({S16});
1231   else
1232     Log2Ops.customFor({S16});
1233   Log2Ops.scalarize(0)
1234     .lower();
1235 
1236   auto &LogOps =
1237       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1238   LogOps.customFor({S32, S16});
1239   LogOps.clampScalar(0, MinScalarFPTy, S32)
1240         .scalarize(0);
1241 
1242   // The 64-bit versions produce 32-bit results, but only on the SALU.
1243   getActionDefinitionsBuilder(G_CTPOP)
1244     .legalFor({{S32, S32}, {S32, S64}})
1245     .clampScalar(0, S32, S32)
1246     .widenScalarToNextPow2(1, 32)
1247     .clampScalar(1, S32, S64)
1248     .scalarize(0)
1249     .widenScalarToNextPow2(0, 32);
1250 
1251   // If no 16 bit instr is available, lower into different instructions.
1252   if (ST.has16BitInsts())
1253     getActionDefinitionsBuilder(G_IS_FPCLASS)
1254         .legalForCartesianProduct({S1}, FPTypes16)
1255         .widenScalarToNextPow2(1)
1256         .scalarize(0)
1257         .lower();
1258   else
1259     getActionDefinitionsBuilder(G_IS_FPCLASS)
1260         .legalForCartesianProduct({S1}, FPTypesBase)
1261         .lowerFor({S1, S16})
1262         .widenScalarToNextPow2(1)
1263         .scalarize(0)
1264         .lower();
1265 
1266   // The hardware instructions return a different result on 0 than the generic
1267   // instructions expect. The hardware produces -1, but these produce the
1268   // bitwidth.
1269   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1270     .scalarize(0)
1271     .clampScalar(0, S32, S32)
1272     .clampScalar(1, S32, S64)
1273     .widenScalarToNextPow2(0, 32)
1274     .widenScalarToNextPow2(1, 32)
1275     .custom();
1276 
1277   // The 64-bit versions produce 32-bit results, but only on the SALU.
1278   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1279       .legalFor({{S32, S32}, {S32, S64}})
1280       .customIf(scalarNarrowerThan(1, 32))
1281       .clampScalar(0, S32, S32)
1282       .clampScalar(1, S32, S64)
1283       .scalarize(0)
1284       .widenScalarToNextPow2(0, 32)
1285       .widenScalarToNextPow2(1, 32);
1286 
1287   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1288       .legalFor({{S32, S32}, {S32, S64}})
1289       .clampScalar(0, S32, S32)
1290       .clampScalar(1, S32, S64)
1291       .scalarize(0)
1292       .widenScalarToNextPow2(0, 32)
1293       .widenScalarToNextPow2(1, 32);
1294 
1295   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1296   // RegBankSelect.
1297   getActionDefinitionsBuilder(G_BITREVERSE)
1298     .legalFor({S32, S64})
1299     .clampScalar(0, S32, S64)
1300     .scalarize(0)
1301     .widenScalarToNextPow2(0);
1302 
1303   if (ST.has16BitInsts()) {
1304     getActionDefinitionsBuilder(G_BSWAP)
1305       .legalFor({S16, S32, V2S16})
1306       .clampMaxNumElementsStrict(0, S16, 2)
1307       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1308       // narrowScalar limitation.
1309       .widenScalarToNextPow2(0)
1310       .clampScalar(0, S16, S32)
1311       .scalarize(0);
1312 
1313     if (ST.hasVOP3PInsts()) {
1314       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1315         .legalFor({S32, S16, V2S16})
1316         .clampMaxNumElements(0, S16, 2)
1317         .minScalar(0, S16)
1318         .widenScalarToNextPow2(0)
1319         .scalarize(0)
1320         .lower();
1321     } else {
1322       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1323         .legalFor({S32, S16})
1324         .widenScalarToNextPow2(0)
1325         .minScalar(0, S16)
1326         .scalarize(0)
1327         .lower();
1328     }
1329   } else {
1330     // TODO: Should have same legality without v_perm_b32
1331     getActionDefinitionsBuilder(G_BSWAP)
1332       .legalFor({S32})
1333       .lowerIf(scalarNarrowerThan(0, 32))
1334       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1335       // narrowScalar limitation.
1336       .widenScalarToNextPow2(0)
1337       .maxScalar(0, S32)
1338       .scalarize(0)
1339       .lower();
1340 
1341     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1342       .legalFor({S32})
1343       .minScalar(0, S32)
1344       .widenScalarToNextPow2(0)
1345       .scalarize(0)
1346       .lower();
1347   }
1348 
1349   getActionDefinitionsBuilder(G_INTTOPTR)
1350       // List the common cases
1351       .legalForCartesianProduct(AddrSpaces64, {S64})
1352       .legalForCartesianProduct(AddrSpaces32, {S32})
1353       .scalarize(0)
1354       // Accept any address space as long as the size matches
1355       .legalIf(sameSize(0, 1))
1356       .widenScalarIf(smallerThan(1, 0),
1357                      [](const LegalityQuery &Query) {
1358                        return std::pair(
1359                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1360                      })
1361       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1362         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1363       });
1364 
1365   getActionDefinitionsBuilder(G_PTRTOINT)
1366       // List the common cases
1367       .legalForCartesianProduct(AddrSpaces64, {S64})
1368       .legalForCartesianProduct(AddrSpaces32, {S32})
1369       .scalarize(0)
1370       // Accept any address space as long as the size matches
1371       .legalIf(sameSize(0, 1))
1372       .widenScalarIf(smallerThan(0, 1),
1373                      [](const LegalityQuery &Query) {
1374                        return std::pair(
1375                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1376                      })
1377       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1378         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1379       });
1380 
1381   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1382     .scalarize(0)
1383     .custom();
1384 
1385   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1386                                     bool IsLoad) -> bool {
1387     const LLT DstTy = Query.Types[0];
1388 
1389     // Split vector extloads.
1390     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1391 
1392     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1393       return true;
1394 
1395     const LLT PtrTy = Query.Types[1];
1396     unsigned AS = PtrTy.getAddressSpace();
1397     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1398                                       Query.MMODescrs[0].Ordering !=
1399                                           AtomicOrdering::NotAtomic))
1400       return true;
1401 
1402     // Catch weird sized loads that don't evenly divide into the access sizes
1403     // TODO: May be able to widen depending on alignment etc.
1404     unsigned NumRegs = (MemSize + 31) / 32;
1405     if (NumRegs == 3) {
1406       if (!ST.hasDwordx3LoadStores())
1407         return true;
1408     } else {
1409       // If the alignment allows, these should have been widened.
1410       if (!isPowerOf2_32(NumRegs))
1411         return true;
1412     }
1413 
1414     return false;
1415   };
1416 
1417   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1418   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1419   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1420 
1421   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1422   // LDS
1423   // TODO: Unsupported flat for SI.
1424 
1425   for (unsigned Op : {G_LOAD, G_STORE}) {
1426     const bool IsStore = Op == G_STORE;
1427 
1428     auto &Actions = getActionDefinitionsBuilder(Op);
1429     // Explicitly list some common cases.
1430     // TODO: Does this help compile time at all?
1431     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1432                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1433                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1434                                       {S64, GlobalPtr, S64, GlobalAlign32},
1435                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1436                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1437                                       {S32, GlobalPtr, S8, GlobalAlign8},
1438                                       {S32, GlobalPtr, S16, GlobalAlign16},
1439 
1440                                       {S32, LocalPtr, S32, 32},
1441                                       {S64, LocalPtr, S64, 32},
1442                                       {V2S32, LocalPtr, V2S32, 32},
1443                                       {S32, LocalPtr, S8, 8},
1444                                       {S32, LocalPtr, S16, 16},
1445                                       {V2S16, LocalPtr, S32, 32},
1446 
1447                                       {S32, PrivatePtr, S32, 32},
1448                                       {S32, PrivatePtr, S8, 8},
1449                                       {S32, PrivatePtr, S16, 16},
1450                                       {V2S16, PrivatePtr, S32, 32},
1451 
1452                                       {S32, ConstantPtr, S32, GlobalAlign32},
1453                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1454                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1455                                       {S64, ConstantPtr, S64, GlobalAlign32},
1456                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1457     Actions.legalIf(
1458       [=](const LegalityQuery &Query) -> bool {
1459         return isLoadStoreLegal(ST, Query);
1460       });
1461 
1462     // The custom pointers (fat pointers, buffer resources) don't work with load
1463     // and store at this level. Fat pointers should have been lowered to
1464     // intrinsics before the translation to MIR.
1465     Actions.unsupportedIf(
1466         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1467 
1468     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1469     // ptrtoint. This is needed to account for the fact that we can't have i128
1470     // as a register class for SelectionDAG reasons.
1471     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1472       return hasBufferRsrcWorkaround(Query.Types[0]);
1473     });
1474 
1475     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1476     // 64-bits.
1477     //
1478     // TODO: Should generalize bitcast action into coerce, which will also cover
1479     // inserting addrspacecasts.
1480     Actions.customIf(typeIs(1, Constant32Ptr));
1481 
1482     // Turn any illegal element vectors into something easier to deal
1483     // with. These will ultimately produce 32-bit scalar shifts to extract the
1484     // parts anyway.
1485     //
1486     // For odd 16-bit element vectors, prefer to split those into pieces with
1487     // 16-bit vector parts.
1488     Actions.bitcastIf(
1489       [=](const LegalityQuery &Query) -> bool {
1490         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1491                                           Query.MMODescrs[0].MemoryTy);
1492       }, bitcastToRegisterType(0));
1493 
1494     if (!IsStore) {
1495       // Widen suitably aligned loads by loading extra bytes. The standard
1496       // legalization actions can't properly express widening memory operands.
1497       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1498         return shouldWidenLoad(ST, Query, G_LOAD);
1499       });
1500     }
1501 
1502     // FIXME: load/store narrowing should be moved to lower action
1503     Actions
1504         .narrowScalarIf(
1505             [=](const LegalityQuery &Query) -> bool {
1506               return !Query.Types[0].isVector() &&
1507                      needToSplitMemOp(Query, Op == G_LOAD);
1508             },
1509             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1510               const LLT DstTy = Query.Types[0];
1511               const LLT PtrTy = Query.Types[1];
1512 
1513               const unsigned DstSize = DstTy.getSizeInBits();
1514               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1515 
1516               // Split extloads.
1517               if (DstSize > MemSize)
1518                 return std::pair(0, LLT::scalar(MemSize));
1519 
1520               unsigned MaxSize = maxSizeForAddrSpace(
1521                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1522                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1523               if (MemSize > MaxSize)
1524                 return std::pair(0, LLT::scalar(MaxSize));
1525 
1526               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1527               return std::pair(0, LLT::scalar(Align));
1528             })
1529         .fewerElementsIf(
1530             [=](const LegalityQuery &Query) -> bool {
1531               return Query.Types[0].isVector() &&
1532                      needToSplitMemOp(Query, Op == G_LOAD);
1533             },
1534             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1535               const LLT DstTy = Query.Types[0];
1536               const LLT PtrTy = Query.Types[1];
1537 
1538               LLT EltTy = DstTy.getElementType();
1539               unsigned MaxSize = maxSizeForAddrSpace(
1540                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1541                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1542 
1543               // FIXME: Handle widened to power of 2 results better. This ends
1544               // up scalarizing.
1545               // FIXME: 3 element stores scalarized on SI
1546 
1547               // Split if it's too large for the address space.
1548               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1549               if (MemSize > MaxSize) {
1550                 unsigned NumElts = DstTy.getNumElements();
1551                 unsigned EltSize = EltTy.getSizeInBits();
1552 
1553                 if (MaxSize % EltSize == 0) {
1554                   return std::pair(
1555                       0, LLT::scalarOrVector(
1556                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1557                 }
1558 
1559                 unsigned NumPieces = MemSize / MaxSize;
1560 
1561                 // FIXME: Refine when odd breakdowns handled
1562                 // The scalars will need to be re-legalized.
1563                 if (NumPieces == 1 || NumPieces >= NumElts ||
1564                     NumElts % NumPieces != 0)
1565                   return std::pair(0, EltTy);
1566 
1567                 return std::pair(0,
1568                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1569               }
1570 
1571               // FIXME: We could probably handle weird extending loads better.
1572               if (DstTy.getSizeInBits() > MemSize)
1573                 return std::pair(0, EltTy);
1574 
1575               unsigned EltSize = EltTy.getSizeInBits();
1576               unsigned DstSize = DstTy.getSizeInBits();
1577               if (!isPowerOf2_32(DstSize)) {
1578                 // We're probably decomposing an odd sized store. Try to split
1579                 // to the widest type. TODO: Account for alignment. As-is it
1580                 // should be OK, since the new parts will be further legalized.
1581                 unsigned FloorSize = llvm::bit_floor(DstSize);
1582                 return std::pair(
1583                     0, LLT::scalarOrVector(
1584                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1585               }
1586 
1587               // May need relegalization for the scalars.
1588               return std::pair(0, EltTy);
1589             })
1590     .minScalar(0, S32)
1591     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1592     .widenScalarToNextPow2(0)
1593     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1594     .lower();
1595   }
1596 
1597   // FIXME: Unaligned accesses not lowered.
1598   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1599                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1600                                                   {S32, GlobalPtr, S16, 2 * 8},
1601                                                   {S32, LocalPtr, S8, 8},
1602                                                   {S32, LocalPtr, S16, 16},
1603                                                   {S32, PrivatePtr, S8, 8},
1604                                                   {S32, PrivatePtr, S16, 16},
1605                                                   {S32, ConstantPtr, S8, 8},
1606                                                   {S32, ConstantPtr, S16, 2 * 8}})
1607                        .legalIf(
1608                          [=](const LegalityQuery &Query) -> bool {
1609                            return isLoadStoreLegal(ST, Query);
1610                          });
1611 
1612   if (ST.hasFlatAddressSpace()) {
1613     ExtLoads.legalForTypesWithMemDesc(
1614         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1615   }
1616 
1617   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618   // 64-bits.
1619   //
1620   // TODO: Should generalize bitcast action into coerce, which will also cover
1621   // inserting addrspacecasts.
1622   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1623 
1624   ExtLoads.clampScalar(0, S32, S32)
1625           .widenScalarToNextPow2(0)
1626           .lower();
1627 
1628   auto &Atomics = getActionDefinitionsBuilder(
1629     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1630      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1631      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1632      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1633     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1634                {S64, GlobalPtr}, {S64, LocalPtr},
1635                {S32, RegionPtr}, {S64, RegionPtr}});
1636   if (ST.hasFlatAddressSpace()) {
1637     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1638   }
1639 
1640   // TODO: v2bf16 operations, and fat buffer pointer support.
1641   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1642   if (ST.hasLDSFPAtomicAddF32()) {
1643     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1644     if (ST.hasLdsAtomicAddF64())
1645       Atomic.legalFor({{S64, LocalPtr}});
1646     if (ST.hasAtomicDsPkAdd16Insts())
1647       Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1648   }
1649   if (ST.hasAtomicFaddInsts())
1650     Atomic.legalFor({{S32, GlobalPtr}});
1651   if (ST.hasFlatAtomicFaddF32Inst())
1652     Atomic.legalFor({{S32, FlatPtr}});
1653 
1654   if (ST.hasGFX90AInsts()) {
1655     // These are legal with some caveats, and should have undergone expansion in
1656     // the IR in most situations
1657     // TODO: Move atomic expansion into legalizer
1658     Atomic.legalFor({
1659         {S32, GlobalPtr},
1660         {S64, GlobalPtr},
1661         {S64, FlatPtr}
1662       });
1663   }
1664 
1665   if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1666       ST.hasAtomicBufferGlobalPkAddF16Insts())
1667     Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1668   if (ST.hasAtomicGlobalPkAddBF16Inst())
1669     Atomic.legalFor({{V2BF16, GlobalPtr}});
1670   if (ST.hasAtomicFlatPkAdd16Insts())
1671     Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1672 
1673 
1674   // Most of the legalization work here is done by AtomicExpand. We could
1675   // probably use a simpler legality rule that just assumes anything is OK.
1676   auto &AtomicFMinFMax =
1677     getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1678     .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1679 
1680   if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1681     AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1682   if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1683     AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1684   if (ST.hasAtomicFMinFMaxF32FlatInsts())
1685     AtomicFMinFMax.legalFor({F32, FlatPtr});
1686   if (ST.hasAtomicFMinFMaxF64FlatInsts())
1687     AtomicFMinFMax.legalFor({F64, FlatPtr});
1688 
1689   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1690   // demarshalling
1691   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1692     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1693                 {S32, FlatPtr}, {S64, FlatPtr}})
1694     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1695                {S32, RegionPtr}, {S64, RegionPtr}});
1696   // TODO: Pointer types, any 32-bit or 64-bit vector
1697 
1698   // Condition should be s32 for scalar, s1 for vector.
1699   getActionDefinitionsBuilder(G_SELECT)
1700       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1701                                  LocalPtr, FlatPtr, PrivatePtr,
1702                                  LLT::fixed_vector(2, LocalPtr),
1703                                  LLT::fixed_vector(2, PrivatePtr)},
1704                                 {S1, S32})
1705       .clampScalar(0, S16, S64)
1706       .scalarize(1)
1707       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1708       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1709       .clampMaxNumElements(0, S32, 2)
1710       .clampMaxNumElements(0, LocalPtr, 2)
1711       .clampMaxNumElements(0, PrivatePtr, 2)
1712       .scalarize(0)
1713       .widenScalarToNextPow2(0)
1714       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1715 
1716   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1717   // be more flexible with the shift amount type.
1718   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1719     .legalFor({{S32, S32}, {S64, S32}});
1720   if (ST.has16BitInsts()) {
1721     if (ST.hasVOP3PInsts()) {
1722       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1723             .clampMaxNumElements(0, S16, 2);
1724     } else
1725       Shifts.legalFor({{S16, S16}});
1726 
1727     // TODO: Support 16-bit shift amounts for all types
1728     Shifts.widenScalarIf(
1729       [=](const LegalityQuery &Query) {
1730         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1731         // 32-bit amount.
1732         const LLT ValTy = Query.Types[0];
1733         const LLT AmountTy = Query.Types[1];
1734         return ValTy.getSizeInBits() <= 16 &&
1735                AmountTy.getSizeInBits() < 16;
1736       }, changeTo(1, S16));
1737     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1738     Shifts.clampScalar(1, S32, S32);
1739     Shifts.widenScalarToNextPow2(0, 16);
1740     Shifts.clampScalar(0, S16, S64);
1741 
1742     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1743       .minScalar(0, S16)
1744       .scalarize(0)
1745       .lower();
1746   } else {
1747     // Make sure we legalize the shift amount type first, as the general
1748     // expansion for the shifted type will produce much worse code if it hasn't
1749     // been truncated already.
1750     Shifts.clampScalar(1, S32, S32);
1751     Shifts.widenScalarToNextPow2(0, 32);
1752     Shifts.clampScalar(0, S32, S64);
1753 
1754     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1755       .minScalar(0, S32)
1756       .scalarize(0)
1757       .lower();
1758   }
1759   Shifts.scalarize(0);
1760 
1761   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1762     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1763     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1764     unsigned IdxTypeIdx = 2;
1765 
1766     getActionDefinitionsBuilder(Op)
1767       .customIf([=](const LegalityQuery &Query) {
1768           const LLT EltTy = Query.Types[EltTypeIdx];
1769           const LLT VecTy = Query.Types[VecTypeIdx];
1770           const LLT IdxTy = Query.Types[IdxTypeIdx];
1771           const unsigned EltSize = EltTy.getSizeInBits();
1772           const bool isLegalVecType =
1773               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1774           // Address space 8 pointers are 128-bit wide values, but the logic
1775           // below will try to bitcast them to 2N x s64, which will fail.
1776           // Therefore, as an intermediate step, wrap extracts/insertions from a
1777           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1778           // extraction result) in order to produce a vector operation that can
1779           // be handled by the logic below.
1780           if (EltTy.isPointer() && EltSize > 64)
1781             return true;
1782           return (EltSize == 32 || EltSize == 64) &&
1783                   VecTy.getSizeInBits() % 32 == 0 &&
1784                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1785                   IdxTy.getSizeInBits() == 32 &&
1786                   isLegalVecType;
1787         })
1788       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1789                  bitcastToVectorElement32(VecTypeIdx))
1790       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1791       .bitcastIf(
1792         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1793         [=](const LegalityQuery &Query) {
1794           // For > 64-bit element types, try to turn this into a 64-bit
1795           // element vector since we may be able to do better indexing
1796           // if this is scalar. If not, fall back to 32.
1797           const LLT EltTy = Query.Types[EltTypeIdx];
1798           const LLT VecTy = Query.Types[VecTypeIdx];
1799           const unsigned DstEltSize = EltTy.getSizeInBits();
1800           const unsigned VecSize = VecTy.getSizeInBits();
1801 
1802           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1803           return std::pair(
1804               VecTypeIdx,
1805               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1806         })
1807       .clampScalar(EltTypeIdx, S32, S64)
1808       .clampScalar(VecTypeIdx, S32, S64)
1809       .clampScalar(IdxTypeIdx, S32, S32)
1810       .clampMaxNumElements(VecTypeIdx, S32, 32)
1811       // TODO: Clamp elements for 64-bit vectors?
1812       .moreElementsIf(
1813         isIllegalRegisterType(VecTypeIdx),
1814         moreElementsToNextExistingRegClass(VecTypeIdx))
1815       // It should only be necessary with variable indexes.
1816       // As a last resort, lower to the stack
1817       .lower();
1818   }
1819 
1820   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1821     .unsupportedIf([=](const LegalityQuery &Query) {
1822         const LLT &EltTy = Query.Types[1].getElementType();
1823         return Query.Types[0] != EltTy;
1824       });
1825 
1826   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1827     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1828     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1829 
1830     // FIXME: Doesn't handle extract of illegal sizes.
1831     getActionDefinitionsBuilder(Op)
1832       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1833       .lowerIf([=](const LegalityQuery &Query) {
1834           // Sub-vector(or single element) insert and extract.
1835           // TODO: verify immediate offset here since lower only works with
1836           // whole elements.
1837           const LLT BigTy = Query.Types[BigTyIdx];
1838           return BigTy.isVector();
1839         })
1840       // FIXME: Multiples of 16 should not be legal.
1841       .legalIf([=](const LegalityQuery &Query) {
1842           const LLT BigTy = Query.Types[BigTyIdx];
1843           const LLT LitTy = Query.Types[LitTyIdx];
1844           return (BigTy.getSizeInBits() % 32 == 0) &&
1845                  (LitTy.getSizeInBits() % 16 == 0);
1846         })
1847       .widenScalarIf(
1848         [=](const LegalityQuery &Query) {
1849           const LLT BigTy = Query.Types[BigTyIdx];
1850           return (BigTy.getScalarSizeInBits() < 16);
1851         },
1852         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1853       .widenScalarIf(
1854         [=](const LegalityQuery &Query) {
1855           const LLT LitTy = Query.Types[LitTyIdx];
1856           return (LitTy.getScalarSizeInBits() < 16);
1857         },
1858         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1859       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1860       .widenScalarToNextPow2(BigTyIdx, 32);
1861 
1862   }
1863 
1864   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1865     .legalForCartesianProduct(AllS32Vectors, {S32})
1866     .legalForCartesianProduct(AllS64Vectors, {S64})
1867     .clampNumElements(0, V16S32, V32S32)
1868     .clampNumElements(0, V2S64, V16S64)
1869     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1870     .moreElementsIf(
1871       isIllegalRegisterType(0),
1872       moreElementsToNextExistingRegClass(0));
1873 
1874   if (ST.hasScalarPackInsts()) {
1875     BuildVector
1876       // FIXME: Should probably widen s1 vectors straight to s32
1877       .minScalarOrElt(0, S16)
1878       .minScalar(1, S16);
1879 
1880     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1881       .legalFor({V2S16, S32})
1882       .lower();
1883   } else {
1884     BuildVector.customFor({V2S16, S16});
1885     BuildVector.minScalarOrElt(0, S32);
1886 
1887     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1888       .customFor({V2S16, S32})
1889       .lower();
1890   }
1891 
1892   BuildVector.legalIf(isRegisterType(0));
1893 
1894   // FIXME: Clamp maximum size
1895   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1896     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1897     .clampMaxNumElements(0, S32, 32)
1898     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1899     .clampMaxNumElements(0, S16, 64);
1900 
1901   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1902 
1903   // Merge/Unmerge
1904   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1905     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1906     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1907 
1908     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1909       const LLT Ty = Query.Types[TypeIdx];
1910       if (Ty.isVector()) {
1911         const LLT &EltTy = Ty.getElementType();
1912         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1913           return true;
1914         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1915           return true;
1916       }
1917       return false;
1918     };
1919 
1920     auto &Builder = getActionDefinitionsBuilder(Op)
1921       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1922       .lowerFor({{S16, V2S16}})
1923       .lowerIf([=](const LegalityQuery &Query) {
1924           const LLT BigTy = Query.Types[BigTyIdx];
1925           return BigTy.getSizeInBits() == 32;
1926         })
1927       // Try to widen to s16 first for small types.
1928       // TODO: Only do this on targets with legal s16 shifts
1929       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1930       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1931       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1932       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1933                            elementTypeIs(1, S16)),
1934                        changeTo(1, V2S16))
1935       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1936       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1937       // valid.
1938       .clampScalar(LitTyIdx, S32, S512)
1939       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1940       // Break up vectors with weird elements into scalars
1941       .fewerElementsIf(
1942         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1943         scalarize(0))
1944       .fewerElementsIf(
1945         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1946         scalarize(1))
1947       .clampScalar(BigTyIdx, S32, MaxScalar);
1948 
1949     if (Op == G_MERGE_VALUES) {
1950       Builder.widenScalarIf(
1951         // TODO: Use 16-bit shifts if legal for 8-bit values?
1952         [=](const LegalityQuery &Query) {
1953           const LLT Ty = Query.Types[LitTyIdx];
1954           return Ty.getSizeInBits() < 32;
1955         },
1956         changeTo(LitTyIdx, S32));
1957     }
1958 
1959     Builder.widenScalarIf(
1960       [=](const LegalityQuery &Query) {
1961         const LLT Ty = Query.Types[BigTyIdx];
1962         return Ty.getSizeInBits() % 16 != 0;
1963       },
1964       [=](const LegalityQuery &Query) {
1965         // Pick the next power of 2, or a multiple of 64 over 128.
1966         // Whichever is smaller.
1967         const LLT &Ty = Query.Types[BigTyIdx];
1968         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1969         if (NewSizeInBits >= 256) {
1970           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1971           if (RoundedTo < NewSizeInBits)
1972             NewSizeInBits = RoundedTo;
1973         }
1974         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1975       })
1976       // Any vectors left are the wrong size. Scalarize them.
1977       .scalarize(0)
1978       .scalarize(1);
1979   }
1980 
1981   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1982   // RegBankSelect.
1983   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1984     .legalFor({{S32}, {S64}});
1985 
1986   if (ST.hasVOP3PInsts()) {
1987     SextInReg.lowerFor({{V2S16}})
1988       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1989       // get more vector shift opportunities, since we'll get those when
1990       // expanded.
1991       .clampMaxNumElementsStrict(0, S16, 2);
1992   } else if (ST.has16BitInsts()) {
1993     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1994   } else {
1995     // Prefer to promote to s32 before lowering if we don't have 16-bit
1996     // shifts. This avoid a lot of intermediate truncate and extend operations.
1997     SextInReg.lowerFor({{S32}, {S64}});
1998   }
1999 
2000   SextInReg
2001     .scalarize(0)
2002     .clampScalar(0, S32, S64)
2003     .lower();
2004 
2005   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2006     .scalarize(0)
2007     .lower();
2008 
2009   // TODO: Only Try to form v2s16 with legal packed instructions.
2010   getActionDefinitionsBuilder(G_FSHR)
2011     .legalFor({{S32, S32}})
2012     .lowerFor({{V2S16, V2S16}})
2013     .clampMaxNumElementsStrict(0, S16, 2)
2014     .scalarize(0)
2015     .lower();
2016 
2017   if (ST.hasVOP3PInsts()) {
2018     getActionDefinitionsBuilder(G_FSHL)
2019       .lowerFor({{V2S16, V2S16}})
2020       .clampMaxNumElementsStrict(0, S16, 2)
2021       .scalarize(0)
2022       .lower();
2023   } else {
2024     getActionDefinitionsBuilder(G_FSHL)
2025       .scalarize(0)
2026       .lower();
2027   }
2028 
2029   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2030     .legalFor({S64});
2031 
2032   getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2033 
2034   getActionDefinitionsBuilder(G_FENCE)
2035     .alwaysLegal();
2036 
2037   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2038       .scalarize(0)
2039       .minScalar(0, S32)
2040       .lower();
2041 
2042   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2043       .legalFor({{S32, S32}, {S64, S32}})
2044       .clampScalar(1, S32, S32)
2045       .clampScalar(0, S32, S64)
2046       .widenScalarToNextPow2(0)
2047       .scalarize(0);
2048 
2049   getActionDefinitionsBuilder(
2050       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2051        G_FCOPYSIGN,
2052 
2053        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2054        G_READ_REGISTER, G_WRITE_REGISTER,
2055 
2056        G_SADDO, G_SSUBO})
2057       .lower();
2058 
2059   if (ST.hasIEEEMinMax()) {
2060     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2061         .legalFor(FPTypesPK16)
2062         .clampMaxNumElements(0, S16, 2)
2063         .scalarize(0);
2064   } else {
2065     // TODO: Implement
2066     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2067   }
2068 
2069   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2070       .lower();
2071 
2072   getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2073 
2074   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2075         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2076         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2077     .unsupported();
2078 
2079   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2080 
2081   getLegacyLegalizerInfo().computeTables();
2082   verify(*ST.getInstrInfo());
2083 }
2084 
2085 bool AMDGPULegalizerInfo::legalizeCustom(
2086     LegalizerHelper &Helper, MachineInstr &MI,
2087     LostDebugLocObserver &LocObserver) const {
2088   MachineIRBuilder &B = Helper.MIRBuilder;
2089   MachineRegisterInfo &MRI = *B.getMRI();
2090 
2091   switch (MI.getOpcode()) {
2092   case TargetOpcode::G_ADDRSPACE_CAST:
2093     return legalizeAddrSpaceCast(MI, MRI, B);
2094   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2095     return legalizeFroundeven(MI, MRI, B);
2096   case TargetOpcode::G_FCEIL:
2097     return legalizeFceil(MI, MRI, B);
2098   case TargetOpcode::G_FREM:
2099     return legalizeFrem(MI, MRI, B);
2100   case TargetOpcode::G_INTRINSIC_TRUNC:
2101     return legalizeIntrinsicTrunc(MI, MRI, B);
2102   case TargetOpcode::G_SITOFP:
2103     return legalizeITOFP(MI, MRI, B, true);
2104   case TargetOpcode::G_UITOFP:
2105     return legalizeITOFP(MI, MRI, B, false);
2106   case TargetOpcode::G_FPTOSI:
2107     return legalizeFPTOI(MI, MRI, B, true);
2108   case TargetOpcode::G_FPTOUI:
2109     return legalizeFPTOI(MI, MRI, B, false);
2110   case TargetOpcode::G_FMINNUM:
2111   case TargetOpcode::G_FMAXNUM:
2112   case TargetOpcode::G_FMINNUM_IEEE:
2113   case TargetOpcode::G_FMAXNUM_IEEE:
2114     return legalizeMinNumMaxNum(Helper, MI);
2115   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2116     return legalizeExtractVectorElt(MI, MRI, B);
2117   case TargetOpcode::G_INSERT_VECTOR_ELT:
2118     return legalizeInsertVectorElt(MI, MRI, B);
2119   case TargetOpcode::G_FSIN:
2120   case TargetOpcode::G_FCOS:
2121     return legalizeSinCos(MI, MRI, B);
2122   case TargetOpcode::G_GLOBAL_VALUE:
2123     return legalizeGlobalValue(MI, MRI, B);
2124   case TargetOpcode::G_LOAD:
2125   case TargetOpcode::G_SEXTLOAD:
2126   case TargetOpcode::G_ZEXTLOAD:
2127     return legalizeLoad(Helper, MI);
2128   case TargetOpcode::G_STORE:
2129     return legalizeStore(Helper, MI);
2130   case TargetOpcode::G_FMAD:
2131     return legalizeFMad(MI, MRI, B);
2132   case TargetOpcode::G_FDIV:
2133     return legalizeFDIV(MI, MRI, B);
2134   case TargetOpcode::G_FFREXP:
2135     return legalizeFFREXP(MI, MRI, B);
2136   case TargetOpcode::G_FSQRT:
2137     return legalizeFSQRT(MI, MRI, B);
2138   case TargetOpcode::G_UDIV:
2139   case TargetOpcode::G_UREM:
2140   case TargetOpcode::G_UDIVREM:
2141     return legalizeUnsignedDIV_REM(MI, MRI, B);
2142   case TargetOpcode::G_SDIV:
2143   case TargetOpcode::G_SREM:
2144   case TargetOpcode::G_SDIVREM:
2145     return legalizeSignedDIV_REM(MI, MRI, B);
2146   case TargetOpcode::G_ATOMIC_CMPXCHG:
2147     return legalizeAtomicCmpXChg(MI, MRI, B);
2148   case TargetOpcode::G_FLOG2:
2149     return legalizeFlog2(MI, B);
2150   case TargetOpcode::G_FLOG:
2151   case TargetOpcode::G_FLOG10:
2152     return legalizeFlogCommon(MI, B);
2153   case TargetOpcode::G_FEXP2:
2154     return legalizeFExp2(MI, B);
2155   case TargetOpcode::G_FEXP:
2156   case TargetOpcode::G_FEXP10:
2157     return legalizeFExp(MI, B);
2158   case TargetOpcode::G_FPOW:
2159     return legalizeFPow(MI, B);
2160   case TargetOpcode::G_FFLOOR:
2161     return legalizeFFloor(MI, MRI, B);
2162   case TargetOpcode::G_BUILD_VECTOR:
2163   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2164     return legalizeBuildVector(MI, MRI, B);
2165   case TargetOpcode::G_MUL:
2166     return legalizeMul(Helper, MI);
2167   case TargetOpcode::G_CTLZ:
2168   case TargetOpcode::G_CTTZ:
2169     return legalizeCTLZ_CTTZ(MI, MRI, B);
2170   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2171     return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2172   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2173     return legalizeFPTruncRound(MI, B);
2174   case TargetOpcode::G_STACKSAVE:
2175     return legalizeStackSave(MI, B);
2176   case TargetOpcode::G_GET_FPENV:
2177     return legalizeGetFPEnv(MI, MRI, B);
2178   case TargetOpcode::G_SET_FPENV:
2179     return legalizeSetFPEnv(MI, MRI, B);
2180   case TargetOpcode::G_TRAP:
2181     return legalizeTrap(MI, MRI, B);
2182   case TargetOpcode::G_DEBUGTRAP:
2183     return legalizeDebugTrap(MI, MRI, B);
2184   default:
2185     return false;
2186   }
2187 
2188   llvm_unreachable("expected switch to return");
2189 }
2190 
2191 Register AMDGPULegalizerInfo::getSegmentAperture(
2192   unsigned AS,
2193   MachineRegisterInfo &MRI,
2194   MachineIRBuilder &B) const {
2195   MachineFunction &MF = B.getMF();
2196   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2197   const LLT S32 = LLT::scalar(32);
2198   const LLT S64 = LLT::scalar(64);
2199 
2200   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2201 
2202   if (ST.hasApertureRegs()) {
2203     // Note: this register is somewhat broken. When used as a 32-bit operand,
2204     // it only returns zeroes. The real value is in the upper 32 bits.
2205     // Thus, we must emit extract the high 32 bits.
2206     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2207                                        ? AMDGPU::SRC_SHARED_BASE
2208                                        : AMDGPU::SRC_PRIVATE_BASE;
2209     // FIXME: It would be more natural to emit a COPY here, but then copy
2210     // coalescing would kick in and it would think it's okay to use the "HI"
2211     // subregister (instead of extracting the HI 32 bits) which is an artificial
2212     // (unusable) register.
2213     //  Register TableGen definitions would need an overhaul to get rid of the
2214     //  artificial "HI" aperture registers and prevent this kind of issue from
2215     //  happening.
2216     Register Dst = MRI.createGenericVirtualRegister(S64);
2217     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2218     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2219     return B.buildUnmerge(S32, Dst).getReg(1);
2220   }
2221 
2222   // TODO: can we be smarter about machine pointer info?
2223   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2224   Register LoadAddr = MRI.createGenericVirtualRegister(
2225     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2226   // For code object version 5, private_base and shared_base are passed through
2227   // implicit kernargs.
2228   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2229       AMDGPU::AMDHSA_COV5) {
2230     AMDGPUTargetLowering::ImplicitParameter Param =
2231         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2232                                       : AMDGPUTargetLowering::PRIVATE_BASE;
2233     uint64_t Offset =
2234         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2235 
2236     Register KernargPtrReg = MRI.createGenericVirtualRegister(
2237         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2238 
2239     if (!loadInputValue(KernargPtrReg, B,
2240                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2241       return Register();
2242 
2243     MachineMemOperand *MMO = MF.getMachineMemOperand(
2244         PtrInfo,
2245         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2246             MachineMemOperand::MOInvariant,
2247         LLT::scalar(32), commonAlignment(Align(64), Offset));
2248 
2249     // Pointer address
2250     B.buildPtrAdd(LoadAddr, KernargPtrReg,
2251                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2252     // Load address
2253     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2254   }
2255 
2256   Register QueuePtr = MRI.createGenericVirtualRegister(
2257     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2258 
2259   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2260     return Register();
2261 
2262   // Offset into amd_queue_t for group_segment_aperture_base_hi /
2263   // private_segment_aperture_base_hi.
2264   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2265 
2266   MachineMemOperand *MMO = MF.getMachineMemOperand(
2267       PtrInfo,
2268       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2269           MachineMemOperand::MOInvariant,
2270       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2271 
2272   B.buildPtrAdd(LoadAddr, QueuePtr,
2273                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2274   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2275 }
2276 
2277 /// Return true if the value is a known valid address, such that a null check is
2278 /// not necessary.
2279 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2280                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2281   MachineInstr *Def = MRI.getVRegDef(Val);
2282   switch (Def->getOpcode()) {
2283   case AMDGPU::G_FRAME_INDEX:
2284   case AMDGPU::G_GLOBAL_VALUE:
2285   case AMDGPU::G_BLOCK_ADDR:
2286     return true;
2287   case AMDGPU::G_CONSTANT: {
2288     const ConstantInt *CI = Def->getOperand(1).getCImm();
2289     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2290   }
2291   default:
2292     return false;
2293   }
2294 
2295   return false;
2296 }
2297 
2298 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2299   MachineInstr &MI, MachineRegisterInfo &MRI,
2300   MachineIRBuilder &B) const {
2301   MachineFunction &MF = B.getMF();
2302 
2303   // MI can either be a G_ADDRSPACE_CAST or a
2304   // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2305   assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2306          (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2307                                      Intrinsic::amdgcn_addrspacecast_nonnull));
2308 
2309   const LLT S32 = LLT::scalar(32);
2310   Register Dst = MI.getOperand(0).getReg();
2311   Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2312                                      : MI.getOperand(1).getReg();
2313   LLT DstTy = MRI.getType(Dst);
2314   LLT SrcTy = MRI.getType(Src);
2315   unsigned DestAS = DstTy.getAddressSpace();
2316   unsigned SrcAS = SrcTy.getAddressSpace();
2317 
2318   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2319   // vector element.
2320   assert(!DstTy.isVector());
2321 
2322   const AMDGPUTargetMachine &TM
2323     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2324 
2325   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2326     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2327     return true;
2328   }
2329 
2330   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2331       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2332        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2333     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2334     // G_ADDRSPACE_CAST we need to guess.
2335     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2336       // Extract low 32-bits of the pointer.
2337       B.buildExtract(Dst, Src, 0);
2338       MI.eraseFromParent();
2339       return true;
2340     }
2341 
2342     unsigned NullVal = TM.getNullPointerValue(DestAS);
2343 
2344     auto SegmentNull = B.buildConstant(DstTy, NullVal);
2345     auto FlatNull = B.buildConstant(SrcTy, 0);
2346 
2347     // Extract low 32-bits of the pointer.
2348     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2349 
2350     auto CmpRes =
2351         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2352     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2353 
2354     MI.eraseFromParent();
2355     return true;
2356   }
2357 
2358   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2359       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2360        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2361     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2362     if (!ApertureReg.isValid())
2363       return false;
2364 
2365     // Coerce the type of the low half of the result so we can use merge_values.
2366     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2367 
2368     // TODO: Should we allow mismatched types but matching sizes in merges to
2369     // avoid the ptrtoint?
2370     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2371 
2372     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2373     // G_ADDRSPACE_CAST we need to guess.
2374     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2375       B.buildCopy(Dst, BuildPtr);
2376       MI.eraseFromParent();
2377       return true;
2378     }
2379 
2380     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2381     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2382 
2383     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2384                               SegmentNull.getReg(0));
2385 
2386     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2387 
2388     MI.eraseFromParent();
2389     return true;
2390   }
2391 
2392   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2393       SrcTy.getSizeInBits() == 64) {
2394     // Truncate.
2395     B.buildExtract(Dst, Src, 0);
2396     MI.eraseFromParent();
2397     return true;
2398   }
2399 
2400   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2401       DstTy.getSizeInBits() == 64) {
2402     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2403     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2404     auto PtrLo = B.buildPtrToInt(S32, Src);
2405     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2406     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2407     MI.eraseFromParent();
2408     return true;
2409   }
2410 
2411   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2412       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2413 
2414   LLVMContext &Ctx = MF.getFunction().getContext();
2415   Ctx.diagnose(InvalidAddrSpaceCast);
2416   B.buildUndef(Dst);
2417   MI.eraseFromParent();
2418   return true;
2419 }
2420 
2421 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2422                                              MachineRegisterInfo &MRI,
2423                                              MachineIRBuilder &B) const {
2424   Register Src = MI.getOperand(1).getReg();
2425   LLT Ty = MRI.getType(Src);
2426   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2427 
2428   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2429   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2430 
2431   auto C1 = B.buildFConstant(Ty, C1Val);
2432   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2433 
2434   // TODO: Should this propagate fast-math-flags?
2435   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2436   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2437 
2438   auto C2 = B.buildFConstant(Ty, C2Val);
2439   auto Fabs = B.buildFAbs(Ty, Src);
2440 
2441   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2442   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2443   MI.eraseFromParent();
2444   return true;
2445 }
2446 
2447 bool AMDGPULegalizerInfo::legalizeFceil(
2448   MachineInstr &MI, MachineRegisterInfo &MRI,
2449   MachineIRBuilder &B) const {
2450 
2451   const LLT S1 = LLT::scalar(1);
2452   const LLT S64 = LLT::scalar(64);
2453 
2454   Register Src = MI.getOperand(1).getReg();
2455   assert(MRI.getType(Src) == S64);
2456 
2457   // result = trunc(src)
2458   // if (src > 0.0 && src != result)
2459   //   result += 1.0
2460 
2461   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2462 
2463   const auto Zero = B.buildFConstant(S64, 0.0);
2464   const auto One = B.buildFConstant(S64, 1.0);
2465   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2466   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2467   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2468   auto Add = B.buildSelect(S64, And, One, Zero);
2469 
2470   // TODO: Should this propagate fast-math-flags?
2471   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2472   MI.eraseFromParent();
2473   return true;
2474 }
2475 
2476 bool AMDGPULegalizerInfo::legalizeFrem(
2477   MachineInstr &MI, MachineRegisterInfo &MRI,
2478   MachineIRBuilder &B) const {
2479     Register DstReg = MI.getOperand(0).getReg();
2480     Register Src0Reg = MI.getOperand(1).getReg();
2481     Register Src1Reg = MI.getOperand(2).getReg();
2482     auto Flags = MI.getFlags();
2483     LLT Ty = MRI.getType(DstReg);
2484 
2485     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2486     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2487     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2488     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2489     MI.eraseFromParent();
2490     return true;
2491 }
2492 
2493 static MachineInstrBuilder extractF64Exponent(Register Hi,
2494                                               MachineIRBuilder &B) {
2495   const unsigned FractBits = 52;
2496   const unsigned ExpBits = 11;
2497   LLT S32 = LLT::scalar(32);
2498 
2499   auto Const0 = B.buildConstant(S32, FractBits - 32);
2500   auto Const1 = B.buildConstant(S32, ExpBits);
2501 
2502   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2503                      .addUse(Hi)
2504                      .addUse(Const0.getReg(0))
2505                      .addUse(Const1.getReg(0));
2506 
2507   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2508 }
2509 
2510 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2511   MachineInstr &MI, MachineRegisterInfo &MRI,
2512   MachineIRBuilder &B) const {
2513   const LLT S1 = LLT::scalar(1);
2514   const LLT S32 = LLT::scalar(32);
2515   const LLT S64 = LLT::scalar(64);
2516 
2517   Register Src = MI.getOperand(1).getReg();
2518   assert(MRI.getType(Src) == S64);
2519 
2520   // TODO: Should this use extract since the low half is unused?
2521   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2522   Register Hi = Unmerge.getReg(1);
2523 
2524   // Extract the upper half, since this is where we will find the sign and
2525   // exponent.
2526   auto Exp = extractF64Exponent(Hi, B);
2527 
2528   const unsigned FractBits = 52;
2529 
2530   // Extract the sign bit.
2531   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2532   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2533 
2534   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2535 
2536   const auto Zero32 = B.buildConstant(S32, 0);
2537 
2538   // Extend back to 64-bits.
2539   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2540 
2541   auto Shr = B.buildAShr(S64, FractMask, Exp);
2542   auto Not = B.buildNot(S64, Shr);
2543   auto Tmp0 = B.buildAnd(S64, Src, Not);
2544   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2545 
2546   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2547   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2548 
2549   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2550   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2551   MI.eraseFromParent();
2552   return true;
2553 }
2554 
2555 bool AMDGPULegalizerInfo::legalizeITOFP(
2556   MachineInstr &MI, MachineRegisterInfo &MRI,
2557   MachineIRBuilder &B, bool Signed) const {
2558 
2559   Register Dst = MI.getOperand(0).getReg();
2560   Register Src = MI.getOperand(1).getReg();
2561 
2562   const LLT S64 = LLT::scalar(64);
2563   const LLT S32 = LLT::scalar(32);
2564 
2565   assert(MRI.getType(Src) == S64);
2566 
2567   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2568   auto ThirtyTwo = B.buildConstant(S32, 32);
2569 
2570   if (MRI.getType(Dst) == S64) {
2571     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2572                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2573 
2574     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2575     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2576 
2577     // TODO: Should this propagate fast-math-flags?
2578     B.buildFAdd(Dst, LdExp, CvtLo);
2579     MI.eraseFromParent();
2580     return true;
2581   }
2582 
2583   assert(MRI.getType(Dst) == S32);
2584 
2585   auto One = B.buildConstant(S32, 1);
2586 
2587   MachineInstrBuilder ShAmt;
2588   if (Signed) {
2589     auto ThirtyOne = B.buildConstant(S32, 31);
2590     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2591     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2592     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2593     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2594                   .addUse(Unmerge.getReg(1));
2595     auto LS2 = B.buildSub(S32, LS, One);
2596     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2597   } else
2598     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2599   auto Norm = B.buildShl(S64, Src, ShAmt);
2600   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2601   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2602   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2603   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2604   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2605   B.buildFLdexp(Dst, FVal, Scale);
2606   MI.eraseFromParent();
2607   return true;
2608 }
2609 
2610 // TODO: Copied from DAG implementation. Verify logic and document how this
2611 // actually works.
2612 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2613                                         MachineRegisterInfo &MRI,
2614                                         MachineIRBuilder &B,
2615                                         bool Signed) const {
2616 
2617   Register Dst = MI.getOperand(0).getReg();
2618   Register Src = MI.getOperand(1).getReg();
2619 
2620   const LLT S64 = LLT::scalar(64);
2621   const LLT S32 = LLT::scalar(32);
2622 
2623   const LLT SrcLT = MRI.getType(Src);
2624   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2625 
2626   unsigned Flags = MI.getFlags();
2627 
2628   // The basic idea of converting a floating point number into a pair of 32-bit
2629   // integers is illustrated as follows:
2630   //
2631   //     tf := trunc(val);
2632   //    hif := floor(tf * 2^-32);
2633   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2634   //     hi := fptoi(hif);
2635   //     lo := fptoi(lof);
2636   //
2637   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2638   MachineInstrBuilder Sign;
2639   if (Signed && SrcLT == S32) {
2640     // However, a 32-bit floating point number has only 23 bits mantissa and
2641     // it's not enough to hold all the significant bits of `lof` if val is
2642     // negative. To avoid the loss of precision, We need to take the absolute
2643     // value after truncating and flip the result back based on the original
2644     // signedness.
2645     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2646     Trunc = B.buildFAbs(S32, Trunc, Flags);
2647   }
2648   MachineInstrBuilder K0, K1;
2649   if (SrcLT == S64) {
2650     K0 = B.buildFConstant(
2651         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2652     K1 = B.buildFConstant(
2653         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2654   } else {
2655     K0 = B.buildFConstant(
2656         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2657     K1 = B.buildFConstant(
2658         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2659   }
2660 
2661   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2662   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2663   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2664 
2665   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2666                                      : B.buildFPTOUI(S32, FloorMul);
2667   auto Lo = B.buildFPTOUI(S32, Fma);
2668 
2669   if (Signed && SrcLT == S32) {
2670     // Flip the result based on the signedness, which is either all 0s or 1s.
2671     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2672     // r := xor({lo, hi}, sign) - sign;
2673     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2674                Sign);
2675   } else
2676     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2677   MI.eraseFromParent();
2678 
2679   return true;
2680 }
2681 
2682 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2683                                                MachineInstr &MI) const {
2684   MachineFunction &MF = Helper.MIRBuilder.getMF();
2685   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2686 
2687   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2688                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2689 
2690   // With ieee_mode disabled, the instructions have the correct behavior
2691   // already for G_FMINNUM/G_FMAXNUM
2692   if (!MFI->getMode().IEEE)
2693     return !IsIEEEOp;
2694 
2695   if (IsIEEEOp)
2696     return true;
2697 
2698   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2699 }
2700 
2701 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2702   MachineInstr &MI, MachineRegisterInfo &MRI,
2703   MachineIRBuilder &B) const {
2704   // TODO: Should move some of this into LegalizerHelper.
2705 
2706   // TODO: Promote dynamic indexing of s16 to s32
2707 
2708   Register Dst = MI.getOperand(0).getReg();
2709   Register Vec = MI.getOperand(1).getReg();
2710 
2711   LLT VecTy = MRI.getType(Vec);
2712   LLT EltTy = VecTy.getElementType();
2713   assert(EltTy == MRI.getType(Dst));
2714 
2715   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2716   // but we can't go directly to that logic becasue you can't bitcast a vector
2717   // of pointers to a vector of integers. Therefore, introduce an intermediate
2718   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2719   // drive the legalization forward.
2720   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2721     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2722     LLT IntVecTy = VecTy.changeElementType(IntTy);
2723 
2724     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2725     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2726     B.buildIntToPtr(Dst, IntElt);
2727 
2728     MI.eraseFromParent();
2729     return true;
2730   }
2731 
2732   // FIXME: Artifact combiner probably should have replaced the truncated
2733   // constant before this, so we shouldn't need
2734   // getIConstantVRegValWithLookThrough.
2735   std::optional<ValueAndVReg> MaybeIdxVal =
2736       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2737   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2738     return true;
2739   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2740 
2741   if (IdxVal < VecTy.getNumElements()) {
2742     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2743     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2744   } else {
2745     B.buildUndef(Dst);
2746   }
2747 
2748   MI.eraseFromParent();
2749   return true;
2750 }
2751 
2752 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2753   MachineInstr &MI, MachineRegisterInfo &MRI,
2754   MachineIRBuilder &B) const {
2755   // TODO: Should move some of this into LegalizerHelper.
2756 
2757   // TODO: Promote dynamic indexing of s16 to s32
2758 
2759   Register Dst = MI.getOperand(0).getReg();
2760   Register Vec = MI.getOperand(1).getReg();
2761   Register Ins = MI.getOperand(2).getReg();
2762 
2763   LLT VecTy = MRI.getType(Vec);
2764   LLT EltTy = VecTy.getElementType();
2765   assert(EltTy == MRI.getType(Ins));
2766 
2767   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2768   // but we can't go directly to that logic becasue you can't bitcast a vector
2769   // of pointers to a vector of integers. Therefore, make the pointer vector
2770   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2771   // new value, and then inttoptr the result vector back. This will then allow
2772   // the rest of legalization to take over.
2773   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2774     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2775     LLT IntVecTy = VecTy.changeElementType(IntTy);
2776 
2777     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2778     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2779     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2780                                                  MI.getOperand(3));
2781     B.buildIntToPtr(Dst, IntVecDest);
2782     MI.eraseFromParent();
2783     return true;
2784   }
2785 
2786   // FIXME: Artifact combiner probably should have replaced the truncated
2787   // constant before this, so we shouldn't need
2788   // getIConstantVRegValWithLookThrough.
2789   std::optional<ValueAndVReg> MaybeIdxVal =
2790       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2791   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2792     return true;
2793 
2794   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2795 
2796   unsigned NumElts = VecTy.getNumElements();
2797   if (IdxVal < NumElts) {
2798     SmallVector<Register, 8> SrcRegs;
2799     for (unsigned i = 0; i < NumElts; ++i)
2800       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2801     B.buildUnmerge(SrcRegs, Vec);
2802 
2803     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2804     B.buildMergeLikeInstr(Dst, SrcRegs);
2805   } else {
2806     B.buildUndef(Dst);
2807   }
2808 
2809   MI.eraseFromParent();
2810   return true;
2811 }
2812 
2813 bool AMDGPULegalizerInfo::legalizeSinCos(
2814   MachineInstr &MI, MachineRegisterInfo &MRI,
2815   MachineIRBuilder &B) const {
2816 
2817   Register DstReg = MI.getOperand(0).getReg();
2818   Register SrcReg = MI.getOperand(1).getReg();
2819   LLT Ty = MRI.getType(DstReg);
2820   unsigned Flags = MI.getFlags();
2821 
2822   Register TrigVal;
2823   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2824   if (ST.hasTrigReducedRange()) {
2825     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2826     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2827                   .addUse(MulVal.getReg(0))
2828                   .setMIFlags(Flags)
2829                   .getReg(0);
2830   } else
2831     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2832 
2833   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2834     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2835   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2836       .addUse(TrigVal)
2837       .setMIFlags(Flags);
2838   MI.eraseFromParent();
2839   return true;
2840 }
2841 
2842 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2843                                                   MachineIRBuilder &B,
2844                                                   const GlobalValue *GV,
2845                                                   int64_t Offset,
2846                                                   unsigned GAFlags) const {
2847   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2848   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2849   // to the following code sequence:
2850   //
2851   // For constant address space:
2852   //   s_getpc_b64 s[0:1]
2853   //   s_add_u32 s0, s0, $symbol
2854   //   s_addc_u32 s1, s1, 0
2855   //
2856   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2857   //   a fixup or relocation is emitted to replace $symbol with a literal
2858   //   constant, which is a pc-relative offset from the encoding of the $symbol
2859   //   operand to the global variable.
2860   //
2861   // For global address space:
2862   //   s_getpc_b64 s[0:1]
2863   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2864   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2865   //
2866   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2867   //   fixups or relocations are emitted to replace $symbol@*@lo and
2868   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2869   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2870   //   operand to the global variable.
2871 
2872   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2873 
2874   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2875     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2876 
2877   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2878     .addDef(PCReg);
2879 
2880   MIB.addGlobalAddress(GV, Offset, GAFlags);
2881   if (GAFlags == SIInstrInfo::MO_NONE)
2882     MIB.addImm(0);
2883   else
2884     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2885 
2886   if (!B.getMRI()->getRegClassOrNull(PCReg))
2887     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2888 
2889   if (PtrTy.getSizeInBits() == 32)
2890     B.buildExtract(DstReg, PCReg, 0);
2891   return true;
2892 }
2893 
2894 // Emit a ABS32_LO / ABS32_HI relocation stub.
2895 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2896     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2897     MachineRegisterInfo &MRI) const {
2898   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2899 
2900   LLT S32 = LLT::scalar(32);
2901 
2902   // Use the destination directly, if and only if we store the lower address
2903   // part only and we don't have a register class being set.
2904   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2905                         ? DstReg
2906                         : MRI.createGenericVirtualRegister(S32);
2907 
2908   if (!MRI.getRegClassOrNull(AddrLo))
2909     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2910 
2911   // Write the lower half.
2912   B.buildInstr(AMDGPU::S_MOV_B32)
2913       .addDef(AddrLo)
2914       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2915 
2916   // If required, write the upper half as well.
2917   if (RequiresHighHalf) {
2918     assert(PtrTy.getSizeInBits() == 64 &&
2919            "Must provide a 64-bit pointer type!");
2920 
2921     Register AddrHi = MRI.createGenericVirtualRegister(S32);
2922     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2923 
2924     B.buildInstr(AMDGPU::S_MOV_B32)
2925         .addDef(AddrHi)
2926         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2927 
2928     // Use the destination directly, if and only if we don't have a register
2929     // class being set.
2930     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2931                            ? DstReg
2932                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
2933 
2934     if (!MRI.getRegClassOrNull(AddrDst))
2935       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2936 
2937     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2938 
2939     // If we created a new register for the destination, cast the result into
2940     // the final output.
2941     if (AddrDst != DstReg)
2942       B.buildCast(DstReg, AddrDst);
2943   } else if (AddrLo != DstReg) {
2944     // If we created a new register for the destination, cast the result into
2945     // the final output.
2946     B.buildCast(DstReg, AddrLo);
2947   }
2948 }
2949 
2950 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2951   MachineInstr &MI, MachineRegisterInfo &MRI,
2952   MachineIRBuilder &B) const {
2953   Register DstReg = MI.getOperand(0).getReg();
2954   LLT Ty = MRI.getType(DstReg);
2955   unsigned AS = Ty.getAddressSpace();
2956 
2957   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2958   MachineFunction &MF = B.getMF();
2959   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2960 
2961   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2962     if (!MFI->isModuleEntryFunction() &&
2963         GV->getName() != "llvm.amdgcn.module.lds") {
2964       const Function &Fn = MF.getFunction();
2965       DiagnosticInfoUnsupported BadLDSDecl(
2966         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2967         DS_Warning);
2968       Fn.getContext().diagnose(BadLDSDecl);
2969 
2970       // We currently don't have a way to correctly allocate LDS objects that
2971       // aren't directly associated with a kernel. We do force inlining of
2972       // functions that use local objects. However, if these dead functions are
2973       // not eliminated, we don't want a compile time error. Just emit a warning
2974       // and a trap, since there should be no callable path here.
2975       B.buildTrap();
2976       B.buildUndef(DstReg);
2977       MI.eraseFromParent();
2978       return true;
2979     }
2980 
2981     // TODO: We could emit code to handle the initialization somewhere.
2982     // We ignore the initializer for now and legalize it to allow selection.
2983     // The initializer will anyway get errored out during assembly emission.
2984     const SITargetLowering *TLI = ST.getTargetLowering();
2985     if (!TLI->shouldUseLDSConstAddress(GV)) {
2986       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2987       return true; // Leave in place;
2988     }
2989 
2990     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2991       Type *Ty = GV->getValueType();
2992       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2993       // zero-sized type in other languages to declare the dynamic shared
2994       // memory which size is not known at the compile time. They will be
2995       // allocated by the runtime and placed directly after the static
2996       // allocated ones. They all share the same offset.
2997       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2998         // Adjust alignment for that dynamic shared memory array.
2999         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3000         LLT S32 = LLT::scalar(32);
3001         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3002         B.buildIntToPtr(DstReg, Sz);
3003         MI.eraseFromParent();
3004         return true;
3005       }
3006     }
3007 
3008     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3009                                                    *cast<GlobalVariable>(GV)));
3010     MI.eraseFromParent();
3011     return true;
3012   }
3013 
3014   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3015     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3016     MI.eraseFromParent();
3017     return true;
3018   }
3019 
3020   const SITargetLowering *TLI = ST.getTargetLowering();
3021 
3022   if (TLI->shouldEmitFixup(GV)) {
3023     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3024     MI.eraseFromParent();
3025     return true;
3026   }
3027 
3028   if (TLI->shouldEmitPCReloc(GV)) {
3029     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3030     MI.eraseFromParent();
3031     return true;
3032   }
3033 
3034   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3035   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3036 
3037   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3038   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3039       MachinePointerInfo::getGOT(MF),
3040       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3041           MachineMemOperand::MOInvariant,
3042       LoadTy, Align(8));
3043 
3044   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3045 
3046   if (Ty.getSizeInBits() == 32) {
3047     // Truncate if this is a 32-bit constant address.
3048     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3049     B.buildExtract(DstReg, Load, 0);
3050   } else
3051     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3052 
3053   MI.eraseFromParent();
3054   return true;
3055 }
3056 
3057 static LLT widenToNextPowerOf2(LLT Ty) {
3058   if (Ty.isVector())
3059     return Ty.changeElementCount(
3060         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3061   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3062 }
3063 
3064 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3065                                        MachineInstr &MI) const {
3066   MachineIRBuilder &B = Helper.MIRBuilder;
3067   MachineRegisterInfo &MRI = *B.getMRI();
3068   GISelChangeObserver &Observer = Helper.Observer;
3069 
3070   Register PtrReg = MI.getOperand(1).getReg();
3071   LLT PtrTy = MRI.getType(PtrReg);
3072   unsigned AddrSpace = PtrTy.getAddressSpace();
3073 
3074   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3075     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3076     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3077     Observer.changingInstr(MI);
3078     MI.getOperand(1).setReg(Cast.getReg(0));
3079     Observer.changedInstr(MI);
3080     return true;
3081   }
3082 
3083   if (MI.getOpcode() != AMDGPU::G_LOAD)
3084     return false;
3085 
3086   Register ValReg = MI.getOperand(0).getReg();
3087   LLT ValTy = MRI.getType(ValReg);
3088 
3089   if (hasBufferRsrcWorkaround(ValTy)) {
3090     Observer.changingInstr(MI);
3091     castBufferRsrcFromV4I32(MI, B, MRI, 0);
3092     Observer.changedInstr(MI);
3093     return true;
3094   }
3095 
3096   MachineMemOperand *MMO = *MI.memoperands_begin();
3097   const unsigned ValSize = ValTy.getSizeInBits();
3098   const LLT MemTy = MMO->getMemoryType();
3099   const Align MemAlign = MMO->getAlign();
3100   const unsigned MemSize = MemTy.getSizeInBits();
3101   const uint64_t AlignInBits = 8 * MemAlign.value();
3102 
3103   // Widen non-power-of-2 loads to the alignment if needed
3104   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3105     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3106 
3107     // This was already the correct extending load result type, so just adjust
3108     // the memory type.
3109     if (WideMemSize == ValSize) {
3110       MachineFunction &MF = B.getMF();
3111 
3112       MachineMemOperand *WideMMO =
3113           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3114       Observer.changingInstr(MI);
3115       MI.setMemRefs(MF, {WideMMO});
3116       Observer.changedInstr(MI);
3117       return true;
3118     }
3119 
3120     // Don't bother handling edge case that should probably never be produced.
3121     if (ValSize > WideMemSize)
3122       return false;
3123 
3124     LLT WideTy = widenToNextPowerOf2(ValTy);
3125 
3126     Register WideLoad;
3127     if (!WideTy.isVector()) {
3128       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3129       B.buildTrunc(ValReg, WideLoad).getReg(0);
3130     } else {
3131       // Extract the subvector.
3132 
3133       if (isRegisterType(ValTy)) {
3134         // If this a case where G_EXTRACT is legal, use it.
3135         // (e.g. <3 x s32> -> <4 x s32>)
3136         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3137         B.buildExtract(ValReg, WideLoad, 0);
3138       } else {
3139         // For cases where the widened type isn't a nice register value, unmerge
3140         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3141         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3142         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3143       }
3144     }
3145 
3146     MI.eraseFromParent();
3147     return true;
3148   }
3149 
3150   return false;
3151 }
3152 
3153 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3154                                         MachineInstr &MI) const {
3155   MachineIRBuilder &B = Helper.MIRBuilder;
3156   MachineRegisterInfo &MRI = *B.getMRI();
3157   GISelChangeObserver &Observer = Helper.Observer;
3158 
3159   Register DataReg = MI.getOperand(0).getReg();
3160   LLT DataTy = MRI.getType(DataReg);
3161 
3162   if (hasBufferRsrcWorkaround(DataTy)) {
3163     Observer.changingInstr(MI);
3164     castBufferRsrcArgToV4I32(MI, B, 0);
3165     Observer.changedInstr(MI);
3166     return true;
3167   }
3168   return false;
3169 }
3170 
3171 bool AMDGPULegalizerInfo::legalizeFMad(
3172   MachineInstr &MI, MachineRegisterInfo &MRI,
3173   MachineIRBuilder &B) const {
3174   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3175   assert(Ty.isScalar());
3176 
3177   MachineFunction &MF = B.getMF();
3178   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3179 
3180   // TODO: Always legal with future ftz flag.
3181   // FIXME: Do we need just output?
3182   if (Ty == LLT::float32() &&
3183       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3184     return true;
3185   if (Ty == LLT::float16() &&
3186       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3187     return true;
3188 
3189   MachineIRBuilder HelperBuilder(MI);
3190   GISelObserverWrapper DummyObserver;
3191   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3192   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3193 }
3194 
3195 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3196   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3197   Register DstReg = MI.getOperand(0).getReg();
3198   Register PtrReg = MI.getOperand(1).getReg();
3199   Register CmpVal = MI.getOperand(2).getReg();
3200   Register NewVal = MI.getOperand(3).getReg();
3201 
3202   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3203          "this should not have been custom lowered");
3204 
3205   LLT ValTy = MRI.getType(CmpVal);
3206   LLT VecTy = LLT::fixed_vector(2, ValTy);
3207 
3208   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3209 
3210   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3211     .addDef(DstReg)
3212     .addUse(PtrReg)
3213     .addUse(PackedVal)
3214     .setMemRefs(MI.memoperands());
3215 
3216   MI.eraseFromParent();
3217   return true;
3218 }
3219 
3220 /// Return true if it's known that \p Src can never be an f32 denormal value.
3221 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3222                                        Register Src) {
3223   const MachineInstr *DefMI = MRI.getVRegDef(Src);
3224   switch (DefMI->getOpcode()) {
3225   case TargetOpcode::G_INTRINSIC: {
3226     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3227     case Intrinsic::amdgcn_frexp_mant:
3228       return true;
3229     default:
3230       break;
3231     }
3232 
3233     break;
3234   }
3235   case TargetOpcode::G_FFREXP: {
3236     if (DefMI->getOperand(0).getReg() == Src)
3237       return true;
3238     break;
3239   }
3240   case TargetOpcode::G_FPEXT: {
3241     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3242   }
3243   default:
3244     return false;
3245   }
3246 
3247   return false;
3248 }
3249 
3250 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3251   if (Flags & MachineInstr::FmAfn)
3252     return true;
3253   const auto &Options = MF.getTarget().Options;
3254   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3255 }
3256 
3257 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3258                                    unsigned Flags) {
3259   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3260          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3261              DenormalMode::PreserveSign;
3262 }
3263 
3264 std::pair<Register, Register>
3265 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3266                                        unsigned Flags) const {
3267   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3268     return {};
3269 
3270   const LLT F32 = LLT::scalar(32);
3271   auto SmallestNormal = B.buildFConstant(
3272       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3273   auto IsLtSmallestNormal =
3274       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3275 
3276   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3277   auto One = B.buildFConstant(F32, 1.0);
3278   auto ScaleFactor =
3279       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3280   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3281 
3282   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3283 }
3284 
3285 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3286                                         MachineIRBuilder &B) const {
3287   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3288   // If we have to handle denormals, scale up the input and adjust the result.
3289 
3290   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3291   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3292 
3293   Register Dst = MI.getOperand(0).getReg();
3294   Register Src = MI.getOperand(1).getReg();
3295   LLT Ty = B.getMRI()->getType(Dst);
3296   unsigned Flags = MI.getFlags();
3297 
3298   if (Ty == LLT::scalar(16)) {
3299     const LLT F32 = LLT::scalar(32);
3300     // Nothing in half is a denormal when promoted to f32.
3301     auto Ext = B.buildFPExt(F32, Src, Flags);
3302     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3303                     .addUse(Ext.getReg(0))
3304                     .setMIFlags(Flags);
3305     B.buildFPTrunc(Dst, Log2, Flags);
3306     MI.eraseFromParent();
3307     return true;
3308   }
3309 
3310   assert(Ty == LLT::scalar(32));
3311 
3312   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3313   if (!ScaledInput) {
3314     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3315         .addUse(Src)
3316         .setMIFlags(Flags);
3317     MI.eraseFromParent();
3318     return true;
3319   }
3320 
3321   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3322                   .addUse(ScaledInput)
3323                   .setMIFlags(Flags);
3324 
3325   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3326   auto Zero = B.buildFConstant(Ty, 0.0);
3327   auto ResultOffset =
3328       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3329   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3330 
3331   MI.eraseFromParent();
3332   return true;
3333 }
3334 
3335 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3336                        Register Z, unsigned Flags) {
3337   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3338   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3339 }
3340 
3341 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3342                                              MachineIRBuilder &B) const {
3343   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3344   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3345 
3346   MachineRegisterInfo &MRI = *B.getMRI();
3347   Register Dst = MI.getOperand(0).getReg();
3348   Register X = MI.getOperand(1).getReg();
3349   unsigned Flags = MI.getFlags();
3350   const LLT Ty = MRI.getType(X);
3351   MachineFunction &MF = B.getMF();
3352 
3353   const LLT F32 = LLT::scalar(32);
3354   const LLT F16 = LLT::scalar(16);
3355 
3356   const AMDGPUTargetMachine &TM =
3357       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3358 
3359   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3360       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3361     if (Ty == F16 && !ST.has16BitInsts()) {
3362       Register LogVal = MRI.createGenericVirtualRegister(F32);
3363       auto PromoteSrc = B.buildFPExt(F32, X);
3364       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3365       B.buildFPTrunc(Dst, LogVal);
3366     } else {
3367       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3368     }
3369 
3370     MI.eraseFromParent();
3371     return true;
3372   }
3373 
3374   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3375   if (ScaledInput)
3376     X = ScaledInput;
3377 
3378   auto Y =
3379       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3380 
3381   Register R;
3382   if (ST.hasFastFMAF32()) {
3383     // c+cc are ln(2)/ln(10) to more than 49 bits
3384     const float c_log10 = 0x1.344134p-2f;
3385     const float cc_log10 = 0x1.09f79ep-26f;
3386 
3387     // c + cc is ln(2) to more than 49 bits
3388     const float c_log = 0x1.62e42ep-1f;
3389     const float cc_log = 0x1.efa39ep-25f;
3390 
3391     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3392     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3393 
3394     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3395     auto NegR = B.buildFNeg(Ty, R, Flags);
3396     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3397     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3398     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3399   } else {
3400     // ch+ct is ln(2)/ln(10) to more than 36 bits
3401     const float ch_log10 = 0x1.344000p-2f;
3402     const float ct_log10 = 0x1.3509f6p-18f;
3403 
3404     // ch + ct is ln(2) to more than 36 bits
3405     const float ch_log = 0x1.62e000p-1f;
3406     const float ct_log = 0x1.0bfbe8p-15f;
3407 
3408     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3409     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3410 
3411     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3412     auto YH = B.buildAnd(Ty, Y, MaskConst);
3413     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3414     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3415 
3416     Register Mad0 =
3417         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3418     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3419     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3420   }
3421 
3422   const bool IsFiniteOnly =
3423       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3424       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3425 
3426   if (!IsFiniteOnly) {
3427     // Expand isfinite(x) => fabs(x) < inf
3428     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3429     auto Fabs = B.buildFAbs(Ty, Y);
3430     auto IsFinite =
3431         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3432     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3433   }
3434 
3435   if (ScaledInput) {
3436     auto Zero = B.buildFConstant(Ty, 0.0);
3437     auto ShiftK =
3438         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3439     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3440     B.buildFSub(Dst, R, Shift, Flags);
3441   } else {
3442     B.buildCopy(Dst, R);
3443   }
3444 
3445   MI.eraseFromParent();
3446   return true;
3447 }
3448 
3449 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3450                                              Register Src, bool IsLog10,
3451                                              unsigned Flags) const {
3452   const double Log2BaseInverted =
3453       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3454 
3455   LLT Ty = B.getMRI()->getType(Dst);
3456 
3457   if (Ty == LLT::scalar(32)) {
3458     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3459     if (ScaledInput) {
3460       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3461                         .addUse(Src)
3462                         .setMIFlags(Flags);
3463       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3464       auto Zero = B.buildFConstant(Ty, 0.0);
3465       auto ResultOffset =
3466           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3467       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3468 
3469       if (ST.hasFastFMAF32())
3470         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3471       else {
3472         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3473         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3474       }
3475 
3476       return true;
3477     }
3478   }
3479 
3480   auto Log2Operand = Ty == LLT::scalar(16)
3481                          ? B.buildFLog2(Ty, Src, Flags)
3482                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3483                                .addUse(Src)
3484                                .setMIFlags(Flags);
3485   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3486   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3487   return true;
3488 }
3489 
3490 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3491                                         MachineIRBuilder &B) const {
3492   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3493   // If we have to handle denormals, scale up the input and adjust the result.
3494 
3495   Register Dst = MI.getOperand(0).getReg();
3496   Register Src = MI.getOperand(1).getReg();
3497   unsigned Flags = MI.getFlags();
3498   LLT Ty = B.getMRI()->getType(Dst);
3499   const LLT F16 = LLT::scalar(16);
3500   const LLT F32 = LLT::scalar(32);
3501 
3502   if (Ty == F16) {
3503     // Nothing in half is a denormal when promoted to f32.
3504     auto Ext = B.buildFPExt(F32, Src, Flags);
3505     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3506                     .addUse(Ext.getReg(0))
3507                     .setMIFlags(Flags);
3508     B.buildFPTrunc(Dst, Log2, Flags);
3509     MI.eraseFromParent();
3510     return true;
3511   }
3512 
3513   assert(Ty == F32);
3514 
3515   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3516     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3517         .addUse(Src)
3518         .setMIFlags(Flags);
3519     MI.eraseFromParent();
3520     return true;
3521   }
3522 
3523   // bool needs_scaling = x < -0x1.f80000p+6f;
3524   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3525 
3526   // -nextafter(128.0, -1)
3527   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3528   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3529                                   RangeCheckConst, Flags);
3530 
3531   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3532   auto Zero = B.buildFConstant(Ty, 0.0);
3533   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3534   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3535 
3536   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3537                   .addUse(AddInput.getReg(0))
3538                   .setMIFlags(Flags);
3539 
3540   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3541   auto One = B.buildFConstant(Ty, 1.0);
3542   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3543   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3544   MI.eraseFromParent();
3545   return true;
3546 }
3547 
3548 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3549                                              Register X, unsigned Flags) const {
3550   LLT Ty = B.getMRI()->getType(Dst);
3551   LLT F32 = LLT::scalar(32);
3552 
3553   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3554     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3555     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3556 
3557     if (Ty == F32) {
3558       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3559         .addUse(Mul.getReg(0))
3560         .setMIFlags(Flags);
3561     } else {
3562       B.buildFExp2(Dst, Mul.getReg(0), Flags);
3563     }
3564 
3565     return true;
3566   }
3567 
3568   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3569   auto NeedsScaling =
3570       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3571   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3572   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3573   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3574 
3575   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3576   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3577 
3578   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3579     .addUse(ExpInput.getReg(0))
3580     .setMIFlags(Flags);
3581 
3582   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3583   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3584   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3585   return true;
3586 }
3587 
3588 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3589                                        MachineIRBuilder &B) const {
3590   Register Dst = MI.getOperand(0).getReg();
3591   Register X = MI.getOperand(1).getReg();
3592   const unsigned Flags = MI.getFlags();
3593   MachineFunction &MF = B.getMF();
3594   MachineRegisterInfo &MRI = *B.getMRI();
3595   LLT Ty = MRI.getType(Dst);
3596   const LLT F16 = LLT::scalar(16);
3597   const LLT F32 = LLT::scalar(32);
3598   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3599 
3600   if (Ty == F16) {
3601     // v_exp_f16 (fmul x, log2e)
3602     if (allowApproxFunc(MF, Flags)) {
3603       // TODO: Does this really require fast?
3604       legalizeFExpUnsafe(B, Dst, X, Flags);
3605       MI.eraseFromParent();
3606       return true;
3607     }
3608 
3609     // exp(f16 x) ->
3610     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3611 
3612     // Nothing in half is a denormal when promoted to f32.
3613     auto Ext = B.buildFPExt(F32, X, Flags);
3614     Register Lowered = MRI.createGenericVirtualRegister(F32);
3615     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3616     B.buildFPTrunc(Dst, Lowered, Flags);
3617     MI.eraseFromParent();
3618     return true;
3619   }
3620 
3621   assert(Ty == F32);
3622 
3623   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3624   // library behavior. Also, is known-not-daz source sufficient?
3625   if (allowApproxFunc(MF, Flags)) {
3626     legalizeFExpUnsafe(B, Dst, X, Flags);
3627     MI.eraseFromParent();
3628     return true;
3629   }
3630 
3631   //    Algorithm:
3632   //
3633   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3634   //
3635   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3636   //    n = 64*m + j,   0 <= j < 64
3637   //
3638   //    e^x = 2^((64*m + j + f)/64)
3639   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3640   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3641   //
3642   //    f = x*(64/ln(2)) - n
3643   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3644   //
3645   //    e^x = (2^m) * (2^(j/64)) * e^r
3646   //
3647   //    (2^(j/64)) is precomputed
3648   //
3649   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3650   //    e^r = 1 + q
3651   //
3652   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3653   //
3654   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3655   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3656   Register PH, PL;
3657 
3658   if (ST.hasFastFMAF32()) {
3659     const float c_exp = numbers::log2ef;
3660     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3661     const float c_exp10 = 0x1.a934f0p+1f;
3662     const float cc_exp10 = 0x1.2f346ep-24f;
3663 
3664     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3665     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3666     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3667     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3668 
3669     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3670     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3671   } else {
3672     const float ch_exp = 0x1.714000p+0f;
3673     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3674 
3675     const float ch_exp10 = 0x1.a92000p+1f;
3676     const float cl_exp10 = 0x1.4f0978p-11f;
3677 
3678     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3679     auto XH = B.buildAnd(Ty, X, MaskConst);
3680     auto XL = B.buildFSub(Ty, X, XH, Flags);
3681 
3682     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3683     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3684 
3685     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3686     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3687 
3688     Register Mad0 =
3689         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3690     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3691   }
3692 
3693   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3694 
3695   // It is unsafe to contract this fsub into the PH multiply.
3696   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3697   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3698   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3699 
3700   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3701                   .addUse(A.getReg(0))
3702                   .setMIFlags(Flags);
3703   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3704 
3705   auto UnderflowCheckConst =
3706       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3707   auto Zero = B.buildFConstant(Ty, 0.0);
3708   auto Underflow =
3709       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3710 
3711   R = B.buildSelect(Ty, Underflow, Zero, R);
3712 
3713   const auto &Options = MF.getTarget().Options;
3714 
3715   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3716     auto OverflowCheckConst =
3717         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3718 
3719     auto Overflow =
3720         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3721     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3722     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3723   }
3724 
3725   B.buildCopy(Dst, R);
3726   MI.eraseFromParent();
3727   return true;
3728 }
3729 
3730 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3731                                        MachineIRBuilder &B) const {
3732   Register Dst = MI.getOperand(0).getReg();
3733   Register Src0 = MI.getOperand(1).getReg();
3734   Register Src1 = MI.getOperand(2).getReg();
3735   unsigned Flags = MI.getFlags();
3736   LLT Ty = B.getMRI()->getType(Dst);
3737   const LLT F16 = LLT::float16();
3738   const LLT F32 = LLT::float32();
3739 
3740   if (Ty == F32) {
3741     auto Log = B.buildFLog2(F32, Src0, Flags);
3742     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3743                    .addUse(Log.getReg(0))
3744                    .addUse(Src1)
3745                    .setMIFlags(Flags);
3746     B.buildFExp2(Dst, Mul, Flags);
3747   } else if (Ty == F16) {
3748     // There's no f16 fmul_legacy, so we need to convert for it.
3749     auto Log = B.buildFLog2(F16, Src0, Flags);
3750     auto Ext0 = B.buildFPExt(F32, Log, Flags);
3751     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3752     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3753                    .addUse(Ext0.getReg(0))
3754                    .addUse(Ext1.getReg(0))
3755                    .setMIFlags(Flags);
3756     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3757   } else
3758     return false;
3759 
3760   MI.eraseFromParent();
3761   return true;
3762 }
3763 
3764 // Find a source register, ignoring any possible source modifiers.
3765 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3766   Register ModSrc = OrigSrc;
3767   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3768     ModSrc = SrcFNeg->getOperand(1).getReg();
3769     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3770       ModSrc = SrcFAbs->getOperand(1).getReg();
3771   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3772     ModSrc = SrcFAbs->getOperand(1).getReg();
3773   return ModSrc;
3774 }
3775 
3776 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3777                                          MachineRegisterInfo &MRI,
3778                                          MachineIRBuilder &B) const {
3779 
3780   const LLT S1 = LLT::scalar(1);
3781   const LLT F64 = LLT::float64();
3782   Register Dst = MI.getOperand(0).getReg();
3783   Register OrigSrc = MI.getOperand(1).getReg();
3784   unsigned Flags = MI.getFlags();
3785   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3786          "this should not have been custom lowered");
3787 
3788   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3789   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3790   // efficient way to implement it is using V_FRACT_F64. The workaround for the
3791   // V_FRACT bug is:
3792   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3793   //
3794   // Convert floor(x) to (x - fract(x))
3795 
3796   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3797                    .addUse(OrigSrc)
3798                    .setMIFlags(Flags);
3799 
3800   // Give source modifier matching some assistance before obscuring a foldable
3801   // pattern.
3802 
3803   // TODO: We can avoid the neg on the fract? The input sign to fract
3804   // shouldn't matter?
3805   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3806 
3807   auto Const =
3808       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3809 
3810   Register Min = MRI.createGenericVirtualRegister(F64);
3811 
3812   // We don't need to concern ourselves with the snan handling difference, so
3813   // use the one which will directly select.
3814   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3815   if (MFI->getMode().IEEE)
3816     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3817   else
3818     B.buildFMinNum(Min, Fract, Const, Flags);
3819 
3820   Register CorrectedFract = Min;
3821   if (!MI.getFlag(MachineInstr::FmNoNans)) {
3822     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3823     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3824   }
3825 
3826   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3827   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3828 
3829   MI.eraseFromParent();
3830   return true;
3831 }
3832 
3833 // Turn an illegal packed v2s16 build vector into bit operations.
3834 // TODO: This should probably be a bitcast action in LegalizerHelper.
3835 bool AMDGPULegalizerInfo::legalizeBuildVector(
3836   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3837   Register Dst = MI.getOperand(0).getReg();
3838   const LLT S32 = LLT::scalar(32);
3839   const LLT S16 = LLT::scalar(16);
3840   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3841 
3842   Register Src0 = MI.getOperand(1).getReg();
3843   Register Src1 = MI.getOperand(2).getReg();
3844 
3845   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3846     assert(MRI.getType(Src0) == S32);
3847     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3848     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3849   }
3850 
3851   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3852   B.buildBitcast(Dst, Merge);
3853 
3854   MI.eraseFromParent();
3855   return true;
3856 }
3857 
3858 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3859 //
3860 // Source and accumulation registers must all be 32-bits.
3861 //
3862 // TODO: When the multiply is uniform, we should produce a code sequence
3863 // that is better suited to instruction selection on the SALU. Instead of
3864 // the outer loop going over parts of the result, the outer loop should go
3865 // over parts of one of the factors. This should result in instruction
3866 // selection that makes full use of S_ADDC_U32 instructions.
3867 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3868                                         MutableArrayRef<Register> Accum,
3869                                         ArrayRef<Register> Src0,
3870                                         ArrayRef<Register> Src1,
3871                                         bool UsePartialMad64_32,
3872                                         bool SeparateOddAlignedProducts) const {
3873   // Use (possibly empty) vectors of S1 registers to represent the set of
3874   // carries from one pair of positions to the next.
3875   using Carry = SmallVector<Register, 2>;
3876 
3877   MachineIRBuilder &B = Helper.MIRBuilder;
3878   GISelKnownBits &KB = *Helper.getKnownBits();
3879 
3880   const LLT S1 = LLT::scalar(1);
3881   const LLT S32 = LLT::scalar(32);
3882   const LLT S64 = LLT::scalar(64);
3883 
3884   Register Zero32;
3885   Register Zero64;
3886 
3887   auto getZero32 = [&]() -> Register {
3888     if (!Zero32)
3889       Zero32 = B.buildConstant(S32, 0).getReg(0);
3890     return Zero32;
3891   };
3892   auto getZero64 = [&]() -> Register {
3893     if (!Zero64)
3894       Zero64 = B.buildConstant(S64, 0).getReg(0);
3895     return Zero64;
3896   };
3897 
3898   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3899   for (unsigned i = 0; i < Src0.size(); ++i) {
3900     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3901     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3902   }
3903 
3904   // Merge the given carries into the 32-bit LocalAccum, which is modified
3905   // in-place.
3906   //
3907   // Returns the carry-out, which is a single S1 register or null.
3908   auto mergeCarry =
3909       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3910         if (CarryIn.empty())
3911           return Register();
3912 
3913         bool HaveCarryOut = true;
3914         Register CarryAccum;
3915         if (CarryIn.size() == 1) {
3916           if (!LocalAccum) {
3917             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3918             return Register();
3919           }
3920 
3921           CarryAccum = getZero32();
3922         } else {
3923           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3924           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3925             CarryAccum =
3926                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3927                     .getReg(0);
3928           }
3929 
3930           if (!LocalAccum) {
3931             LocalAccum = getZero32();
3932             HaveCarryOut = false;
3933           }
3934         }
3935 
3936         auto Add =
3937             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3938         LocalAccum = Add.getReg(0);
3939         return HaveCarryOut ? Add.getReg(1) : Register();
3940       };
3941 
3942   // Build a multiply-add chain to compute
3943   //
3944   //   LocalAccum + (partial products at DstIndex)
3945   //       + (opportunistic subset of CarryIn)
3946   //
3947   // LocalAccum is an array of one or two 32-bit registers that are updated
3948   // in-place. The incoming registers may be null.
3949   //
3950   // In some edge cases, carry-ins can be consumed "for free". In that case,
3951   // the consumed carry bits are removed from CarryIn in-place.
3952   auto buildMadChain =
3953       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
3954           -> Carry {
3955         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
3956                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
3957 
3958         Carry CarryOut;
3959         unsigned j0 = 0;
3960 
3961         // Use plain 32-bit multiplication for the most significant part of the
3962         // result by default.
3963         if (LocalAccum.size() == 1 &&
3964             (!UsePartialMad64_32 || !CarryIn.empty())) {
3965           do {
3966             // Skip multiplication if one of the operands is 0
3967             unsigned j1 = DstIndex - j0;
3968             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3969               ++j0;
3970               continue;
3971             }
3972             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3973             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
3974               LocalAccum[0] = Mul.getReg(0);
3975             } else {
3976               if (CarryIn.empty()) {
3977                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3978               } else {
3979                 LocalAccum[0] =
3980                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3981                         .getReg(0);
3982                 CarryIn.pop_back();
3983               }
3984             }
3985             ++j0;
3986           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3987         }
3988 
3989         // Build full 64-bit multiplies.
3990         if (j0 <= DstIndex) {
3991           bool HaveSmallAccum = false;
3992           Register Tmp;
3993 
3994           if (LocalAccum[0]) {
3995             if (LocalAccum.size() == 1) {
3996               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3997               HaveSmallAccum = true;
3998             } else if (LocalAccum[1]) {
3999               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4000               HaveSmallAccum = false;
4001             } else {
4002               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4003               HaveSmallAccum = true;
4004             }
4005           } else {
4006             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4007             Tmp = getZero64();
4008             HaveSmallAccum = true;
4009           }
4010 
4011           do {
4012             unsigned j1 = DstIndex - j0;
4013             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4014               ++j0;
4015               continue;
4016             }
4017             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4018                                     {Src0[j0], Src1[j1], Tmp});
4019             Tmp = Mad.getReg(0);
4020             if (!HaveSmallAccum)
4021               CarryOut.push_back(Mad.getReg(1));
4022             HaveSmallAccum = false;
4023 
4024             ++j0;
4025           } while (j0 <= DstIndex);
4026 
4027           auto Unmerge = B.buildUnmerge(S32, Tmp);
4028           LocalAccum[0] = Unmerge.getReg(0);
4029           if (LocalAccum.size() > 1)
4030             LocalAccum[1] = Unmerge.getReg(1);
4031         }
4032 
4033         return CarryOut;
4034       };
4035 
4036   // Outer multiply loop, iterating over destination parts from least
4037   // significant to most significant parts.
4038   //
4039   // The columns of the following diagram correspond to the destination parts
4040   // affected by one iteration of the outer loop (ignoring boundary
4041   // conditions).
4042   //
4043   //   Dest index relative to 2 * i:      1 0 -1
4044   //                                      ------
4045   //   Carries from previous iteration:     e o
4046   //   Even-aligned partial product sum:  E E .
4047   //   Odd-aligned partial product sum:     O O
4048   //
4049   // 'o' is OddCarry, 'e' is EvenCarry.
4050   // EE and OO are computed from partial products via buildMadChain and use
4051   // accumulation where possible and appropriate.
4052   //
4053   Register SeparateOddCarry;
4054   Carry EvenCarry;
4055   Carry OddCarry;
4056 
4057   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4058     Carry OddCarryIn = std::move(OddCarry);
4059     Carry EvenCarryIn = std::move(EvenCarry);
4060     OddCarry.clear();
4061     EvenCarry.clear();
4062 
4063     // Partial products at offset 2 * i.
4064     if (2 * i < Accum.size()) {
4065       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4066       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4067     }
4068 
4069     // Partial products at offset 2 * i - 1.
4070     if (i > 0) {
4071       if (!SeparateOddAlignedProducts) {
4072         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4073         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4074       } else {
4075         bool IsHighest = 2 * i >= Accum.size();
4076         Register SeparateOddOut[2];
4077         auto LocalAccum = MutableArrayRef(SeparateOddOut)
4078                               .take_front(IsHighest ? 1 : 2);
4079         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4080 
4081         MachineInstr *Lo;
4082 
4083         if (i == 1) {
4084           if (!IsHighest)
4085             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4086           else
4087             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4088         } else {
4089           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4090                             SeparateOddCarry);
4091         }
4092         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4093 
4094         if (!IsHighest) {
4095           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4096                                 Lo->getOperand(1).getReg());
4097           Accum[2 * i] = Hi.getReg(0);
4098           SeparateOddCarry = Hi.getReg(1);
4099         }
4100       }
4101     }
4102 
4103     // Add in the carries from the previous iteration
4104     if (i > 0) {
4105       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4106         EvenCarryIn.push_back(CarryOut);
4107 
4108       if (2 * i < Accum.size()) {
4109         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4110           OddCarry.push_back(CarryOut);
4111       }
4112     }
4113   }
4114 }
4115 
4116 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4117 //
4118 // TODO: If the multiply is followed by an addition, we should attempt to
4119 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4120 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4121                                       MachineInstr &MI) const {
4122   assert(ST.hasMad64_32());
4123   assert(MI.getOpcode() == TargetOpcode::G_MUL);
4124 
4125   MachineIRBuilder &B = Helper.MIRBuilder;
4126   MachineRegisterInfo &MRI = *B.getMRI();
4127 
4128   Register DstReg = MI.getOperand(0).getReg();
4129   Register Src0 = MI.getOperand(1).getReg();
4130   Register Src1 = MI.getOperand(2).getReg();
4131 
4132   LLT Ty = MRI.getType(DstReg);
4133   assert(Ty.isScalar());
4134 
4135   unsigned Size = Ty.getSizeInBits();
4136   unsigned NumParts = Size / 32;
4137   assert((Size % 32) == 0);
4138   assert(NumParts >= 2);
4139 
4140   // Whether to use MAD_64_32 for partial products whose high half is
4141   // discarded. This avoids some ADD instructions but risks false dependency
4142   // stalls on some subtargets in some cases.
4143   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4144 
4145   // Whether to compute odd-aligned partial products separately. This is
4146   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4147   // in an even-aligned VGPR.
4148   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4149 
4150   LLT S32 = LLT::scalar(32);
4151   SmallVector<Register, 2> Src0Parts, Src1Parts;
4152   for (unsigned i = 0; i < NumParts; ++i) {
4153     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4154     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4155   }
4156   B.buildUnmerge(Src0Parts, Src0);
4157   B.buildUnmerge(Src1Parts, Src1);
4158 
4159   SmallVector<Register, 2> AccumRegs(NumParts);
4160   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4161                 SeparateOddAlignedProducts);
4162 
4163   B.buildMergeLikeInstr(DstReg, AccumRegs);
4164   MI.eraseFromParent();
4165   return true;
4166 }
4167 
4168 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4169 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4170 // case with a single min instruction instead of a compare+select.
4171 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4172                                             MachineRegisterInfo &MRI,
4173                                             MachineIRBuilder &B) const {
4174   Register Dst = MI.getOperand(0).getReg();
4175   Register Src = MI.getOperand(1).getReg();
4176   LLT DstTy = MRI.getType(Dst);
4177   LLT SrcTy = MRI.getType(Src);
4178 
4179   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4180                         ? AMDGPU::G_AMDGPU_FFBH_U32
4181                         : AMDGPU::G_AMDGPU_FFBL_B32;
4182   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4183   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4184 
4185   MI.eraseFromParent();
4186   return true;
4187 }
4188 
4189 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4190                                                   MachineRegisterInfo &MRI,
4191                                                   MachineIRBuilder &B) const {
4192   Register Dst = MI.getOperand(0).getReg();
4193   Register Src = MI.getOperand(1).getReg();
4194   LLT SrcTy = MRI.getType(Src);
4195   TypeSize NumBits = SrcTy.getSizeInBits();
4196 
4197   assert(NumBits < 32u);
4198 
4199   auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4200   auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4201   auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4202   auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4203   B.buildTrunc(Dst, Ctlz);
4204   MI.eraseFromParent();
4205   return true;
4206 }
4207 
4208 // Check that this is a G_XOR x, -1
4209 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4210   if (MI.getOpcode() != TargetOpcode::G_XOR)
4211     return false;
4212   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4213   return ConstVal && *ConstVal == -1;
4214 }
4215 
4216 // Return the use branch instruction, otherwise null if the usage is invalid.
4217 static MachineInstr *
4218 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4219                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4220   Register CondDef = MI.getOperand(0).getReg();
4221   if (!MRI.hasOneNonDBGUse(CondDef))
4222     return nullptr;
4223 
4224   MachineBasicBlock *Parent = MI.getParent();
4225   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4226 
4227   if (isNot(MRI, *UseMI)) {
4228     Register NegatedCond = UseMI->getOperand(0).getReg();
4229     if (!MRI.hasOneNonDBGUse(NegatedCond))
4230       return nullptr;
4231 
4232     // We're deleting the def of this value, so we need to remove it.
4233     eraseInstr(*UseMI, MRI);
4234 
4235     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4236     Negated = true;
4237   }
4238 
4239   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4240     return nullptr;
4241 
4242   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4243   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4244   if (Next == Parent->end()) {
4245     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4246     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4247       return nullptr;
4248     UncondBrTarget = &*NextMBB;
4249   } else {
4250     if (Next->getOpcode() != AMDGPU::G_BR)
4251       return nullptr;
4252     Br = &*Next;
4253     UncondBrTarget = Br->getOperand(0).getMBB();
4254   }
4255 
4256   return UseMI;
4257 }
4258 
4259 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4260                                          const ArgDescriptor *Arg,
4261                                          const TargetRegisterClass *ArgRC,
4262                                          LLT ArgTy) const {
4263   MCRegister SrcReg = Arg->getRegister();
4264   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
4265   assert(DstReg.isVirtual() && "Virtual register expected");
4266 
4267   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4268                                              *ArgRC, B.getDebugLoc(), ArgTy);
4269   if (Arg->isMasked()) {
4270     // TODO: Should we try to emit this once in the entry block?
4271     const LLT S32 = LLT::scalar(32);
4272     const unsigned Mask = Arg->getMask();
4273     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4274 
4275     Register AndMaskSrc = LiveIn;
4276 
4277     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4278     // 0.
4279     if (Shift != 0) {
4280       auto ShiftAmt = B.buildConstant(S32, Shift);
4281       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4282     }
4283 
4284     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4285   } else {
4286     B.buildCopy(DstReg, LiveIn);
4287   }
4288 
4289   return true;
4290 }
4291 
4292 bool AMDGPULegalizerInfo::loadInputValue(
4293     Register DstReg, MachineIRBuilder &B,
4294     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4295   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4296   const ArgDescriptor *Arg = nullptr;
4297   const TargetRegisterClass *ArgRC;
4298   LLT ArgTy;
4299 
4300   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4301   const ArgDescriptor WorkGroupIDX =
4302       ArgDescriptor::createRegister(AMDGPU::TTMP9);
4303   // If GridZ is not programmed in an entry function then the hardware will set
4304   // it to all zeros, so there is no need to mask the GridY value in the low
4305   // order bits.
4306   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4307       AMDGPU::TTMP7,
4308       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4309   const ArgDescriptor WorkGroupIDZ =
4310       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4311   if (ST.hasArchitectedSGPRs() &&
4312       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4313     switch (ArgType) {
4314     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4315       Arg = &WorkGroupIDX;
4316       ArgRC = &AMDGPU::SReg_32RegClass;
4317       ArgTy = LLT::scalar(32);
4318       break;
4319     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4320       Arg = &WorkGroupIDY;
4321       ArgRC = &AMDGPU::SReg_32RegClass;
4322       ArgTy = LLT::scalar(32);
4323       break;
4324     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4325       Arg = &WorkGroupIDZ;
4326       ArgRC = &AMDGPU::SReg_32RegClass;
4327       ArgTy = LLT::scalar(32);
4328       break;
4329     default:
4330       break;
4331     }
4332   }
4333 
4334   if (!Arg)
4335     std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4336 
4337   if (!Arg) {
4338     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4339       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4340       // case the pointer argument may be missing and we use null.
4341       B.buildConstant(DstReg, 0);
4342       return true;
4343     }
4344 
4345     // It's undefined behavior if a function marked with the amdgpu-no-*
4346     // attributes uses the corresponding intrinsic.
4347     B.buildUndef(DstReg);
4348     return true;
4349   }
4350 
4351   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4352     return false; // TODO: Handle these
4353   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4354 }
4355 
4356 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4357     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4358     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4359   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4360     return false;
4361 
4362   MI.eraseFromParent();
4363   return true;
4364 }
4365 
4366 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4367                                 int64_t C) {
4368   B.buildConstant(MI.getOperand(0).getReg(), C);
4369   MI.eraseFromParent();
4370   return true;
4371 }
4372 
4373 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4374     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4375     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4376   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4377   if (MaxID == 0)
4378     return replaceWithConstant(B, MI, 0);
4379 
4380   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4381   const ArgDescriptor *Arg;
4382   const TargetRegisterClass *ArgRC;
4383   LLT ArgTy;
4384   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4385 
4386   Register DstReg = MI.getOperand(0).getReg();
4387   if (!Arg) {
4388     // It's undefined behavior if a function marked with the amdgpu-no-*
4389     // attributes uses the corresponding intrinsic.
4390     B.buildUndef(DstReg);
4391     MI.eraseFromParent();
4392     return true;
4393   }
4394 
4395   if (Arg->isMasked()) {
4396     // Don't bother inserting AssertZext for packed IDs since we're emitting the
4397     // masking operations anyway.
4398     //
4399     // TODO: We could assert the top bit is 0 for the source copy.
4400     if (!loadInputValue(DstReg, B, ArgType))
4401       return false;
4402   } else {
4403     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4404     if (!loadInputValue(TmpReg, B, ArgType))
4405       return false;
4406     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4407   }
4408 
4409   MI.eraseFromParent();
4410   return true;
4411 }
4412 
4413 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4414                                                      int64_t Offset) const {
4415   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4416   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4417 
4418   // TODO: If we passed in the base kernel offset we could have a better
4419   // alignment than 4, but we don't really need it.
4420   if (!loadInputValue(KernArgReg, B,
4421                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4422     llvm_unreachable("failed to find kernarg segment ptr");
4423 
4424   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4425   // TODO: Should get nuw
4426   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4427 }
4428 
4429 /// Legalize a value that's loaded from kernel arguments. This is only used by
4430 /// legacy intrinsics.
4431 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4432                                                       MachineIRBuilder &B,
4433                                                       uint64_t Offset,
4434                                                       Align Alignment) const {
4435   Register DstReg = MI.getOperand(0).getReg();
4436 
4437   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4438          "unexpected kernarg parameter type");
4439 
4440   Register Ptr = getKernargParameterPtr(B, Offset);
4441   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4442   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4443               MachineMemOperand::MODereferenceable |
4444                   MachineMemOperand::MOInvariant);
4445   MI.eraseFromParent();
4446   return true;
4447 }
4448 
4449 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4450                                        MachineRegisterInfo &MRI,
4451                                        MachineIRBuilder &B) const {
4452   Register Dst = MI.getOperand(0).getReg();
4453   LLT DstTy = MRI.getType(Dst);
4454   LLT S16 = LLT::scalar(16);
4455   LLT S32 = LLT::scalar(32);
4456   LLT S64 = LLT::scalar(64);
4457 
4458   if (DstTy == S16)
4459     return legalizeFDIV16(MI, MRI, B);
4460   if (DstTy == S32)
4461     return legalizeFDIV32(MI, MRI, B);
4462   if (DstTy == S64)
4463     return legalizeFDIV64(MI, MRI, B);
4464 
4465   return false;
4466 }
4467 
4468 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4469                                                         Register DstDivReg,
4470                                                         Register DstRemReg,
4471                                                         Register X,
4472                                                         Register Y) const {
4473   const LLT S1 = LLT::scalar(1);
4474   const LLT S32 = LLT::scalar(32);
4475 
4476   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4477   // algorithm used here.
4478 
4479   // Initial estimate of inv(y).
4480   auto FloatY = B.buildUITOFP(S32, Y);
4481   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4482   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4483   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4484   auto Z = B.buildFPTOUI(S32, ScaledY);
4485 
4486   // One round of UNR.
4487   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4488   auto NegYZ = B.buildMul(S32, NegY, Z);
4489   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4490 
4491   // Quotient/remainder estimate.
4492   auto Q = B.buildUMulH(S32, X, Z);
4493   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4494 
4495   // First quotient/remainder refinement.
4496   auto One = B.buildConstant(S32, 1);
4497   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4498   if (DstDivReg)
4499     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4500   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4501 
4502   // Second quotient/remainder refinement.
4503   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4504   if (DstDivReg)
4505     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4506 
4507   if (DstRemReg)
4508     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4509 }
4510 
4511 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4512 //
4513 // Return lo, hi of result
4514 //
4515 // %cvt.lo = G_UITOFP Val.lo
4516 // %cvt.hi = G_UITOFP Val.hi
4517 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4518 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4519 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4520 // %mul2 = G_FMUL %mul1, 2**(-32)
4521 // %trunc = G_INTRINSIC_TRUNC %mul2
4522 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4523 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4524 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4525                                                        Register Val) {
4526   const LLT S32 = LLT::scalar(32);
4527   auto Unmerge = B.buildUnmerge(S32, Val);
4528 
4529   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4530   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4531 
4532   auto Mad = B.buildFMAD(
4533       S32, CvtHi, // 2**32
4534       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4535 
4536   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4537   auto Mul1 = B.buildFMul(
4538       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4539 
4540   // 2**(-32)
4541   auto Mul2 = B.buildFMul(
4542       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4543   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4544 
4545   // -(2**32)
4546   auto Mad2 = B.buildFMAD(
4547       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4548       Mul1);
4549 
4550   auto ResultLo = B.buildFPTOUI(S32, Mad2);
4551   auto ResultHi = B.buildFPTOUI(S32, Trunc);
4552 
4553   return {ResultLo.getReg(0), ResultHi.getReg(0)};
4554 }
4555 
4556 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4557                                                         Register DstDivReg,
4558                                                         Register DstRemReg,
4559                                                         Register Numer,
4560                                                         Register Denom) const {
4561   const LLT S32 = LLT::scalar(32);
4562   const LLT S64 = LLT::scalar(64);
4563   const LLT S1 = LLT::scalar(1);
4564   Register RcpLo, RcpHi;
4565 
4566   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4567 
4568   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4569 
4570   auto Zero64 = B.buildConstant(S64, 0);
4571   auto NegDenom = B.buildSub(S64, Zero64, Denom);
4572 
4573   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4574   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4575 
4576   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4577   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4578   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4579 
4580   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4581   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4582   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4583 
4584   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4585   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4586   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4587   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4588   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4589 
4590   auto Zero32 = B.buildConstant(S32, 0);
4591   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4592   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4593   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4594 
4595   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4596   Register NumerLo = UnmergeNumer.getReg(0);
4597   Register NumerHi = UnmergeNumer.getReg(1);
4598 
4599   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4600   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4601   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4602   Register Mul3_Lo = UnmergeMul3.getReg(0);
4603   Register Mul3_Hi = UnmergeMul3.getReg(1);
4604   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4605   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4606   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4607   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4608 
4609   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4610   Register DenomLo = UnmergeDenom.getReg(0);
4611   Register DenomHi = UnmergeDenom.getReg(1);
4612 
4613   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4614   auto C1 = B.buildSExt(S32, CmpHi);
4615 
4616   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4617   auto C2 = B.buildSExt(S32, CmpLo);
4618 
4619   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4620   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4621 
4622   // TODO: Here and below portions of the code can be enclosed into if/endif.
4623   // Currently control flow is unconditional and we have 4 selects after
4624   // potential endif to substitute PHIs.
4625 
4626   // if C3 != 0 ...
4627   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4628   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4629   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4630   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4631 
4632   auto One64 = B.buildConstant(S64, 1);
4633   auto Add3 = B.buildAdd(S64, MulHi3, One64);
4634 
4635   auto C4 =
4636       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4637   auto C5 =
4638       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4639   auto C6 = B.buildSelect(
4640       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4641 
4642   // if (C6 != 0)
4643   auto Add4 = B.buildAdd(S64, Add3, One64);
4644   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4645 
4646   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4647   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4648   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4649 
4650   // endif C6
4651   // endif C3
4652 
4653   if (DstDivReg) {
4654     auto Sel1 = B.buildSelect(
4655         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4656     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4657                   Sel1, MulHi3);
4658   }
4659 
4660   if (DstRemReg) {
4661     auto Sel2 = B.buildSelect(
4662         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4663     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4664                   Sel2, Sub1);
4665   }
4666 }
4667 
4668 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4669                                                   MachineRegisterInfo &MRI,
4670                                                   MachineIRBuilder &B) const {
4671   Register DstDivReg, DstRemReg;
4672   switch (MI.getOpcode()) {
4673   default:
4674     llvm_unreachable("Unexpected opcode!");
4675   case AMDGPU::G_UDIV: {
4676     DstDivReg = MI.getOperand(0).getReg();
4677     break;
4678   }
4679   case AMDGPU::G_UREM: {
4680     DstRemReg = MI.getOperand(0).getReg();
4681     break;
4682   }
4683   case AMDGPU::G_UDIVREM: {
4684     DstDivReg = MI.getOperand(0).getReg();
4685     DstRemReg = MI.getOperand(1).getReg();
4686     break;
4687   }
4688   }
4689 
4690   const LLT S64 = LLT::scalar(64);
4691   const LLT S32 = LLT::scalar(32);
4692   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4693   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4694   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4695   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4696 
4697   if (Ty == S32)
4698     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4699   else if (Ty == S64)
4700     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4701   else
4702     return false;
4703 
4704   MI.eraseFromParent();
4705   return true;
4706 }
4707 
4708 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4709                                                 MachineRegisterInfo &MRI,
4710                                                 MachineIRBuilder &B) const {
4711   const LLT S64 = LLT::scalar(64);
4712   const LLT S32 = LLT::scalar(32);
4713 
4714   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4715   if (Ty != S32 && Ty != S64)
4716     return false;
4717 
4718   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4719   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4720   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4721 
4722   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4723   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4724   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4725 
4726   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4727   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4728 
4729   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4730   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4731 
4732   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4733   switch (MI.getOpcode()) {
4734   default:
4735     llvm_unreachable("Unexpected opcode!");
4736   case AMDGPU::G_SDIV: {
4737     DstDivReg = MI.getOperand(0).getReg();
4738     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4739     break;
4740   }
4741   case AMDGPU::G_SREM: {
4742     DstRemReg = MI.getOperand(0).getReg();
4743     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4744     break;
4745   }
4746   case AMDGPU::G_SDIVREM: {
4747     DstDivReg = MI.getOperand(0).getReg();
4748     DstRemReg = MI.getOperand(1).getReg();
4749     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4750     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4751     break;
4752   }
4753   }
4754 
4755   if (Ty == S32)
4756     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4757   else
4758     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4759 
4760   if (DstDivReg) {
4761     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4762     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4763     B.buildSub(DstDivReg, SignXor, Sign);
4764   }
4765 
4766   if (DstRemReg) {
4767     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4768     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4769     B.buildSub(DstRemReg, SignXor, Sign);
4770   }
4771 
4772   MI.eraseFromParent();
4773   return true;
4774 }
4775 
4776 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4777                                                  MachineRegisterInfo &MRI,
4778                                                  MachineIRBuilder &B) const {
4779   Register Res = MI.getOperand(0).getReg();
4780   Register LHS = MI.getOperand(1).getReg();
4781   Register RHS = MI.getOperand(2).getReg();
4782   uint16_t Flags = MI.getFlags();
4783   LLT ResTy = MRI.getType(Res);
4784 
4785   const MachineFunction &MF = B.getMF();
4786   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4787                             MF.getTarget().Options.UnsafeFPMath;
4788 
4789   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4790     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4791       return false;
4792 
4793     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4794     // the CI documentation has a worst case error of 1 ulp.
4795     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4796     // use it as long as we aren't trying to use denormals.
4797     //
4798     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4799 
4800     // 1 / x -> RCP(x)
4801     if (CLHS->isExactlyValue(1.0)) {
4802       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4803           .addUse(RHS)
4804           .setMIFlags(Flags);
4805 
4806       MI.eraseFromParent();
4807       return true;
4808     }
4809 
4810     // -1 / x -> RCP( FNEG(x) )
4811     if (CLHS->isExactlyValue(-1.0)) {
4812       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4813       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4814           .addUse(FNeg.getReg(0))
4815           .setMIFlags(Flags);
4816 
4817       MI.eraseFromParent();
4818       return true;
4819     }
4820   }
4821 
4822   // For f16 require afn or arcp.
4823   // For f32 require afn.
4824   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4825                               !MI.getFlag(MachineInstr::FmArcp)))
4826     return false;
4827 
4828   // x / y -> x * (1.0 / y)
4829   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4830                  .addUse(RHS)
4831                  .setMIFlags(Flags);
4832   B.buildFMul(Res, LHS, RCP, Flags);
4833 
4834   MI.eraseFromParent();
4835   return true;
4836 }
4837 
4838 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4839                                                    MachineRegisterInfo &MRI,
4840                                                    MachineIRBuilder &B) const {
4841   Register Res = MI.getOperand(0).getReg();
4842   Register X = MI.getOperand(1).getReg();
4843   Register Y = MI.getOperand(2).getReg();
4844   uint16_t Flags = MI.getFlags();
4845   LLT ResTy = MRI.getType(Res);
4846 
4847   const MachineFunction &MF = B.getMF();
4848   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4849                             MI.getFlag(MachineInstr::FmAfn);
4850 
4851   if (!AllowInaccurateRcp)
4852     return false;
4853 
4854   auto NegY = B.buildFNeg(ResTy, Y);
4855   auto One = B.buildFConstant(ResTy, 1.0);
4856 
4857   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4858                .addUse(Y)
4859                .setMIFlags(Flags);
4860 
4861   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4862   R = B.buildFMA(ResTy, Tmp0, R, R);
4863 
4864   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4865   R = B.buildFMA(ResTy, Tmp1, R, R);
4866 
4867   auto Ret = B.buildFMul(ResTy, X, R);
4868   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4869 
4870   B.buildFMA(Res, Tmp2, R, Ret);
4871   MI.eraseFromParent();
4872   return true;
4873 }
4874 
4875 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4876                                          MachineRegisterInfo &MRI,
4877                                          MachineIRBuilder &B) const {
4878   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4879     return true;
4880 
4881   Register Res = MI.getOperand(0).getReg();
4882   Register LHS = MI.getOperand(1).getReg();
4883   Register RHS = MI.getOperand(2).getReg();
4884 
4885   uint16_t Flags = MI.getFlags();
4886 
4887   LLT S16 = LLT::scalar(16);
4888   LLT S32 = LLT::scalar(32);
4889 
4890   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4891   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4892 
4893   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4894                  .addUse(RHSExt.getReg(0))
4895                  .setMIFlags(Flags);
4896 
4897   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4898   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4899 
4900   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4901       .addUse(RDst.getReg(0))
4902       .addUse(RHS)
4903       .addUse(LHS)
4904       .setMIFlags(Flags);
4905 
4906   MI.eraseFromParent();
4907   return true;
4908 }
4909 
4910 static constexpr unsigned SPDenormModeBitField =
4911     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
4912 
4913 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4914 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4915 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4916                                const GCNSubtarget &ST,
4917                                SIModeRegisterDefaults Mode) {
4918   // Set SP denorm mode to this value.
4919   unsigned SPDenormMode =
4920     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4921 
4922   if (ST.hasDenormModeInst()) {
4923     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
4924     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4925 
4926     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4927     B.buildInstr(AMDGPU::S_DENORM_MODE)
4928       .addImm(NewDenormModeValue);
4929 
4930   } else {
4931     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4932       .addImm(SPDenormMode)
4933       .addImm(SPDenormModeBitField);
4934   }
4935 }
4936 
4937 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4938                                          MachineRegisterInfo &MRI,
4939                                          MachineIRBuilder &B) const {
4940   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4941     return true;
4942 
4943   Register Res = MI.getOperand(0).getReg();
4944   Register LHS = MI.getOperand(1).getReg();
4945   Register RHS = MI.getOperand(2).getReg();
4946   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4947   SIModeRegisterDefaults Mode = MFI->getMode();
4948 
4949   uint16_t Flags = MI.getFlags();
4950 
4951   LLT S32 = LLT::scalar(32);
4952   LLT S1 = LLT::scalar(1);
4953 
4954   auto One = B.buildFConstant(S32, 1.0f);
4955 
4956   auto DenominatorScaled =
4957       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4958           .addUse(LHS)
4959           .addUse(RHS)
4960           .addImm(0)
4961           .setMIFlags(Flags);
4962   auto NumeratorScaled =
4963       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4964           .addUse(LHS)
4965           .addUse(RHS)
4966           .addImm(1)
4967           .setMIFlags(Flags);
4968 
4969   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4970                        .addUse(DenominatorScaled.getReg(0))
4971                        .setMIFlags(Flags);
4972   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4973 
4974   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4975   const bool HasDynamicDenormals =
4976       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4977       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4978 
4979   Register SavedSPDenormMode;
4980   if (!PreservesDenormals) {
4981     if (HasDynamicDenormals) {
4982       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4983       B.buildInstr(AMDGPU::S_GETREG_B32)
4984           .addDef(SavedSPDenormMode)
4985           .addImm(SPDenormModeBitField);
4986     }
4987     toggleSPDenormMode(true, B, ST, Mode);
4988   }
4989 
4990   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4991   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4992   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4993   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4994   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4995   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4996 
4997   if (!PreservesDenormals) {
4998     if (HasDynamicDenormals) {
4999       assert(SavedSPDenormMode);
5000       B.buildInstr(AMDGPU::S_SETREG_B32)
5001           .addReg(SavedSPDenormMode)
5002           .addImm(SPDenormModeBitField);
5003     } else
5004       toggleSPDenormMode(false, B, ST, Mode);
5005   }
5006 
5007   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5008                   .addUse(Fma4.getReg(0))
5009                   .addUse(Fma1.getReg(0))
5010                   .addUse(Fma3.getReg(0))
5011                   .addUse(NumeratorScaled.getReg(1))
5012                   .setMIFlags(Flags);
5013 
5014   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5015       .addUse(Fmas.getReg(0))
5016       .addUse(RHS)
5017       .addUse(LHS)
5018       .setMIFlags(Flags);
5019 
5020   MI.eraseFromParent();
5021   return true;
5022 }
5023 
5024 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5025                                          MachineRegisterInfo &MRI,
5026                                          MachineIRBuilder &B) const {
5027   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5028     return true;
5029 
5030   Register Res = MI.getOperand(0).getReg();
5031   Register LHS = MI.getOperand(1).getReg();
5032   Register RHS = MI.getOperand(2).getReg();
5033 
5034   uint16_t Flags = MI.getFlags();
5035 
5036   LLT S64 = LLT::scalar(64);
5037   LLT S1 = LLT::scalar(1);
5038 
5039   auto One = B.buildFConstant(S64, 1.0);
5040 
5041   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5042                        .addUse(LHS)
5043                        .addUse(RHS)
5044                        .addImm(0)
5045                        .setMIFlags(Flags);
5046 
5047   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5048 
5049   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5050                  .addUse(DivScale0.getReg(0))
5051                  .setMIFlags(Flags);
5052 
5053   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5054   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5055   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5056 
5057   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5058                        .addUse(LHS)
5059                        .addUse(RHS)
5060                        .addImm(1)
5061                        .setMIFlags(Flags);
5062 
5063   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5064   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5065   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5066 
5067   Register Scale;
5068   if (!ST.hasUsableDivScaleConditionOutput()) {
5069     // Workaround a hardware bug on SI where the condition output from div_scale
5070     // is not usable.
5071 
5072     LLT S32 = LLT::scalar(32);
5073 
5074     auto NumUnmerge = B.buildUnmerge(S32, LHS);
5075     auto DenUnmerge = B.buildUnmerge(S32, RHS);
5076     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5077     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5078 
5079     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5080                               Scale1Unmerge.getReg(1));
5081     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5082                               Scale0Unmerge.getReg(1));
5083     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5084   } else {
5085     Scale = DivScale1.getReg(1);
5086   }
5087 
5088   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5089                   .addUse(Fma4.getReg(0))
5090                   .addUse(Fma3.getReg(0))
5091                   .addUse(Mul.getReg(0))
5092                   .addUse(Scale)
5093                   .setMIFlags(Flags);
5094 
5095   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5096       .addUse(Fmas.getReg(0))
5097       .addUse(RHS)
5098       .addUse(LHS)
5099       .setMIFlags(Flags);
5100 
5101   MI.eraseFromParent();
5102   return true;
5103 }
5104 
5105 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5106                                          MachineRegisterInfo &MRI,
5107                                          MachineIRBuilder &B) const {
5108   Register Res0 = MI.getOperand(0).getReg();
5109   Register Res1 = MI.getOperand(1).getReg();
5110   Register Val = MI.getOperand(2).getReg();
5111   uint16_t Flags = MI.getFlags();
5112 
5113   LLT Ty = MRI.getType(Res0);
5114   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5115 
5116   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5117                   .addUse(Val)
5118                   .setMIFlags(Flags);
5119   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5120                  .addUse(Val)
5121                  .setMIFlags(Flags);
5122 
5123   if (ST.hasFractBug()) {
5124     auto Fabs = B.buildFAbs(Ty, Val);
5125     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5126     auto IsFinite =
5127         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5128     auto Zero = B.buildConstant(InstrExpTy, 0);
5129     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5130     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5131   }
5132 
5133   B.buildCopy(Res0, Mant);
5134   B.buildSExtOrTrunc(Res1, Exp);
5135 
5136   MI.eraseFromParent();
5137   return true;
5138 }
5139 
5140 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5141                                                  MachineRegisterInfo &MRI,
5142                                                  MachineIRBuilder &B) const {
5143   Register Res = MI.getOperand(0).getReg();
5144   Register LHS = MI.getOperand(2).getReg();
5145   Register RHS = MI.getOperand(3).getReg();
5146   uint16_t Flags = MI.getFlags();
5147 
5148   LLT S32 = LLT::scalar(32);
5149   LLT S1 = LLT::scalar(1);
5150 
5151   auto Abs = B.buildFAbs(S32, RHS, Flags);
5152   const APFloat C0Val(1.0f);
5153 
5154   auto C0 = B.buildFConstant(S32, 0x1p+96f);
5155   auto C1 = B.buildFConstant(S32, 0x1p-32f);
5156   auto C2 = B.buildFConstant(S32, 1.0f);
5157 
5158   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5159   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5160 
5161   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5162 
5163   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5164                  .addUse(Mul0.getReg(0))
5165                  .setMIFlags(Flags);
5166 
5167   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5168 
5169   B.buildFMul(Res, Sel, Mul1, Flags);
5170 
5171   MI.eraseFromParent();
5172   return true;
5173 }
5174 
5175 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5176                                            MachineRegisterInfo &MRI,
5177                                            MachineIRBuilder &B) const {
5178   // Bypass the correct expansion a standard promotion through G_FSQRT would
5179   // get. The f32 op is accurate enough for the f16 cas.
5180   unsigned Flags = MI.getFlags();
5181   assert(!ST.has16BitInsts());
5182   const LLT F32 = LLT::scalar(32);
5183   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5184   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5185     .addUse(Ext.getReg(0))
5186     .setMIFlags(Flags);
5187   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5188   MI.eraseFromParent();
5189   return true;
5190 }
5191 
5192 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5193                                            MachineRegisterInfo &MRI,
5194                                            MachineIRBuilder &B) const {
5195   MachineFunction &MF = B.getMF();
5196   Register Dst = MI.getOperand(0).getReg();
5197   Register X = MI.getOperand(1).getReg();
5198   const unsigned Flags = MI.getFlags();
5199   const LLT S1 = LLT::scalar(1);
5200   const LLT F32 = LLT::scalar(32);
5201   const LLT I32 = LLT::scalar(32);
5202 
5203   if (allowApproxFunc(MF, Flags)) {
5204     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5205       .addUse(X)
5206       .setMIFlags(Flags);
5207     MI.eraseFromParent();
5208     return true;
5209   }
5210 
5211   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5212   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5213   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5214   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5215   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5216 
5217   Register SqrtS = MRI.createGenericVirtualRegister(F32);
5218   if (needsDenormHandlingF32(MF, X, Flags)) {
5219     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5220       .addUse(SqrtX.getReg(0))
5221       .setMIFlags(Flags);
5222 
5223     auto NegOne = B.buildConstant(I32, -1);
5224     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5225 
5226     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5227     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5228 
5229     auto PosOne = B.buildConstant(I32, 1);
5230     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5231 
5232     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5233     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5234 
5235     auto Zero = B.buildFConstant(F32, 0.0f);
5236     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5237 
5238     SqrtS =
5239         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5240 
5241     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5242     SqrtS =
5243         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5244   } else {
5245     auto SqrtR =
5246         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5247     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5248 
5249     auto Half = B.buildFConstant(F32, 0.5f);
5250     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5251     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5252     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5253     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5254     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5255     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5256     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5257     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5258   }
5259 
5260   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5261 
5262   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5263 
5264   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5265 
5266   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5267   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5268 
5269   MI.eraseFromParent();
5270   return true;
5271 }
5272 
5273 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5274                                            MachineRegisterInfo &MRI,
5275                                            MachineIRBuilder &B) const {
5276   // For double type, the SQRT and RSQ instructions don't have required
5277   // precision, we apply Goldschmidt's algorithm to improve the result:
5278   //
5279   //   y0 = rsq(x)
5280   //   g0 = x * y0
5281   //   h0 = 0.5 * y0
5282   //
5283   //   r0 = 0.5 - h0 * g0
5284   //   g1 = g0 * r0 + g0
5285   //   h1 = h0 * r0 + h0
5286   //
5287   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5288   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
5289   //   h2 = h1 * r1 + h1
5290   //
5291   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5292   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
5293   //
5294   //   sqrt(x) = g3
5295 
5296   const LLT S1 = LLT::scalar(1);
5297   const LLT S32 = LLT::scalar(32);
5298   const LLT F64 = LLT::scalar(64);
5299 
5300   Register Dst = MI.getOperand(0).getReg();
5301   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5302 
5303   Register X = MI.getOperand(1).getReg();
5304   unsigned Flags = MI.getFlags();
5305 
5306   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5307 
5308   auto ZeroInt = B.buildConstant(S32, 0);
5309   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5310 
5311   // Scale up input if it is too small.
5312   auto ScaleUpFactor = B.buildConstant(S32, 256);
5313   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5314   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5315 
5316   auto SqrtY =
5317       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5318 
5319   auto Half = B.buildFConstant(F64, 0.5);
5320   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5321   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5322 
5323   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5324   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5325 
5326   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5327   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5328 
5329   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5330   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5331 
5332   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5333 
5334   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5335   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5336 
5337   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5338 
5339   // Scale down the result.
5340   auto ScaleDownFactor = B.buildConstant(S32, -128);
5341   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5342   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5343 
5344   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5345   // with finite only or nsz because rsq(+/-0) = +/-inf
5346 
5347   // TODO: Check for DAZ and expand to subnormals
5348   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5349 
5350   // If x is +INF, +0, or -0, use its original value
5351   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5352 
5353   MI.eraseFromParent();
5354   return true;
5355 }
5356 
5357 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5358                                         MachineRegisterInfo &MRI,
5359                                         MachineIRBuilder &B) const {
5360   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5361   if (Ty == LLT::scalar(32))
5362     return legalizeFSQRTF32(MI, MRI, B);
5363   if (Ty == LLT::scalar(64))
5364     return legalizeFSQRTF64(MI, MRI, B);
5365   if (Ty == LLT::scalar(16))
5366     return legalizeFSQRTF16(MI, MRI, B);
5367   return false;
5368 }
5369 
5370 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5371 // FIXME: Why do we handle this one but not other removed instructions?
5372 //
5373 // Reciprocal square root.  The clamp prevents infinite results, clamping
5374 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5375 // +-max_float.
5376 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5377                                                     MachineRegisterInfo &MRI,
5378                                                     MachineIRBuilder &B) const {
5379   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5380     return true;
5381 
5382   Register Dst = MI.getOperand(0).getReg();
5383   Register Src = MI.getOperand(2).getReg();
5384   auto Flags = MI.getFlags();
5385 
5386   LLT Ty = MRI.getType(Dst);
5387 
5388   const fltSemantics *FltSemantics;
5389   if (Ty == LLT::scalar(32))
5390     FltSemantics = &APFloat::IEEEsingle();
5391   else if (Ty == LLT::scalar(64))
5392     FltSemantics = &APFloat::IEEEdouble();
5393   else
5394     return false;
5395 
5396   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5397                  .addUse(Src)
5398                  .setMIFlags(Flags);
5399 
5400   // We don't need to concern ourselves with the snan handling difference, since
5401   // the rsq quieted (or not) so use the one which will directly select.
5402   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5403   const bool UseIEEE = MFI->getMode().IEEE;
5404 
5405   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5406   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5407                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5408 
5409   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5410 
5411   if (UseIEEE)
5412     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5413   else
5414     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5415   MI.eraseFromParent();
5416   return true;
5417 }
5418 
5419 // TODO: Fix pointer type handling
5420 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5421                                          MachineInstr &MI,
5422                                          Intrinsic::ID IID) const {
5423 
5424   MachineIRBuilder &B = Helper.MIRBuilder;
5425   MachineRegisterInfo &MRI = *B.getMRI();
5426 
5427   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5428                       IID == Intrinsic::amdgcn_permlanex16;
5429 
5430   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5431                                       Register Src2, LLT VT) -> Register {
5432     auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5433     switch (IID) {
5434     case Intrinsic::amdgcn_readfirstlane:
5435     case Intrinsic::amdgcn_permlane64:
5436       return LaneOp.getReg(0);
5437     case Intrinsic::amdgcn_readlane:
5438       return LaneOp.addUse(Src1).getReg(0);
5439     case Intrinsic::amdgcn_writelane:
5440       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5441     case Intrinsic::amdgcn_permlane16:
5442     case Intrinsic::amdgcn_permlanex16: {
5443       Register Src3 = MI.getOperand(5).getReg();
5444       Register Src4 = MI.getOperand(6).getImm();
5445       Register Src5 = MI.getOperand(7).getImm();
5446       return LaneOp.addUse(Src1)
5447           .addUse(Src2)
5448           .addUse(Src3)
5449           .addImm(Src4)
5450           .addImm(Src5)
5451           .getReg(0);
5452     }
5453     default:
5454       llvm_unreachable("unhandled lane op");
5455     }
5456   };
5457 
5458   Register DstReg = MI.getOperand(0).getReg();
5459   Register Src0 = MI.getOperand(2).getReg();
5460   Register Src1, Src2;
5461   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5462       IsPermLane16) {
5463     Src1 = MI.getOperand(3).getReg();
5464     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5465       Src2 = MI.getOperand(4).getReg();
5466     }
5467   }
5468 
5469   LLT Ty = MRI.getType(DstReg);
5470   unsigned Size = Ty.getSizeInBits();
5471 
5472   if (Size == 32) {
5473     // Already legal
5474     return true;
5475   }
5476 
5477   if (Size < 32) {
5478     Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5479 
5480     if (IsPermLane16)
5481       Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5482 
5483     if (IID == Intrinsic::amdgcn_writelane)
5484       Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5485 
5486     Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5487     B.buildTrunc(DstReg, LaneOpDst);
5488     MI.eraseFromParent();
5489     return true;
5490   }
5491 
5492   if (Size % 32 != 0)
5493     return false;
5494 
5495   LLT PartialResTy = S32;
5496   if (Ty.isVector()) {
5497     LLT EltTy = Ty.getElementType();
5498     switch (EltTy.getSizeInBits()) {
5499     case 16:
5500       PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5501       break;
5502     case 32:
5503       PartialResTy = EltTy;
5504       break;
5505     default:
5506       // Handle all other cases via S32 pieces;
5507       break;
5508     }
5509   }
5510 
5511   SmallVector<Register, 2> PartialRes;
5512   unsigned NumParts = Size / 32;
5513   MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5514   MachineInstrBuilder Src1Parts, Src2Parts;
5515 
5516   if (IsPermLane16)
5517     Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5518 
5519   if (IID == Intrinsic::amdgcn_writelane)
5520     Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5521 
5522   for (unsigned i = 0; i < NumParts; ++i) {
5523     Src0 = Src0Parts.getReg(i);
5524 
5525     if (IsPermLane16)
5526       Src1 = Src1Parts.getReg(i);
5527 
5528     if (IID == Intrinsic::amdgcn_writelane)
5529       Src2 = Src2Parts.getReg(i);
5530 
5531     PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5532   }
5533 
5534   B.buildMergeLikeInstr(DstReg, PartialRes);
5535   MI.eraseFromParent();
5536   return true;
5537 }
5538 
5539 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5540                                             MachineRegisterInfo &MRI,
5541                                             MachineIRBuilder &B) const {
5542   uint64_t Offset =
5543     ST.getTargetLowering()->getImplicitParameterOffset(
5544       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5545   LLT DstTy = MRI.getType(DstReg);
5546   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5547 
5548   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5549   if (!loadInputValue(KernargPtrReg, B,
5550                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5551     return false;
5552 
5553   // FIXME: This should be nuw
5554   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5555   return true;
5556 }
5557 
5558 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5559 /// bits of the pointer and replace them with the stride argument, then
5560 /// merge_values everything together. In the common case of a raw buffer (the
5561 /// stride component is 0), we can just AND off the upper half.
5562 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5563     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5564   Register Result = MI.getOperand(0).getReg();
5565   Register Pointer = MI.getOperand(2).getReg();
5566   Register Stride = MI.getOperand(3).getReg();
5567   Register NumRecords = MI.getOperand(4).getReg();
5568   Register Flags = MI.getOperand(5).getReg();
5569 
5570   LLT S32 = LLT::scalar(32);
5571 
5572   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5573   auto Unmerge = B.buildUnmerge(S32, Pointer);
5574   Register LowHalf = Unmerge.getReg(0);
5575   Register HighHalf = Unmerge.getReg(1);
5576 
5577   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5578   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5579 
5580   MachineInstrBuilder NewHighHalf = Masked;
5581   std::optional<ValueAndVReg> StrideConst =
5582       getIConstantVRegValWithLookThrough(Stride, MRI);
5583   if (!StrideConst || !StrideConst->Value.isZero()) {
5584     MachineInstrBuilder ShiftedStride;
5585     if (StrideConst) {
5586       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5587       uint32_t ShiftedStrideVal = StrideVal << 16;
5588       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5589     } else {
5590       auto ExtStride = B.buildAnyExt(S32, Stride);
5591       auto ShiftConst = B.buildConstant(S32, 16);
5592       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5593     }
5594     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5595   }
5596   Register NewHighHalfReg = NewHighHalf.getReg(0);
5597   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5598   MI.eraseFromParent();
5599   return true;
5600 }
5601 
5602 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5603                                                  MachineRegisterInfo &MRI,
5604                                                  MachineIRBuilder &B) const {
5605   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5606   if (!MFI->isEntryFunction()) {
5607     return legalizePreloadedArgIntrin(MI, MRI, B,
5608                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5609   }
5610 
5611   Register DstReg = MI.getOperand(0).getReg();
5612   if (!getImplicitArgPtr(DstReg, MRI, B))
5613     return false;
5614 
5615   MI.eraseFromParent();
5616   return true;
5617 }
5618 
5619 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5620                                          MachineRegisterInfo &MRI,
5621                                          MachineIRBuilder &B) const {
5622   Function &F = B.getMF().getFunction();
5623   std::optional<uint32_t> KnownSize =
5624       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5625   if (KnownSize.has_value())
5626     B.buildConstant(DstReg, *KnownSize);
5627   return false;
5628 }
5629 
5630 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5631                                               MachineRegisterInfo &MRI,
5632                                               MachineIRBuilder &B) const {
5633 
5634   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5635   if (!MFI->isEntryFunction()) {
5636     return legalizePreloadedArgIntrin(MI, MRI, B,
5637                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5638   }
5639 
5640   Register DstReg = MI.getOperand(0).getReg();
5641   if (!getLDSKernelId(DstReg, MRI, B))
5642     return false;
5643 
5644   MI.eraseFromParent();
5645   return true;
5646 }
5647 
5648 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5649                                               MachineRegisterInfo &MRI,
5650                                               MachineIRBuilder &B,
5651                                               unsigned AddrSpace) const {
5652   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5653   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5654   Register Hi32 = Unmerge.getReg(1);
5655 
5656   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5657   MI.eraseFromParent();
5658   return true;
5659 }
5660 
5661 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5662 // offset (the offset that is included in bounds checking and swizzling, to be
5663 // split between the instruction's voffset and immoffset fields) and soffset
5664 // (the offset that is excluded from bounds checking and swizzling, to go in
5665 // the instruction's soffset field).  This function takes the first kind of
5666 // offset and figures out how to split it between voffset and immoffset.
5667 std::pair<Register, unsigned>
5668 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5669                                         Register OrigOffset) const {
5670   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5671   Register BaseReg;
5672   unsigned ImmOffset;
5673   const LLT S32 = LLT::scalar(32);
5674   MachineRegisterInfo &MRI = *B.getMRI();
5675 
5676   std::tie(BaseReg, ImmOffset) =
5677       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5678 
5679   // If BaseReg is a pointer, convert it to int.
5680   if (MRI.getType(BaseReg).isPointer())
5681     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5682 
5683   // If the immediate value is too big for the immoffset field, put only bits
5684   // that would normally fit in the immoffset field. The remaining value that
5685   // is copied/added for the voffset field is a large power of 2, and it
5686   // stands more chance of being CSEd with the copy/add for another similar
5687   // load/store.
5688   // However, do not do that rounding down if that is a negative
5689   // number, as it appears to be illegal to have a negative offset in the
5690   // vgpr, even if adding the immediate offset makes it positive.
5691   unsigned Overflow = ImmOffset & ~MaxImm;
5692   ImmOffset -= Overflow;
5693   if ((int32_t)Overflow < 0) {
5694     Overflow += ImmOffset;
5695     ImmOffset = 0;
5696   }
5697 
5698   if (Overflow != 0) {
5699     if (!BaseReg) {
5700       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5701     } else {
5702       auto OverflowVal = B.buildConstant(S32, Overflow);
5703       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5704     }
5705   }
5706 
5707   if (!BaseReg)
5708     BaseReg = B.buildConstant(S32, 0).getReg(0);
5709 
5710   return std::pair(BaseReg, ImmOffset);
5711 }
5712 
5713 /// Handle register layout difference for f16 images for some subtargets.
5714 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5715                                              MachineRegisterInfo &MRI,
5716                                              Register Reg,
5717                                              bool ImageStore) const {
5718   const LLT S16 = LLT::scalar(16);
5719   const LLT S32 = LLT::scalar(32);
5720   LLT StoreVT = MRI.getType(Reg);
5721   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5722 
5723   if (ST.hasUnpackedD16VMem()) {
5724     auto Unmerge = B.buildUnmerge(S16, Reg);
5725 
5726     SmallVector<Register, 4> WideRegs;
5727     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5728       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5729 
5730     int NumElts = StoreVT.getNumElements();
5731 
5732     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5733         .getReg(0);
5734   }
5735 
5736   if (ImageStore && ST.hasImageStoreD16Bug()) {
5737     if (StoreVT.getNumElements() == 2) {
5738       SmallVector<Register, 4> PackedRegs;
5739       Reg = B.buildBitcast(S32, Reg).getReg(0);
5740       PackedRegs.push_back(Reg);
5741       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5742       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5743           .getReg(0);
5744     }
5745 
5746     if (StoreVT.getNumElements() == 3) {
5747       SmallVector<Register, 4> PackedRegs;
5748       auto Unmerge = B.buildUnmerge(S16, Reg);
5749       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5750         PackedRegs.push_back(Unmerge.getReg(I));
5751       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5752       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5753       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5754     }
5755 
5756     if (StoreVT.getNumElements() == 4) {
5757       SmallVector<Register, 4> PackedRegs;
5758       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5759       auto Unmerge = B.buildUnmerge(S32, Reg);
5760       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5761         PackedRegs.push_back(Unmerge.getReg(I));
5762       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5763       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5764           .getReg(0);
5765     }
5766 
5767     llvm_unreachable("invalid data type");
5768   }
5769 
5770   if (StoreVT == LLT::fixed_vector(3, S16)) {
5771     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5772               .getReg(0);
5773   }
5774   return Reg;
5775 }
5776 
5777 Register AMDGPULegalizerInfo::fixStoreSourceType(
5778   MachineIRBuilder &B, Register VData, bool IsFormat) const {
5779   MachineRegisterInfo *MRI = B.getMRI();
5780   LLT Ty = MRI->getType(VData);
5781 
5782   const LLT S16 = LLT::scalar(16);
5783 
5784   // Fixup buffer resources themselves needing to be v4i128.
5785   if (hasBufferRsrcWorkaround(Ty))
5786     return castBufferRsrcToV4I32(VData, B);
5787 
5788   // Fixup illegal register types for i8 stores.
5789   if (Ty == LLT::scalar(8) || Ty == S16) {
5790     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5791     return AnyExt;
5792   }
5793 
5794   if (Ty.isVector()) {
5795     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5796       if (IsFormat)
5797         return handleD16VData(B, *MRI, VData);
5798     }
5799   }
5800 
5801   return VData;
5802 }
5803 
5804 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5805                                               MachineRegisterInfo &MRI,
5806                                               MachineIRBuilder &B,
5807                                               bool IsTyped,
5808                                               bool IsFormat) const {
5809   Register VData = MI.getOperand(1).getReg();
5810   LLT Ty = MRI.getType(VData);
5811   LLT EltTy = Ty.getScalarType();
5812   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5813   const LLT S32 = LLT::scalar(32);
5814 
5815   VData = fixStoreSourceType(B, VData, IsFormat);
5816   castBufferRsrcArgToV4I32(MI, B, 2);
5817   Register RSrc = MI.getOperand(2).getReg();
5818 
5819   MachineMemOperand *MMO = *MI.memoperands_begin();
5820   const int MemSize = MMO->getSize().getValue();
5821 
5822   unsigned ImmOffset;
5823 
5824   // The typed intrinsics add an immediate after the registers.
5825   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5826 
5827   // The struct intrinsic variants add one additional operand over raw.
5828   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5829   Register VIndex;
5830   int OpOffset = 0;
5831   if (HasVIndex) {
5832     VIndex = MI.getOperand(3).getReg();
5833     OpOffset = 1;
5834   } else {
5835     VIndex = B.buildConstant(S32, 0).getReg(0);
5836   }
5837 
5838   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5839   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5840 
5841   unsigned Format = 0;
5842   if (IsTyped) {
5843     Format = MI.getOperand(5 + OpOffset).getImm();
5844     ++OpOffset;
5845   }
5846 
5847   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5848 
5849   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5850 
5851   unsigned Opc;
5852   if (IsTyped) {
5853     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5854                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5855   } else if (IsFormat) {
5856     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5857                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5858   } else {
5859     switch (MemSize) {
5860     case 1:
5861       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5862       break;
5863     case 2:
5864       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5865       break;
5866     default:
5867       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5868       break;
5869     }
5870   }
5871 
5872   auto MIB = B.buildInstr(Opc)
5873     .addUse(VData)              // vdata
5874     .addUse(RSrc)               // rsrc
5875     .addUse(VIndex)             // vindex
5876     .addUse(VOffset)            // voffset
5877     .addUse(SOffset)            // soffset
5878     .addImm(ImmOffset);         // offset(imm)
5879 
5880   if (IsTyped)
5881     MIB.addImm(Format);
5882 
5883   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5884      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5885      .addMemOperand(MMO);
5886 
5887   MI.eraseFromParent();
5888   return true;
5889 }
5890 
5891 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5892                             Register VIndex, Register VOffset, Register SOffset,
5893                             unsigned ImmOffset, unsigned Format,
5894                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5895                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5896   auto MIB = B.buildInstr(Opc)
5897                  .addDef(LoadDstReg) // vdata
5898                  .addUse(RSrc)       // rsrc
5899                  .addUse(VIndex)     // vindex
5900                  .addUse(VOffset)    // voffset
5901                  .addUse(SOffset)    // soffset
5902                  .addImm(ImmOffset); // offset(imm)
5903 
5904   if (IsTyped)
5905     MIB.addImm(Format);
5906 
5907   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5908       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5909       .addMemOperand(MMO);
5910 }
5911 
5912 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
5913                                              MachineRegisterInfo &MRI,
5914                                              MachineIRBuilder &B,
5915                                              bool IsFormat,
5916                                              bool IsTyped) const {
5917   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
5918   MachineMemOperand *MMO = *MI.memoperands_begin();
5919   const LLT MemTy = MMO->getMemoryType();
5920   const LLT S32 = LLT::scalar(32);
5921 
5922   Register Dst = MI.getOperand(0).getReg();
5923 
5924   Register StatusDst;
5925   int OpOffset = 0;
5926   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5927   bool IsTFE = MI.getNumExplicitDefs() == 2;
5928   if (IsTFE) {
5929     StatusDst = MI.getOperand(1).getReg();
5930     ++OpOffset;
5931   }
5932 
5933   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5934   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
5935 
5936   // The typed intrinsics add an immediate after the registers.
5937   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5938 
5939   // The struct intrinsic variants add one additional operand over raw.
5940   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
5941   Register VIndex;
5942   if (HasVIndex) {
5943     VIndex = MI.getOperand(3 + OpOffset).getReg();
5944     ++OpOffset;
5945   } else {
5946     VIndex = B.buildConstant(S32, 0).getReg(0);
5947   }
5948 
5949   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5950   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5951 
5952   unsigned Format = 0;
5953   if (IsTyped) {
5954     Format = MI.getOperand(5 + OpOffset).getImm();
5955     ++OpOffset;
5956   }
5957 
5958   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5959   unsigned ImmOffset;
5960 
5961   LLT Ty = MRI.getType(Dst);
5962   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5963   // logic doesn't have to handle that case.
5964   if (hasBufferRsrcWorkaround(Ty)) {
5965     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5966     Dst = MI.getOperand(0).getReg();
5967   }
5968   LLT EltTy = Ty.getScalarType();
5969   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5970   const bool Unpacked = ST.hasUnpackedD16VMem();
5971 
5972   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5973 
5974   unsigned Opc;
5975 
5976   // TODO: Support TFE for typed and narrow loads.
5977   if (IsTyped) {
5978     if (IsTFE)
5979       return false;
5980     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5981                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5982   } else if (IsFormat) {
5983     if (IsD16) {
5984       if (IsTFE)
5985         return false;
5986       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5987     } else {
5988       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5989                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5990     }
5991   } else {
5992     switch (MemTy.getSizeInBits()) {
5993     case 8:
5994       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5995                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5996       break;
5997     case 16:
5998       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5999                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6000       break;
6001     default:
6002       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6003                   : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6004       break;
6005     }
6006   }
6007 
6008   if (IsTFE) {
6009     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6010     unsigned NumLoadDWords = NumValueDWords + 1;
6011     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6012     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6013     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6014                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6015     if (MemTy.getSizeInBits() < 32) {
6016       Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6017       B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6018       B.buildTrunc(Dst, ExtDst);
6019     } else if (NumValueDWords == 1) {
6020       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6021     } else {
6022       SmallVector<Register, 5> LoadElts;
6023       for (unsigned I = 0; I != NumValueDWords; ++I)
6024         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6025       LoadElts.push_back(StatusDst);
6026       B.buildUnmerge(LoadElts, LoadDstReg);
6027       LoadElts.truncate(NumValueDWords);
6028       B.buildMergeLikeInstr(Dst, LoadElts);
6029     }
6030   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6031              (IsD16 && !Ty.isVector())) {
6032     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6033     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6034                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6035     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6036     B.buildTrunc(Dst, LoadDstReg);
6037   } else if (Unpacked && IsD16 && Ty.isVector()) {
6038     LLT UnpackedTy = Ty.changeElementSize(32);
6039     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6040     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6041                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6042     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6043     // FIXME: G_TRUNC should work, but legalization currently fails
6044     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6045     SmallVector<Register, 4> Repack;
6046     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6047       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6048     B.buildMergeLikeInstr(Dst, Repack);
6049   } else {
6050     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6051                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6052   }
6053 
6054   MI.eraseFromParent();
6055   return true;
6056 }
6057 
6058 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6059   switch (IntrID) {
6060   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6061   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6062   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6063   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6064     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6065   case Intrinsic::amdgcn_raw_buffer_atomic_add:
6066   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6067   case Intrinsic::amdgcn_struct_buffer_atomic_add:
6068   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6069     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6070   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6071   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6072   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6073   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6074     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6075   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6076   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6077   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6078   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6079     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6080   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6081   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6082   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6083   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6084     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6085   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6086   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6087   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6088   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6089     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6090   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6091   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6092   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6093   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6094     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6095   case Intrinsic::amdgcn_raw_buffer_atomic_and:
6096   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6097   case Intrinsic::amdgcn_struct_buffer_atomic_and:
6098   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6099     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6100   case Intrinsic::amdgcn_raw_buffer_atomic_or:
6101   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6102   case Intrinsic::amdgcn_struct_buffer_atomic_or:
6103   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6104     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6105   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6106   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6107   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6108   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6109     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6110   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6111   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6112   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6113   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6114     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6115   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6116   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6117   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6118   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6119     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6120   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6121   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6122   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6123   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6124     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6125   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6126   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6127   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6128   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6129     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6130   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6131   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6132   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6133   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6134     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6135   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6136   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6137   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6138   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6139     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6140   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6141   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6142     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6143   default:
6144     llvm_unreachable("unhandled atomic opcode");
6145   }
6146 }
6147 
6148 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6149                                                MachineIRBuilder &B,
6150                                                Intrinsic::ID IID) const {
6151   const bool IsCmpSwap =
6152       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6153       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6154       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6155       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6156 
6157   Register Dst = MI.getOperand(0).getReg();
6158   // Since we don't have 128-bit atomics, we don't need to handle the case of
6159   // p8 argmunents to the atomic itself
6160   Register VData = MI.getOperand(2).getReg();
6161 
6162   Register CmpVal;
6163   int OpOffset = 0;
6164 
6165   if (IsCmpSwap) {
6166     CmpVal = MI.getOperand(3).getReg();
6167     ++OpOffset;
6168   }
6169 
6170   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6171   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6172   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6173 
6174   // The struct intrinsic variants add one additional operand over raw.
6175   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6176   Register VIndex;
6177   if (HasVIndex) {
6178     VIndex = MI.getOperand(4 + OpOffset).getReg();
6179     ++OpOffset;
6180   } else {
6181     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6182   }
6183 
6184   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6185   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6186   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6187 
6188   MachineMemOperand *MMO = *MI.memoperands_begin();
6189 
6190   unsigned ImmOffset;
6191   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6192 
6193   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6194       .addDef(Dst)
6195       .addUse(VData); // vdata
6196 
6197   if (IsCmpSwap)
6198     MIB.addReg(CmpVal);
6199 
6200   MIB.addUse(RSrc)               // rsrc
6201      .addUse(VIndex)             // vindex
6202      .addUse(VOffset)            // voffset
6203      .addUse(SOffset)            // soffset
6204      .addImm(ImmOffset)          // offset(imm)
6205      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
6206      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6207      .addMemOperand(MMO);
6208 
6209   MI.eraseFromParent();
6210   return true;
6211 }
6212 
6213 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6214 /// vector with s16 typed elements.
6215 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6216                                       SmallVectorImpl<Register> &PackedAddrs,
6217                                       unsigned ArgOffset,
6218                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
6219                                       bool IsA16, bool IsG16) {
6220   const LLT S16 = LLT::scalar(16);
6221   const LLT V2S16 = LLT::fixed_vector(2, 16);
6222   auto EndIdx = Intr->VAddrEnd;
6223 
6224   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6225     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6226     if (!SrcOp.isReg())
6227       continue; // _L to _LZ may have eliminated this.
6228 
6229     Register AddrReg = SrcOp.getReg();
6230 
6231     if ((I < Intr->GradientStart) ||
6232         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6233         (I >= Intr->CoordStart && !IsA16)) {
6234       if ((I < Intr->GradientStart) && IsA16 &&
6235           (B.getMRI()->getType(AddrReg) == S16)) {
6236         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6237         // Special handling of bias when A16 is on. Bias is of type half but
6238         // occupies full 32-bit.
6239         PackedAddrs.push_back(
6240             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6241                 .getReg(0));
6242       } else {
6243         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6244                "Bias needs to be converted to 16 bit in A16 mode");
6245         // Handle any gradient or coordinate operands that should not be packed
6246         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6247         PackedAddrs.push_back(AddrReg);
6248       }
6249     } else {
6250       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6251       // derivatives dx/dh and dx/dv are packed with undef.
6252       if (((I + 1) >= EndIdx) ||
6253           ((Intr->NumGradients / 2) % 2 == 1 &&
6254            (I == static_cast<unsigned>(Intr->GradientStart +
6255                                        (Intr->NumGradients / 2) - 1) ||
6256             I == static_cast<unsigned>(Intr->GradientStart +
6257                                        Intr->NumGradients - 1))) ||
6258           // Check for _L to _LZ optimization
6259           !MI.getOperand(ArgOffset + I + 1).isReg()) {
6260         PackedAddrs.push_back(
6261             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6262                 .getReg(0));
6263       } else {
6264         PackedAddrs.push_back(
6265             B.buildBuildVector(
6266                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6267                 .getReg(0));
6268         ++I;
6269       }
6270     }
6271   }
6272 }
6273 
6274 /// Convert from separate vaddr components to a single vector address register,
6275 /// and replace the remaining operands with $noreg.
6276 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6277                                      int DimIdx, int NumVAddrs) {
6278   const LLT S32 = LLT::scalar(32);
6279   (void)S32;
6280   SmallVector<Register, 8> AddrRegs;
6281   for (int I = 0; I != NumVAddrs; ++I) {
6282     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6283     if (SrcOp.isReg()) {
6284       AddrRegs.push_back(SrcOp.getReg());
6285       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6286     }
6287   }
6288 
6289   int NumAddrRegs = AddrRegs.size();
6290   if (NumAddrRegs != 1) {
6291     auto VAddr =
6292         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6293     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6294   }
6295 
6296   for (int I = 1; I != NumVAddrs; ++I) {
6297     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6298     if (SrcOp.isReg())
6299       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6300   }
6301 }
6302 
6303 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6304 ///
6305 /// Depending on the subtarget, load/store with 16-bit element data need to be
6306 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6307 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6308 /// registers.
6309 ///
6310 /// We don't want to directly select image instructions just yet, but also want
6311 /// to exposes all register repacking to the legalizer/combiners. We also don't
6312 /// want a selected instruction entering RegBankSelect. In order to avoid
6313 /// defining a multitude of intermediate image instructions, directly hack on
6314 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6315 /// padding now unnecessary arguments with $noreg.
6316 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6317     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6318     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6319 
6320   const MachineFunction &MF = *MI.getMF();
6321   const unsigned NumDefs = MI.getNumExplicitDefs();
6322   const unsigned ArgOffset = NumDefs + 1;
6323   bool IsTFE = NumDefs == 2;
6324   // We are only processing the operands of d16 image operations on subtargets
6325   // that use the unpacked register layout, or need to repack the TFE result.
6326 
6327   // TODO: Do we need to guard against already legalized intrinsics?
6328   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6329       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6330 
6331   MachineRegisterInfo *MRI = B.getMRI();
6332   const LLT S32 = LLT::scalar(32);
6333   const LLT S16 = LLT::scalar(16);
6334   const LLT V2S16 = LLT::fixed_vector(2, 16);
6335 
6336   unsigned DMask = 0;
6337   Register VData;
6338   LLT Ty;
6339 
6340   if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6341     VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6342     Ty = MRI->getType(VData);
6343   }
6344 
6345   const bool IsAtomicPacked16Bit =
6346       (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6347        BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6348 
6349   // Check for 16 bit addresses and pack if true.
6350   LLT GradTy =
6351       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6352   LLT AddrTy =
6353       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6354   const bool IsG16 =
6355       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6356   const bool IsA16 = AddrTy == S16;
6357   const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6358 
6359   int DMaskLanes = 0;
6360   if (!BaseOpcode->Atomic) {
6361     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6362     if (BaseOpcode->Gather4) {
6363       DMaskLanes = 4;
6364     } else if (DMask != 0) {
6365       DMaskLanes = llvm::popcount(DMask);
6366     } else if (!IsTFE && !BaseOpcode->Store) {
6367       // If dmask is 0, this is a no-op load. This can be eliminated.
6368       B.buildUndef(MI.getOperand(0));
6369       MI.eraseFromParent();
6370       return true;
6371     }
6372   }
6373 
6374   Observer.changingInstr(MI);
6375   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6376 
6377   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6378                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6379   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6380                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6381   unsigned NewOpcode = LoadOpcode;
6382   if (BaseOpcode->Store)
6383     NewOpcode = StoreOpcode;
6384   else if (BaseOpcode->NoReturn)
6385     NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6386 
6387   // Track that we legalized this
6388   MI.setDesc(B.getTII().get(NewOpcode));
6389 
6390   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6391   // dmask to be at least 1 otherwise the instruction will fail
6392   if (IsTFE && DMask == 0) {
6393     DMask = 0x1;
6394     DMaskLanes = 1;
6395     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6396   }
6397 
6398   if (BaseOpcode->Atomic) {
6399     Register VData0 = MI.getOperand(2).getReg();
6400     LLT Ty = MRI->getType(VData0);
6401 
6402     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6403     if (Ty.isVector() && !IsAtomicPacked16Bit)
6404       return false;
6405 
6406     if (BaseOpcode->AtomicX2) {
6407       Register VData1 = MI.getOperand(3).getReg();
6408       // The two values are packed in one register.
6409       LLT PackedTy = LLT::fixed_vector(2, Ty);
6410       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6411       MI.getOperand(2).setReg(Concat.getReg(0));
6412       MI.getOperand(3).setReg(AMDGPU::NoRegister);
6413     }
6414   }
6415 
6416   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6417 
6418   // Rewrite the addressing register layout before doing anything else.
6419   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6420     // 16 bit gradients are supported, but are tied to the A16 control
6421     // so both gradients and addresses must be 16 bit
6422     return false;
6423   }
6424 
6425   if (IsA16 && !ST.hasA16()) {
6426     // A16 not supported
6427     return false;
6428   }
6429 
6430   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6431   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6432 
6433   if (IsA16 || IsG16) {
6434     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6435     // instructions expect VGPR_32
6436     SmallVector<Register, 4> PackedRegs;
6437 
6438     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6439 
6440     // See also below in the non-a16 branch
6441     const bool UseNSA = ST.hasNSAEncoding() &&
6442                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6443                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6444     const bool UsePartialNSA =
6445         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6446 
6447     if (UsePartialNSA) {
6448       // Pack registers that would go over NSAMaxSize into last VAddr register
6449       LLT PackedAddrTy =
6450           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6451       auto Concat = B.buildConcatVectors(
6452           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6453       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6454       PackedRegs.resize(NSAMaxSize);
6455     } else if (!UseNSA && PackedRegs.size() > 1) {
6456       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6457       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6458       PackedRegs[0] = Concat.getReg(0);
6459       PackedRegs.resize(1);
6460     }
6461 
6462     const unsigned NumPacked = PackedRegs.size();
6463     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6464       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6465       if (!SrcOp.isReg()) {
6466         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6467         continue;
6468       }
6469 
6470       assert(SrcOp.getReg() != AMDGPU::NoRegister);
6471 
6472       if (I - Intr->VAddrStart < NumPacked)
6473         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6474       else
6475         SrcOp.setReg(AMDGPU::NoRegister);
6476     }
6477   } else {
6478     // If the register allocator cannot place the address registers contiguously
6479     // without introducing moves, then using the non-sequential address encoding
6480     // is always preferable, since it saves VALU instructions and is usually a
6481     // wash in terms of code size or even better.
6482     //
6483     // However, we currently have no way of hinting to the register allocator
6484     // that MIMG addresses should be placed contiguously when it is possible to
6485     // do so, so force non-NSA for the common 2-address case as a heuristic.
6486     //
6487     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6488     // allocation when possible.
6489     //
6490     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6491     // set of the remaining addresses.
6492     const bool UseNSA = ST.hasNSAEncoding() &&
6493                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6494                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6495     const bool UsePartialNSA =
6496         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6497 
6498     if (UsePartialNSA) {
6499       convertImageAddrToPacked(B, MI,
6500                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6501                                Intr->NumVAddrs - NSAMaxSize + 1);
6502     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6503       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6504                                Intr->NumVAddrs);
6505     }
6506   }
6507 
6508   int Flags = 0;
6509   if (IsA16)
6510     Flags |= 1;
6511   if (IsG16)
6512     Flags |= 2;
6513   MI.addOperand(MachineOperand::CreateImm(Flags));
6514 
6515   if (BaseOpcode->NoReturn) { // No TFE for stores?
6516     // TODO: Handle dmask trim
6517     if (!Ty.isVector() || !IsD16)
6518       return true;
6519 
6520     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6521     if (RepackedReg != VData) {
6522       MI.getOperand(1).setReg(RepackedReg);
6523     }
6524 
6525     return true;
6526   }
6527 
6528   Register DstReg = MI.getOperand(0).getReg();
6529   const LLT EltTy = Ty.getScalarType();
6530   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6531 
6532   // Confirm that the return type is large enough for the dmask specified
6533   if (NumElts < DMaskLanes)
6534     return false;
6535 
6536   if (NumElts > 4 || DMaskLanes > 4)
6537     return false;
6538 
6539   // Image atomic instructions are using DMask to specify how many bits
6540   // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6541   // DMaskLanes for image atomic has default value '0'.
6542   // We must be sure that atomic variants (especially packed) will not be
6543   // truncated from v2s16 or v4s16 to s16 type.
6544   //
6545   // ChangeElementCount will be needed for image load where Ty is always scalar.
6546   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6547   const LLT AdjustedTy =
6548       DMaskLanes == 0
6549           ? Ty
6550           : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6551 
6552   // The raw dword aligned data component of the load. The only legal cases
6553   // where this matters should be when using the packed D16 format, for
6554   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6555   LLT RoundedTy;
6556 
6557   // S32 vector to cover all data, plus TFE result element.
6558   LLT TFETy;
6559 
6560   // Register type to use for each loaded component. Will be S32 or V2S16.
6561   LLT RegTy;
6562 
6563   if (IsD16 && ST.hasUnpackedD16VMem()) {
6564     RoundedTy =
6565         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6566     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6567     RegTy = S32;
6568   } else {
6569     unsigned EltSize = EltTy.getSizeInBits();
6570     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6571     unsigned RoundedSize = 32 * RoundedElts;
6572     RoundedTy = LLT::scalarOrVector(
6573         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6574     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6575     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6576   }
6577 
6578   // The return type does not need adjustment.
6579   // TODO: Should we change s16 case to s32 or <2 x s16>?
6580   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6581     return true;
6582 
6583   Register Dst1Reg;
6584 
6585   // Insert after the instruction.
6586   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6587 
6588   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6589   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6590   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6591   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6592 
6593   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6594 
6595   MI.getOperand(0).setReg(NewResultReg);
6596 
6597   // In the IR, TFE is supposed to be used with a 2 element struct return
6598   // type. The instruction really returns these two values in one contiguous
6599   // register, with one additional dword beyond the loaded data. Rewrite the
6600   // return type to use a single register result.
6601 
6602   if (IsTFE) {
6603     Dst1Reg = MI.getOperand(1).getReg();
6604     if (MRI->getType(Dst1Reg) != S32)
6605       return false;
6606 
6607     // TODO: Make sure the TFE operand bit is set.
6608     MI.removeOperand(1);
6609 
6610     // Handle the easy case that requires no repack instructions.
6611     if (Ty == S32) {
6612       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6613       return true;
6614     }
6615   }
6616 
6617   // Now figure out how to copy the new result register back into the old
6618   // result.
6619   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6620 
6621   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6622 
6623   if (ResultNumRegs == 1) {
6624     assert(!IsTFE);
6625     ResultRegs[0] = NewResultReg;
6626   } else {
6627     // We have to repack into a new vector of some kind.
6628     for (int I = 0; I != NumDataRegs; ++I)
6629       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6630     B.buildUnmerge(ResultRegs, NewResultReg);
6631 
6632     // Drop the final TFE element to get the data part. The TFE result is
6633     // directly written to the right place already.
6634     if (IsTFE)
6635       ResultRegs.resize(NumDataRegs);
6636   }
6637 
6638   // For an s16 scalar result, we form an s32 result with a truncate regardless
6639   // of packed vs. unpacked.
6640   if (IsD16 && !Ty.isVector()) {
6641     B.buildTrunc(DstReg, ResultRegs[0]);
6642     return true;
6643   }
6644 
6645   // Avoid a build/concat_vector of 1 entry.
6646   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6647     B.buildBitcast(DstReg, ResultRegs[0]);
6648     return true;
6649   }
6650 
6651   assert(Ty.isVector());
6652 
6653   if (IsD16) {
6654     // For packed D16 results with TFE enabled, all the data components are
6655     // S32. Cast back to the expected type.
6656     //
6657     // TODO: We don't really need to use load s32 elements. We would only need one
6658     // cast for the TFE result if a multiple of v2s16 was used.
6659     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6660       for (Register &Reg : ResultRegs)
6661         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6662     } else if (ST.hasUnpackedD16VMem()) {
6663       for (Register &Reg : ResultRegs)
6664         Reg = B.buildTrunc(S16, Reg).getReg(0);
6665     }
6666   }
6667 
6668   auto padWithUndef = [&](LLT Ty, int NumElts) {
6669     if (NumElts == 0)
6670       return;
6671     Register Undef = B.buildUndef(Ty).getReg(0);
6672     for (int I = 0; I != NumElts; ++I)
6673       ResultRegs.push_back(Undef);
6674   };
6675 
6676   // Pad out any elements eliminated due to the dmask.
6677   LLT ResTy = MRI->getType(ResultRegs[0]);
6678   if (!ResTy.isVector()) {
6679     padWithUndef(ResTy, NumElts - ResultRegs.size());
6680     B.buildBuildVector(DstReg, ResultRegs);
6681     return true;
6682   }
6683 
6684   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6685   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6686 
6687   // Deal with the one annoying legal case.
6688   const LLT V3S16 = LLT::fixed_vector(3, 16);
6689   if (Ty == V3S16) {
6690     if (IsTFE) {
6691       if (ResultRegs.size() == 1) {
6692         NewResultReg = ResultRegs[0];
6693       } else if (ResultRegs.size() == 2) {
6694         LLT V4S16 = LLT::fixed_vector(4, 16);
6695         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6696       } else {
6697         return false;
6698       }
6699     }
6700 
6701     if (MRI->getType(DstReg).getNumElements() <
6702         MRI->getType(NewResultReg).getNumElements()) {
6703       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6704     } else {
6705       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6706     }
6707     return true;
6708   }
6709 
6710   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6711   B.buildConcatVectors(DstReg, ResultRegs);
6712   return true;
6713 }
6714 
6715 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6716                                               MachineInstr &MI) const {
6717   MachineIRBuilder &B = Helper.MIRBuilder;
6718   GISelChangeObserver &Observer = Helper.Observer;
6719 
6720   Register OrigDst = MI.getOperand(0).getReg();
6721   Register Dst;
6722   LLT Ty = B.getMRI()->getType(OrigDst);
6723   unsigned Size = Ty.getSizeInBits();
6724   MachineFunction &MF = B.getMF();
6725   unsigned Opc = 0;
6726   if (Size < 32 && ST.hasScalarSubwordLoads()) {
6727     assert(Size == 8 || Size == 16);
6728     Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6729                     : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6730     // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6731     // destination register.
6732     Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6733   } else {
6734     Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6735     Dst = OrigDst;
6736   }
6737 
6738   Observer.changingInstr(MI);
6739 
6740   // Handle needing to s.buffer.load() a p8 value.
6741   if (hasBufferRsrcWorkaround(Ty)) {
6742     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6743     B.setInsertPt(B.getMBB(), MI);
6744   }
6745   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6746     Ty = getBitcastRegisterType(Ty);
6747     Helper.bitcastDst(MI, Ty, 0);
6748     B.setInsertPt(B.getMBB(), MI);
6749   }
6750 
6751   // FIXME: We don't really need this intermediate instruction. The intrinsic
6752   // should be fixed to have a memory operand. Since it's readnone, we're not
6753   // allowed to add one.
6754   MI.setDesc(B.getTII().get(Opc));
6755   MI.removeOperand(1); // Remove intrinsic ID
6756 
6757   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6758   const unsigned MemSize = (Size + 7) / 8;
6759   const Align MemAlign = B.getDataLayout().getABITypeAlign(
6760       getTypeForLLT(Ty, MF.getFunction().getContext()));
6761   MachineMemOperand *MMO = MF.getMachineMemOperand(
6762       MachinePointerInfo(),
6763       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6764           MachineMemOperand::MOInvariant,
6765       MemSize, MemAlign);
6766   MI.addMemOperand(MF, MMO);
6767   if (Dst != OrigDst) {
6768     MI.getOperand(0).setReg(Dst);
6769     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6770     B.buildTrunc(OrigDst, Dst);
6771   }
6772 
6773   // If we don't have 96-bit result scalar loads, widening to 128-bit should
6774   // always be legal. We may need to restore this to a 96-bit result if it turns
6775   // out this needs to be converted to a vector load during RegBankSelect.
6776   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6777     if (Ty.isVector())
6778       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6779     else
6780       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6781   }
6782 
6783   Observer.changedInstr(MI);
6784   return true;
6785 }
6786 
6787 // TODO: Move to selection
6788 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6789                                        MachineRegisterInfo &MRI,
6790                                        MachineIRBuilder &B) const {
6791   if (!ST.isTrapHandlerEnabled() ||
6792       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6793     return legalizeTrapEndpgm(MI, MRI, B);
6794 
6795   return ST.supportsGetDoorbellID() ?
6796          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6797 }
6798 
6799 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6800     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6801   const DebugLoc &DL = MI.getDebugLoc();
6802   MachineBasicBlock &BB = B.getMBB();
6803   MachineFunction *MF = BB.getParent();
6804 
6805   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6806     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6807       .addImm(0);
6808     MI.eraseFromParent();
6809     return true;
6810   }
6811 
6812   // We need a block split to make the real endpgm a terminator. We also don't
6813   // want to break phis in successor blocks, so we can't just delete to the
6814   // end of the block.
6815   BB.splitAt(MI, false /*UpdateLiveIns*/);
6816   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6817   MF->push_back(TrapBB);
6818   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6819     .addImm(0);
6820   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6821     .addMBB(TrapBB);
6822 
6823   BB.addSuccessor(TrapBB);
6824   MI.eraseFromParent();
6825   return true;
6826 }
6827 
6828 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6829     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6830   MachineFunction &MF = B.getMF();
6831   const LLT S64 = LLT::scalar(64);
6832 
6833   Register SGPR01(AMDGPU::SGPR0_SGPR1);
6834   // For code object version 5, queue_ptr is passed through implicit kernarg.
6835   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6836       AMDGPU::AMDHSA_COV5) {
6837     AMDGPUTargetLowering::ImplicitParameter Param =
6838         AMDGPUTargetLowering::QUEUE_PTR;
6839     uint64_t Offset =
6840         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6841 
6842     Register KernargPtrReg = MRI.createGenericVirtualRegister(
6843         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6844 
6845     if (!loadInputValue(KernargPtrReg, B,
6846                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6847       return false;
6848 
6849     // TODO: can we be smarter about machine pointer info?
6850     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6851     MachineMemOperand *MMO = MF.getMachineMemOperand(
6852         PtrInfo,
6853         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6854             MachineMemOperand::MOInvariant,
6855         LLT::scalar(64), commonAlignment(Align(64), Offset));
6856 
6857     // Pointer address
6858     Register LoadAddr = MRI.createGenericVirtualRegister(
6859         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6860     B.buildPtrAdd(LoadAddr, KernargPtrReg,
6861                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
6862     // Load address
6863     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
6864     B.buildCopy(SGPR01, Temp);
6865     B.buildInstr(AMDGPU::S_TRAP)
6866         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6867         .addReg(SGPR01, RegState::Implicit);
6868     MI.eraseFromParent();
6869     return true;
6870   }
6871 
6872   // Pass queue pointer to trap handler as input, and insert trap instruction
6873   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6874   Register LiveIn =
6875     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6876   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
6877     return false;
6878 
6879   B.buildCopy(SGPR01, LiveIn);
6880   B.buildInstr(AMDGPU::S_TRAP)
6881       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
6882       .addReg(SGPR01, RegState::Implicit);
6883 
6884   MI.eraseFromParent();
6885   return true;
6886 }
6887 
6888 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6889                                           MachineRegisterInfo &MRI,
6890                                           MachineIRBuilder &B) const {
6891   // We need to simulate the 's_trap 2' instruction on targets that run in
6892   // PRIV=1 (where it is treated as a nop).
6893   if (ST.hasPrivEnabledTrap2NopBug()) {
6894     ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6895                                            MI.getDebugLoc());
6896     MI.eraseFromParent();
6897     return true;
6898   }
6899 
6900   B.buildInstr(AMDGPU::S_TRAP)
6901       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
6902   MI.eraseFromParent();
6903   return true;
6904 }
6905 
6906 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6907                                             MachineRegisterInfo &MRI,
6908                                             MachineIRBuilder &B) const {
6909   // Is non-HSA path or trap-handler disabled? Then, report a warning
6910   // accordingly
6911   if (!ST.isTrapHandlerEnabled() ||
6912       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
6913     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
6914                                      "debugtrap handler not supported",
6915                                      MI.getDebugLoc(), DS_Warning);
6916     LLVMContext &Ctx = B.getMF().getFunction().getContext();
6917     Ctx.diagnose(NoTrap);
6918   } else {
6919     // Insert debug-trap instruction
6920     B.buildInstr(AMDGPU::S_TRAP)
6921         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
6922   }
6923 
6924   MI.eraseFromParent();
6925   return true;
6926 }
6927 
6928 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6929                                                MachineIRBuilder &B) const {
6930   MachineRegisterInfo &MRI = *B.getMRI();
6931   const LLT S16 = LLT::scalar(16);
6932   const LLT S32 = LLT::scalar(32);
6933   const LLT V2S16 = LLT::fixed_vector(2, 16);
6934   const LLT V3S32 = LLT::fixed_vector(3, 32);
6935 
6936   Register DstReg = MI.getOperand(0).getReg();
6937   Register NodePtr = MI.getOperand(2).getReg();
6938   Register RayExtent = MI.getOperand(3).getReg();
6939   Register RayOrigin = MI.getOperand(4).getReg();
6940   Register RayDir = MI.getOperand(5).getReg();
6941   Register RayInvDir = MI.getOperand(6).getReg();
6942   Register TDescr = MI.getOperand(7).getReg();
6943 
6944   if (!ST.hasGFX10_AEncoding()) {
6945     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6946                                         "intrinsic not supported on subtarget",
6947                                         MI.getDebugLoc());
6948     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6949     return false;
6950   }
6951 
6952   const bool IsGFX11 = AMDGPU::isGFX11(ST);
6953   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6954   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6955   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6956   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6957   const unsigned NumVDataDwords = 4;
6958   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6959   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6960   const bool UseNSA =
6961       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6962 
6963   const unsigned BaseOpcodes[2][2] = {
6964       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6965       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6966        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6967   int Opcode;
6968   if (UseNSA) {
6969     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6970                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6971                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
6972                                                : AMDGPU::MIMGEncGfx10NSA,
6973                                    NumVDataDwords, NumVAddrDwords);
6974   } else {
6975     assert(!IsGFX12Plus);
6976     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6977                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6978                                            : AMDGPU::MIMGEncGfx10Default,
6979                                    NumVDataDwords, NumVAddrDwords);
6980   }
6981   assert(Opcode != -1);
6982 
6983   SmallVector<Register, 12> Ops;
6984   if (UseNSA && IsGFX11Plus) {
6985     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
6986       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6987       auto Merged = B.buildMergeLikeInstr(
6988           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6989       Ops.push_back(Merged.getReg(0));
6990     };
6991 
6992     Ops.push_back(NodePtr);
6993     Ops.push_back(RayExtent);
6994     packLanes(RayOrigin);
6995 
6996     if (IsA16) {
6997       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
6998       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6999       auto MergedDir = B.buildMergeLikeInstr(
7000           V3S32,
7001           {B.buildBitcast(
7002                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7003                                                    UnmergeRayDir.getReg(0)}))
7004                .getReg(0),
7005            B.buildBitcast(
7006                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7007                                                    UnmergeRayDir.getReg(1)}))
7008                .getReg(0),
7009            B.buildBitcast(
7010                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7011                                                    UnmergeRayDir.getReg(2)}))
7012                .getReg(0)});
7013       Ops.push_back(MergedDir.getReg(0));
7014     } else {
7015       packLanes(RayDir);
7016       packLanes(RayInvDir);
7017     }
7018   } else {
7019     if (Is64) {
7020       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7021       Ops.push_back(Unmerge.getReg(0));
7022       Ops.push_back(Unmerge.getReg(1));
7023     } else {
7024       Ops.push_back(NodePtr);
7025     }
7026     Ops.push_back(RayExtent);
7027 
7028     auto packLanes = [&Ops, &S32, &B](Register Src) {
7029       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7030       Ops.push_back(Unmerge.getReg(0));
7031       Ops.push_back(Unmerge.getReg(1));
7032       Ops.push_back(Unmerge.getReg(2));
7033     };
7034 
7035     packLanes(RayOrigin);
7036     if (IsA16) {
7037       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7038       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7039       Register R1 = MRI.createGenericVirtualRegister(S32);
7040       Register R2 = MRI.createGenericVirtualRegister(S32);
7041       Register R3 = MRI.createGenericVirtualRegister(S32);
7042       B.buildMergeLikeInstr(R1,
7043                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7044       B.buildMergeLikeInstr(
7045           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7046       B.buildMergeLikeInstr(
7047           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7048       Ops.push_back(R1);
7049       Ops.push_back(R2);
7050       Ops.push_back(R3);
7051     } else {
7052       packLanes(RayDir);
7053       packLanes(RayInvDir);
7054     }
7055   }
7056 
7057   if (!UseNSA) {
7058     // Build a single vector containing all the operands so far prepared.
7059     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7060     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7061     Ops.clear();
7062     Ops.push_back(MergedOps);
7063   }
7064 
7065   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7066     .addDef(DstReg)
7067     .addImm(Opcode);
7068 
7069   for (Register R : Ops) {
7070     MIB.addUse(R);
7071   }
7072 
7073   MIB.addUse(TDescr)
7074      .addImm(IsA16 ? 1 : 0)
7075      .cloneMemRefs(MI);
7076 
7077   MI.eraseFromParent();
7078   return true;
7079 }
7080 
7081 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
7082                                                MachineIRBuilder &B) const {
7083   unsigned Opc;
7084   int RoundMode = MI.getOperand(2).getImm();
7085 
7086   if (RoundMode == (int)RoundingMode::TowardPositive)
7087     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
7088   else if (RoundMode == (int)RoundingMode::TowardNegative)
7089     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
7090   else
7091     return false;
7092 
7093   B.buildInstr(Opc)
7094       .addDef(MI.getOperand(0).getReg())
7095       .addUse(MI.getOperand(1).getReg());
7096 
7097   MI.eraseFromParent();
7098 
7099   return true;
7100 }
7101 
7102 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7103                                             MachineIRBuilder &B) const {
7104   const SITargetLowering *TLI = ST.getTargetLowering();
7105   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7106   Register DstReg = MI.getOperand(0).getReg();
7107   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7108   MI.eraseFromParent();
7109   return true;
7110 }
7111 
7112 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7113                                          MachineIRBuilder &B) const {
7114   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7115   if (!ST.hasArchitectedSGPRs())
7116     return false;
7117   LLT S32 = LLT::scalar(32);
7118   Register DstReg = MI.getOperand(0).getReg();
7119   auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7120   auto LSB = B.buildConstant(S32, 25);
7121   auto Width = B.buildConstant(S32, 5);
7122   B.buildUbfx(DstReg, TTMP8, LSB, Width);
7123   MI.eraseFromParent();
7124   return true;
7125 }
7126 
7127 static constexpr unsigned FPEnvModeBitField =
7128     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
7129 
7130 static constexpr unsigned FPEnvTrapBitField =
7131     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
7132 
7133 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7134                                            MachineRegisterInfo &MRI,
7135                                            MachineIRBuilder &B) const {
7136   Register Src = MI.getOperand(0).getReg();
7137   if (MRI.getType(Src) != S64)
7138     return false;
7139 
7140   auto ModeReg =
7141       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7142                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7143           .addImm(FPEnvModeBitField);
7144   auto TrapReg =
7145       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7146                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7147           .addImm(FPEnvTrapBitField);
7148   B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7149   MI.eraseFromParent();
7150   return true;
7151 }
7152 
7153 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7154                                            MachineRegisterInfo &MRI,
7155                                            MachineIRBuilder &B) const {
7156   Register Src = MI.getOperand(0).getReg();
7157   if (MRI.getType(Src) != S64)
7158     return false;
7159 
7160   auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7161   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7162                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7163       .addImm(static_cast<int16_t>(FPEnvModeBitField))
7164       .addReg(Unmerge.getReg(0));
7165   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7166                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7167       .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7168       .addReg(Unmerge.getReg(1));
7169   MI.eraseFromParent();
7170   return true;
7171 }
7172 
7173 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7174                                             MachineInstr &MI) const {
7175   MachineIRBuilder &B = Helper.MIRBuilder;
7176   MachineRegisterInfo &MRI = *B.getMRI();
7177 
7178   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7179   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7180   switch (IntrID) {
7181   case Intrinsic::amdgcn_if:
7182   case Intrinsic::amdgcn_else: {
7183     MachineInstr *Br = nullptr;
7184     MachineBasicBlock *UncondBrTarget = nullptr;
7185     bool Negated = false;
7186     if (MachineInstr *BrCond =
7187             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7188       const SIRegisterInfo *TRI
7189         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7190 
7191       Register Def = MI.getOperand(1).getReg();
7192       Register Use = MI.getOperand(3).getReg();
7193 
7194       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7195 
7196       if (Negated)
7197         std::swap(CondBrTarget, UncondBrTarget);
7198 
7199       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7200       if (IntrID == Intrinsic::amdgcn_if) {
7201         B.buildInstr(AMDGPU::SI_IF)
7202           .addDef(Def)
7203           .addUse(Use)
7204           .addMBB(UncondBrTarget);
7205       } else {
7206         B.buildInstr(AMDGPU::SI_ELSE)
7207             .addDef(Def)
7208             .addUse(Use)
7209             .addMBB(UncondBrTarget);
7210       }
7211 
7212       if (Br) {
7213         Br->getOperand(0).setMBB(CondBrTarget);
7214       } else {
7215         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7216         // since we're swapping branch targets it needs to be reinserted.
7217         // FIXME: IRTranslator should probably not do this
7218         B.buildBr(*CondBrTarget);
7219       }
7220 
7221       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7222       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7223       MI.eraseFromParent();
7224       BrCond->eraseFromParent();
7225       return true;
7226     }
7227 
7228     return false;
7229   }
7230   case Intrinsic::amdgcn_loop: {
7231     MachineInstr *Br = nullptr;
7232     MachineBasicBlock *UncondBrTarget = nullptr;
7233     bool Negated = false;
7234     if (MachineInstr *BrCond =
7235             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7236       const SIRegisterInfo *TRI
7237         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7238 
7239       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7240       Register Reg = MI.getOperand(2).getReg();
7241 
7242       if (Negated)
7243         std::swap(CondBrTarget, UncondBrTarget);
7244 
7245       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7246       B.buildInstr(AMDGPU::SI_LOOP)
7247         .addUse(Reg)
7248         .addMBB(UncondBrTarget);
7249 
7250       if (Br)
7251         Br->getOperand(0).setMBB(CondBrTarget);
7252       else
7253         B.buildBr(*CondBrTarget);
7254 
7255       MI.eraseFromParent();
7256       BrCond->eraseFromParent();
7257       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7258       return true;
7259     }
7260 
7261     return false;
7262   }
7263   case Intrinsic::amdgcn_addrspacecast_nonnull:
7264     return legalizeAddrSpaceCast(MI, MRI, B);
7265   case Intrinsic::amdgcn_make_buffer_rsrc:
7266     return legalizePointerAsRsrcIntrin(MI, MRI, B);
7267   case Intrinsic::amdgcn_kernarg_segment_ptr:
7268     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7269       // This only makes sense to call in a kernel, so just lower to null.
7270       B.buildConstant(MI.getOperand(0).getReg(), 0);
7271       MI.eraseFromParent();
7272       return true;
7273     }
7274 
7275     return legalizePreloadedArgIntrin(
7276       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7277   case Intrinsic::amdgcn_implicitarg_ptr:
7278     return legalizeImplicitArgPtr(MI, MRI, B);
7279   case Intrinsic::amdgcn_workitem_id_x:
7280     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7281                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7282   case Intrinsic::amdgcn_workitem_id_y:
7283     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7284                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7285   case Intrinsic::amdgcn_workitem_id_z:
7286     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7287                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7288   case Intrinsic::amdgcn_workgroup_id_x:
7289     return legalizePreloadedArgIntrin(MI, MRI, B,
7290                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7291   case Intrinsic::amdgcn_workgroup_id_y:
7292     return legalizePreloadedArgIntrin(MI, MRI, B,
7293                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7294   case Intrinsic::amdgcn_workgroup_id_z:
7295     return legalizePreloadedArgIntrin(MI, MRI, B,
7296                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7297   case Intrinsic::amdgcn_wave_id:
7298     return legalizeWaveID(MI, B);
7299   case Intrinsic::amdgcn_lds_kernel_id:
7300     return legalizePreloadedArgIntrin(MI, MRI, B,
7301                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7302   case Intrinsic::amdgcn_dispatch_ptr:
7303     return legalizePreloadedArgIntrin(MI, MRI, B,
7304                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
7305   case Intrinsic::amdgcn_queue_ptr:
7306     return legalizePreloadedArgIntrin(MI, MRI, B,
7307                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
7308   case Intrinsic::amdgcn_implicit_buffer_ptr:
7309     return legalizePreloadedArgIntrin(
7310       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7311   case Intrinsic::amdgcn_dispatch_id:
7312     return legalizePreloadedArgIntrin(MI, MRI, B,
7313                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
7314   case Intrinsic::r600_read_ngroups_x:
7315     // TODO: Emit error for hsa
7316     return legalizeKernargMemParameter(MI, B,
7317                                        SI::KernelInputOffsets::NGROUPS_X);
7318   case Intrinsic::r600_read_ngroups_y:
7319     return legalizeKernargMemParameter(MI, B,
7320                                        SI::KernelInputOffsets::NGROUPS_Y);
7321   case Intrinsic::r600_read_ngroups_z:
7322     return legalizeKernargMemParameter(MI, B,
7323                                        SI::KernelInputOffsets::NGROUPS_Z);
7324   case Intrinsic::r600_read_local_size_x:
7325     // TODO: Could insert G_ASSERT_ZEXT from s16
7326     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7327   case Intrinsic::r600_read_local_size_y:
7328     // TODO: Could insert G_ASSERT_ZEXT from s16
7329     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
7330     // TODO: Could insert G_ASSERT_ZEXT from s16
7331   case Intrinsic::r600_read_local_size_z:
7332     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
7333   case Intrinsic::r600_read_global_size_x:
7334     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
7335   case Intrinsic::r600_read_global_size_y:
7336     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
7337   case Intrinsic::r600_read_global_size_z:
7338     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
7339   case Intrinsic::amdgcn_fdiv_fast:
7340     return legalizeFDIVFastIntrin(MI, MRI, B);
7341   case Intrinsic::amdgcn_is_shared:
7342     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7343   case Intrinsic::amdgcn_is_private:
7344     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7345   case Intrinsic::amdgcn_wavefrontsize: {
7346     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7347     MI.eraseFromParent();
7348     return true;
7349   }
7350   case Intrinsic::amdgcn_s_buffer_load:
7351     return legalizeSBufferLoad(Helper, MI);
7352   case Intrinsic::amdgcn_raw_buffer_store:
7353   case Intrinsic::amdgcn_raw_ptr_buffer_store:
7354   case Intrinsic::amdgcn_struct_buffer_store:
7355   case Intrinsic::amdgcn_struct_ptr_buffer_store:
7356     return legalizeBufferStore(MI, MRI, B, false, false);
7357   case Intrinsic::amdgcn_raw_buffer_store_format:
7358   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7359   case Intrinsic::amdgcn_struct_buffer_store_format:
7360   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7361     return legalizeBufferStore(MI, MRI, B, false, true);
7362   case Intrinsic::amdgcn_raw_tbuffer_store:
7363   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7364   case Intrinsic::amdgcn_struct_tbuffer_store:
7365   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7366     return legalizeBufferStore(MI, MRI, B, true, true);
7367   case Intrinsic::amdgcn_raw_buffer_load:
7368   case Intrinsic::amdgcn_raw_ptr_buffer_load:
7369   case Intrinsic::amdgcn_raw_atomic_buffer_load:
7370   case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7371   case Intrinsic::amdgcn_struct_buffer_load:
7372   case Intrinsic::amdgcn_struct_ptr_buffer_load:
7373     return legalizeBufferLoad(MI, MRI, B, false, false);
7374   case Intrinsic::amdgcn_raw_buffer_load_format:
7375   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7376   case Intrinsic::amdgcn_struct_buffer_load_format:
7377   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7378     return legalizeBufferLoad(MI, MRI, B, true, false);
7379   case Intrinsic::amdgcn_raw_tbuffer_load:
7380   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7381   case Intrinsic::amdgcn_struct_tbuffer_load:
7382   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7383     return legalizeBufferLoad(MI, MRI, B, true, true);
7384   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7385   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7386   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7387   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7388   case Intrinsic::amdgcn_raw_buffer_atomic_add:
7389   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7390   case Intrinsic::amdgcn_struct_buffer_atomic_add:
7391   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7392   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7393   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7394   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7395   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7396   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7397   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7398   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7399   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7400   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7401   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7402   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7403   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7404   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7405   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7406   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7407   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7408   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7409   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7410   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7411   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7412   case Intrinsic::amdgcn_raw_buffer_atomic_and:
7413   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7414   case Intrinsic::amdgcn_struct_buffer_atomic_and:
7415   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7416   case Intrinsic::amdgcn_raw_buffer_atomic_or:
7417   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7418   case Intrinsic::amdgcn_struct_buffer_atomic_or:
7419   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7420   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7421   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7422   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7423   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7424   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7425   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7426   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7427   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7428   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7429   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7430   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7431   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7432   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7433   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7434   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7435   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7436   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7437   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7438   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7439   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7440   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7441   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7442   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7443   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7444   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7445   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7446   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7447   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7448     return legalizeBufferAtomic(MI, B, IntrID);
7449   case Intrinsic::amdgcn_rsq_clamp:
7450     return legalizeRsqClampIntrinsic(MI, MRI, B);
7451   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7452     return legalizeBVHIntrinsic(MI, B);
7453   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7454   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7455   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7456   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7457   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7458   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7459   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7460   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7461     Register Index = MI.getOperand(5).getReg();
7462     LLT S32 = LLT::scalar(32);
7463     if (MRI.getType(Index) != S32)
7464       MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7465     return true;
7466   }
7467   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7468   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7469   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7470     Register Index = MI.getOperand(7).getReg();
7471     LLT S32 = LLT::scalar(32);
7472     if (MRI.getType(Index) != S32)
7473       MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7474     return true;
7475   }
7476   case Intrinsic::amdgcn_fmed3: {
7477     GISelChangeObserver &Observer = Helper.Observer;
7478 
7479     // FIXME: This is to workaround the inability of tablegen match combiners to
7480     // match intrinsics in patterns.
7481     Observer.changingInstr(MI);
7482     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7483     MI.removeOperand(1);
7484     Observer.changedInstr(MI);
7485     return true;
7486   }
7487   case Intrinsic::amdgcn_readlane:
7488   case Intrinsic::amdgcn_writelane:
7489   case Intrinsic::amdgcn_readfirstlane:
7490   case Intrinsic::amdgcn_permlane16:
7491   case Intrinsic::amdgcn_permlanex16:
7492   case Intrinsic::amdgcn_permlane64:
7493     return legalizeLaneOp(Helper, MI, IntrID);
7494   default: {
7495     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7496             AMDGPU::getImageDimIntrinsicInfo(IntrID))
7497       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7498     return true;
7499   }
7500   }
7501 
7502   return true;
7503 }
7504