xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUMemoryUtils.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "SIInstrInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "SIRegisterInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/ScopeExit.h"
27 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
31 #include "llvm/CodeGen/GlobalISel/Utils.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
36 
37 #define DEBUG_TYPE "amdgpu-legalinfo"
38 
39 using namespace llvm;
40 using namespace LegalizeActions;
41 using namespace LegalizeMutations;
42 using namespace LegalityPredicates;
43 using namespace MIPatternMatch;
44 
45 // Hack until load/store selection patterns support any tuple of legal types.
46 static cl::opt<bool> EnableNewLegality(
47   "amdgpu-global-isel-new-legality",
48   cl::desc("Use GlobalISel desired legality, rather than try to use"
49            "rules compatible with selection patterns"),
50   cl::init(false),
51   cl::ReallyHidden);
52 
53 static constexpr unsigned MaxRegisterSize = 1024;
54 
55 // Round the number of elements to the next power of two elements
56 static LLT getPow2VectorType(LLT Ty) {
57   unsigned NElts = Ty.getNumElements();
58   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
59   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
60 }
61 
62 // Round the number of bits to the next power of two bits
63 static LLT getPow2ScalarType(LLT Ty) {
64   unsigned Bits = Ty.getSizeInBits();
65   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
66   return LLT::scalar(Pow2Bits);
67 }
68 
69 /// \returns true if this is an odd sized vector which should widen by adding an
70 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71 /// excludes s1 vectors, which should always be scalarized.
72 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
73   return [=](const LegalityQuery &Query) {
74     const LLT Ty = Query.Types[TypeIdx];
75     if (!Ty.isVector())
76       return false;
77 
78     const LLT EltTy = Ty.getElementType();
79     const unsigned EltSize = EltTy.getSizeInBits();
80     return Ty.getNumElements() % 2 != 0 &&
81            EltSize > 1 && EltSize < 32 &&
82            Ty.getSizeInBits() % 32 != 0;
83   };
84 }
85 
86 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     return Ty.getSizeInBits() % 32 == 0;
90   };
91 }
92 
93 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
94   return [=](const LegalityQuery &Query) {
95     const LLT Ty = Query.Types[TypeIdx];
96     const LLT EltTy = Ty.getScalarType();
97     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
98   };
99 }
100 
101 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
102   return [=](const LegalityQuery &Query) {
103     const LLT Ty = Query.Types[TypeIdx];
104     const LLT EltTy = Ty.getElementType();
105     return std::pair(TypeIdx,
106                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
107   };
108 }
109 
110 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
111   return [=](const LegalityQuery &Query) {
112     const LLT Ty = Query.Types[TypeIdx];
113     const LLT EltTy = Ty.getElementType();
114     unsigned Size = Ty.getSizeInBits();
115     unsigned Pieces = (Size + 63) / 64;
116     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117     return std::pair(TypeIdx, LLT::scalarOrVector(
118                                   ElementCount::getFixed(NewNumElts), EltTy));
119   };
120 }
121 
122 // Increase the number of vector elements to reach the next multiple of 32-bit
123 // type.
124 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
125   return [=](const LegalityQuery &Query) {
126     const LLT Ty = Query.Types[TypeIdx];
127 
128     const LLT EltTy = Ty.getElementType();
129     const int Size = Ty.getSizeInBits();
130     const int EltSize = EltTy.getSizeInBits();
131     const int NextMul32 = (Size + 31) / 32;
132 
133     assert(EltSize < 32);
134 
135     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
137   };
138 }
139 
140 // Increase the number of vector elements to reach the next legal RegClass.
141 static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
142   return [=](const LegalityQuery &Query) {
143     const LLT Ty = Query.Types[TypeIdx];
144     const unsigned NumElts = Ty.getNumElements();
145     const unsigned EltSize = Ty.getElementType().getSizeInBits();
146     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
147 
148     assert(EltSize == 32 || EltSize == 64);
149     assert(Ty.getSizeInBits() < MaxRegisterSize);
150 
151     unsigned NewNumElts;
152     // Find the nearest legal RegClass that is larger than the current type.
153     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
154       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
155         break;
156     }
157     return std::pair(TypeIdx,
158                      LLT::fixed_vector(NewNumElts, Ty.getElementType()));
159   };
160 }
161 
162 static LLT getBufferRsrcScalarType(const LLT Ty) {
163   if (!Ty.isVector())
164     return LLT::scalar(128);
165   const ElementCount NumElems = Ty.getElementCount();
166   return LLT::vector(NumElems, LLT::scalar(128));
167 }
168 
169 static LLT getBufferRsrcRegisterType(const LLT Ty) {
170   if (!Ty.isVector())
171     return LLT::fixed_vector(4, LLT::scalar(32));
172   const unsigned NumElems = Ty.getElementCount().getFixedValue();
173   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
174 }
175 
176 static LLT getBitcastRegisterType(const LLT Ty) {
177   const unsigned Size = Ty.getSizeInBits();
178 
179   if (Size <= 32) {
180     // <2 x s8> -> s16
181     // <4 x s8> -> s32
182     return LLT::scalar(Size);
183   }
184 
185   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186 }
187 
188 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189   return [=](const LegalityQuery &Query) {
190     const LLT Ty = Query.Types[TypeIdx];
191     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192   };
193 }
194 
195 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196   return [=](const LegalityQuery &Query) {
197     const LLT Ty = Query.Types[TypeIdx];
198     unsigned Size = Ty.getSizeInBits();
199     assert(Size % 32 == 0);
200     return std::pair(
201         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
202   };
203 }
204 
205 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
206   return [=](const LegalityQuery &Query) {
207     const LLT QueryTy = Query.Types[TypeIdx];
208     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
209   };
210 }
211 
212 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
213   return [=](const LegalityQuery &Query) {
214     const LLT QueryTy = Query.Types[TypeIdx];
215     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
216   };
217 }
218 
219 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
223   };
224 }
225 
226 static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
227   return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
228          Size <= MaxRegisterSize;
229 }
230 
231 static bool isRegisterVectorElementType(LLT EltTy) {
232   const int EltSize = EltTy.getSizeInBits();
233   return EltSize == 16 || EltSize % 32 == 0;
234 }
235 
236 static bool isRegisterVectorType(LLT Ty) {
237   const int EltSize = Ty.getElementType().getSizeInBits();
238   return EltSize == 32 || EltSize == 64 ||
239          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
240          EltSize == 128 || EltSize == 256;
241 }
242 
243 // TODO: replace all uses of isRegisterType with isRegisterClassType
244 static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
245   if (!isRegisterSize(ST, Ty.getSizeInBits()))
246     return false;
247 
248   if (Ty.isVector())
249     return isRegisterVectorType(Ty);
250 
251   return true;
252 }
253 
254 // Any combination of 32 or 64-bit elements up the maximum register size, and
255 // multiples of v2s16.
256 static LegalityPredicate isRegisterType(const GCNSubtarget &ST,
257                                         unsigned TypeIdx) {
258   return [=, &ST](const LegalityQuery &Query) {
259     return isRegisterType(ST, Query.Types[TypeIdx]);
260   };
261 }
262 
263 // RegisterType that doesn't have a corresponding RegClass.
264 // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
265 // should be removed.
266 static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST,
267                                                unsigned TypeIdx) {
268   return [=, &ST](const LegalityQuery &Query) {
269     LLT Ty = Query.Types[TypeIdx];
270     return isRegisterType(ST, Ty) &&
271            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
272   };
273 }
274 
275 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
276   return [=](const LegalityQuery &Query) {
277     const LLT QueryTy = Query.Types[TypeIdx];
278     if (!QueryTy.isVector())
279       return false;
280     const LLT EltTy = QueryTy.getElementType();
281     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
282   };
283 }
284 
285 constexpr LLT S1 = LLT::scalar(1);
286 constexpr LLT S8 = LLT::scalar(8);
287 constexpr LLT S16 = LLT::scalar(16);
288 constexpr LLT S32 = LLT::scalar(32);
289 constexpr LLT F32 = LLT::float32();
290 constexpr LLT S64 = LLT::scalar(64);
291 constexpr LLT F64 = LLT::float64();
292 constexpr LLT S96 = LLT::scalar(96);
293 constexpr LLT S128 = LLT::scalar(128);
294 constexpr LLT S160 = LLT::scalar(160);
295 constexpr LLT S192 = LLT::scalar(192);
296 constexpr LLT S224 = LLT::scalar(224);
297 constexpr LLT S256 = LLT::scalar(256);
298 constexpr LLT S512 = LLT::scalar(512);
299 constexpr LLT S1024 = LLT::scalar(1024);
300 constexpr LLT MaxScalar = LLT::scalar(MaxRegisterSize);
301 
302 constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
303 constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
304 constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
305 constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
306 constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
307 constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
308 constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
309 constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
310 
311 constexpr LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
312 constexpr LLT V2BF16 = V2F16; // FIXME
313 
314 constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
315 constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
316 constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
317 constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
318 constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
319 constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
320 constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
321 constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
322 constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
323 constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
324 constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
325 constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
326 constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
327 
328 constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
329 constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
330 constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
331 constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
332 constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
333 constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
334 constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
335 constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
336 
337 constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
338 constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
339 
340 constexpr std::initializer_list<LLT> AllScalarTypes = {
341     S32, S64, S96, S128, S160, S192, S224, S256, S512, S1024};
342 
343 constexpr std::initializer_list<LLT> AllS16Vectors{
344     V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
345 
346 constexpr std::initializer_list<LLT> AllS32Vectors = {
347     V2S32, V3S32,  V4S32,  V5S32,  V6S32,  V7S32, V8S32,
348     V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
349 
350 constexpr std::initializer_list<LLT> AllS64Vectors = {
351     V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
352 
353 constexpr std::initializer_list<LLT> AllVectors{
354     V2S16,  V4S16,  V6S16,  V8S16,  V10S16, V12S16, V16S16, V2S128,
355     V4S128, V2S32,  V3S32,  V4S32,  V5S32,  V6S32,  V7S32,  V8S32,
356     V9S32,  V10S32, V11S32, V12S32, V16S32, V32S32, V2S64,  V3S64,
357     V4S64,  V5S64,  V6S64,  V7S64,  V8S64,  V16S64};
358 
359 // Checks whether a type is in the list of legal register types.
360 static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
361   if (Ty.isPointerOrPointerVector())
362     Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
363 
364   return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
365          is_contained(AllScalarTypes, Ty) ||
366          (ST.useRealTrue16Insts() && Ty == S16) ||
367          is_contained(AllS16Vectors, Ty);
368 }
369 
370 static LegalityPredicate isRegisterClassType(const GCNSubtarget &ST,
371                                              unsigned TypeIdx) {
372   return [&ST, TypeIdx](const LegalityQuery &Query) {
373     return isRegisterClassType(ST, Query.Types[TypeIdx]);
374   };
375 }
376 
377 // If we have a truncating store or an extending load with a data size larger
378 // than 32-bits, we need to reduce to a 32-bit type.
379 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
380   return [=](const LegalityQuery &Query) {
381     const LLT Ty = Query.Types[TypeIdx];
382     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
383            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
384   };
385 }
386 
387 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
388 // handle some operations by just promoting the register during
389 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
390 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
391                                     bool IsLoad, bool IsAtomic) {
392   switch (AS) {
393   case AMDGPUAS::PRIVATE_ADDRESS:
394     // FIXME: Private element size.
395     return ST.enableFlatScratch() ? 128 : 32;
396   case AMDGPUAS::LOCAL_ADDRESS:
397     return ST.useDS128() ? 128 : 64;
398   case AMDGPUAS::GLOBAL_ADDRESS:
399   case AMDGPUAS::CONSTANT_ADDRESS:
400   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
401   case AMDGPUAS::BUFFER_RESOURCE:
402     // Treat constant and global as identical. SMRD loads are sometimes usable for
403     // global loads (ideally constant address space should be eliminated)
404     // depending on the context. Legality cannot be context dependent, but
405     // RegBankSelect can split the load as necessary depending on the pointer
406     // register bank/uniformity and if the memory is invariant or not written in a
407     // kernel.
408     return IsLoad ? 512 : 128;
409   default:
410     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
411     // if they may alias scratch depending on the subtarget.  This needs to be
412     // moved to custom handling to use addressMayBeAccessedAsPrivate
413     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
414   }
415 }
416 
417 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
418                                  const LegalityQuery &Query) {
419   const LLT Ty = Query.Types[0];
420 
421   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
422   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
423 
424   unsigned RegSize = Ty.getSizeInBits();
425   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
426   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
427   unsigned AS = Query.Types[1].getAddressSpace();
428 
429   // All of these need to be custom lowered to cast the pointer operand.
430   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
431     return false;
432 
433   // Do not handle extending vector loads.
434   if (Ty.isVector() && MemSize != RegSize)
435     return false;
436 
437   // TODO: We should be able to widen loads if the alignment is high enough, but
438   // we also need to modify the memory access size.
439 #if 0
440   // Accept widening loads based on alignment.
441   if (IsLoad && MemSize < Size)
442     MemSize = std::max(MemSize, Align);
443 #endif
444 
445   // Only 1-byte and 2-byte to 32-bit extloads are valid.
446   if (MemSize != RegSize && RegSize != 32)
447     return false;
448 
449   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
450                                     Query.MMODescrs[0].Ordering !=
451                                         AtomicOrdering::NotAtomic))
452     return false;
453 
454   switch (MemSize) {
455   case 8:
456   case 16:
457   case 32:
458   case 64:
459   case 128:
460     break;
461   case 96:
462     if (!ST.hasDwordx3LoadStores())
463       return false;
464     break;
465   case 256:
466   case 512:
467     // These may contextually need to be broken down.
468     break;
469   default:
470     return false;
471   }
472 
473   assert(RegSize >= MemSize);
474 
475   if (AlignBits < MemSize) {
476     const SITargetLowering *TLI = ST.getTargetLowering();
477     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
478                                                  Align(AlignBits / 8)))
479       return false;
480   }
481 
482   return true;
483 }
484 
485 // The newer buffer intrinsic forms take their resource arguments as
486 // pointers in address space 8, aka s128 values. However, in order to not break
487 // SelectionDAG, the underlying operations have to continue to take v4i32
488 // arguments. Therefore, we convert resource pointers - or vectors of them
489 // to integer values here.
490 static bool hasBufferRsrcWorkaround(const LLT Ty) {
491   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
492     return true;
493   if (Ty.isVector()) {
494     const LLT ElemTy = Ty.getElementType();
495     return hasBufferRsrcWorkaround(ElemTy);
496   }
497   return false;
498 }
499 
500 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
501 // workaround this. Eventually it should ignore the type for loads and only care
502 // about the size. Return true in cases where we will workaround this for now by
503 // bitcasting.
504 static bool loadStoreBitcastWorkaround(const LLT Ty) {
505   if (EnableNewLegality)
506     return false;
507 
508   const unsigned Size = Ty.getSizeInBits();
509   if (Ty.isPointerVector())
510     return true;
511   if (Size <= 64)
512     return false;
513   // Address space 8 pointers get their own workaround.
514   if (hasBufferRsrcWorkaround(Ty))
515     return false;
516   if (!Ty.isVector())
517     return true;
518 
519   unsigned EltSize = Ty.getScalarSizeInBits();
520   return EltSize != 32 && EltSize != 64;
521 }
522 
523 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
524   const LLT Ty = Query.Types[0];
525   return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
526          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
527 }
528 
529 /// Return true if a load or store of the type should be lowered with a bitcast
530 /// to a different type.
531 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
532                                        const LLT MemTy) {
533   const unsigned MemSizeInBits = MemTy.getSizeInBits();
534   const unsigned Size = Ty.getSizeInBits();
535   if (Size != MemSizeInBits)
536     return Size <= 32 && Ty.isVector();
537 
538   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(ST, Ty))
539     return true;
540 
541   // Don't try to handle bitcasting vector ext loads for now.
542   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
543          (Size <= 32 || isRegisterSize(ST, Size)) &&
544          !isRegisterVectorElementType(Ty.getElementType());
545 }
546 
547 /// Return true if we should legalize a load by widening an odd sized memory
548 /// access up to the alignment. Note this case when the memory access itself
549 /// changes, not the size of the result register.
550 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
551                             uint64_t AlignInBits, unsigned AddrSpace,
552                             unsigned Opcode) {
553   unsigned SizeInBits = MemoryTy.getSizeInBits();
554   // We don't want to widen cases that are naturally legal.
555   if (isPowerOf2_32(SizeInBits))
556     return false;
557 
558   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
559   // end up widening these for a scalar load during RegBankSelect, if we don't
560   // have 96-bit scalar loads.
561   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
562     return false;
563 
564   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
565     return false;
566 
567   // A load is known dereferenceable up to the alignment, so it's legal to widen
568   // to it.
569   //
570   // TODO: Could check dereferenceable for less aligned cases.
571   unsigned RoundedSize = NextPowerOf2(SizeInBits);
572   if (AlignInBits < RoundedSize)
573     return false;
574 
575   // Do not widen if it would introduce a slow unaligned load.
576   const SITargetLowering *TLI = ST.getTargetLowering();
577   unsigned Fast = 0;
578   return TLI->allowsMisalignedMemoryAccessesImpl(
579              RoundedSize, AddrSpace, Align(AlignInBits / 8),
580              MachineMemOperand::MOLoad, &Fast) &&
581          Fast;
582 }
583 
584 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
585                             unsigned Opcode) {
586   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
587     return false;
588 
589   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
590                          Query.MMODescrs[0].AlignInBits,
591                          Query.Types[1].getAddressSpace(), Opcode);
592 }
593 
594 /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
595 /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
596 /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
597 static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
598                                    MachineRegisterInfo &MRI, unsigned Idx) {
599   MachineOperand &MO = MI.getOperand(Idx);
600 
601   const LLT PointerTy = MRI.getType(MO.getReg());
602 
603   // Paranoidly prevent us from doing this multiple times.
604   if (!hasBufferRsrcWorkaround(PointerTy))
605     return PointerTy;
606 
607   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
608   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
609   if (!PointerTy.isVector()) {
610     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
611     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
612     const LLT S32 = LLT::scalar(32);
613 
614     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
615     std::array<Register, 4> VectorElems;
616     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
617     for (unsigned I = 0; I < NumParts; ++I)
618       VectorElems[I] =
619           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
620     B.buildMergeValues(MO, VectorElems);
621     MO.setReg(VectorReg);
622     return VectorTy;
623   }
624   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
625   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
626   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
627   B.buildIntToPtr(MO, Scalar);
628   MO.setReg(BitcastReg);
629 
630   return VectorTy;
631 }
632 
633 /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
634 /// the form in which the value must be in order to be passed to the low-level
635 /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
636 /// needed in order to account for the fact that we can't define a register
637 /// class for s128 without breaking SelectionDAG.
638 static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
639   MachineRegisterInfo &MRI = *B.getMRI();
640   const LLT PointerTy = MRI.getType(Pointer);
641   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
642   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
643 
644   if (!PointerTy.isVector()) {
645     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
646     SmallVector<Register, 4> PointerParts;
647     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
648     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
649     for (unsigned I = 0; I < NumParts; ++I)
650       PointerParts.push_back(Unmerged.getReg(I));
651     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
652   }
653   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
654   return B.buildBitcast(VectorTy, Scalar).getReg(0);
655 }
656 
657 static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
658                                      unsigned Idx) {
659   MachineOperand &MO = MI.getOperand(Idx);
660 
661   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
662   // Paranoidly prevent us from doing this multiple times.
663   if (!hasBufferRsrcWorkaround(PointerTy))
664     return;
665   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
666 }
667 
668 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
669                                          const GCNTargetMachine &TM)
670   :  ST(ST_) {
671   using namespace TargetOpcode;
672 
673   auto GetAddrSpacePtr = [&TM](unsigned AS) {
674     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
675   };
676 
677   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
678   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
679   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
680   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
681   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
682   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
683   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
684   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
685   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
686   const LLT BufferStridedPtr =
687       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
688 
689   const LLT CodePtr = FlatPtr;
690 
691   const std::initializer_list<LLT> AddrSpaces64 = {
692     GlobalPtr, ConstantPtr, FlatPtr
693   };
694 
695   const std::initializer_list<LLT> AddrSpaces32 = {
696     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
697   };
698 
699   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
700 
701   const std::initializer_list<LLT> FPTypesBase = {
702     S32, S64
703   };
704 
705   const std::initializer_list<LLT> FPTypes16 = {
706     S32, S64, S16
707   };
708 
709   const std::initializer_list<LLT> FPTypesPK16 = {
710     S32, S64, S16, V2S16
711   };
712 
713   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
714 
715   // s1 for VCC branches, s32 for SCC branches.
716   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
717 
718   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
719   // elements for v3s16
720   getActionDefinitionsBuilder(G_PHI)
721       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
722       .legalFor(AllS32Vectors)
723       .legalFor(AllS64Vectors)
724       .legalFor(AddrSpaces64)
725       .legalFor(AddrSpaces32)
726       .legalFor(AddrSpaces128)
727       .legalIf(isPointer(0))
728       .clampScalar(0, S16, S256)
729       .widenScalarToNextPow2(0, 32)
730       .clampMaxNumElements(0, S32, 16)
731       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
732       .scalarize(0);
733 
734   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
735     // Full set of gfx9 features.
736     if (ST.hasScalarAddSub64()) {
737       getActionDefinitionsBuilder({G_ADD, G_SUB})
738           .legalFor({S64, S32, S16, V2S16})
739           .clampMaxNumElementsStrict(0, S16, 2)
740           .scalarize(0)
741           .minScalar(0, S16)
742           .widenScalarToNextMultipleOf(0, 32)
743           .maxScalar(0, S32);
744     } else {
745       getActionDefinitionsBuilder({G_ADD, G_SUB})
746           .legalFor({S32, S16, V2S16})
747           .clampMaxNumElementsStrict(0, S16, 2)
748           .scalarize(0)
749           .minScalar(0, S16)
750           .widenScalarToNextMultipleOf(0, 32)
751           .maxScalar(0, S32);
752     }
753 
754     if (ST.hasScalarSMulU64()) {
755       getActionDefinitionsBuilder(G_MUL)
756           .legalFor({S64, S32, S16, V2S16})
757           .clampMaxNumElementsStrict(0, S16, 2)
758           .scalarize(0)
759           .minScalar(0, S16)
760           .widenScalarToNextMultipleOf(0, 32)
761           .custom();
762     } else {
763       getActionDefinitionsBuilder(G_MUL)
764           .legalFor({S32, S16, V2S16})
765           .clampMaxNumElementsStrict(0, S16, 2)
766           .scalarize(0)
767           .minScalar(0, S16)
768           .widenScalarToNextMultipleOf(0, 32)
769           .custom();
770     }
771     assert(ST.hasMad64_32());
772 
773     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
774       .legalFor({S32, S16, V2S16}) // Clamp modifier
775       .minScalarOrElt(0, S16)
776       .clampMaxNumElementsStrict(0, S16, 2)
777       .scalarize(0)
778       .widenScalarToNextPow2(0, 32)
779       .lower();
780   } else if (ST.has16BitInsts()) {
781     getActionDefinitionsBuilder({G_ADD, G_SUB})
782       .legalFor({S32, S16})
783       .minScalar(0, S16)
784       .widenScalarToNextMultipleOf(0, 32)
785       .maxScalar(0, S32)
786       .scalarize(0);
787 
788     getActionDefinitionsBuilder(G_MUL)
789       .legalFor({S32, S16})
790       .scalarize(0)
791       .minScalar(0, S16)
792       .widenScalarToNextMultipleOf(0, 32)
793       .custom();
794     assert(ST.hasMad64_32());
795 
796     // Technically the saturating operations require clamp bit support, but this
797     // was introduced at the same time as 16-bit operations.
798     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
799       .legalFor({S32, S16}) // Clamp modifier
800       .minScalar(0, S16)
801       .scalarize(0)
802       .widenScalarToNextPow2(0, 16)
803       .lower();
804 
805     // We're just lowering this, but it helps get a better result to try to
806     // coerce to the desired type first.
807     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
808       .minScalar(0, S16)
809       .scalarize(0)
810       .lower();
811   } else {
812     getActionDefinitionsBuilder({G_ADD, G_SUB})
813       .legalFor({S32})
814       .widenScalarToNextMultipleOf(0, 32)
815       .clampScalar(0, S32, S32)
816       .scalarize(0);
817 
818     auto &Mul = getActionDefinitionsBuilder(G_MUL)
819       .legalFor({S32})
820       .scalarize(0)
821       .minScalar(0, S32)
822       .widenScalarToNextMultipleOf(0, 32);
823 
824     if (ST.hasMad64_32())
825       Mul.custom();
826     else
827       Mul.maxScalar(0, S32);
828 
829     if (ST.hasIntClamp()) {
830       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
831         .legalFor({S32}) // Clamp modifier.
832         .scalarize(0)
833         .minScalarOrElt(0, S32)
834         .lower();
835     } else {
836       // Clamp bit support was added in VI, along with 16-bit operations.
837       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
838         .minScalar(0, S32)
839         .scalarize(0)
840         .lower();
841     }
842 
843     // FIXME: DAG expansion gets better results. The widening uses the smaller
844     // range values and goes for the min/max lowering directly.
845     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
846       .minScalar(0, S32)
847       .scalarize(0)
848       .lower();
849   }
850 
851   getActionDefinitionsBuilder(
852       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
853       .customFor({S32, S64})
854       .clampScalar(0, S32, S64)
855       .widenScalarToNextPow2(0, 32)
856       .scalarize(0);
857 
858   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
859                    .legalFor({S32})
860                    .maxScalar(0, S32);
861 
862   if (ST.hasVOP3PInsts()) {
863     Mulh
864       .clampMaxNumElements(0, S8, 2)
865       .lowerFor({V2S8});
866   }
867 
868   Mulh
869     .scalarize(0)
870     .lower();
871 
872   // Report legal for any types we can handle anywhere. For the cases only legal
873   // on the SALU, RegBankSelect will be able to re-legalize.
874   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
875       .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
876       .clampScalar(0, S32, S64)
877       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
878       .fewerElementsIf(
879           all(vectorWiderThan(0, 64), scalarOrEltNarrowerThan(0, 64)),
880           fewerEltsToSize64Vector(0))
881       .widenScalarToNextPow2(0)
882       .scalarize(0);
883 
884   getActionDefinitionsBuilder(
885       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
886       .legalFor({{S32, S1}, {S32, S32}})
887       .clampScalar(0, S32, S32)
888       .scalarize(0);
889 
890   getActionDefinitionsBuilder(G_BITCAST)
891       // Don't worry about the size constraint.
892       .legalIf(all(isRegisterClassType(ST, 0), isRegisterClassType(ST, 1)))
893       .lower();
894 
895   getActionDefinitionsBuilder(G_CONSTANT)
896     .legalFor({S1, S32, S64, S16, GlobalPtr,
897                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
898     .legalIf(isPointer(0))
899     .clampScalar(0, S32, S64)
900     .widenScalarToNextPow2(0);
901 
902   getActionDefinitionsBuilder(G_FCONSTANT)
903     .legalFor({S32, S64, S16})
904     .clampScalar(0, S16, S64);
905 
906   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
907       .legalIf(isRegisterClassType(ST, 0))
908       // s1 and s16 are special cases because they have legal operations on
909       // them, but don't really occupy registers in the normal way.
910       .legalFor({S1, S16})
911       .clampNumElements(0, V16S32, V32S32)
912       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
913       .clampScalarOrElt(0, S32, MaxScalar)
914       .widenScalarToNextPow2(0, 32)
915       .clampMaxNumElements(0, S32, 16);
916 
917   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
918 
919   // If the amount is divergent, we have to do a wave reduction to get the
920   // maximum value, so this is expanded during RegBankSelect.
921   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
922     .legalFor({{PrivatePtr, S32}});
923 
924   getActionDefinitionsBuilder(G_STACKSAVE)
925     .customFor({PrivatePtr});
926   getActionDefinitionsBuilder(G_STACKRESTORE)
927     .legalFor({PrivatePtr});
928 
929   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
930 
931   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
932     .customIf(typeIsNot(0, PrivatePtr));
933 
934   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
935 
936   auto &FPOpActions = getActionDefinitionsBuilder(
937     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
938       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
939     .legalFor({S32, S64});
940   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
941     .customFor({S32, S64});
942   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
943     .customFor({S32, S64});
944 
945   if (ST.has16BitInsts()) {
946     if (ST.hasVOP3PInsts())
947       FPOpActions.legalFor({S16, V2S16});
948     else
949       FPOpActions.legalFor({S16});
950 
951     TrigActions.customFor({S16});
952     FDIVActions.customFor({S16});
953   }
954 
955   if (ST.hasPackedFP32Ops()) {
956     FPOpActions.legalFor({V2S32});
957     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
958   }
959 
960   auto &MinNumMaxNum = getActionDefinitionsBuilder(
961       {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
962        G_FMAXNUM_IEEE});
963 
964   if (ST.hasVOP3PInsts()) {
965     MinNumMaxNum.customFor(FPTypesPK16)
966       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
967       .clampMaxNumElements(0, S16, 2)
968       .clampScalar(0, S16, S64)
969       .scalarize(0);
970   } else if (ST.has16BitInsts()) {
971     MinNumMaxNum.customFor(FPTypes16)
972       .clampScalar(0, S16, S64)
973       .scalarize(0);
974   } else {
975     MinNumMaxNum.customFor(FPTypesBase)
976       .clampScalar(0, S32, S64)
977       .scalarize(0);
978   }
979 
980   if (ST.hasVOP3PInsts())
981     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
982 
983   FPOpActions
984     .scalarize(0)
985     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
986 
987   TrigActions
988     .scalarize(0)
989     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
990 
991   FDIVActions
992     .scalarize(0)
993     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
994 
995   getActionDefinitionsBuilder({G_FNEG, G_FABS})
996     .legalFor(FPTypesPK16)
997     .clampMaxNumElementsStrict(0, S16, 2)
998     .scalarize(0)
999     .clampScalar(0, S16, S64);
1000 
1001   if (ST.has16BitInsts()) {
1002     getActionDefinitionsBuilder(G_FSQRT)
1003       .legalFor({S16})
1004       .customFor({S32, S64})
1005       .scalarize(0)
1006       .unsupported();
1007     getActionDefinitionsBuilder(G_FFLOOR)
1008       .legalFor({S32, S64, S16})
1009       .scalarize(0)
1010       .clampScalar(0, S16, S64);
1011 
1012     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1013       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1014       .scalarize(0)
1015       .maxScalarIf(typeIs(0, S16), 1, S16)
1016       .clampScalar(1, S32, S32)
1017       .lower();
1018 
1019     getActionDefinitionsBuilder(G_FFREXP)
1020       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1021       .scalarize(0)
1022       .lower();
1023   } else {
1024     getActionDefinitionsBuilder(G_FSQRT)
1025       .customFor({S32, S64, S16})
1026       .scalarize(0)
1027       .unsupported();
1028 
1029 
1030     if (ST.hasFractBug()) {
1031       getActionDefinitionsBuilder(G_FFLOOR)
1032         .customFor({S64})
1033         .legalFor({S32, S64})
1034         .scalarize(0)
1035         .clampScalar(0, S32, S64);
1036     } else {
1037       getActionDefinitionsBuilder(G_FFLOOR)
1038         .legalFor({S32, S64})
1039         .scalarize(0)
1040         .clampScalar(0, S32, S64);
1041     }
1042 
1043     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1044       .legalFor({{S32, S32}, {S64, S32}})
1045       .scalarize(0)
1046       .clampScalar(0, S32, S64)
1047       .clampScalar(1, S32, S32)
1048       .lower();
1049 
1050     getActionDefinitionsBuilder(G_FFREXP)
1051       .customFor({{S32, S32}, {S64, S32}})
1052       .scalarize(0)
1053       .minScalar(0, S32)
1054       .clampScalar(1, S32, S32)
1055       .lower();
1056   }
1057 
1058   auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1059   if (ST.hasCvtPkF16F32Inst()) {
1060     FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1061         .clampMaxNumElements(0, S16, 2);
1062   } else {
1063     FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1064   }
1065   FPTruncActions.scalarize(0).lower();
1066 
1067   getActionDefinitionsBuilder(G_FPEXT)
1068     .legalFor({{S64, S32}, {S32, S16}})
1069     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1070     .scalarize(0);
1071 
1072   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1073   if (ST.has16BitInsts()) {
1074     FSubActions
1075       // Use actual fsub instruction
1076       .legalFor({S32, S16})
1077       // Must use fadd + fneg
1078       .lowerFor({S64, V2S16});
1079   } else {
1080     FSubActions
1081       // Use actual fsub instruction
1082       .legalFor({S32})
1083       // Must use fadd + fneg
1084       .lowerFor({S64, S16, V2S16});
1085   }
1086 
1087   FSubActions
1088     .scalarize(0)
1089     .clampScalar(0, S32, S64);
1090 
1091   // Whether this is legal depends on the floating point mode for the function.
1092   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1093   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1094     FMad.customFor({S32, S16});
1095   else if (ST.hasMadMacF32Insts())
1096     FMad.customFor({S32});
1097   else if (ST.hasMadF16())
1098     FMad.customFor({S16});
1099   FMad.scalarize(0)
1100       .lower();
1101 
1102   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1103   if (ST.has16BitInsts()) {
1104     FRem.customFor({S16, S32, S64});
1105   } else {
1106     FRem.minScalar(0, S32)
1107         .customFor({S32, S64});
1108   }
1109   FRem.scalarize(0);
1110 
1111   // TODO: Do we need to clamp maximum bitwidth?
1112   getActionDefinitionsBuilder(G_TRUNC)
1113     .legalIf(isScalar(0))
1114     .legalFor({{V2S16, V2S32}})
1115     .clampMaxNumElements(0, S16, 2)
1116     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1117     // situations (like an invalid implicit use), we don't want to infinite loop
1118     // in the legalizer.
1119     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
1120     .alwaysLegal();
1121 
1122   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1123     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1124                {S32, S1}, {S64, S1}, {S16, S1}})
1125     .scalarize(0)
1126     .clampScalar(0, S32, S64)
1127     .widenScalarToNextPow2(1, 32);
1128 
1129   // TODO: Split s1->s64 during regbankselect for VALU.
1130   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1131                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1132                     .lowerIf(typeIs(1, S1))
1133                     .customFor({{S32, S64}, {S64, S64}});
1134   if (ST.has16BitInsts())
1135     IToFP.legalFor({{S16, S16}});
1136   IToFP.clampScalar(1, S32, S64)
1137        .minScalar(0, S32)
1138        .scalarize(0)
1139        .widenScalarToNextPow2(1);
1140 
1141   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1142     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1143     .customFor({{S64, S32}, {S64, S64}})
1144     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1145   if (ST.has16BitInsts())
1146     FPToI.legalFor({{S16, S16}});
1147   else
1148     FPToI.minScalar(1, S32);
1149 
1150   FPToI.minScalar(0, S32)
1151        .widenScalarToNextPow2(0, 32)
1152        .scalarize(0)
1153        .lower();
1154 
1155   getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1156       .clampScalar(0, S16, S64)
1157       .scalarize(0)
1158       .lower();
1159 
1160   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1161       .legalFor({S16, S32})
1162       .scalarize(0)
1163       .lower();
1164 
1165   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1166   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1167       .scalarize(0)
1168       .lower();
1169 
1170   getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1171       .clampScalar(0, S16, S64)
1172       .scalarize(0)
1173       .lower();
1174 
1175   if (ST.has16BitInsts()) {
1176     getActionDefinitionsBuilder(
1177         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1178         .legalFor({S16, S32, S64})
1179         .clampScalar(0, S16, S64)
1180         .scalarize(0);
1181   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1182     getActionDefinitionsBuilder(
1183         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1184         .legalFor({S32, S64})
1185         .clampScalar(0, S32, S64)
1186         .scalarize(0);
1187   } else {
1188     getActionDefinitionsBuilder(
1189         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1190         .legalFor({S32})
1191         .customFor({S64})
1192         .clampScalar(0, S32, S64)
1193         .scalarize(0);
1194   }
1195 
1196   getActionDefinitionsBuilder(G_PTR_ADD)
1197       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1198       .legalIf(all(isPointer(0), sameSize(0, 1)))
1199       .scalarize(0)
1200       .scalarSameSizeAs(1, 0);
1201 
1202   getActionDefinitionsBuilder(G_PTRMASK)
1203     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1204     .scalarSameSizeAs(1, 0)
1205     .scalarize(0);
1206 
1207   auto &CmpBuilder =
1208     getActionDefinitionsBuilder(G_ICMP)
1209     // The compare output type differs based on the register bank of the output,
1210     // so make both s1 and s32 legal.
1211     //
1212     // Scalar compares producing output in scc will be promoted to s32, as that
1213     // is the allocatable register type that will be needed for the copy from
1214     // scc. This will be promoted during RegBankSelect, and we assume something
1215     // before that won't try to use s32 result types.
1216     //
1217     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1218     // bank.
1219     .legalForCartesianProduct(
1220       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1221     .legalForCartesianProduct(
1222       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1223   if (ST.has16BitInsts()) {
1224     CmpBuilder.legalFor({{S1, S16}});
1225   }
1226 
1227   CmpBuilder
1228     .widenScalarToNextPow2(1)
1229     .clampScalar(1, S32, S64)
1230     .scalarize(0)
1231     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1232 
1233   auto &FCmpBuilder =
1234       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1235           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1236 
1237   if (ST.hasSALUFloatInsts())
1238     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1239 
1240   FCmpBuilder
1241     .widenScalarToNextPow2(1)
1242     .clampScalar(1, S32, S64)
1243     .scalarize(0);
1244 
1245   // FIXME: fpow has a selection pattern that should move to custom lowering.
1246   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1247   if (ST.has16BitInsts())
1248     ExpOps.customFor({{S32}, {S16}});
1249   else
1250     ExpOps.customFor({S32});
1251   ExpOps.clampScalar(0, MinScalarFPTy, S32)
1252         .scalarize(0);
1253 
1254   getActionDefinitionsBuilder(G_FPOWI)
1255     .clampScalar(0, MinScalarFPTy, S32)
1256     .lower();
1257 
1258   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1259   Log2Ops.customFor({S32});
1260   if (ST.has16BitInsts())
1261     Log2Ops.legalFor({S16});
1262   else
1263     Log2Ops.customFor({S16});
1264   Log2Ops.scalarize(0)
1265     .lower();
1266 
1267   auto &LogOps =
1268       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1269   LogOps.customFor({S32, S16});
1270   LogOps.clampScalar(0, MinScalarFPTy, S32)
1271         .scalarize(0);
1272 
1273   // The 64-bit versions produce 32-bit results, but only on the SALU.
1274   getActionDefinitionsBuilder(G_CTPOP)
1275     .legalFor({{S32, S32}, {S32, S64}})
1276     .clampScalar(0, S32, S32)
1277     .widenScalarToNextPow2(1, 32)
1278     .clampScalar(1, S32, S64)
1279     .scalarize(0)
1280     .widenScalarToNextPow2(0, 32);
1281 
1282   // If no 16 bit instr is available, lower into different instructions.
1283   if (ST.has16BitInsts())
1284     getActionDefinitionsBuilder(G_IS_FPCLASS)
1285         .legalForCartesianProduct({S1}, FPTypes16)
1286         .widenScalarToNextPow2(1)
1287         .scalarize(0)
1288         .lower();
1289   else
1290     getActionDefinitionsBuilder(G_IS_FPCLASS)
1291         .legalForCartesianProduct({S1}, FPTypesBase)
1292         .lowerFor({S1, S16})
1293         .widenScalarToNextPow2(1)
1294         .scalarize(0)
1295         .lower();
1296 
1297   // The hardware instructions return a different result on 0 than the generic
1298   // instructions expect. The hardware produces -1, but these produce the
1299   // bitwidth.
1300   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1301     .scalarize(0)
1302     .clampScalar(0, S32, S32)
1303     .clampScalar(1, S32, S64)
1304     .widenScalarToNextPow2(0, 32)
1305     .widenScalarToNextPow2(1, 32)
1306     .custom();
1307 
1308   // The 64-bit versions produce 32-bit results, but only on the SALU.
1309   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1310       .legalFor({{S32, S32}, {S32, S64}})
1311       .customIf(scalarNarrowerThan(1, 32))
1312       .clampScalar(0, S32, S32)
1313       .clampScalar(1, S32, S64)
1314       .scalarize(0)
1315       .widenScalarToNextPow2(0, 32)
1316       .widenScalarToNextPow2(1, 32);
1317 
1318   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1319       .legalFor({{S32, S32}, {S32, S64}})
1320       .clampScalar(0, S32, S32)
1321       .clampScalar(1, S32, S64)
1322       .scalarize(0)
1323       .widenScalarToNextPow2(0, 32)
1324       .widenScalarToNextPow2(1, 32);
1325 
1326   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1327   // RegBankSelect.
1328   getActionDefinitionsBuilder(G_BITREVERSE)
1329     .legalFor({S32, S64})
1330     .clampScalar(0, S32, S64)
1331     .scalarize(0)
1332     .widenScalarToNextPow2(0);
1333 
1334   if (ST.has16BitInsts()) {
1335     getActionDefinitionsBuilder(G_BSWAP)
1336       .legalFor({S16, S32, V2S16})
1337       .clampMaxNumElementsStrict(0, S16, 2)
1338       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1339       // narrowScalar limitation.
1340       .widenScalarToNextPow2(0)
1341       .clampScalar(0, S16, S32)
1342       .scalarize(0);
1343 
1344     if (ST.hasVOP3PInsts()) {
1345       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1346         .legalFor({S32, S16, V2S16})
1347         .clampMaxNumElements(0, S16, 2)
1348         .minScalar(0, S16)
1349         .widenScalarToNextPow2(0)
1350         .scalarize(0)
1351         .lower();
1352     } else {
1353       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1354         .legalFor({S32, S16})
1355         .widenScalarToNextPow2(0)
1356         .minScalar(0, S16)
1357         .scalarize(0)
1358         .lower();
1359     }
1360   } else {
1361     // TODO: Should have same legality without v_perm_b32
1362     getActionDefinitionsBuilder(G_BSWAP)
1363       .legalFor({S32})
1364       .lowerIf(scalarNarrowerThan(0, 32))
1365       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1366       // narrowScalar limitation.
1367       .widenScalarToNextPow2(0)
1368       .maxScalar(0, S32)
1369       .scalarize(0)
1370       .lower();
1371 
1372     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1373       .legalFor({S32})
1374       .minScalar(0, S32)
1375       .widenScalarToNextPow2(0)
1376       .scalarize(0)
1377       .lower();
1378   }
1379 
1380   getActionDefinitionsBuilder(G_INTTOPTR)
1381       // List the common cases
1382       .legalForCartesianProduct(AddrSpaces64, {S64})
1383       .legalForCartesianProduct(AddrSpaces32, {S32})
1384       .scalarize(0)
1385       // Accept any address space as long as the size matches
1386       .legalIf(sameSize(0, 1))
1387       .widenScalarIf(smallerThan(1, 0),
1388                      [](const LegalityQuery &Query) {
1389                        return std::pair(
1390                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
1391                      })
1392       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1393         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1394       });
1395 
1396   getActionDefinitionsBuilder(G_PTRTOINT)
1397       // List the common cases
1398       .legalForCartesianProduct(AddrSpaces64, {S64})
1399       .legalForCartesianProduct(AddrSpaces32, {S32})
1400       .scalarize(0)
1401       // Accept any address space as long as the size matches
1402       .legalIf(sameSize(0, 1))
1403       .widenScalarIf(smallerThan(0, 1),
1404                      [](const LegalityQuery &Query) {
1405                        return std::pair(
1406                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
1407                      })
1408       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1409         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1410       });
1411 
1412   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1413     .scalarize(0)
1414     .custom();
1415 
1416   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1417                                     bool IsLoad) -> bool {
1418     const LLT DstTy = Query.Types[0];
1419 
1420     // Split vector extloads.
1421     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1422 
1423     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1424       return true;
1425 
1426     const LLT PtrTy = Query.Types[1];
1427     unsigned AS = PtrTy.getAddressSpace();
1428     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1429                                       Query.MMODescrs[0].Ordering !=
1430                                           AtomicOrdering::NotAtomic))
1431       return true;
1432 
1433     // Catch weird sized loads that don't evenly divide into the access sizes
1434     // TODO: May be able to widen depending on alignment etc.
1435     unsigned NumRegs = (MemSize + 31) / 32;
1436     if (NumRegs == 3) {
1437       if (!ST.hasDwordx3LoadStores())
1438         return true;
1439     } else {
1440       // If the alignment allows, these should have been widened.
1441       if (!isPowerOf2_32(NumRegs))
1442         return true;
1443     }
1444 
1445     return false;
1446   };
1447 
1448   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1449   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1450   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1451 
1452   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1453   // LDS
1454   // TODO: Unsupported flat for SI.
1455 
1456   for (unsigned Op : {G_LOAD, G_STORE}) {
1457     const bool IsStore = Op == G_STORE;
1458 
1459     auto &Actions = getActionDefinitionsBuilder(Op);
1460     // Explicitly list some common cases.
1461     // TODO: Does this help compile time at all?
1462     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1463                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1464                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1465                                       {S64, GlobalPtr, S64, GlobalAlign32},
1466                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1467                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1468                                       {S32, GlobalPtr, S8, GlobalAlign8},
1469                                       {S32, GlobalPtr, S16, GlobalAlign16},
1470 
1471                                       {S32, LocalPtr, S32, 32},
1472                                       {S64, LocalPtr, S64, 32},
1473                                       {V2S32, LocalPtr, V2S32, 32},
1474                                       {S32, LocalPtr, S8, 8},
1475                                       {S32, LocalPtr, S16, 16},
1476                                       {V2S16, LocalPtr, S32, 32},
1477 
1478                                       {S32, PrivatePtr, S32, 32},
1479                                       {S32, PrivatePtr, S8, 8},
1480                                       {S32, PrivatePtr, S16, 16},
1481                                       {V2S16, PrivatePtr, S32, 32},
1482 
1483                                       {S32, ConstantPtr, S32, GlobalAlign32},
1484                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1485                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1486                                       {S64, ConstantPtr, S64, GlobalAlign32},
1487                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1488     Actions.legalIf(
1489       [=](const LegalityQuery &Query) -> bool {
1490         return isLoadStoreLegal(ST, Query);
1491       });
1492 
1493     // The custom pointers (fat pointers, buffer resources) don't work with load
1494     // and store at this level. Fat pointers should have been lowered to
1495     // intrinsics before the translation to MIR.
1496     Actions.unsupportedIf(
1497         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1498 
1499     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1500     // ptrtoint. This is needed to account for the fact that we can't have i128
1501     // as a register class for SelectionDAG reasons.
1502     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1503       return hasBufferRsrcWorkaround(Query.Types[0]);
1504     });
1505 
1506     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1507     // 64-bits.
1508     //
1509     // TODO: Should generalize bitcast action into coerce, which will also cover
1510     // inserting addrspacecasts.
1511     Actions.customIf(typeIs(1, Constant32Ptr));
1512 
1513     // Turn any illegal element vectors into something easier to deal
1514     // with. These will ultimately produce 32-bit scalar shifts to extract the
1515     // parts anyway.
1516     //
1517     // For odd 16-bit element vectors, prefer to split those into pieces with
1518     // 16-bit vector parts.
1519     Actions.bitcastIf(
1520       [=](const LegalityQuery &Query) -> bool {
1521         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1522                                           Query.MMODescrs[0].MemoryTy);
1523       }, bitcastToRegisterType(0));
1524 
1525     if (!IsStore) {
1526       // Widen suitably aligned loads by loading extra bytes. The standard
1527       // legalization actions can't properly express widening memory operands.
1528       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1529         return shouldWidenLoad(ST, Query, G_LOAD);
1530       });
1531     }
1532 
1533     // FIXME: load/store narrowing should be moved to lower action
1534     Actions
1535         .narrowScalarIf(
1536             [=](const LegalityQuery &Query) -> bool {
1537               return !Query.Types[0].isVector() &&
1538                      needToSplitMemOp(Query, Op == G_LOAD);
1539             },
1540             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1541               const LLT DstTy = Query.Types[0];
1542               const LLT PtrTy = Query.Types[1];
1543 
1544               const unsigned DstSize = DstTy.getSizeInBits();
1545               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1546 
1547               // Split extloads.
1548               if (DstSize > MemSize)
1549                 return std::pair(0, LLT::scalar(MemSize));
1550 
1551               unsigned MaxSize = maxSizeForAddrSpace(
1552                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1553                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1554               if (MemSize > MaxSize)
1555                 return std::pair(0, LLT::scalar(MaxSize));
1556 
1557               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1558               return std::pair(0, LLT::scalar(Align));
1559             })
1560         .fewerElementsIf(
1561             [=](const LegalityQuery &Query) -> bool {
1562               return Query.Types[0].isVector() &&
1563                      needToSplitMemOp(Query, Op == G_LOAD);
1564             },
1565             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1566               const LLT DstTy = Query.Types[0];
1567               const LLT PtrTy = Query.Types[1];
1568 
1569               LLT EltTy = DstTy.getElementType();
1570               unsigned MaxSize = maxSizeForAddrSpace(
1571                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1572                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1573 
1574               // FIXME: Handle widened to power of 2 results better. This ends
1575               // up scalarizing.
1576               // FIXME: 3 element stores scalarized on SI
1577 
1578               // Split if it's too large for the address space.
1579               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1580               if (MemSize > MaxSize) {
1581                 unsigned NumElts = DstTy.getNumElements();
1582                 unsigned EltSize = EltTy.getSizeInBits();
1583 
1584                 if (MaxSize % EltSize == 0) {
1585                   return std::pair(
1586                       0, LLT::scalarOrVector(
1587                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1588                 }
1589 
1590                 unsigned NumPieces = MemSize / MaxSize;
1591 
1592                 // FIXME: Refine when odd breakdowns handled
1593                 // The scalars will need to be re-legalized.
1594                 if (NumPieces == 1 || NumPieces >= NumElts ||
1595                     NumElts % NumPieces != 0)
1596                   return std::pair(0, EltTy);
1597 
1598                 return std::pair(0,
1599                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
1600               }
1601 
1602               // FIXME: We could probably handle weird extending loads better.
1603               if (DstTy.getSizeInBits() > MemSize)
1604                 return std::pair(0, EltTy);
1605 
1606               unsigned EltSize = EltTy.getSizeInBits();
1607               unsigned DstSize = DstTy.getSizeInBits();
1608               if (!isPowerOf2_32(DstSize)) {
1609                 // We're probably decomposing an odd sized store. Try to split
1610                 // to the widest type. TODO: Account for alignment. As-is it
1611                 // should be OK, since the new parts will be further legalized.
1612                 unsigned FloorSize = llvm::bit_floor(DstSize);
1613                 return std::pair(
1614                     0, LLT::scalarOrVector(
1615                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1616               }
1617 
1618               // May need relegalization for the scalars.
1619               return std::pair(0, EltTy);
1620             })
1621     .minScalar(0, S32)
1622     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1623     .widenScalarToNextPow2(0)
1624     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1625     .lower();
1626   }
1627 
1628   // FIXME: Unaligned accesses not lowered.
1629   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1630                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1631                                                   {S32, GlobalPtr, S16, 2 * 8},
1632                                                   {S32, LocalPtr, S8, 8},
1633                                                   {S32, LocalPtr, S16, 16},
1634                                                   {S32, PrivatePtr, S8, 8},
1635                                                   {S32, PrivatePtr, S16, 16},
1636                                                   {S32, ConstantPtr, S8, 8},
1637                                                   {S32, ConstantPtr, S16, 2 * 8}})
1638                        .legalIf(
1639                          [=](const LegalityQuery &Query) -> bool {
1640                            return isLoadStoreLegal(ST, Query);
1641                          });
1642 
1643   if (ST.hasFlatAddressSpace()) {
1644     ExtLoads.legalForTypesWithMemDesc(
1645         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1646   }
1647 
1648   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1649   // 64-bits.
1650   //
1651   // TODO: Should generalize bitcast action into coerce, which will also cover
1652   // inserting addrspacecasts.
1653   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1654 
1655   ExtLoads.clampScalar(0, S32, S32)
1656           .widenScalarToNextPow2(0)
1657           .lower();
1658 
1659   auto &Atomics = getActionDefinitionsBuilder(
1660     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1661      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1662      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1663      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1664     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1665                {S64, GlobalPtr}, {S64, LocalPtr},
1666                {S32, RegionPtr}, {S64, RegionPtr}});
1667   if (ST.hasFlatAddressSpace()) {
1668     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1669   }
1670 
1671   // TODO: v2bf16 operations, and fat buffer pointer support.
1672   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1673   if (ST.hasLDSFPAtomicAddF32()) {
1674     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1675     if (ST.hasLdsAtomicAddF64())
1676       Atomic.legalFor({{S64, LocalPtr}});
1677     if (ST.hasAtomicDsPkAdd16Insts())
1678       Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1679   }
1680   if (ST.hasAtomicFaddInsts())
1681     Atomic.legalFor({{S32, GlobalPtr}});
1682   if (ST.hasFlatAtomicFaddF32Inst())
1683     Atomic.legalFor({{S32, FlatPtr}});
1684 
1685   if (ST.hasGFX90AInsts()) {
1686     // These are legal with some caveats, and should have undergone expansion in
1687     // the IR in most situations
1688     // TODO: Move atomic expansion into legalizer
1689     Atomic.legalFor({
1690         {S32, GlobalPtr},
1691         {S64, GlobalPtr},
1692         {S64, FlatPtr}
1693       });
1694   }
1695 
1696   if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1697       ST.hasAtomicBufferGlobalPkAddF16Insts())
1698     Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1699   if (ST.hasAtomicGlobalPkAddBF16Inst())
1700     Atomic.legalFor({{V2BF16, GlobalPtr}});
1701   if (ST.hasAtomicFlatPkAdd16Insts())
1702     Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1703 
1704 
1705   // Most of the legalization work here is done by AtomicExpand. We could
1706   // probably use a simpler legality rule that just assumes anything is OK.
1707   auto &AtomicFMinFMax =
1708     getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1709     .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1710 
1711   if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1712     AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1713   if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1714     AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1715   if (ST.hasAtomicFMinFMaxF32FlatInsts())
1716     AtomicFMinFMax.legalFor({F32, FlatPtr});
1717   if (ST.hasAtomicFMinFMaxF64FlatInsts())
1718     AtomicFMinFMax.legalFor({F64, FlatPtr});
1719 
1720   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1721   // demarshalling
1722   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1723     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1724                 {S32, FlatPtr}, {S64, FlatPtr}})
1725     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1726                {S32, RegionPtr}, {S64, RegionPtr}});
1727   // TODO: Pointer types, any 32-bit or 64-bit vector
1728 
1729   // Condition should be s32 for scalar, s1 for vector.
1730   getActionDefinitionsBuilder(G_SELECT)
1731       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1732                                  LocalPtr, FlatPtr, PrivatePtr,
1733                                  LLT::fixed_vector(2, LocalPtr),
1734                                  LLT::fixed_vector(2, PrivatePtr)},
1735                                 {S1, S32})
1736       .clampScalar(0, S16, S64)
1737       .scalarize(1)
1738       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1739       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1740       .clampMaxNumElements(0, S32, 2)
1741       .clampMaxNumElements(0, LocalPtr, 2)
1742       .clampMaxNumElements(0, PrivatePtr, 2)
1743       .scalarize(0)
1744       .widenScalarToNextPow2(0)
1745       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1746 
1747   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1748   // be more flexible with the shift amount type.
1749   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1750     .legalFor({{S32, S32}, {S64, S32}});
1751   if (ST.has16BitInsts()) {
1752     if (ST.hasVOP3PInsts()) {
1753       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1754             .clampMaxNumElements(0, S16, 2);
1755     } else
1756       Shifts.legalFor({{S16, S16}});
1757 
1758     // TODO: Support 16-bit shift amounts for all types
1759     Shifts.widenScalarIf(
1760       [=](const LegalityQuery &Query) {
1761         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1762         // 32-bit amount.
1763         const LLT ValTy = Query.Types[0];
1764         const LLT AmountTy = Query.Types[1];
1765         return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1766                AmountTy.getSizeInBits() < 16;
1767       }, changeTo(1, S16));
1768     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1769     Shifts.clampScalar(1, S32, S32);
1770     Shifts.widenScalarToNextPow2(0, 16);
1771     Shifts.clampScalar(0, S16, S64);
1772 
1773     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1774       .minScalar(0, S16)
1775       .scalarize(0)
1776       .lower();
1777   } else {
1778     // Make sure we legalize the shift amount type first, as the general
1779     // expansion for the shifted type will produce much worse code if it hasn't
1780     // been truncated already.
1781     Shifts.clampScalar(1, S32, S32);
1782     Shifts.widenScalarToNextPow2(0, 32);
1783     Shifts.clampScalar(0, S32, S64);
1784 
1785     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1786       .minScalar(0, S32)
1787       .scalarize(0)
1788       .lower();
1789   }
1790   Shifts.scalarize(0);
1791 
1792   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1793     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1794     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1795     unsigned IdxTypeIdx = 2;
1796 
1797     getActionDefinitionsBuilder(Op)
1798         .customIf([=](const LegalityQuery &Query) {
1799           const LLT EltTy = Query.Types[EltTypeIdx];
1800           const LLT VecTy = Query.Types[VecTypeIdx];
1801           const LLT IdxTy = Query.Types[IdxTypeIdx];
1802           const unsigned EltSize = EltTy.getSizeInBits();
1803           const bool isLegalVecType =
1804               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1805           // Address space 8 pointers are 128-bit wide values, but the logic
1806           // below will try to bitcast them to 2N x s64, which will fail.
1807           // Therefore, as an intermediate step, wrap extracts/insertions from a
1808           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1809           // extraction result) in order to produce a vector operation that can
1810           // be handled by the logic below.
1811           if (EltTy.isPointer() && EltSize > 64)
1812             return true;
1813           return (EltSize == 32 || EltSize == 64) &&
1814                   VecTy.getSizeInBits() % 32 == 0 &&
1815                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1816                   IdxTy.getSizeInBits() == 32 &&
1817                   isLegalVecType;
1818         })
1819         .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1820                        scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1821                    bitcastToVectorElement32(VecTypeIdx))
1822         //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1823         .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1824                        scalarOrEltWiderThan(VecTypeIdx, 64)),
1825                    [=](const LegalityQuery &Query) {
1826                      // For > 64-bit element types, try to turn this into a
1827                      // 64-bit element vector since we may be able to do better
1828                      // indexing if this is scalar. If not, fall back to 32.
1829                      const LLT EltTy = Query.Types[EltTypeIdx];
1830                      const LLT VecTy = Query.Types[VecTypeIdx];
1831                      const unsigned DstEltSize = EltTy.getSizeInBits();
1832                      const unsigned VecSize = VecTy.getSizeInBits();
1833 
1834                      const unsigned TargetEltSize =
1835                          DstEltSize % 64 == 0 ? 64 : 32;
1836                      return std::pair(VecTypeIdx,
1837                                       LLT::fixed_vector(VecSize / TargetEltSize,
1838                                                         TargetEltSize));
1839                    })
1840         .clampScalar(EltTypeIdx, S32, S64)
1841         .clampScalar(VecTypeIdx, S32, S64)
1842         .clampScalar(IdxTypeIdx, S32, S32)
1843         .clampMaxNumElements(VecTypeIdx, S32, 32)
1844         // TODO: Clamp elements for 64-bit vectors?
1845         .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1846                         moreElementsToNextExistingRegClass(VecTypeIdx))
1847         // It should only be necessary with variable indexes.
1848         // As a last resort, lower to the stack
1849         .lower();
1850   }
1851 
1852   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1853     .unsupportedIf([=](const LegalityQuery &Query) {
1854         const LLT &EltTy = Query.Types[1].getElementType();
1855         return Query.Types[0] != EltTy;
1856       });
1857 
1858   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1859     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1860     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1861 
1862     // FIXME: Doesn't handle extract of illegal sizes.
1863     getActionDefinitionsBuilder(Op)
1864       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1865       .lowerIf([=](const LegalityQuery &Query) {
1866           // Sub-vector(or single element) insert and extract.
1867           // TODO: verify immediate offset here since lower only works with
1868           // whole elements.
1869           const LLT BigTy = Query.Types[BigTyIdx];
1870           return BigTy.isVector();
1871         })
1872       // FIXME: Multiples of 16 should not be legal.
1873       .legalIf([=](const LegalityQuery &Query) {
1874           const LLT BigTy = Query.Types[BigTyIdx];
1875           const LLT LitTy = Query.Types[LitTyIdx];
1876           return (BigTy.getSizeInBits() % 32 == 0) &&
1877                  (LitTy.getSizeInBits() % 16 == 0);
1878         })
1879       .widenScalarIf(
1880         [=](const LegalityQuery &Query) {
1881           const LLT BigTy = Query.Types[BigTyIdx];
1882           return (BigTy.getScalarSizeInBits() < 16);
1883         },
1884         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1885       .widenScalarIf(
1886         [=](const LegalityQuery &Query) {
1887           const LLT LitTy = Query.Types[LitTyIdx];
1888           return (LitTy.getScalarSizeInBits() < 16);
1889         },
1890         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1891       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1892       .widenScalarToNextPow2(BigTyIdx, 32);
1893 
1894   }
1895 
1896   auto &BuildVector =
1897       getActionDefinitionsBuilder(G_BUILD_VECTOR)
1898           .legalForCartesianProduct(AllS32Vectors, {S32})
1899           .legalForCartesianProduct(AllS64Vectors, {S64})
1900           .clampNumElements(0, V16S32, V32S32)
1901           .clampNumElements(0, V2S64, V16S64)
1902           .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1903           .moreElementsIf(isIllegalRegisterType(ST, 0),
1904                           moreElementsToNextExistingRegClass(0));
1905 
1906   if (ST.hasScalarPackInsts()) {
1907     BuildVector
1908       // FIXME: Should probably widen s1 vectors straight to s32
1909       .minScalarOrElt(0, S16)
1910       .minScalar(1, S16);
1911 
1912     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1913       .legalFor({V2S16, S32})
1914       .lower();
1915   } else {
1916     BuildVector.customFor({V2S16, S16});
1917     BuildVector.minScalarOrElt(0, S32);
1918 
1919     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1920       .customFor({V2S16, S32})
1921       .lower();
1922   }
1923 
1924   BuildVector.legalIf(isRegisterType(ST, 0));
1925 
1926   // FIXME: Clamp maximum size
1927   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1928       .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1929       .clampMaxNumElements(0, S32, 32)
1930       .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1931       .clampMaxNumElements(0, S16, 64);
1932 
1933   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1934 
1935   // Merge/Unmerge
1936   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1937     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1938     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1939 
1940     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1941       const LLT Ty = Query.Types[TypeIdx];
1942       if (Ty.isVector()) {
1943         const LLT &EltTy = Ty.getElementType();
1944         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1945           return true;
1946         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
1947           return true;
1948       }
1949       return false;
1950     };
1951 
1952     auto &Builder =
1953         getActionDefinitionsBuilder(Op)
1954             .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
1955             .lowerFor({{S16, V2S16}})
1956             .lowerIf([=](const LegalityQuery &Query) {
1957               const LLT BigTy = Query.Types[BigTyIdx];
1958               return BigTy.getSizeInBits() == 32;
1959             })
1960             // Try to widen to s16 first for small types.
1961             // TODO: Only do this on targets with legal s16 shifts
1962             .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1963             .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1964             .moreElementsIf(isSmallOddVector(BigTyIdx),
1965                             oneMoreElement(BigTyIdx))
1966             .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1967                                  elementTypeIs(1, S16)),
1968                              changeTo(1, V2S16))
1969             // Clamp the little scalar to s8-s256 and make it a power of 2. It's
1970             // not worth considering the multiples of 64 since 2*192 and 2*384
1971             // are not valid.
1972             .clampScalar(LitTyIdx, S32, S512)
1973             .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1974             // Break up vectors with weird elements into scalars
1975             .fewerElementsIf(
1976                 [=](const LegalityQuery &Query) {
1977                   return notValidElt(Query, LitTyIdx);
1978                 },
1979                 scalarize(0))
1980             .fewerElementsIf(
1981                 [=](const LegalityQuery &Query) {
1982                   return notValidElt(Query, BigTyIdx);
1983                 },
1984                 scalarize(1))
1985             .clampScalar(BigTyIdx, S32, MaxScalar);
1986 
1987     if (Op == G_MERGE_VALUES) {
1988       Builder.widenScalarIf(
1989         // TODO: Use 16-bit shifts if legal for 8-bit values?
1990         [=](const LegalityQuery &Query) {
1991           const LLT Ty = Query.Types[LitTyIdx];
1992           return Ty.getSizeInBits() < 32;
1993         },
1994         changeTo(LitTyIdx, S32));
1995     }
1996 
1997     Builder.widenScalarIf(
1998       [=](const LegalityQuery &Query) {
1999         const LLT Ty = Query.Types[BigTyIdx];
2000         return Ty.getSizeInBits() % 16 != 0;
2001       },
2002       [=](const LegalityQuery &Query) {
2003         // Pick the next power of 2, or a multiple of 64 over 128.
2004         // Whichever is smaller.
2005         const LLT &Ty = Query.Types[BigTyIdx];
2006         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2007         if (NewSizeInBits >= 256) {
2008           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2009           if (RoundedTo < NewSizeInBits)
2010             NewSizeInBits = RoundedTo;
2011         }
2012         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2013       })
2014       // Any vectors left are the wrong size. Scalarize them.
2015       .scalarize(0)
2016       .scalarize(1);
2017   }
2018 
2019   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2020   // RegBankSelect.
2021   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2022                         .legalFor({{S32}, {S64}})
2023                         .clampScalar(0, S32, S64);
2024 
2025   if (ST.hasVOP3PInsts()) {
2026     SextInReg.lowerFor({{V2S16}})
2027       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2028       // get more vector shift opportunities, since we'll get those when
2029       // expanded.
2030       .clampMaxNumElementsStrict(0, S16, 2);
2031   } else if (ST.has16BitInsts()) {
2032     SextInReg.lowerFor({{S32}, {S64}, {S16}});
2033   } else {
2034     // Prefer to promote to s32 before lowering if we don't have 16-bit
2035     // shifts. This avoid a lot of intermediate truncate and extend operations.
2036     SextInReg.lowerFor({{S32}, {S64}});
2037   }
2038 
2039   SextInReg
2040     .scalarize(0)
2041     .clampScalar(0, S32, S64)
2042     .lower();
2043 
2044   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2045     .scalarize(0)
2046     .lower();
2047 
2048   // TODO: Only Try to form v2s16 with legal packed instructions.
2049   getActionDefinitionsBuilder(G_FSHR)
2050     .legalFor({{S32, S32}})
2051     .lowerFor({{V2S16, V2S16}})
2052     .clampMaxNumElementsStrict(0, S16, 2)
2053     .scalarize(0)
2054     .lower();
2055 
2056   if (ST.hasVOP3PInsts()) {
2057     getActionDefinitionsBuilder(G_FSHL)
2058       .lowerFor({{V2S16, V2S16}})
2059       .clampMaxNumElementsStrict(0, S16, 2)
2060       .scalarize(0)
2061       .lower();
2062   } else {
2063     getActionDefinitionsBuilder(G_FSHL)
2064       .scalarize(0)
2065       .lower();
2066   }
2067 
2068   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2069     .legalFor({S64});
2070 
2071   getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2072 
2073   getActionDefinitionsBuilder(G_FENCE)
2074     .alwaysLegal();
2075 
2076   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2077       .scalarize(0)
2078       .minScalar(0, S32)
2079       .lower();
2080 
2081   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2082       .legalFor({{S32, S32}, {S64, S32}})
2083       .clampScalar(1, S32, S32)
2084       .clampScalar(0, S32, S64)
2085       .widenScalarToNextPow2(0)
2086       .scalarize(0);
2087 
2088   getActionDefinitionsBuilder(
2089       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2090        G_FCOPYSIGN,
2091 
2092        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2093        G_READ_REGISTER, G_WRITE_REGISTER,
2094 
2095        G_SADDO, G_SSUBO})
2096       .lower();
2097 
2098   if (ST.hasIEEEMinimumMaximumInsts()) {
2099     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2100         .legalFor(FPTypesPK16)
2101         .clampMaxNumElements(0, S16, 2)
2102         .scalarize(0);
2103   } else {
2104     // TODO: Implement
2105     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2106   }
2107 
2108   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2109       .lower();
2110 
2111   getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2112 
2113   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2114         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2115         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2116     .unsupported();
2117 
2118   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2119 
2120   getActionDefinitionsBuilder(
2121       {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2122        G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2123        G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2124        G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2125       .legalFor(AllVectors)
2126       .scalarize(1)
2127       .lower();
2128 
2129   getLegacyLegalizerInfo().computeTables();
2130   verify(*ST.getInstrInfo());
2131 }
2132 
2133 bool AMDGPULegalizerInfo::legalizeCustom(
2134     LegalizerHelper &Helper, MachineInstr &MI,
2135     LostDebugLocObserver &LocObserver) const {
2136   MachineIRBuilder &B = Helper.MIRBuilder;
2137   MachineRegisterInfo &MRI = *B.getMRI();
2138 
2139   switch (MI.getOpcode()) {
2140   case TargetOpcode::G_ADDRSPACE_CAST:
2141     return legalizeAddrSpaceCast(MI, MRI, B);
2142   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2143     return legalizeFroundeven(MI, MRI, B);
2144   case TargetOpcode::G_FCEIL:
2145     return legalizeFceil(MI, MRI, B);
2146   case TargetOpcode::G_FREM:
2147     return legalizeFrem(MI, MRI, B);
2148   case TargetOpcode::G_INTRINSIC_TRUNC:
2149     return legalizeIntrinsicTrunc(MI, MRI, B);
2150   case TargetOpcode::G_SITOFP:
2151     return legalizeITOFP(MI, MRI, B, true);
2152   case TargetOpcode::G_UITOFP:
2153     return legalizeITOFP(MI, MRI, B, false);
2154   case TargetOpcode::G_FPTOSI:
2155     return legalizeFPTOI(MI, MRI, B, true);
2156   case TargetOpcode::G_FPTOUI:
2157     return legalizeFPTOI(MI, MRI, B, false);
2158   case TargetOpcode::G_FMINNUM:
2159   case TargetOpcode::G_FMAXNUM:
2160   case TargetOpcode::G_FMINIMUMNUM:
2161   case TargetOpcode::G_FMAXIMUMNUM:
2162   case TargetOpcode::G_FMINNUM_IEEE:
2163   case TargetOpcode::G_FMAXNUM_IEEE:
2164     return legalizeMinNumMaxNum(Helper, MI);
2165   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2166     return legalizeExtractVectorElt(MI, MRI, B);
2167   case TargetOpcode::G_INSERT_VECTOR_ELT:
2168     return legalizeInsertVectorElt(MI, MRI, B);
2169   case TargetOpcode::G_FSIN:
2170   case TargetOpcode::G_FCOS:
2171     return legalizeSinCos(MI, MRI, B);
2172   case TargetOpcode::G_GLOBAL_VALUE:
2173     return legalizeGlobalValue(MI, MRI, B);
2174   case TargetOpcode::G_LOAD:
2175   case TargetOpcode::G_SEXTLOAD:
2176   case TargetOpcode::G_ZEXTLOAD:
2177     return legalizeLoad(Helper, MI);
2178   case TargetOpcode::G_STORE:
2179     return legalizeStore(Helper, MI);
2180   case TargetOpcode::G_FMAD:
2181     return legalizeFMad(MI, MRI, B);
2182   case TargetOpcode::G_FDIV:
2183     return legalizeFDIV(MI, MRI, B);
2184   case TargetOpcode::G_FFREXP:
2185     return legalizeFFREXP(MI, MRI, B);
2186   case TargetOpcode::G_FSQRT:
2187     return legalizeFSQRT(MI, MRI, B);
2188   case TargetOpcode::G_UDIV:
2189   case TargetOpcode::G_UREM:
2190   case TargetOpcode::G_UDIVREM:
2191     return legalizeUnsignedDIV_REM(MI, MRI, B);
2192   case TargetOpcode::G_SDIV:
2193   case TargetOpcode::G_SREM:
2194   case TargetOpcode::G_SDIVREM:
2195     return legalizeSignedDIV_REM(MI, MRI, B);
2196   case TargetOpcode::G_ATOMIC_CMPXCHG:
2197     return legalizeAtomicCmpXChg(MI, MRI, B);
2198   case TargetOpcode::G_FLOG2:
2199     return legalizeFlog2(MI, B);
2200   case TargetOpcode::G_FLOG:
2201   case TargetOpcode::G_FLOG10:
2202     return legalizeFlogCommon(MI, B);
2203   case TargetOpcode::G_FEXP2:
2204     return legalizeFExp2(MI, B);
2205   case TargetOpcode::G_FEXP:
2206   case TargetOpcode::G_FEXP10:
2207     return legalizeFExp(MI, B);
2208   case TargetOpcode::G_FPOW:
2209     return legalizeFPow(MI, B);
2210   case TargetOpcode::G_FFLOOR:
2211     return legalizeFFloor(MI, MRI, B);
2212   case TargetOpcode::G_BUILD_VECTOR:
2213   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2214     return legalizeBuildVector(MI, MRI, B);
2215   case TargetOpcode::G_MUL:
2216     return legalizeMul(Helper, MI);
2217   case TargetOpcode::G_CTLZ:
2218   case TargetOpcode::G_CTTZ:
2219     return legalizeCTLZ_CTTZ(MI, MRI, B);
2220   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2221     return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2222   case TargetOpcode::G_STACKSAVE:
2223     return legalizeStackSave(MI, B);
2224   case TargetOpcode::G_GET_FPENV:
2225     return legalizeGetFPEnv(MI, MRI, B);
2226   case TargetOpcode::G_SET_FPENV:
2227     return legalizeSetFPEnv(MI, MRI, B);
2228   case TargetOpcode::G_TRAP:
2229     return legalizeTrap(MI, MRI, B);
2230   case TargetOpcode::G_DEBUGTRAP:
2231     return legalizeDebugTrap(MI, MRI, B);
2232   default:
2233     return false;
2234   }
2235 
2236   llvm_unreachable("expected switch to return");
2237 }
2238 
2239 Register AMDGPULegalizerInfo::getSegmentAperture(
2240   unsigned AS,
2241   MachineRegisterInfo &MRI,
2242   MachineIRBuilder &B) const {
2243   MachineFunction &MF = B.getMF();
2244   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2245   const LLT S32 = LLT::scalar(32);
2246   const LLT S64 = LLT::scalar(64);
2247 
2248   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
2249 
2250   if (ST.hasApertureRegs()) {
2251     // Note: this register is somewhat broken. When used as a 32-bit operand,
2252     // it only returns zeroes. The real value is in the upper 32 bits.
2253     // Thus, we must emit extract the high 32 bits.
2254     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2255                                        ? AMDGPU::SRC_SHARED_BASE
2256                                        : AMDGPU::SRC_PRIVATE_BASE;
2257     // FIXME: It would be more natural to emit a COPY here, but then copy
2258     // coalescing would kick in and it would think it's okay to use the "HI"
2259     // subregister (instead of extracting the HI 32 bits) which is an artificial
2260     // (unusable) register.
2261     //  Register TableGen definitions would need an overhaul to get rid of the
2262     //  artificial "HI" aperture registers and prevent this kind of issue from
2263     //  happening.
2264     Register Dst = MRI.createGenericVirtualRegister(S64);
2265     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2266     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2267     return B.buildUnmerge(S32, Dst).getReg(1);
2268   }
2269 
2270   // TODO: can we be smarter about machine pointer info?
2271   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
2272   Register LoadAddr = MRI.createGenericVirtualRegister(
2273     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2274   // For code object version 5, private_base and shared_base are passed through
2275   // implicit kernargs.
2276   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
2277       AMDGPU::AMDHSA_COV5) {
2278     AMDGPUTargetLowering::ImplicitParameter Param =
2279         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
2280                                       : AMDGPUTargetLowering::PRIVATE_BASE;
2281     uint64_t Offset =
2282         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2283 
2284     Register KernargPtrReg = MRI.createGenericVirtualRegister(
2285         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2286 
2287     if (!loadInputValue(KernargPtrReg, B,
2288                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
2289       return Register();
2290 
2291     MachineMemOperand *MMO = MF.getMachineMemOperand(
2292         PtrInfo,
2293         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2294             MachineMemOperand::MOInvariant,
2295         LLT::scalar(32), commonAlignment(Align(64), Offset));
2296 
2297     // Pointer address
2298     B.buildPtrAdd(LoadAddr, KernargPtrReg,
2299                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2300     // Load address
2301     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2302   }
2303 
2304   Register QueuePtr = MRI.createGenericVirtualRegister(
2305     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2306 
2307   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
2308     return Register();
2309 
2310   // Offset into amd_queue_t for group_segment_aperture_base_hi /
2311   // private_segment_aperture_base_hi.
2312   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2313 
2314   MachineMemOperand *MMO = MF.getMachineMemOperand(
2315       PtrInfo,
2316       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2317           MachineMemOperand::MOInvariant,
2318       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2319 
2320   B.buildPtrAdd(LoadAddr, QueuePtr,
2321                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2322   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2323 }
2324 
2325 /// Return true if the value is a known valid address, such that a null check is
2326 /// not necessary.
2327 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
2328                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2329   MachineInstr *Def = MRI.getVRegDef(Val);
2330   switch (Def->getOpcode()) {
2331   case AMDGPU::G_FRAME_INDEX:
2332   case AMDGPU::G_GLOBAL_VALUE:
2333   case AMDGPU::G_BLOCK_ADDR:
2334     return true;
2335   case AMDGPU::G_CONSTANT: {
2336     const ConstantInt *CI = Def->getOperand(1).getCImm();
2337     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
2338   }
2339   default:
2340     return false;
2341   }
2342 
2343   return false;
2344 }
2345 
2346 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
2347   MachineInstr &MI, MachineRegisterInfo &MRI,
2348   MachineIRBuilder &B) const {
2349   MachineFunction &MF = B.getMF();
2350 
2351   // MI can either be a G_ADDRSPACE_CAST or a
2352   // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2353   assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2354          (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2355                                      Intrinsic::amdgcn_addrspacecast_nonnull));
2356 
2357   const LLT S32 = LLT::scalar(32);
2358   Register Dst = MI.getOperand(0).getReg();
2359   Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2360                                      : MI.getOperand(1).getReg();
2361   LLT DstTy = MRI.getType(Dst);
2362   LLT SrcTy = MRI.getType(Src);
2363   unsigned DestAS = DstTy.getAddressSpace();
2364   unsigned SrcAS = SrcTy.getAddressSpace();
2365 
2366   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2367   // vector element.
2368   assert(!DstTy.isVector());
2369 
2370   const AMDGPUTargetMachine &TM
2371     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2372 
2373   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2374     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2375     return true;
2376   }
2377 
2378   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2379       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2380        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2381     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2382     // G_ADDRSPACE_CAST we need to guess.
2383     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2384       // Extract low 32-bits of the pointer.
2385       B.buildExtract(Dst, Src, 0);
2386       MI.eraseFromParent();
2387       return true;
2388     }
2389 
2390     unsigned NullVal = TM.getNullPointerValue(DestAS);
2391 
2392     auto SegmentNull = B.buildConstant(DstTy, NullVal);
2393     auto FlatNull = B.buildConstant(SrcTy, 0);
2394 
2395     // Extract low 32-bits of the pointer.
2396     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
2397 
2398     auto CmpRes =
2399         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2400     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2401 
2402     MI.eraseFromParent();
2403     return true;
2404   }
2405 
2406   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2407       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2408        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2409     auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2410       Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2411       if (!ApertureReg.isValid())
2412         return false;
2413 
2414       // Coerce the type of the low half of the result so we can use
2415       // merge_values.
2416       Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2417 
2418       // TODO: Should we allow mismatched types but matching sizes in merges to
2419       // avoid the ptrtoint?
2420       return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2421     };
2422 
2423     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2424     // G_ADDRSPACE_CAST we need to guess.
2425     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2426       castLocalOrPrivateToFlat(Dst);
2427       MI.eraseFromParent();
2428       return true;
2429     }
2430 
2431     Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2432 
2433     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2434     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2435 
2436     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2437                               SegmentNull.getReg(0));
2438 
2439     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2440 
2441     MI.eraseFromParent();
2442     return true;
2443   }
2444 
2445   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2446       SrcTy.getSizeInBits() == 64) {
2447     // Truncate.
2448     B.buildExtract(Dst, Src, 0);
2449     MI.eraseFromParent();
2450     return true;
2451   }
2452 
2453   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2454       DstTy.getSizeInBits() == 64) {
2455     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2456     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2457     auto PtrLo = B.buildPtrToInt(S32, Src);
2458     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2459     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2460     MI.eraseFromParent();
2461     return true;
2462   }
2463 
2464   // Invalid casts are poison.
2465   // TODO: Should return poison
2466   B.buildUndef(Dst);
2467   MI.eraseFromParent();
2468   return true;
2469 }
2470 
2471 bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2472                                              MachineRegisterInfo &MRI,
2473                                              MachineIRBuilder &B) const {
2474   Register Src = MI.getOperand(1).getReg();
2475   LLT Ty = MRI.getType(Src);
2476   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2477 
2478   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2479   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2480 
2481   auto C1 = B.buildFConstant(Ty, C1Val);
2482   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2483 
2484   // TODO: Should this propagate fast-math-flags?
2485   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2486   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2487 
2488   auto C2 = B.buildFConstant(Ty, C2Val);
2489   auto Fabs = B.buildFAbs(Ty, Src);
2490 
2491   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2492   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2493   MI.eraseFromParent();
2494   return true;
2495 }
2496 
2497 bool AMDGPULegalizerInfo::legalizeFceil(
2498   MachineInstr &MI, MachineRegisterInfo &MRI,
2499   MachineIRBuilder &B) const {
2500 
2501   const LLT S1 = LLT::scalar(1);
2502   const LLT S64 = LLT::scalar(64);
2503 
2504   Register Src = MI.getOperand(1).getReg();
2505   assert(MRI.getType(Src) == S64);
2506 
2507   // result = trunc(src)
2508   // if (src > 0.0 && src != result)
2509   //   result += 1.0
2510 
2511   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2512 
2513   const auto Zero = B.buildFConstant(S64, 0.0);
2514   const auto One = B.buildFConstant(S64, 1.0);
2515   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2516   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2517   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2518   auto Add = B.buildSelect(S64, And, One, Zero);
2519 
2520   // TODO: Should this propagate fast-math-flags?
2521   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2522   MI.eraseFromParent();
2523   return true;
2524 }
2525 
2526 bool AMDGPULegalizerInfo::legalizeFrem(
2527   MachineInstr &MI, MachineRegisterInfo &MRI,
2528   MachineIRBuilder &B) const {
2529     Register DstReg = MI.getOperand(0).getReg();
2530     Register Src0Reg = MI.getOperand(1).getReg();
2531     Register Src1Reg = MI.getOperand(2).getReg();
2532     auto Flags = MI.getFlags();
2533     LLT Ty = MRI.getType(DstReg);
2534 
2535     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2536     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2537     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2538     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2539     MI.eraseFromParent();
2540     return true;
2541 }
2542 
2543 static MachineInstrBuilder extractF64Exponent(Register Hi,
2544                                               MachineIRBuilder &B) {
2545   const unsigned FractBits = 52;
2546   const unsigned ExpBits = 11;
2547   LLT S32 = LLT::scalar(32);
2548 
2549   auto Const0 = B.buildConstant(S32, FractBits - 32);
2550   auto Const1 = B.buildConstant(S32, ExpBits);
2551 
2552   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2553                      .addUse(Hi)
2554                      .addUse(Const0.getReg(0))
2555                      .addUse(Const1.getReg(0));
2556 
2557   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2558 }
2559 
2560 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2561   MachineInstr &MI, MachineRegisterInfo &MRI,
2562   MachineIRBuilder &B) const {
2563   const LLT S1 = LLT::scalar(1);
2564   const LLT S32 = LLT::scalar(32);
2565   const LLT S64 = LLT::scalar(64);
2566 
2567   Register Src = MI.getOperand(1).getReg();
2568   assert(MRI.getType(Src) == S64);
2569 
2570   // TODO: Should this use extract since the low half is unused?
2571   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2572   Register Hi = Unmerge.getReg(1);
2573 
2574   // Extract the upper half, since this is where we will find the sign and
2575   // exponent.
2576   auto Exp = extractF64Exponent(Hi, B);
2577 
2578   const unsigned FractBits = 52;
2579 
2580   // Extract the sign bit.
2581   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2582   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2583 
2584   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2585 
2586   const auto Zero32 = B.buildConstant(S32, 0);
2587 
2588   // Extend back to 64-bits.
2589   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2590 
2591   auto Shr = B.buildAShr(S64, FractMask, Exp);
2592   auto Not = B.buildNot(S64, Shr);
2593   auto Tmp0 = B.buildAnd(S64, Src, Not);
2594   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2595 
2596   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2597   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2598 
2599   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2600   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2601   MI.eraseFromParent();
2602   return true;
2603 }
2604 
2605 bool AMDGPULegalizerInfo::legalizeITOFP(
2606   MachineInstr &MI, MachineRegisterInfo &MRI,
2607   MachineIRBuilder &B, bool Signed) const {
2608 
2609   Register Dst = MI.getOperand(0).getReg();
2610   Register Src = MI.getOperand(1).getReg();
2611 
2612   const LLT S64 = LLT::scalar(64);
2613   const LLT S32 = LLT::scalar(32);
2614 
2615   assert(MRI.getType(Src) == S64);
2616 
2617   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2618   auto ThirtyTwo = B.buildConstant(S32, 32);
2619 
2620   if (MRI.getType(Dst) == S64) {
2621     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2622                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2623 
2624     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2625     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2626 
2627     // TODO: Should this propagate fast-math-flags?
2628     B.buildFAdd(Dst, LdExp, CvtLo);
2629     MI.eraseFromParent();
2630     return true;
2631   }
2632 
2633   assert(MRI.getType(Dst) == S32);
2634 
2635   auto One = B.buildConstant(S32, 1);
2636 
2637   MachineInstrBuilder ShAmt;
2638   if (Signed) {
2639     auto ThirtyOne = B.buildConstant(S32, 31);
2640     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2641     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2642     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2643     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2644                   .addUse(Unmerge.getReg(1));
2645     auto LS2 = B.buildSub(S32, LS, One);
2646     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2647   } else
2648     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2649   auto Norm = B.buildShl(S64, Src, ShAmt);
2650   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2651   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2652   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2653   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2654   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2655   B.buildFLdexp(Dst, FVal, Scale);
2656   MI.eraseFromParent();
2657   return true;
2658 }
2659 
2660 // TODO: Copied from DAG implementation. Verify logic and document how this
2661 // actually works.
2662 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2663                                         MachineRegisterInfo &MRI,
2664                                         MachineIRBuilder &B,
2665                                         bool Signed) const {
2666 
2667   Register Dst = MI.getOperand(0).getReg();
2668   Register Src = MI.getOperand(1).getReg();
2669 
2670   const LLT S64 = LLT::scalar(64);
2671   const LLT S32 = LLT::scalar(32);
2672 
2673   const LLT SrcLT = MRI.getType(Src);
2674   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2675 
2676   unsigned Flags = MI.getFlags();
2677 
2678   // The basic idea of converting a floating point number into a pair of 32-bit
2679   // integers is illustrated as follows:
2680   //
2681   //     tf := trunc(val);
2682   //    hif := floor(tf * 2^-32);
2683   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2684   //     hi := fptoi(hif);
2685   //     lo := fptoi(lof);
2686   //
2687   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2688   MachineInstrBuilder Sign;
2689   if (Signed && SrcLT == S32) {
2690     // However, a 32-bit floating point number has only 23 bits mantissa and
2691     // it's not enough to hold all the significant bits of `lof` if val is
2692     // negative. To avoid the loss of precision, We need to take the absolute
2693     // value after truncating and flip the result back based on the original
2694     // signedness.
2695     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2696     Trunc = B.buildFAbs(S32, Trunc, Flags);
2697   }
2698   MachineInstrBuilder K0, K1;
2699   if (SrcLT == S64) {
2700     K0 = B.buildFConstant(
2701         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2702     K1 = B.buildFConstant(
2703         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2704   } else {
2705     K0 = B.buildFConstant(
2706         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2707     K1 = B.buildFConstant(
2708         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2709   }
2710 
2711   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2712   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2713   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2714 
2715   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2716                                      : B.buildFPTOUI(S32, FloorMul);
2717   auto Lo = B.buildFPTOUI(S32, Fma);
2718 
2719   if (Signed && SrcLT == S32) {
2720     // Flip the result based on the signedness, which is either all 0s or 1s.
2721     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2722     // r := xor({lo, hi}, sign) - sign;
2723     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2724                Sign);
2725   } else
2726     B.buildMergeLikeInstr(Dst, {Lo, Hi});
2727   MI.eraseFromParent();
2728 
2729   return true;
2730 }
2731 
2732 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2733                                                MachineInstr &MI) const {
2734   MachineFunction &MF = Helper.MIRBuilder.getMF();
2735   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2736 
2737   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2738                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2739 
2740   // With ieee_mode disabled, the instructions have the correct behavior
2741   // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
2742   //
2743   // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
2744   // enabled.
2745   if (!MFI->getMode().IEEE) {
2746     if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
2747         MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
2748       return true;
2749 
2750     return !IsIEEEOp;
2751   }
2752 
2753   if (IsIEEEOp)
2754     return true;
2755 
2756   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2757 }
2758 
2759 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2760   MachineInstr &MI, MachineRegisterInfo &MRI,
2761   MachineIRBuilder &B) const {
2762   // TODO: Should move some of this into LegalizerHelper.
2763 
2764   // TODO: Promote dynamic indexing of s16 to s32
2765 
2766   Register Dst = MI.getOperand(0).getReg();
2767   Register Vec = MI.getOperand(1).getReg();
2768 
2769   LLT VecTy = MRI.getType(Vec);
2770   LLT EltTy = VecTy.getElementType();
2771   assert(EltTy == MRI.getType(Dst));
2772 
2773   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2774   // but we can't go directly to that logic becasue you can't bitcast a vector
2775   // of pointers to a vector of integers. Therefore, introduce an intermediate
2776   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2777   // drive the legalization forward.
2778   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2779     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2780     LLT IntVecTy = VecTy.changeElementType(IntTy);
2781 
2782     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2783     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2784     B.buildIntToPtr(Dst, IntElt);
2785 
2786     MI.eraseFromParent();
2787     return true;
2788   }
2789 
2790   // FIXME: Artifact combiner probably should have replaced the truncated
2791   // constant before this, so we shouldn't need
2792   // getIConstantVRegValWithLookThrough.
2793   std::optional<ValueAndVReg> MaybeIdxVal =
2794       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2795   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2796     return true;
2797   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2798 
2799   if (IdxVal < VecTy.getNumElements()) {
2800     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2801     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2802   } else {
2803     B.buildUndef(Dst);
2804   }
2805 
2806   MI.eraseFromParent();
2807   return true;
2808 }
2809 
2810 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2811   MachineInstr &MI, MachineRegisterInfo &MRI,
2812   MachineIRBuilder &B) const {
2813   // TODO: Should move some of this into LegalizerHelper.
2814 
2815   // TODO: Promote dynamic indexing of s16 to s32
2816 
2817   Register Dst = MI.getOperand(0).getReg();
2818   Register Vec = MI.getOperand(1).getReg();
2819   Register Ins = MI.getOperand(2).getReg();
2820 
2821   LLT VecTy = MRI.getType(Vec);
2822   LLT EltTy = VecTy.getElementType();
2823   assert(EltTy == MRI.getType(Ins));
2824 
2825   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2826   // but we can't go directly to that logic becasue you can't bitcast a vector
2827   // of pointers to a vector of integers. Therefore, make the pointer vector
2828   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2829   // new value, and then inttoptr the result vector back. This will then allow
2830   // the rest of legalization to take over.
2831   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2832     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2833     LLT IntVecTy = VecTy.changeElementType(IntTy);
2834 
2835     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2836     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2837     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2838                                                  MI.getOperand(3));
2839     B.buildIntToPtr(Dst, IntVecDest);
2840     MI.eraseFromParent();
2841     return true;
2842   }
2843 
2844   // FIXME: Artifact combiner probably should have replaced the truncated
2845   // constant before this, so we shouldn't need
2846   // getIConstantVRegValWithLookThrough.
2847   std::optional<ValueAndVReg> MaybeIdxVal =
2848       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2849   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2850     return true;
2851 
2852   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2853 
2854   unsigned NumElts = VecTy.getNumElements();
2855   if (IdxVal < NumElts) {
2856     SmallVector<Register, 8> SrcRegs;
2857     for (unsigned i = 0; i < NumElts; ++i)
2858       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2859     B.buildUnmerge(SrcRegs, Vec);
2860 
2861     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2862     B.buildMergeLikeInstr(Dst, SrcRegs);
2863   } else {
2864     B.buildUndef(Dst);
2865   }
2866 
2867   MI.eraseFromParent();
2868   return true;
2869 }
2870 
2871 bool AMDGPULegalizerInfo::legalizeSinCos(
2872   MachineInstr &MI, MachineRegisterInfo &MRI,
2873   MachineIRBuilder &B) const {
2874 
2875   Register DstReg = MI.getOperand(0).getReg();
2876   Register SrcReg = MI.getOperand(1).getReg();
2877   LLT Ty = MRI.getType(DstReg);
2878   unsigned Flags = MI.getFlags();
2879 
2880   Register TrigVal;
2881   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2882   if (ST.hasTrigReducedRange()) {
2883     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2884     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2885                   .addUse(MulVal.getReg(0))
2886                   .setMIFlags(Flags)
2887                   .getReg(0);
2888   } else
2889     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2890 
2891   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2892     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2893   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
2894       .addUse(TrigVal)
2895       .setMIFlags(Flags);
2896   MI.eraseFromParent();
2897   return true;
2898 }
2899 
2900 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2901                                                   MachineIRBuilder &B,
2902                                                   const GlobalValue *GV,
2903                                                   int64_t Offset,
2904                                                   unsigned GAFlags) const {
2905   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2906   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2907   // to the following code sequence:
2908   //
2909   // For constant address space:
2910   //   s_getpc_b64 s[0:1]
2911   //   s_add_u32 s0, s0, $symbol
2912   //   s_addc_u32 s1, s1, 0
2913   //
2914   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2915   //   a fixup or relocation is emitted to replace $symbol with a literal
2916   //   constant, which is a pc-relative offset from the encoding of the $symbol
2917   //   operand to the global variable.
2918   //
2919   // For global address space:
2920   //   s_getpc_b64 s[0:1]
2921   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2922   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2923   //
2924   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2925   //   fixups or relocations are emitted to replace $symbol@*@lo and
2926   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2927   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2928   //   operand to the global variable.
2929 
2930   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2931 
2932   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2933     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2934 
2935   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2936     .addDef(PCReg);
2937 
2938   MIB.addGlobalAddress(GV, Offset, GAFlags);
2939   if (GAFlags == SIInstrInfo::MO_NONE)
2940     MIB.addImm(0);
2941   else
2942     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
2943 
2944   if (!B.getMRI()->getRegClassOrNull(PCReg))
2945     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2946 
2947   if (PtrTy.getSizeInBits() == 32)
2948     B.buildExtract(DstReg, PCReg, 0);
2949   return true;
2950 }
2951 
2952 // Emit a ABS32_LO / ABS32_HI relocation stub.
2953 void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2954     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2955     MachineRegisterInfo &MRI) const {
2956   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2957 
2958   LLT S32 = LLT::scalar(32);
2959 
2960   // Use the destination directly, if and only if we store the lower address
2961   // part only and we don't have a register class being set.
2962   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2963                         ? DstReg
2964                         : MRI.createGenericVirtualRegister(S32);
2965 
2966   if (!MRI.getRegClassOrNull(AddrLo))
2967     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2968 
2969   // Write the lower half.
2970   B.buildInstr(AMDGPU::S_MOV_B32)
2971       .addDef(AddrLo)
2972       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2973 
2974   // If required, write the upper half as well.
2975   if (RequiresHighHalf) {
2976     assert(PtrTy.getSizeInBits() == 64 &&
2977            "Must provide a 64-bit pointer type!");
2978 
2979     Register AddrHi = MRI.createGenericVirtualRegister(S32);
2980     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2981 
2982     B.buildInstr(AMDGPU::S_MOV_B32)
2983         .addDef(AddrHi)
2984         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2985 
2986     // Use the destination directly, if and only if we don't have a register
2987     // class being set.
2988     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2989                            ? DstReg
2990                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
2991 
2992     if (!MRI.getRegClassOrNull(AddrDst))
2993       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2994 
2995     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2996 
2997     // If we created a new register for the destination, cast the result into
2998     // the final output.
2999     if (AddrDst != DstReg)
3000       B.buildCast(DstReg, AddrDst);
3001   } else if (AddrLo != DstReg) {
3002     // If we created a new register for the destination, cast the result into
3003     // the final output.
3004     B.buildCast(DstReg, AddrLo);
3005   }
3006 }
3007 
3008 bool AMDGPULegalizerInfo::legalizeGlobalValue(
3009   MachineInstr &MI, MachineRegisterInfo &MRI,
3010   MachineIRBuilder &B) const {
3011   Register DstReg = MI.getOperand(0).getReg();
3012   LLT Ty = MRI.getType(DstReg);
3013   unsigned AS = Ty.getAddressSpace();
3014 
3015   const GlobalValue *GV = MI.getOperand(1).getGlobal();
3016   MachineFunction &MF = B.getMF();
3017   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3018 
3019   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
3020     if (!MFI->isModuleEntryFunction() &&
3021         GV->getName() != "llvm.amdgcn.module.lds" &&
3022         !AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV))) {
3023       const Function &Fn = MF.getFunction();
3024       Fn.getContext().diagnose(DiagnosticInfoUnsupported(
3025           Fn, "local memory global used by non-kernel function",
3026           MI.getDebugLoc(), DS_Warning));
3027 
3028       // We currently don't have a way to correctly allocate LDS objects that
3029       // aren't directly associated with a kernel. We do force inlining of
3030       // functions that use local objects. However, if these dead functions are
3031       // not eliminated, we don't want a compile time error. Just emit a warning
3032       // and a trap, since there should be no callable path here.
3033       B.buildTrap();
3034       B.buildUndef(DstReg);
3035       MI.eraseFromParent();
3036       return true;
3037     }
3038 
3039     // TODO: We could emit code to handle the initialization somewhere.
3040     // We ignore the initializer for now and legalize it to allow selection.
3041     // The initializer will anyway get errored out during assembly emission.
3042     const SITargetLowering *TLI = ST.getTargetLowering();
3043     if (!TLI->shouldUseLDSConstAddress(GV)) {
3044       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3045       return true; // Leave in place;
3046     }
3047 
3048     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3049       Type *Ty = GV->getValueType();
3050       // HIP uses an unsized array `extern __shared__ T s[]` or similar
3051       // zero-sized type in other languages to declare the dynamic shared
3052       // memory which size is not known at the compile time. They will be
3053       // allocated by the runtime and placed directly after the static
3054       // allocated ones. They all share the same offset.
3055       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3056         // Adjust alignment for that dynamic shared memory array.
3057         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3058         LLT S32 = LLT::scalar(32);
3059         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3060         B.buildIntToPtr(DstReg, Sz);
3061         MI.eraseFromParent();
3062         return true;
3063       }
3064     }
3065 
3066     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3067                                                    *cast<GlobalVariable>(GV)));
3068     MI.eraseFromParent();
3069     return true;
3070   }
3071 
3072   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3073     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3074     MI.eraseFromParent();
3075     return true;
3076   }
3077 
3078   const SITargetLowering *TLI = ST.getTargetLowering();
3079 
3080   if (TLI->shouldEmitFixup(GV)) {
3081     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3082     MI.eraseFromParent();
3083     return true;
3084   }
3085 
3086   if (TLI->shouldEmitPCReloc(GV)) {
3087     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3088     MI.eraseFromParent();
3089     return true;
3090   }
3091 
3092   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3093   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3094 
3095   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3096   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
3097       MachinePointerInfo::getGOT(MF),
3098       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3099           MachineMemOperand::MOInvariant,
3100       LoadTy, Align(8));
3101 
3102   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3103 
3104   if (Ty.getSizeInBits() == 32) {
3105     // Truncate if this is a 32-bit constant address.
3106     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3107     B.buildExtract(DstReg, Load, 0);
3108   } else
3109     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3110 
3111   MI.eraseFromParent();
3112   return true;
3113 }
3114 
3115 static LLT widenToNextPowerOf2(LLT Ty) {
3116   if (Ty.isVector())
3117     return Ty.changeElementCount(
3118         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3119   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3120 }
3121 
3122 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3123                                        MachineInstr &MI) const {
3124   MachineIRBuilder &B = Helper.MIRBuilder;
3125   MachineRegisterInfo &MRI = *B.getMRI();
3126   GISelChangeObserver &Observer = Helper.Observer;
3127 
3128   Register PtrReg = MI.getOperand(1).getReg();
3129   LLT PtrTy = MRI.getType(PtrReg);
3130   unsigned AddrSpace = PtrTy.getAddressSpace();
3131 
3132   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3133     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3134     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3135     Observer.changingInstr(MI);
3136     MI.getOperand(1).setReg(Cast.getReg(0));
3137     Observer.changedInstr(MI);
3138     return true;
3139   }
3140 
3141   if (MI.getOpcode() != AMDGPU::G_LOAD)
3142     return false;
3143 
3144   Register ValReg = MI.getOperand(0).getReg();
3145   LLT ValTy = MRI.getType(ValReg);
3146 
3147   if (hasBufferRsrcWorkaround(ValTy)) {
3148     Observer.changingInstr(MI);
3149     castBufferRsrcFromV4I32(MI, B, MRI, 0);
3150     Observer.changedInstr(MI);
3151     return true;
3152   }
3153 
3154   MachineMemOperand *MMO = *MI.memoperands_begin();
3155   const unsigned ValSize = ValTy.getSizeInBits();
3156   const LLT MemTy = MMO->getMemoryType();
3157   const Align MemAlign = MMO->getAlign();
3158   const unsigned MemSize = MemTy.getSizeInBits();
3159   const uint64_t AlignInBits = 8 * MemAlign.value();
3160 
3161   // Widen non-power-of-2 loads to the alignment if needed
3162   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3163     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3164 
3165     // This was already the correct extending load result type, so just adjust
3166     // the memory type.
3167     if (WideMemSize == ValSize) {
3168       MachineFunction &MF = B.getMF();
3169 
3170       MachineMemOperand *WideMMO =
3171           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3172       Observer.changingInstr(MI);
3173       MI.setMemRefs(MF, {WideMMO});
3174       Observer.changedInstr(MI);
3175       return true;
3176     }
3177 
3178     // Don't bother handling edge case that should probably never be produced.
3179     if (ValSize > WideMemSize)
3180       return false;
3181 
3182     LLT WideTy = widenToNextPowerOf2(ValTy);
3183 
3184     Register WideLoad;
3185     if (!WideTy.isVector()) {
3186       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3187       B.buildTrunc(ValReg, WideLoad).getReg(0);
3188     } else {
3189       // Extract the subvector.
3190 
3191       if (isRegisterType(ST, ValTy)) {
3192         // If this a case where G_EXTRACT is legal, use it.
3193         // (e.g. <3 x s32> -> <4 x s32>)
3194         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3195         B.buildExtract(ValReg, WideLoad, 0);
3196       } else {
3197         // For cases where the widened type isn't a nice register value, unmerge
3198         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3199         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3200         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3201       }
3202     }
3203 
3204     MI.eraseFromParent();
3205     return true;
3206   }
3207 
3208   return false;
3209 }
3210 
3211 bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
3212                                         MachineInstr &MI) const {
3213   MachineIRBuilder &B = Helper.MIRBuilder;
3214   MachineRegisterInfo &MRI = *B.getMRI();
3215   GISelChangeObserver &Observer = Helper.Observer;
3216 
3217   Register DataReg = MI.getOperand(0).getReg();
3218   LLT DataTy = MRI.getType(DataReg);
3219 
3220   if (hasBufferRsrcWorkaround(DataTy)) {
3221     Observer.changingInstr(MI);
3222     castBufferRsrcArgToV4I32(MI, B, 0);
3223     Observer.changedInstr(MI);
3224     return true;
3225   }
3226   return false;
3227 }
3228 
3229 bool AMDGPULegalizerInfo::legalizeFMad(
3230   MachineInstr &MI, MachineRegisterInfo &MRI,
3231   MachineIRBuilder &B) const {
3232   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3233   assert(Ty.isScalar());
3234 
3235   MachineFunction &MF = B.getMF();
3236   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3237 
3238   // TODO: Always legal with future ftz flag.
3239   // FIXME: Do we need just output?
3240   if (Ty == LLT::float32() &&
3241       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
3242     return true;
3243   if (Ty == LLT::float16() &&
3244       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
3245     return true;
3246 
3247   MachineIRBuilder HelperBuilder(MI);
3248   GISelObserverWrapper DummyObserver;
3249   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3250   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3251 }
3252 
3253 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3254   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3255   Register DstReg = MI.getOperand(0).getReg();
3256   Register PtrReg = MI.getOperand(1).getReg();
3257   Register CmpVal = MI.getOperand(2).getReg();
3258   Register NewVal = MI.getOperand(3).getReg();
3259 
3260   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3261          "this should not have been custom lowered");
3262 
3263   LLT ValTy = MRI.getType(CmpVal);
3264   LLT VecTy = LLT::fixed_vector(2, ValTy);
3265 
3266   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3267 
3268   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3269     .addDef(DstReg)
3270     .addUse(PtrReg)
3271     .addUse(PackedVal)
3272     .setMemRefs(MI.memoperands());
3273 
3274   MI.eraseFromParent();
3275   return true;
3276 }
3277 
3278 /// Return true if it's known that \p Src can never be an f32 denormal value.
3279 static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3280                                        Register Src) {
3281   const MachineInstr *DefMI = MRI.getVRegDef(Src);
3282   switch (DefMI->getOpcode()) {
3283   case TargetOpcode::G_INTRINSIC: {
3284     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3285     case Intrinsic::amdgcn_frexp_mant:
3286       return true;
3287     default:
3288       break;
3289     }
3290 
3291     break;
3292   }
3293   case TargetOpcode::G_FFREXP: {
3294     if (DefMI->getOperand(0).getReg() == Src)
3295       return true;
3296     break;
3297   }
3298   case TargetOpcode::G_FPEXT: {
3299     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3300   }
3301   default:
3302     return false;
3303   }
3304 
3305   return false;
3306 }
3307 
3308 static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3309   if (Flags & MachineInstr::FmAfn)
3310     return true;
3311   const auto &Options = MF.getTarget().Options;
3312   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3313 }
3314 
3315 static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3316                                    unsigned Flags) {
3317   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3318          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3319              DenormalMode::PreserveSign;
3320 }
3321 
3322 std::pair<Register, Register>
3323 AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3324                                        unsigned Flags) const {
3325   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3326     return {};
3327 
3328   const LLT F32 = LLT::scalar(32);
3329   auto SmallestNormal = B.buildFConstant(
3330       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3331   auto IsLtSmallestNormal =
3332       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3333 
3334   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3335   auto One = B.buildFConstant(F32, 1.0);
3336   auto ScaleFactor =
3337       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3338   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3339 
3340   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3341 }
3342 
3343 bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3344                                         MachineIRBuilder &B) const {
3345   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3346   // If we have to handle denormals, scale up the input and adjust the result.
3347 
3348   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3349   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3350 
3351   Register Dst = MI.getOperand(0).getReg();
3352   Register Src = MI.getOperand(1).getReg();
3353   LLT Ty = B.getMRI()->getType(Dst);
3354   unsigned Flags = MI.getFlags();
3355 
3356   if (Ty == LLT::scalar(16)) {
3357     const LLT F32 = LLT::scalar(32);
3358     // Nothing in half is a denormal when promoted to f32.
3359     auto Ext = B.buildFPExt(F32, Src, Flags);
3360     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3361                     .addUse(Ext.getReg(0))
3362                     .setMIFlags(Flags);
3363     B.buildFPTrunc(Dst, Log2, Flags);
3364     MI.eraseFromParent();
3365     return true;
3366   }
3367 
3368   assert(Ty == LLT::scalar(32));
3369 
3370   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3371   if (!ScaledInput) {
3372     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3373         .addUse(Src)
3374         .setMIFlags(Flags);
3375     MI.eraseFromParent();
3376     return true;
3377   }
3378 
3379   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3380                   .addUse(ScaledInput)
3381                   .setMIFlags(Flags);
3382 
3383   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3384   auto Zero = B.buildFConstant(Ty, 0.0);
3385   auto ResultOffset =
3386       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3387   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3388 
3389   MI.eraseFromParent();
3390   return true;
3391 }
3392 
3393 static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3394                        Register Z, unsigned Flags) {
3395   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3396   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3397 }
3398 
3399 bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3400                                              MachineIRBuilder &B) const {
3401   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3402   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3403 
3404   MachineRegisterInfo &MRI = *B.getMRI();
3405   Register Dst = MI.getOperand(0).getReg();
3406   Register X = MI.getOperand(1).getReg();
3407   unsigned Flags = MI.getFlags();
3408   const LLT Ty = MRI.getType(X);
3409   MachineFunction &MF = B.getMF();
3410 
3411   const LLT F32 = LLT::scalar(32);
3412   const LLT F16 = LLT::scalar(16);
3413 
3414   const AMDGPUTargetMachine &TM =
3415       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3416 
3417   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3418       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3419     if (Ty == F16 && !ST.has16BitInsts()) {
3420       Register LogVal = MRI.createGenericVirtualRegister(F32);
3421       auto PromoteSrc = B.buildFPExt(F32, X);
3422       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3423       B.buildFPTrunc(Dst, LogVal);
3424     } else {
3425       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3426     }
3427 
3428     MI.eraseFromParent();
3429     return true;
3430   }
3431 
3432   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3433   if (ScaledInput)
3434     X = ScaledInput;
3435 
3436   auto Y =
3437       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3438 
3439   Register R;
3440   if (ST.hasFastFMAF32()) {
3441     // c+cc are ln(2)/ln(10) to more than 49 bits
3442     const float c_log10 = 0x1.344134p-2f;
3443     const float cc_log10 = 0x1.09f79ep-26f;
3444 
3445     // c + cc is ln(2) to more than 49 bits
3446     const float c_log = 0x1.62e42ep-1f;
3447     const float cc_log = 0x1.efa39ep-25f;
3448 
3449     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3450     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3451 
3452     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3453     auto NegR = B.buildFNeg(Ty, R, Flags);
3454     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3455     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3456     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3457   } else {
3458     // ch+ct is ln(2)/ln(10) to more than 36 bits
3459     const float ch_log10 = 0x1.344000p-2f;
3460     const float ct_log10 = 0x1.3509f6p-18f;
3461 
3462     // ch + ct is ln(2) to more than 36 bits
3463     const float ch_log = 0x1.62e000p-1f;
3464     const float ct_log = 0x1.0bfbe8p-15f;
3465 
3466     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3467     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3468 
3469     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3470     auto YH = B.buildAnd(Ty, Y, MaskConst);
3471     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3472     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3473 
3474     Register Mad0 =
3475         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3476     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3477     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3478   }
3479 
3480   const bool IsFiniteOnly =
3481       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3482       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3483 
3484   if (!IsFiniteOnly) {
3485     // Expand isfinite(x) => fabs(x) < inf
3486     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3487     auto Fabs = B.buildFAbs(Ty, Y);
3488     auto IsFinite =
3489         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3490     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3491   }
3492 
3493   if (ScaledInput) {
3494     auto Zero = B.buildFConstant(Ty, 0.0);
3495     auto ShiftK =
3496         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3497     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3498     B.buildFSub(Dst, R, Shift, Flags);
3499   } else {
3500     B.buildCopy(Dst, R);
3501   }
3502 
3503   MI.eraseFromParent();
3504   return true;
3505 }
3506 
3507 bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3508                                              Register Src, bool IsLog10,
3509                                              unsigned Flags) const {
3510   const double Log2BaseInverted =
3511       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3512 
3513   LLT Ty = B.getMRI()->getType(Dst);
3514 
3515   if (Ty == LLT::scalar(32)) {
3516     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3517     if (ScaledInput) {
3518       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3519                         .addUse(Src)
3520                         .setMIFlags(Flags);
3521       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3522       auto Zero = B.buildFConstant(Ty, 0.0);
3523       auto ResultOffset =
3524           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3525       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3526 
3527       if (ST.hasFastFMAF32())
3528         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3529       else {
3530         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3531         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3532       }
3533 
3534       return true;
3535     }
3536   }
3537 
3538   auto Log2Operand = Ty == LLT::scalar(16)
3539                          ? B.buildFLog2(Ty, Src, Flags)
3540                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3541                                .addUse(Src)
3542                                .setMIFlags(Flags);
3543   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3544   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3545   return true;
3546 }
3547 
3548 bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3549                                         MachineIRBuilder &B) const {
3550   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3551   // If we have to handle denormals, scale up the input and adjust the result.
3552 
3553   Register Dst = MI.getOperand(0).getReg();
3554   Register Src = MI.getOperand(1).getReg();
3555   unsigned Flags = MI.getFlags();
3556   LLT Ty = B.getMRI()->getType(Dst);
3557   const LLT F16 = LLT::scalar(16);
3558   const LLT F32 = LLT::scalar(32);
3559 
3560   if (Ty == F16) {
3561     // Nothing in half is a denormal when promoted to f32.
3562     auto Ext = B.buildFPExt(F32, Src, Flags);
3563     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3564                     .addUse(Ext.getReg(0))
3565                     .setMIFlags(Flags);
3566     B.buildFPTrunc(Dst, Log2, Flags);
3567     MI.eraseFromParent();
3568     return true;
3569   }
3570 
3571   assert(Ty == F32);
3572 
3573   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3574     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3575         .addUse(Src)
3576         .setMIFlags(Flags);
3577     MI.eraseFromParent();
3578     return true;
3579   }
3580 
3581   // bool needs_scaling = x < -0x1.f80000p+6f;
3582   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3583 
3584   // -nextafter(128.0, -1)
3585   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3586   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3587                                   RangeCheckConst, Flags);
3588 
3589   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3590   auto Zero = B.buildFConstant(Ty, 0.0);
3591   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3592   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3593 
3594   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3595                   .addUse(AddInput.getReg(0))
3596                   .setMIFlags(Flags);
3597 
3598   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3599   auto One = B.buildFConstant(Ty, 1.0);
3600   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3601   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3602   MI.eraseFromParent();
3603   return true;
3604 }
3605 
3606 bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3607                                              Register X, unsigned Flags) const {
3608   LLT Ty = B.getMRI()->getType(Dst);
3609   LLT F32 = LLT::scalar(32);
3610 
3611   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3612     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3613     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3614 
3615     if (Ty == F32) {
3616       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3617         .addUse(Mul.getReg(0))
3618         .setMIFlags(Flags);
3619     } else {
3620       B.buildFExp2(Dst, Mul.getReg(0), Flags);
3621     }
3622 
3623     return true;
3624   }
3625 
3626   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3627   auto NeedsScaling =
3628       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3629   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3630   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3631   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3632 
3633   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3634   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3635 
3636   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3637     .addUse(ExpInput.getReg(0))
3638     .setMIFlags(Flags);
3639 
3640   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3641   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3642   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3643   return true;
3644 }
3645 
3646 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
3647                                        MachineIRBuilder &B) const {
3648   Register Dst = MI.getOperand(0).getReg();
3649   Register X = MI.getOperand(1).getReg();
3650   const unsigned Flags = MI.getFlags();
3651   MachineFunction &MF = B.getMF();
3652   MachineRegisterInfo &MRI = *B.getMRI();
3653   LLT Ty = MRI.getType(Dst);
3654   const LLT F16 = LLT::scalar(16);
3655   const LLT F32 = LLT::scalar(32);
3656   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
3657 
3658   if (Ty == F16) {
3659     // v_exp_f16 (fmul x, log2e)
3660     if (allowApproxFunc(MF, Flags)) {
3661       // TODO: Does this really require fast?
3662       legalizeFExpUnsafe(B, Dst, X, Flags);
3663       MI.eraseFromParent();
3664       return true;
3665     }
3666 
3667     // exp(f16 x) ->
3668     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3669 
3670     // Nothing in half is a denormal when promoted to f32.
3671     auto Ext = B.buildFPExt(F32, X, Flags);
3672     Register Lowered = MRI.createGenericVirtualRegister(F32);
3673     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3674     B.buildFPTrunc(Dst, Lowered, Flags);
3675     MI.eraseFromParent();
3676     return true;
3677   }
3678 
3679   assert(Ty == F32);
3680 
3681   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3682   // library behavior. Also, is known-not-daz source sufficient?
3683   if (allowApproxFunc(MF, Flags)) {
3684     legalizeFExpUnsafe(B, Dst, X, Flags);
3685     MI.eraseFromParent();
3686     return true;
3687   }
3688 
3689   //    Algorithm:
3690   //
3691   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3692   //
3693   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3694   //    n = 64*m + j,   0 <= j < 64
3695   //
3696   //    e^x = 2^((64*m + j + f)/64)
3697   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3698   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3699   //
3700   //    f = x*(64/ln(2)) - n
3701   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3702   //
3703   //    e^x = (2^m) * (2^(j/64)) * e^r
3704   //
3705   //    (2^(j/64)) is precomputed
3706   //
3707   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3708   //    e^r = 1 + q
3709   //
3710   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3711   //
3712   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3713   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3714   Register PH, PL;
3715 
3716   if (ST.hasFastFMAF32()) {
3717     const float c_exp = numbers::log2ef;
3718     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3719     const float c_exp10 = 0x1.a934f0p+1f;
3720     const float cc_exp10 = 0x1.2f346ep-24f;
3721 
3722     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3723     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3724     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3725     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3726 
3727     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3728     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3729   } else {
3730     const float ch_exp = 0x1.714000p+0f;
3731     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3732 
3733     const float ch_exp10 = 0x1.a92000p+1f;
3734     const float cl_exp10 = 0x1.4f0978p-11f;
3735 
3736     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3737     auto XH = B.buildAnd(Ty, X, MaskConst);
3738     auto XL = B.buildFSub(Ty, X, XH, Flags);
3739 
3740     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3741     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3742 
3743     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3744     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3745 
3746     Register Mad0 =
3747         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3748     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3749   }
3750 
3751   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
3752 
3753   // It is unsafe to contract this fsub into the PH multiply.
3754   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3755   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3756   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3757 
3758   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3759                   .addUse(A.getReg(0))
3760                   .setMIFlags(Flags);
3761   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3762 
3763   auto UnderflowCheckConst =
3764       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3765   auto Zero = B.buildFConstant(Ty, 0.0);
3766   auto Underflow =
3767       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3768 
3769   R = B.buildSelect(Ty, Underflow, Zero, R);
3770 
3771   const auto &Options = MF.getTarget().Options;
3772 
3773   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3774     auto OverflowCheckConst =
3775         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3776 
3777     auto Overflow =
3778         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3779     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3780     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3781   }
3782 
3783   B.buildCopy(Dst, R);
3784   MI.eraseFromParent();
3785   return true;
3786 }
3787 
3788 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
3789                                        MachineIRBuilder &B) const {
3790   Register Dst = MI.getOperand(0).getReg();
3791   Register Src0 = MI.getOperand(1).getReg();
3792   Register Src1 = MI.getOperand(2).getReg();
3793   unsigned Flags = MI.getFlags();
3794   LLT Ty = B.getMRI()->getType(Dst);
3795   const LLT F16 = LLT::float16();
3796   const LLT F32 = LLT::float32();
3797 
3798   if (Ty == F32) {
3799     auto Log = B.buildFLog2(F32, Src0, Flags);
3800     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3801                    .addUse(Log.getReg(0))
3802                    .addUse(Src1)
3803                    .setMIFlags(Flags);
3804     B.buildFExp2(Dst, Mul, Flags);
3805   } else if (Ty == F16) {
3806     // There's no f16 fmul_legacy, so we need to convert for it.
3807     auto Log = B.buildFLog2(F16, Src0, Flags);
3808     auto Ext0 = B.buildFPExt(F32, Log, Flags);
3809     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3810     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3811                    .addUse(Ext0.getReg(0))
3812                    .addUse(Ext1.getReg(0))
3813                    .setMIFlags(Flags);
3814     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
3815   } else
3816     return false;
3817 
3818   MI.eraseFromParent();
3819   return true;
3820 }
3821 
3822 // Find a source register, ignoring any possible source modifiers.
3823 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
3824   Register ModSrc = OrigSrc;
3825   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
3826     ModSrc = SrcFNeg->getOperand(1).getReg();
3827     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3828       ModSrc = SrcFAbs->getOperand(1).getReg();
3829   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
3830     ModSrc = SrcFAbs->getOperand(1).getReg();
3831   return ModSrc;
3832 }
3833 
3834 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
3835                                          MachineRegisterInfo &MRI,
3836                                          MachineIRBuilder &B) const {
3837 
3838   const LLT S1 = LLT::scalar(1);
3839   const LLT F64 = LLT::float64();
3840   Register Dst = MI.getOperand(0).getReg();
3841   Register OrigSrc = MI.getOperand(1).getReg();
3842   unsigned Flags = MI.getFlags();
3843   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
3844          "this should not have been custom lowered");
3845 
3846   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
3847   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
3848   // efficient way to implement it is using V_FRACT_F64. The workaround for the
3849   // V_FRACT bug is:
3850   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
3851   //
3852   // Convert floor(x) to (x - fract(x))
3853 
3854   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3855                    .addUse(OrigSrc)
3856                    .setMIFlags(Flags);
3857 
3858   // Give source modifier matching some assistance before obscuring a foldable
3859   // pattern.
3860 
3861   // TODO: We can avoid the neg on the fract? The input sign to fract
3862   // shouldn't matter?
3863   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
3864 
3865   auto Const =
3866       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3867 
3868   Register Min = MRI.createGenericVirtualRegister(F64);
3869 
3870   // We don't need to concern ourselves with the snan handling difference, so
3871   // use the one which will directly select.
3872   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3873   if (MFI->getMode().IEEE)
3874     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3875   else
3876     B.buildFMinNum(Min, Fract, Const, Flags);
3877 
3878   Register CorrectedFract = Min;
3879   if (!MI.getFlag(MachineInstr::FmNoNans)) {
3880     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3881     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3882   }
3883 
3884   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
3885   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3886 
3887   MI.eraseFromParent();
3888   return true;
3889 }
3890 
3891 // Turn an illegal packed v2s16 build vector into bit operations.
3892 // TODO: This should probably be a bitcast action in LegalizerHelper.
3893 bool AMDGPULegalizerInfo::legalizeBuildVector(
3894   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3895   Register Dst = MI.getOperand(0).getReg();
3896   const LLT S32 = LLT::scalar(32);
3897   const LLT S16 = LLT::scalar(16);
3898   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
3899 
3900   Register Src0 = MI.getOperand(1).getReg();
3901   Register Src1 = MI.getOperand(2).getReg();
3902 
3903   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3904     assert(MRI.getType(Src0) == S32);
3905     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3906     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3907   }
3908 
3909   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
3910   B.buildBitcast(Dst, Merge);
3911 
3912   MI.eraseFromParent();
3913   return true;
3914 }
3915 
3916 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
3917 //
3918 // Source and accumulation registers must all be 32-bits.
3919 //
3920 // TODO: When the multiply is uniform, we should produce a code sequence
3921 // that is better suited to instruction selection on the SALU. Instead of
3922 // the outer loop going over parts of the result, the outer loop should go
3923 // over parts of one of the factors. This should result in instruction
3924 // selection that makes full use of S_ADDC_U32 instructions.
3925 void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3926                                         MutableArrayRef<Register> Accum,
3927                                         ArrayRef<Register> Src0,
3928                                         ArrayRef<Register> Src1,
3929                                         bool UsePartialMad64_32,
3930                                         bool SeparateOddAlignedProducts) const {
3931   // Use (possibly empty) vectors of S1 registers to represent the set of
3932   // carries from one pair of positions to the next.
3933   using Carry = SmallVector<Register, 2>;
3934 
3935   MachineIRBuilder &B = Helper.MIRBuilder;
3936   GISelValueTracking &VT = *Helper.getValueTracking();
3937 
3938   const LLT S1 = LLT::scalar(1);
3939   const LLT S32 = LLT::scalar(32);
3940   const LLT S64 = LLT::scalar(64);
3941 
3942   Register Zero32;
3943   Register Zero64;
3944 
3945   auto getZero32 = [&]() -> Register {
3946     if (!Zero32)
3947       Zero32 = B.buildConstant(S32, 0).getReg(0);
3948     return Zero32;
3949   };
3950   auto getZero64 = [&]() -> Register {
3951     if (!Zero64)
3952       Zero64 = B.buildConstant(S64, 0).getReg(0);
3953     return Zero64;
3954   };
3955 
3956   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3957   for (unsigned i = 0; i < Src0.size(); ++i) {
3958     Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
3959     Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
3960   }
3961 
3962   // Merge the given carries into the 32-bit LocalAccum, which is modified
3963   // in-place.
3964   //
3965   // Returns the carry-out, which is a single S1 register or null.
3966   auto mergeCarry =
3967       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
3968         if (CarryIn.empty())
3969           return Register();
3970 
3971         bool HaveCarryOut = true;
3972         Register CarryAccum;
3973         if (CarryIn.size() == 1) {
3974           if (!LocalAccum) {
3975             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3976             return Register();
3977           }
3978 
3979           CarryAccum = getZero32();
3980         } else {
3981           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
3982           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3983             CarryAccum =
3984                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
3985                     .getReg(0);
3986           }
3987 
3988           if (!LocalAccum) {
3989             LocalAccum = getZero32();
3990             HaveCarryOut = false;
3991           }
3992         }
3993 
3994         auto Add =
3995             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
3996         LocalAccum = Add.getReg(0);
3997         return HaveCarryOut ? Add.getReg(1) : Register();
3998       };
3999 
4000   // Build a multiply-add chain to compute
4001   //
4002   //   LocalAccum + (partial products at DstIndex)
4003   //       + (opportunistic subset of CarryIn)
4004   //
4005   // LocalAccum is an array of one or two 32-bit registers that are updated
4006   // in-place. The incoming registers may be null.
4007   //
4008   // In some edge cases, carry-ins can be consumed "for free". In that case,
4009   // the consumed carry bits are removed from CarryIn in-place.
4010   auto buildMadChain =
4011       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4012           -> Carry {
4013         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4014                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4015 
4016         Carry CarryOut;
4017         unsigned j0 = 0;
4018 
4019         // Use plain 32-bit multiplication for the most significant part of the
4020         // result by default.
4021         if (LocalAccum.size() == 1 &&
4022             (!UsePartialMad64_32 || !CarryIn.empty())) {
4023           do {
4024             // Skip multiplication if one of the operands is 0
4025             unsigned j1 = DstIndex - j0;
4026             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4027               ++j0;
4028               continue;
4029             }
4030             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4031             if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4032               LocalAccum[0] = Mul.getReg(0);
4033             } else {
4034               if (CarryIn.empty()) {
4035                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4036               } else {
4037                 LocalAccum[0] =
4038                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4039                         .getReg(0);
4040                 CarryIn.pop_back();
4041               }
4042             }
4043             ++j0;
4044           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4045         }
4046 
4047         // Build full 64-bit multiplies.
4048         if (j0 <= DstIndex) {
4049           bool HaveSmallAccum = false;
4050           Register Tmp;
4051 
4052           if (LocalAccum[0]) {
4053             if (LocalAccum.size() == 1) {
4054               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4055               HaveSmallAccum = true;
4056             } else if (LocalAccum[1]) {
4057               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4058               HaveSmallAccum = false;
4059             } else {
4060               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4061               HaveSmallAccum = true;
4062             }
4063           } else {
4064             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4065             Tmp = getZero64();
4066             HaveSmallAccum = true;
4067           }
4068 
4069           do {
4070             unsigned j1 = DstIndex - j0;
4071             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4072               ++j0;
4073               continue;
4074             }
4075             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4076                                     {Src0[j0], Src1[j1], Tmp});
4077             Tmp = Mad.getReg(0);
4078             if (!HaveSmallAccum)
4079               CarryOut.push_back(Mad.getReg(1));
4080             HaveSmallAccum = false;
4081 
4082             ++j0;
4083           } while (j0 <= DstIndex);
4084 
4085           auto Unmerge = B.buildUnmerge(S32, Tmp);
4086           LocalAccum[0] = Unmerge.getReg(0);
4087           if (LocalAccum.size() > 1)
4088             LocalAccum[1] = Unmerge.getReg(1);
4089         }
4090 
4091         return CarryOut;
4092       };
4093 
4094   // Outer multiply loop, iterating over destination parts from least
4095   // significant to most significant parts.
4096   //
4097   // The columns of the following diagram correspond to the destination parts
4098   // affected by one iteration of the outer loop (ignoring boundary
4099   // conditions).
4100   //
4101   //   Dest index relative to 2 * i:      1 0 -1
4102   //                                      ------
4103   //   Carries from previous iteration:     e o
4104   //   Even-aligned partial product sum:  E E .
4105   //   Odd-aligned partial product sum:     O O
4106   //
4107   // 'o' is OddCarry, 'e' is EvenCarry.
4108   // EE and OO are computed from partial products via buildMadChain and use
4109   // accumulation where possible and appropriate.
4110   //
4111   Register SeparateOddCarry;
4112   Carry EvenCarry;
4113   Carry OddCarry;
4114 
4115   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4116     Carry OddCarryIn = std::move(OddCarry);
4117     Carry EvenCarryIn = std::move(EvenCarry);
4118     OddCarry.clear();
4119     EvenCarry.clear();
4120 
4121     // Partial products at offset 2 * i.
4122     if (2 * i < Accum.size()) {
4123       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4124       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4125     }
4126 
4127     // Partial products at offset 2 * i - 1.
4128     if (i > 0) {
4129       if (!SeparateOddAlignedProducts) {
4130         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4131         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4132       } else {
4133         bool IsHighest = 2 * i >= Accum.size();
4134         Register SeparateOddOut[2];
4135         auto LocalAccum = MutableArrayRef(SeparateOddOut)
4136                               .take_front(IsHighest ? 1 : 2);
4137         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4138 
4139         MachineInstr *Lo;
4140 
4141         if (i == 1) {
4142           if (!IsHighest)
4143             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4144           else
4145             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4146         } else {
4147           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4148                             SeparateOddCarry);
4149         }
4150         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4151 
4152         if (!IsHighest) {
4153           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4154                                 Lo->getOperand(1).getReg());
4155           Accum[2 * i] = Hi.getReg(0);
4156           SeparateOddCarry = Hi.getReg(1);
4157         }
4158       }
4159     }
4160 
4161     // Add in the carries from the previous iteration
4162     if (i > 0) {
4163       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4164         EvenCarryIn.push_back(CarryOut);
4165 
4166       if (2 * i < Accum.size()) {
4167         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4168           OddCarry.push_back(CarryOut);
4169       }
4170     }
4171   }
4172 }
4173 
4174 // Custom narrowing of wide multiplies using wide multiply-add instructions.
4175 //
4176 // TODO: If the multiply is followed by an addition, we should attempt to
4177 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4178 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
4179                                       MachineInstr &MI) const {
4180   assert(ST.hasMad64_32());
4181   assert(MI.getOpcode() == TargetOpcode::G_MUL);
4182 
4183   MachineIRBuilder &B = Helper.MIRBuilder;
4184   MachineRegisterInfo &MRI = *B.getMRI();
4185 
4186   Register DstReg = MI.getOperand(0).getReg();
4187   Register Src0 = MI.getOperand(1).getReg();
4188   Register Src1 = MI.getOperand(2).getReg();
4189 
4190   LLT Ty = MRI.getType(DstReg);
4191   assert(Ty.isScalar());
4192 
4193   unsigned Size = Ty.getSizeInBits();
4194   unsigned NumParts = Size / 32;
4195   assert((Size % 32) == 0);
4196   assert(NumParts >= 2);
4197 
4198   // Whether to use MAD_64_32 for partial products whose high half is
4199   // discarded. This avoids some ADD instructions but risks false dependency
4200   // stalls on some subtargets in some cases.
4201   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4202 
4203   // Whether to compute odd-aligned partial products separately. This is
4204   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4205   // in an even-aligned VGPR.
4206   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4207 
4208   LLT S32 = LLT::scalar(32);
4209   SmallVector<Register, 2> Src0Parts, Src1Parts;
4210   for (unsigned i = 0; i < NumParts; ++i) {
4211     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
4212     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
4213   }
4214   B.buildUnmerge(Src0Parts, Src0);
4215   B.buildUnmerge(Src1Parts, Src1);
4216 
4217   SmallVector<Register, 2> AccumRegs(NumParts);
4218   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4219                 SeparateOddAlignedProducts);
4220 
4221   B.buildMergeLikeInstr(DstReg, AccumRegs);
4222   MI.eraseFromParent();
4223   return true;
4224 }
4225 
4226 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4227 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4228 // case with a single min instruction instead of a compare+select.
4229 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4230                                             MachineRegisterInfo &MRI,
4231                                             MachineIRBuilder &B) const {
4232   Register Dst = MI.getOperand(0).getReg();
4233   Register Src = MI.getOperand(1).getReg();
4234   LLT DstTy = MRI.getType(Dst);
4235   LLT SrcTy = MRI.getType(Src);
4236 
4237   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4238                         ? AMDGPU::G_AMDGPU_FFBH_U32
4239                         : AMDGPU::G_AMDGPU_FFBL_B32;
4240   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4241   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4242 
4243   MI.eraseFromParent();
4244   return true;
4245 }
4246 
4247 bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4248                                                   MachineRegisterInfo &MRI,
4249                                                   MachineIRBuilder &B) const {
4250   Register Dst = MI.getOperand(0).getReg();
4251   Register Src = MI.getOperand(1).getReg();
4252   LLT SrcTy = MRI.getType(Src);
4253   TypeSize NumBits = SrcTy.getSizeInBits();
4254 
4255   assert(NumBits < 32u);
4256 
4257   auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4258   auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4259   auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4260   auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4261   B.buildTrunc(Dst, Ctlz);
4262   MI.eraseFromParent();
4263   return true;
4264 }
4265 
4266 // Check that this is a G_XOR x, -1
4267 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4268   if (MI.getOpcode() != TargetOpcode::G_XOR)
4269     return false;
4270   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4271   return ConstVal == -1;
4272 }
4273 
4274 // Return the use branch instruction, otherwise null if the usage is invalid.
4275 static MachineInstr *
4276 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4277                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4278   Register CondDef = MI.getOperand(0).getReg();
4279   if (!MRI.hasOneNonDBGUse(CondDef))
4280     return nullptr;
4281 
4282   MachineBasicBlock *Parent = MI.getParent();
4283   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4284 
4285   if (isNot(MRI, *UseMI)) {
4286     Register NegatedCond = UseMI->getOperand(0).getReg();
4287     if (!MRI.hasOneNonDBGUse(NegatedCond))
4288       return nullptr;
4289 
4290     // We're deleting the def of this value, so we need to remove it.
4291     eraseInstr(*UseMI, MRI);
4292 
4293     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4294     Negated = true;
4295   }
4296 
4297   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4298     return nullptr;
4299 
4300   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4301   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4302   if (Next == Parent->end()) {
4303     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4304     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4305       return nullptr;
4306     UncondBrTarget = &*NextMBB;
4307   } else {
4308     if (Next->getOpcode() != AMDGPU::G_BR)
4309       return nullptr;
4310     Br = &*Next;
4311     UncondBrTarget = Br->getOperand(0).getMBB();
4312   }
4313 
4314   return UseMI;
4315 }
4316 
4317 void AMDGPULegalizerInfo::buildLoadInputValue(Register DstReg,
4318                                               MachineIRBuilder &B,
4319                                               const ArgDescriptor *Arg,
4320                                               const TargetRegisterClass *ArgRC,
4321                                               LLT ArgTy) const {
4322   MCRegister SrcReg = Arg->getRegister();
4323   assert(SrcReg.isPhysical() && "Physical register expected");
4324   assert(DstReg.isVirtual() && "Virtual register expected");
4325 
4326   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4327                                              *ArgRC, B.getDebugLoc(), ArgTy);
4328   if (Arg->isMasked()) {
4329     // TODO: Should we try to emit this once in the entry block?
4330     const LLT S32 = LLT::scalar(32);
4331     const unsigned Mask = Arg->getMask();
4332     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4333 
4334     Register AndMaskSrc = LiveIn;
4335 
4336     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4337     // 0.
4338     if (Shift != 0) {
4339       auto ShiftAmt = B.buildConstant(S32, Shift);
4340       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4341     }
4342 
4343     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4344   } else {
4345     B.buildCopy(DstReg, LiveIn);
4346   }
4347 }
4348 
4349 bool AMDGPULegalizerInfo::loadInputValue(
4350     Register DstReg, MachineIRBuilder &B,
4351     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4352   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4353   const ArgDescriptor *Arg = nullptr;
4354   const TargetRegisterClass *ArgRC;
4355   LLT ArgTy;
4356 
4357   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4358   const ArgDescriptor WorkGroupIDX =
4359       ArgDescriptor::createRegister(AMDGPU::TTMP9);
4360   // If GridZ is not programmed in an entry function then the hardware will set
4361   // it to all zeros, so there is no need to mask the GridY value in the low
4362   // order bits.
4363   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4364       AMDGPU::TTMP7,
4365       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4366   const ArgDescriptor WorkGroupIDZ =
4367       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4368   if (ST.hasArchitectedSGPRs() &&
4369       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4370     switch (ArgType) {
4371     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4372       Arg = &WorkGroupIDX;
4373       ArgRC = &AMDGPU::SReg_32RegClass;
4374       ArgTy = LLT::scalar(32);
4375       break;
4376     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4377       Arg = &WorkGroupIDY;
4378       ArgRC = &AMDGPU::SReg_32RegClass;
4379       ArgTy = LLT::scalar(32);
4380       break;
4381     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4382       Arg = &WorkGroupIDZ;
4383       ArgRC = &AMDGPU::SReg_32RegClass;
4384       ArgTy = LLT::scalar(32);
4385       break;
4386     default:
4387       break;
4388     }
4389   }
4390 
4391   if (!Arg)
4392     std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4393 
4394   if (!Arg) {
4395     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4396       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4397       // case the pointer argument may be missing and we use null.
4398       B.buildConstant(DstReg, 0);
4399       return true;
4400     }
4401 
4402     // It's undefined behavior if a function marked with the amdgpu-no-*
4403     // attributes uses the corresponding intrinsic.
4404     B.buildUndef(DstReg);
4405     return true;
4406   }
4407 
4408   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4409     return false; // TODO: Handle these
4410   buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4411   return true;
4412 }
4413 
4414 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
4415     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4416     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4417   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4418     return false;
4419 
4420   MI.eraseFromParent();
4421   return true;
4422 }
4423 
4424 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
4425                                 int64_t C) {
4426   B.buildConstant(MI.getOperand(0).getReg(), C);
4427   MI.eraseFromParent();
4428   return true;
4429 }
4430 
4431 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
4432     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
4433     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4434   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
4435   if (MaxID == 0)
4436     return replaceWithConstant(B, MI, 0);
4437 
4438   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4439   const ArgDescriptor *Arg;
4440   const TargetRegisterClass *ArgRC;
4441   LLT ArgTy;
4442   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4443 
4444   Register DstReg = MI.getOperand(0).getReg();
4445   if (!Arg) {
4446     // It's undefined behavior if a function marked with the amdgpu-no-*
4447     // attributes uses the corresponding intrinsic.
4448     B.buildUndef(DstReg);
4449     MI.eraseFromParent();
4450     return true;
4451   }
4452 
4453   if (Arg->isMasked()) {
4454     // Don't bother inserting AssertZext for packed IDs since we're emitting the
4455     // masking operations anyway.
4456     //
4457     // TODO: We could assert the top bit is 0 for the source copy.
4458     if (!loadInputValue(DstReg, B, ArgType))
4459       return false;
4460   } else {
4461     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
4462     if (!loadInputValue(TmpReg, B, ArgType))
4463       return false;
4464     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
4465   }
4466 
4467   MI.eraseFromParent();
4468   return true;
4469 }
4470 
4471 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
4472                                                      int64_t Offset) const {
4473   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
4474   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
4475 
4476   // TODO: If we passed in the base kernel offset we could have a better
4477   // alignment than 4, but we don't really need it.
4478   if (!loadInputValue(KernArgReg, B,
4479                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4480     llvm_unreachable("failed to find kernarg segment ptr");
4481 
4482   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
4483   // TODO: Should get nuw
4484   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4485 }
4486 
4487 /// Legalize a value that's loaded from kernel arguments. This is only used by
4488 /// legacy intrinsics.
4489 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
4490                                                       MachineIRBuilder &B,
4491                                                       uint64_t Offset,
4492                                                       Align Alignment) const {
4493   Register DstReg = MI.getOperand(0).getReg();
4494 
4495   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
4496          "unexpected kernarg parameter type");
4497 
4498   Register Ptr = getKernargParameterPtr(B, Offset);
4499   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
4500   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
4501               MachineMemOperand::MODereferenceable |
4502                   MachineMemOperand::MOInvariant);
4503   MI.eraseFromParent();
4504   return true;
4505 }
4506 
4507 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
4508                                        MachineRegisterInfo &MRI,
4509                                        MachineIRBuilder &B) const {
4510   Register Dst = MI.getOperand(0).getReg();
4511   LLT DstTy = MRI.getType(Dst);
4512   LLT S16 = LLT::scalar(16);
4513   LLT S32 = LLT::scalar(32);
4514   LLT S64 = LLT::scalar(64);
4515 
4516   if (DstTy == S16)
4517     return legalizeFDIV16(MI, MRI, B);
4518   if (DstTy == S32)
4519     return legalizeFDIV32(MI, MRI, B);
4520   if (DstTy == S64)
4521     return legalizeFDIV64(MI, MRI, B);
4522 
4523   return false;
4524 }
4525 
4526 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4527                                                         Register DstDivReg,
4528                                                         Register DstRemReg,
4529                                                         Register X,
4530                                                         Register Y) const {
4531   const LLT S1 = LLT::scalar(1);
4532   const LLT S32 = LLT::scalar(32);
4533 
4534   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
4535   // algorithm used here.
4536 
4537   // Initial estimate of inv(y).
4538   auto FloatY = B.buildUITOFP(S32, Y);
4539   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4540   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
4541   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
4542   auto Z = B.buildFPTOUI(S32, ScaledY);
4543 
4544   // One round of UNR.
4545   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
4546   auto NegYZ = B.buildMul(S32, NegY, Z);
4547   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
4548 
4549   // Quotient/remainder estimate.
4550   auto Q = B.buildUMulH(S32, X, Z);
4551   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
4552 
4553   // First quotient/remainder refinement.
4554   auto One = B.buildConstant(S32, 1);
4555   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4556   if (DstDivReg)
4557     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
4558   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
4559 
4560   // Second quotient/remainder refinement.
4561   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4562   if (DstDivReg)
4563     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
4564 
4565   if (DstRemReg)
4566     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
4567 }
4568 
4569 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
4570 //
4571 // Return lo, hi of result
4572 //
4573 // %cvt.lo = G_UITOFP Val.lo
4574 // %cvt.hi = G_UITOFP Val.hi
4575 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
4576 // %rcp = G_AMDGPU_RCP_IFLAG %mad
4577 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
4578 // %mul2 = G_FMUL %mul1, 2**(-32)
4579 // %trunc = G_INTRINSIC_TRUNC %mul2
4580 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
4581 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
4582 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
4583                                                        Register Val) {
4584   const LLT S32 = LLT::scalar(32);
4585   auto Unmerge = B.buildUnmerge(S32, Val);
4586 
4587   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
4588   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
4589 
4590   auto Mad = B.buildFMAD(
4591       S32, CvtHi, // 2**32
4592       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4593 
4594   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4595   auto Mul1 = B.buildFMul(
4596       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
4597 
4598   // 2**(-32)
4599   auto Mul2 = B.buildFMul(
4600       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
4601   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
4602 
4603   // -(2**32)
4604   auto Mad2 = B.buildFMAD(
4605       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4606       Mul1);
4607 
4608   auto ResultLo = B.buildFPTOUI(S32, Mad2);
4609   auto ResultHi = B.buildFPTOUI(S32, Trunc);
4610 
4611   return {ResultLo.getReg(0), ResultHi.getReg(0)};
4612 }
4613 
4614 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4615                                                         Register DstDivReg,
4616                                                         Register DstRemReg,
4617                                                         Register Numer,
4618                                                         Register Denom) const {
4619   const LLT S32 = LLT::scalar(32);
4620   const LLT S64 = LLT::scalar(64);
4621   const LLT S1 = LLT::scalar(1);
4622   Register RcpLo, RcpHi;
4623 
4624   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
4625 
4626   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
4627 
4628   auto Zero64 = B.buildConstant(S64, 0);
4629   auto NegDenom = B.buildSub(S64, Zero64, Denom);
4630 
4631   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
4632   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
4633 
4634   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
4635   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4636   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4637 
4638   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
4639   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4640   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
4641 
4642   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
4643   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
4644   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
4645   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4646   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4647 
4648   auto Zero32 = B.buildConstant(S32, 0);
4649   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4650   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4651   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
4652 
4653   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
4654   Register NumerLo = UnmergeNumer.getReg(0);
4655   Register NumerHi = UnmergeNumer.getReg(1);
4656 
4657   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
4658   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
4659   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
4660   Register Mul3_Lo = UnmergeMul3.getReg(0);
4661   Register Mul3_Hi = UnmergeMul3.getReg(1);
4662   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
4663   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4664   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4665   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
4666 
4667   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
4668   Register DenomLo = UnmergeDenom.getReg(0);
4669   Register DenomHi = UnmergeDenom.getReg(1);
4670 
4671   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
4672   auto C1 = B.buildSExt(S32, CmpHi);
4673 
4674   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
4675   auto C2 = B.buildSExt(S32, CmpLo);
4676 
4677   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
4678   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
4679 
4680   // TODO: Here and below portions of the code can be enclosed into if/endif.
4681   // Currently control flow is unconditional and we have 4 selects after
4682   // potential endif to substitute PHIs.
4683 
4684   // if C3 != 0 ...
4685   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
4686   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4687   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4688   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
4689 
4690   auto One64 = B.buildConstant(S64, 1);
4691   auto Add3 = B.buildAdd(S64, MulHi3, One64);
4692 
4693   auto C4 =
4694       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
4695   auto C5 =
4696       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
4697   auto C6 = B.buildSelect(
4698       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
4699 
4700   // if (C6 != 0)
4701   auto Add4 = B.buildAdd(S64, Add3, One64);
4702   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
4703 
4704   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4705   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4706   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
4707 
4708   // endif C6
4709   // endif C3
4710 
4711   if (DstDivReg) {
4712     auto Sel1 = B.buildSelect(
4713         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4714     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4715                   Sel1, MulHi3);
4716   }
4717 
4718   if (DstRemReg) {
4719     auto Sel2 = B.buildSelect(
4720         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4721     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4722                   Sel2, Sub1);
4723   }
4724 }
4725 
4726 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
4727                                                   MachineRegisterInfo &MRI,
4728                                                   MachineIRBuilder &B) const {
4729   Register DstDivReg, DstRemReg;
4730   switch (MI.getOpcode()) {
4731   default:
4732     llvm_unreachable("Unexpected opcode!");
4733   case AMDGPU::G_UDIV: {
4734     DstDivReg = MI.getOperand(0).getReg();
4735     break;
4736   }
4737   case AMDGPU::G_UREM: {
4738     DstRemReg = MI.getOperand(0).getReg();
4739     break;
4740   }
4741   case AMDGPU::G_UDIVREM: {
4742     DstDivReg = MI.getOperand(0).getReg();
4743     DstRemReg = MI.getOperand(1).getReg();
4744     break;
4745   }
4746   }
4747 
4748   const LLT S64 = LLT::scalar(64);
4749   const LLT S32 = LLT::scalar(32);
4750   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4751   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4752   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4753   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4754 
4755   if (Ty == S32)
4756     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
4757   else if (Ty == S64)
4758     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
4759   else
4760     return false;
4761 
4762   MI.eraseFromParent();
4763   return true;
4764 }
4765 
4766 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
4767                                                 MachineRegisterInfo &MRI,
4768                                                 MachineIRBuilder &B) const {
4769   const LLT S64 = LLT::scalar(64);
4770   const LLT S32 = LLT::scalar(32);
4771 
4772   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
4773   if (Ty != S32 && Ty != S64)
4774     return false;
4775 
4776   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4777   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4778   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4779 
4780   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
4781   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
4782   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
4783 
4784   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
4785   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
4786 
4787   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
4788   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
4789 
4790   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4791   switch (MI.getOpcode()) {
4792   default:
4793     llvm_unreachable("Unexpected opcode!");
4794   case AMDGPU::G_SDIV: {
4795     DstDivReg = MI.getOperand(0).getReg();
4796     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4797     break;
4798   }
4799   case AMDGPU::G_SREM: {
4800     DstRemReg = MI.getOperand(0).getReg();
4801     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4802     break;
4803   }
4804   case AMDGPU::G_SDIVREM: {
4805     DstDivReg = MI.getOperand(0).getReg();
4806     DstRemReg = MI.getOperand(1).getReg();
4807     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4808     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4809     break;
4810   }
4811   }
4812 
4813   if (Ty == S32)
4814     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4815   else
4816     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
4817 
4818   if (DstDivReg) {
4819     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4820     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4821     B.buildSub(DstDivReg, SignXor, Sign);
4822   }
4823 
4824   if (DstRemReg) {
4825     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4826     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4827     B.buildSub(DstRemReg, SignXor, Sign);
4828   }
4829 
4830   MI.eraseFromParent();
4831   return true;
4832 }
4833 
4834 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
4835                                                  MachineRegisterInfo &MRI,
4836                                                  MachineIRBuilder &B) const {
4837   Register Res = MI.getOperand(0).getReg();
4838   Register LHS = MI.getOperand(1).getReg();
4839   Register RHS = MI.getOperand(2).getReg();
4840   uint16_t Flags = MI.getFlags();
4841   LLT ResTy = MRI.getType(Res);
4842 
4843   const MachineFunction &MF = B.getMF();
4844   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4845                             MF.getTarget().Options.UnsafeFPMath;
4846 
4847   if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
4848     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4849       return false;
4850 
4851     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4852     // the CI documentation has a worst case error of 1 ulp.
4853     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4854     // use it as long as we aren't trying to use denormals.
4855     //
4856     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4857 
4858     // 1 / x -> RCP(x)
4859     if (CLHS->isExactlyValue(1.0)) {
4860       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4861           .addUse(RHS)
4862           .setMIFlags(Flags);
4863 
4864       MI.eraseFromParent();
4865       return true;
4866     }
4867 
4868     // -1 / x -> RCP( FNEG(x) )
4869     if (CLHS->isExactlyValue(-1.0)) {
4870       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4871       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4872           .addUse(FNeg.getReg(0))
4873           .setMIFlags(Flags);
4874 
4875       MI.eraseFromParent();
4876       return true;
4877     }
4878   }
4879 
4880   // For f16 require afn or arcp.
4881   // For f32 require afn.
4882   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4883                               !MI.getFlag(MachineInstr::FmArcp)))
4884     return false;
4885 
4886   // x / y -> x * (1.0 / y)
4887   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4888                  .addUse(RHS)
4889                  .setMIFlags(Flags);
4890   B.buildFMul(Res, LHS, RCP, Flags);
4891 
4892   MI.eraseFromParent();
4893   return true;
4894 }
4895 
4896 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4897                                                    MachineRegisterInfo &MRI,
4898                                                    MachineIRBuilder &B) const {
4899   Register Res = MI.getOperand(0).getReg();
4900   Register X = MI.getOperand(1).getReg();
4901   Register Y = MI.getOperand(2).getReg();
4902   uint16_t Flags = MI.getFlags();
4903   LLT ResTy = MRI.getType(Res);
4904 
4905   const MachineFunction &MF = B.getMF();
4906   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4907                             MI.getFlag(MachineInstr::FmAfn);
4908 
4909   if (!AllowInaccurateRcp)
4910     return false;
4911 
4912   auto NegY = B.buildFNeg(ResTy, Y);
4913   auto One = B.buildFConstant(ResTy, 1.0);
4914 
4915   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4916                .addUse(Y)
4917                .setMIFlags(Flags);
4918 
4919   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4920   R = B.buildFMA(ResTy, Tmp0, R, R);
4921 
4922   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4923   R = B.buildFMA(ResTy, Tmp1, R, R);
4924 
4925   auto Ret = B.buildFMul(ResTy, X, R);
4926   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4927 
4928   B.buildFMA(Res, Tmp2, R, Ret);
4929   MI.eraseFromParent();
4930   return true;
4931 }
4932 
4933 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4934                                          MachineRegisterInfo &MRI,
4935                                          MachineIRBuilder &B) const {
4936   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4937     return true;
4938 
4939   Register Res = MI.getOperand(0).getReg();
4940   Register LHS = MI.getOperand(1).getReg();
4941   Register RHS = MI.getOperand(2).getReg();
4942 
4943   uint16_t Flags = MI.getFlags();
4944 
4945   LLT S16 = LLT::scalar(16);
4946   LLT S32 = LLT::scalar(32);
4947 
4948   // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
4949   // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
4950   // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
4951   // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
4952   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4953   // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
4954   // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
4955   // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
4956   // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
4957   // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
4958   // q16.u = opx(V_CVT_F16_F32, q32.u);
4959   // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
4960 
4961   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4962   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4963   auto NegRHSExt = B.buildFNeg(S32, RHSExt);
4964   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4965                  .addUse(RHSExt.getReg(0))
4966                  .setMIFlags(Flags);
4967   auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
4968   MachineInstrBuilder Err;
4969   if (ST.hasMadMacF32Insts()) {
4970     Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4971     Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
4972     Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
4973   } else {
4974     Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4975     Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
4976     Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
4977   }
4978   auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
4979   Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
4980   Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
4981   auto RDst = B.buildFPTrunc(S16, Quot, Flags);
4982   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4983       .addUse(RDst.getReg(0))
4984       .addUse(RHS)
4985       .addUse(LHS)
4986       .setMIFlags(Flags);
4987 
4988   MI.eraseFromParent();
4989   return true;
4990 }
4991 
4992 static constexpr unsigned SPDenormModeBitField =
4993     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
4994 
4995 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4996 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4997 static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4998                                const GCNSubtarget &ST,
4999                                SIModeRegisterDefaults Mode) {
5000   // Set SP denorm mode to this value.
5001   unsigned SPDenormMode =
5002     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5003 
5004   if (ST.hasDenormModeInst()) {
5005     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5006     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5007 
5008     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5009     B.buildInstr(AMDGPU::S_DENORM_MODE)
5010       .addImm(NewDenormModeValue);
5011 
5012   } else {
5013     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5014       .addImm(SPDenormMode)
5015       .addImm(SPDenormModeBitField);
5016   }
5017 }
5018 
5019 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
5020                                          MachineRegisterInfo &MRI,
5021                                          MachineIRBuilder &B) const {
5022   if (legalizeFastUnsafeFDIV(MI, MRI, B))
5023     return true;
5024 
5025   Register Res = MI.getOperand(0).getReg();
5026   Register LHS = MI.getOperand(1).getReg();
5027   Register RHS = MI.getOperand(2).getReg();
5028   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5029   SIModeRegisterDefaults Mode = MFI->getMode();
5030 
5031   uint16_t Flags = MI.getFlags();
5032 
5033   LLT S32 = LLT::scalar(32);
5034   LLT S1 = LLT::scalar(1);
5035 
5036   auto One = B.buildFConstant(S32, 1.0f);
5037 
5038   auto DenominatorScaled =
5039       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5040           .addUse(LHS)
5041           .addUse(RHS)
5042           .addImm(0)
5043           .setMIFlags(Flags);
5044   auto NumeratorScaled =
5045       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5046           .addUse(LHS)
5047           .addUse(RHS)
5048           .addImm(1)
5049           .setMIFlags(Flags);
5050 
5051   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5052                        .addUse(DenominatorScaled.getReg(0))
5053                        .setMIFlags(Flags);
5054   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5055 
5056   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5057   const bool HasDynamicDenormals =
5058       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5059       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5060 
5061   Register SavedSPDenormMode;
5062   if (!PreservesDenormals) {
5063     if (HasDynamicDenormals) {
5064       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5065       B.buildInstr(AMDGPU::S_GETREG_B32)
5066           .addDef(SavedSPDenormMode)
5067           .addImm(SPDenormModeBitField);
5068     }
5069     toggleSPDenormMode(true, B, ST, Mode);
5070   }
5071 
5072   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5073   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5074   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5075   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5076   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5077   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5078 
5079   if (!PreservesDenormals) {
5080     if (HasDynamicDenormals) {
5081       assert(SavedSPDenormMode);
5082       B.buildInstr(AMDGPU::S_SETREG_B32)
5083           .addReg(SavedSPDenormMode)
5084           .addImm(SPDenormModeBitField);
5085     } else
5086       toggleSPDenormMode(false, B, ST, Mode);
5087   }
5088 
5089   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5090                   .addUse(Fma4.getReg(0))
5091                   .addUse(Fma1.getReg(0))
5092                   .addUse(Fma3.getReg(0))
5093                   .addUse(NumeratorScaled.getReg(1))
5094                   .setMIFlags(Flags);
5095 
5096   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5097       .addUse(Fmas.getReg(0))
5098       .addUse(RHS)
5099       .addUse(LHS)
5100       .setMIFlags(Flags);
5101 
5102   MI.eraseFromParent();
5103   return true;
5104 }
5105 
5106 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5107                                          MachineRegisterInfo &MRI,
5108                                          MachineIRBuilder &B) const {
5109   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5110     return true;
5111 
5112   Register Res = MI.getOperand(0).getReg();
5113   Register LHS = MI.getOperand(1).getReg();
5114   Register RHS = MI.getOperand(2).getReg();
5115 
5116   uint16_t Flags = MI.getFlags();
5117 
5118   LLT S64 = LLT::scalar(64);
5119   LLT S1 = LLT::scalar(1);
5120 
5121   auto One = B.buildFConstant(S64, 1.0);
5122 
5123   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5124                        .addUse(LHS)
5125                        .addUse(RHS)
5126                        .addImm(0)
5127                        .setMIFlags(Flags);
5128 
5129   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5130 
5131   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5132                  .addUse(DivScale0.getReg(0))
5133                  .setMIFlags(Flags);
5134 
5135   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5136   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5137   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5138 
5139   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5140                        .addUse(LHS)
5141                        .addUse(RHS)
5142                        .addImm(1)
5143                        .setMIFlags(Flags);
5144 
5145   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5146   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5147   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5148 
5149   Register Scale;
5150   if (!ST.hasUsableDivScaleConditionOutput()) {
5151     // Workaround a hardware bug on SI where the condition output from div_scale
5152     // is not usable.
5153 
5154     LLT S32 = LLT::scalar(32);
5155 
5156     auto NumUnmerge = B.buildUnmerge(S32, LHS);
5157     auto DenUnmerge = B.buildUnmerge(S32, RHS);
5158     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5159     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5160 
5161     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5162                               Scale1Unmerge.getReg(1));
5163     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5164                               Scale0Unmerge.getReg(1));
5165     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5166   } else {
5167     Scale = DivScale1.getReg(1);
5168   }
5169 
5170   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5171                   .addUse(Fma4.getReg(0))
5172                   .addUse(Fma3.getReg(0))
5173                   .addUse(Mul.getReg(0))
5174                   .addUse(Scale)
5175                   .setMIFlags(Flags);
5176 
5177   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5178       .addUse(Fmas.getReg(0))
5179       .addUse(RHS)
5180       .addUse(LHS)
5181       .setMIFlags(Flags);
5182 
5183   MI.eraseFromParent();
5184   return true;
5185 }
5186 
5187 bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
5188                                          MachineRegisterInfo &MRI,
5189                                          MachineIRBuilder &B) const {
5190   Register Res0 = MI.getOperand(0).getReg();
5191   Register Res1 = MI.getOperand(1).getReg();
5192   Register Val = MI.getOperand(2).getReg();
5193   uint16_t Flags = MI.getFlags();
5194 
5195   LLT Ty = MRI.getType(Res0);
5196   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5197 
5198   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5199                   .addUse(Val)
5200                   .setMIFlags(Flags);
5201   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5202                  .addUse(Val)
5203                  .setMIFlags(Flags);
5204 
5205   if (ST.hasFractBug()) {
5206     auto Fabs = B.buildFAbs(Ty, Val);
5207     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5208     auto IsFinite =
5209         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5210     auto Zero = B.buildConstant(InstrExpTy, 0);
5211     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5212     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5213   }
5214 
5215   B.buildCopy(Res0, Mant);
5216   B.buildSExtOrTrunc(Res1, Exp);
5217 
5218   MI.eraseFromParent();
5219   return true;
5220 }
5221 
5222 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
5223                                                  MachineRegisterInfo &MRI,
5224                                                  MachineIRBuilder &B) const {
5225   Register Res = MI.getOperand(0).getReg();
5226   Register LHS = MI.getOperand(2).getReg();
5227   Register RHS = MI.getOperand(3).getReg();
5228   uint16_t Flags = MI.getFlags();
5229 
5230   LLT S32 = LLT::scalar(32);
5231   LLT S1 = LLT::scalar(1);
5232 
5233   auto Abs = B.buildFAbs(S32, RHS, Flags);
5234   const APFloat C0Val(1.0f);
5235 
5236   auto C0 = B.buildFConstant(S32, 0x1p+96f);
5237   auto C1 = B.buildFConstant(S32, 0x1p-32f);
5238   auto C2 = B.buildFConstant(S32, 1.0f);
5239 
5240   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5241   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5242 
5243   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5244 
5245   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5246                  .addUse(Mul0.getReg(0))
5247                  .setMIFlags(Flags);
5248 
5249   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5250 
5251   B.buildFMul(Res, Sel, Mul1, Flags);
5252 
5253   MI.eraseFromParent();
5254   return true;
5255 }
5256 
5257 bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5258                                            MachineRegisterInfo &MRI,
5259                                            MachineIRBuilder &B) const {
5260   // Bypass the correct expansion a standard promotion through G_FSQRT would
5261   // get. The f32 op is accurate enough for the f16 cas.
5262   unsigned Flags = MI.getFlags();
5263   assert(!ST.has16BitInsts());
5264   const LLT F32 = LLT::scalar(32);
5265   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5266   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5267     .addUse(Ext.getReg(0))
5268     .setMIFlags(Flags);
5269   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5270   MI.eraseFromParent();
5271   return true;
5272 }
5273 
5274 bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5275                                            MachineRegisterInfo &MRI,
5276                                            MachineIRBuilder &B) const {
5277   MachineFunction &MF = B.getMF();
5278   Register Dst = MI.getOperand(0).getReg();
5279   Register X = MI.getOperand(1).getReg();
5280   const unsigned Flags = MI.getFlags();
5281   const LLT S1 = LLT::scalar(1);
5282   const LLT F32 = LLT::scalar(32);
5283   const LLT I32 = LLT::scalar(32);
5284 
5285   if (allowApproxFunc(MF, Flags)) {
5286     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5287       .addUse(X)
5288       .setMIFlags(Flags);
5289     MI.eraseFromParent();
5290     return true;
5291   }
5292 
5293   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5294   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5295   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5296   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5297   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5298 
5299   Register SqrtS = MRI.createGenericVirtualRegister(F32);
5300   if (needsDenormHandlingF32(MF, X, Flags)) {
5301     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5302       .addUse(SqrtX.getReg(0))
5303       .setMIFlags(Flags);
5304 
5305     auto NegOne = B.buildConstant(I32, -1);
5306     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5307 
5308     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5309     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5310 
5311     auto PosOne = B.buildConstant(I32, 1);
5312     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5313 
5314     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5315     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5316 
5317     auto Zero = B.buildFConstant(F32, 0.0f);
5318     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5319 
5320     SqrtS =
5321         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5322 
5323     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5324     SqrtS =
5325         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5326   } else {
5327     auto SqrtR =
5328         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5329     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5330 
5331     auto Half = B.buildFConstant(F32, 0.5f);
5332     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5333     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5334     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5335     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5336     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5337     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5338     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5339     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5340   }
5341 
5342   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5343 
5344   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5345 
5346   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5347 
5348   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5349   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5350 
5351   MI.eraseFromParent();
5352   return true;
5353 }
5354 
5355 bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
5356                                            MachineRegisterInfo &MRI,
5357                                            MachineIRBuilder &B) const {
5358   // For double type, the SQRT and RSQ instructions don't have required
5359   // precision, we apply Goldschmidt's algorithm to improve the result:
5360   //
5361   //   y0 = rsq(x)
5362   //   g0 = x * y0
5363   //   h0 = 0.5 * y0
5364   //
5365   //   r0 = 0.5 - h0 * g0
5366   //   g1 = g0 * r0 + g0
5367   //   h1 = h0 * r0 + h0
5368   //
5369   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5370   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
5371   //   h2 = h1 * r1 + h1
5372   //
5373   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5374   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
5375   //
5376   //   sqrt(x) = g3
5377 
5378   const LLT S1 = LLT::scalar(1);
5379   const LLT S32 = LLT::scalar(32);
5380   const LLT F64 = LLT::scalar(64);
5381 
5382   Register Dst = MI.getOperand(0).getReg();
5383   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5384 
5385   Register X = MI.getOperand(1).getReg();
5386   unsigned Flags = MI.getFlags();
5387 
5388   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5389 
5390   auto ZeroInt = B.buildConstant(S32, 0);
5391   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
5392 
5393   // Scale up input if it is too small.
5394   auto ScaleUpFactor = B.buildConstant(S32, 256);
5395   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5396   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
5397 
5398   auto SqrtY =
5399       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5400 
5401   auto Half = B.buildFConstant(F64, 0.5);
5402   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5403   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5404 
5405   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5406   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5407 
5408   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5409   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5410 
5411   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5412   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5413 
5414   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5415 
5416   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
5417   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5418 
5419   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5420 
5421   // Scale down the result.
5422   auto ScaleDownFactor = B.buildConstant(S32, -128);
5423   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
5424   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5425 
5426   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
5427   // with finite only or nsz because rsq(+/-0) = +/-inf
5428 
5429   // TODO: Check for DAZ and expand to subnormals
5430   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5431 
5432   // If x is +INF, +0, or -0, use its original value
5433   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5434 
5435   MI.eraseFromParent();
5436   return true;
5437 }
5438 
5439 bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5440                                         MachineRegisterInfo &MRI,
5441                                         MachineIRBuilder &B) const {
5442   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5443   if (Ty == LLT::scalar(32))
5444     return legalizeFSQRTF32(MI, MRI, B);
5445   if (Ty == LLT::scalar(64))
5446     return legalizeFSQRTF64(MI, MRI, B);
5447   if (Ty == LLT::scalar(16))
5448     return legalizeFSQRTF16(MI, MRI, B);
5449   return false;
5450 }
5451 
5452 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5453 // FIXME: Why do we handle this one but not other removed instructions?
5454 //
5455 // Reciprocal square root.  The clamp prevents infinite results, clamping
5456 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5457 // +-max_float.
5458 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5459                                                     MachineRegisterInfo &MRI,
5460                                                     MachineIRBuilder &B) const {
5461   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5462     return true;
5463 
5464   Register Dst = MI.getOperand(0).getReg();
5465   Register Src = MI.getOperand(2).getReg();
5466   auto Flags = MI.getFlags();
5467 
5468   LLT Ty = MRI.getType(Dst);
5469 
5470   const fltSemantics *FltSemantics;
5471   if (Ty == LLT::scalar(32))
5472     FltSemantics = &APFloat::IEEEsingle();
5473   else if (Ty == LLT::scalar(64))
5474     FltSemantics = &APFloat::IEEEdouble();
5475   else
5476     return false;
5477 
5478   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5479                  .addUse(Src)
5480                  .setMIFlags(Flags);
5481 
5482   // We don't need to concern ourselves with the snan handling difference, since
5483   // the rsq quieted (or not) so use the one which will directly select.
5484   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5485   const bool UseIEEE = MFI->getMode().IEEE;
5486 
5487   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5488   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5489                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5490 
5491   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5492 
5493   if (UseIEEE)
5494     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5495   else
5496     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5497   MI.eraseFromParent();
5498   return true;
5499 }
5500 
5501 // TODO: Fix pointer type handling
5502 bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5503                                          MachineInstr &MI,
5504                                          Intrinsic::ID IID) const {
5505 
5506   MachineIRBuilder &B = Helper.MIRBuilder;
5507   MachineRegisterInfo &MRI = *B.getMRI();
5508 
5509   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5510                       IID == Intrinsic::amdgcn_permlanex16;
5511   bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5512                        IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5513 
5514   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5515                                       Register Src2, LLT VT) -> Register {
5516     auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5517     switch (IID) {
5518     case Intrinsic::amdgcn_readfirstlane:
5519     case Intrinsic::amdgcn_permlane64:
5520       return LaneOp.getReg(0);
5521     case Intrinsic::amdgcn_readlane:
5522     case Intrinsic::amdgcn_set_inactive:
5523     case Intrinsic::amdgcn_set_inactive_chain_arg:
5524       return LaneOp.addUse(Src1).getReg(0);
5525     case Intrinsic::amdgcn_writelane:
5526       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5527     case Intrinsic::amdgcn_permlane16:
5528     case Intrinsic::amdgcn_permlanex16: {
5529       Register Src3 = MI.getOperand(5).getReg();
5530       int64_t Src4 = MI.getOperand(6).getImm();
5531       int64_t Src5 = MI.getOperand(7).getImm();
5532       return LaneOp.addUse(Src1)
5533           .addUse(Src2)
5534           .addUse(Src3)
5535           .addImm(Src4)
5536           .addImm(Src5)
5537           .getReg(0);
5538     }
5539     case Intrinsic::amdgcn_mov_dpp8:
5540       return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
5541     case Intrinsic::amdgcn_update_dpp:
5542       return LaneOp.addUse(Src1)
5543           .addImm(MI.getOperand(4).getImm())
5544           .addImm(MI.getOperand(5).getImm())
5545           .addImm(MI.getOperand(6).getImm())
5546           .addImm(MI.getOperand(7).getImm())
5547           .getReg(0);
5548     default:
5549       llvm_unreachable("unhandled lane op");
5550     }
5551   };
5552 
5553   Register DstReg = MI.getOperand(0).getReg();
5554   Register Src0 = MI.getOperand(2).getReg();
5555   Register Src1, Src2;
5556   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5557       IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5558     Src1 = MI.getOperand(3).getReg();
5559     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5560       Src2 = MI.getOperand(4).getReg();
5561     }
5562   }
5563 
5564   LLT Ty = MRI.getType(DstReg);
5565   unsigned Size = Ty.getSizeInBits();
5566 
5567   unsigned SplitSize = 32;
5568   if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
5569       ST.hasDPALU_DPP() &&
5570       AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
5571     SplitSize = 64;
5572 
5573   if (Size == SplitSize) {
5574     // Already legal
5575     return true;
5576   }
5577 
5578   if (Size < 32) {
5579     Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5580 
5581     if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5582       Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5583 
5584     if (IID == Intrinsic::amdgcn_writelane)
5585       Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5586 
5587     Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5588     B.buildTrunc(DstReg, LaneOpDst);
5589     MI.eraseFromParent();
5590     return true;
5591   }
5592 
5593   if (Size % SplitSize != 0)
5594     return false;
5595 
5596   LLT PartialResTy = LLT::scalar(SplitSize);
5597   bool NeedsBitcast = false;
5598   if (Ty.isVector()) {
5599     LLT EltTy = Ty.getElementType();
5600     unsigned EltSize = EltTy.getSizeInBits();
5601     if (EltSize == SplitSize) {
5602       PartialResTy = EltTy;
5603     } else if (EltSize == 16 || EltSize == 32) {
5604       unsigned NElem = SplitSize / EltSize;
5605       PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
5606     } else {
5607       // Handle all other cases via S32/S64 pieces
5608       NeedsBitcast = true;
5609     }
5610   }
5611 
5612   SmallVector<Register, 4> PartialRes;
5613   unsigned NumParts = Size / SplitSize;
5614   MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5615   MachineInstrBuilder Src1Parts, Src2Parts;
5616 
5617   if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5618     Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5619 
5620   if (IID == Intrinsic::amdgcn_writelane)
5621     Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5622 
5623   for (unsigned i = 0; i < NumParts; ++i) {
5624     Src0 = Src0Parts.getReg(i);
5625 
5626     if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5627       Src1 = Src1Parts.getReg(i);
5628 
5629     if (IID == Intrinsic::amdgcn_writelane)
5630       Src2 = Src2Parts.getReg(i);
5631 
5632     PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5633   }
5634 
5635   if (NeedsBitcast)
5636     B.buildBitcast(DstReg, B.buildMergeLikeInstr(
5637                                LLT::scalar(Ty.getSizeInBits()), PartialRes));
5638   else
5639     B.buildMergeLikeInstr(DstReg, PartialRes);
5640 
5641   MI.eraseFromParent();
5642   return true;
5643 }
5644 
5645 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5646                                             MachineRegisterInfo &MRI,
5647                                             MachineIRBuilder &B) const {
5648   uint64_t Offset =
5649     ST.getTargetLowering()->getImplicitParameterOffset(
5650       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5651   LLT DstTy = MRI.getType(DstReg);
5652   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5653 
5654   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5655   if (!loadInputValue(KernargPtrReg, B,
5656                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5657     return false;
5658 
5659   // FIXME: This should be nuw
5660   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5661   return true;
5662 }
5663 
5664 /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5665 /// bits of the pointer and replace them with the stride argument, then
5666 /// merge_values everything together. In the common case of a raw buffer (the
5667 /// stride component is 0), we can just AND off the upper half.
5668 bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5669     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5670   Register Result = MI.getOperand(0).getReg();
5671   Register Pointer = MI.getOperand(2).getReg();
5672   Register Stride = MI.getOperand(3).getReg();
5673   Register NumRecords = MI.getOperand(4).getReg();
5674   Register Flags = MI.getOperand(5).getReg();
5675 
5676   LLT S32 = LLT::scalar(32);
5677 
5678   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5679   auto Unmerge = B.buildUnmerge(S32, Pointer);
5680   Register LowHalf = Unmerge.getReg(0);
5681   Register HighHalf = Unmerge.getReg(1);
5682 
5683   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5684   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5685 
5686   MachineInstrBuilder NewHighHalf = Masked;
5687   std::optional<ValueAndVReg> StrideConst =
5688       getIConstantVRegValWithLookThrough(Stride, MRI);
5689   if (!StrideConst || !StrideConst->Value.isZero()) {
5690     MachineInstrBuilder ShiftedStride;
5691     if (StrideConst) {
5692       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5693       uint32_t ShiftedStrideVal = StrideVal << 16;
5694       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5695     } else {
5696       auto ExtStride = B.buildAnyExt(S32, Stride);
5697       auto ShiftConst = B.buildConstant(S32, 16);
5698       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5699     }
5700     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5701   }
5702   Register NewHighHalfReg = NewHighHalf.getReg(0);
5703   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5704   MI.eraseFromParent();
5705   return true;
5706 }
5707 
5708 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
5709                                                  MachineRegisterInfo &MRI,
5710                                                  MachineIRBuilder &B) const {
5711   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5712   if (!MFI->isEntryFunction()) {
5713     return legalizePreloadedArgIntrin(MI, MRI, B,
5714                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5715   }
5716 
5717   Register DstReg = MI.getOperand(0).getReg();
5718   if (!getImplicitArgPtr(DstReg, MRI, B))
5719     return false;
5720 
5721   MI.eraseFromParent();
5722   return true;
5723 }
5724 
5725 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5726                                          MachineRegisterInfo &MRI,
5727                                          MachineIRBuilder &B) const {
5728   Function &F = B.getMF().getFunction();
5729   std::optional<uint32_t> KnownSize =
5730       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5731   if (KnownSize.has_value())
5732     B.buildConstant(DstReg, *KnownSize);
5733   return false;
5734 }
5735 
5736 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5737                                               MachineRegisterInfo &MRI,
5738                                               MachineIRBuilder &B) const {
5739 
5740   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5741   if (!MFI->isEntryFunction()) {
5742     return legalizePreloadedArgIntrin(MI, MRI, B,
5743                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5744   }
5745 
5746   Register DstReg = MI.getOperand(0).getReg();
5747   if (!getLDSKernelId(DstReg, MRI, B))
5748     return false;
5749 
5750   MI.eraseFromParent();
5751   return true;
5752 }
5753 
5754 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
5755                                               MachineRegisterInfo &MRI,
5756                                               MachineIRBuilder &B,
5757                                               unsigned AddrSpace) const {
5758   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5759   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5760   Register Hi32 = Unmerge.getReg(1);
5761 
5762   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
5763   MI.eraseFromParent();
5764   return true;
5765 }
5766 
5767 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
5768 // offset (the offset that is included in bounds checking and swizzling, to be
5769 // split between the instruction's voffset and immoffset fields) and soffset
5770 // (the offset that is excluded from bounds checking and swizzling, to go in
5771 // the instruction's soffset field).  This function takes the first kind of
5772 // offset and figures out how to split it between voffset and immoffset.
5773 std::pair<Register, unsigned>
5774 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
5775                                         Register OrigOffset) const {
5776   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
5777   Register BaseReg;
5778   unsigned ImmOffset;
5779   const LLT S32 = LLT::scalar(32);
5780   MachineRegisterInfo &MRI = *B.getMRI();
5781 
5782   std::tie(BaseReg, ImmOffset) =
5783       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
5784 
5785   // If BaseReg is a pointer, convert it to int.
5786   if (MRI.getType(BaseReg).isPointer())
5787     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
5788 
5789   // If the immediate value is too big for the immoffset field, put only bits
5790   // that would normally fit in the immoffset field. The remaining value that
5791   // is copied/added for the voffset field is a large power of 2, and it
5792   // stands more chance of being CSEd with the copy/add for another similar
5793   // load/store.
5794   // However, do not do that rounding down if that is a negative
5795   // number, as it appears to be illegal to have a negative offset in the
5796   // vgpr, even if adding the immediate offset makes it positive.
5797   unsigned Overflow = ImmOffset & ~MaxImm;
5798   ImmOffset -= Overflow;
5799   if ((int32_t)Overflow < 0) {
5800     Overflow += ImmOffset;
5801     ImmOffset = 0;
5802   }
5803 
5804   if (Overflow != 0) {
5805     if (!BaseReg) {
5806       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
5807     } else {
5808       auto OverflowVal = B.buildConstant(S32, Overflow);
5809       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
5810     }
5811   }
5812 
5813   if (!BaseReg)
5814     BaseReg = B.buildConstant(S32, 0).getReg(0);
5815 
5816   return std::pair(BaseReg, ImmOffset);
5817 }
5818 
5819 /// Handle register layout difference for f16 images for some subtargets.
5820 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
5821                                              MachineRegisterInfo &MRI,
5822                                              Register Reg,
5823                                              bool ImageStore) const {
5824   const LLT S16 = LLT::scalar(16);
5825   const LLT S32 = LLT::scalar(32);
5826   LLT StoreVT = MRI.getType(Reg);
5827   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
5828 
5829   if (ST.hasUnpackedD16VMem()) {
5830     auto Unmerge = B.buildUnmerge(S16, Reg);
5831 
5832     SmallVector<Register, 4> WideRegs;
5833     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5834       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
5835 
5836     int NumElts = StoreVT.getNumElements();
5837 
5838     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5839         .getReg(0);
5840   }
5841 
5842   if (ImageStore && ST.hasImageStoreD16Bug()) {
5843     if (StoreVT.getNumElements() == 2) {
5844       SmallVector<Register, 4> PackedRegs;
5845       Reg = B.buildBitcast(S32, Reg).getReg(0);
5846       PackedRegs.push_back(Reg);
5847       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5848       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5849           .getReg(0);
5850     }
5851 
5852     if (StoreVT.getNumElements() == 3) {
5853       SmallVector<Register, 4> PackedRegs;
5854       auto Unmerge = B.buildUnmerge(S16, Reg);
5855       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5856         PackedRegs.push_back(Unmerge.getReg(I));
5857       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5858       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5859       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5860     }
5861 
5862     if (StoreVT.getNumElements() == 4) {
5863       SmallVector<Register, 4> PackedRegs;
5864       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5865       auto Unmerge = B.buildUnmerge(S32, Reg);
5866       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5867         PackedRegs.push_back(Unmerge.getReg(I));
5868       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5869       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5870           .getReg(0);
5871     }
5872 
5873     llvm_unreachable("invalid data type");
5874   }
5875 
5876   if (StoreVT == LLT::fixed_vector(3, S16)) {
5877     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
5878               .getReg(0);
5879   }
5880   return Reg;
5881 }
5882 
5883 Register AMDGPULegalizerInfo::fixStoreSourceType(MachineIRBuilder &B,
5884                                                  Register VData, LLT MemTy,
5885                                                  bool IsFormat) const {
5886   MachineRegisterInfo *MRI = B.getMRI();
5887   LLT Ty = MRI->getType(VData);
5888 
5889   const LLT S16 = LLT::scalar(16);
5890 
5891   // Fixup buffer resources themselves needing to be v4i128.
5892   if (hasBufferRsrcWorkaround(Ty))
5893     return castBufferRsrcToV4I32(VData, B);
5894 
5895   if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
5896     Ty = getBitcastRegisterType(Ty);
5897     VData = B.buildBitcast(Ty, VData).getReg(0);
5898   }
5899   // Fixup illegal register types for i8 stores.
5900   if (Ty == LLT::scalar(8) || Ty == S16) {
5901     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
5902     return AnyExt;
5903   }
5904 
5905   if (Ty.isVector()) {
5906     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
5907       if (IsFormat)
5908         return handleD16VData(B, *MRI, VData);
5909     }
5910   }
5911 
5912   return VData;
5913 }
5914 
5915 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
5916                                               LegalizerHelper &Helper,
5917                                               bool IsTyped,
5918                                               bool IsFormat) const {
5919   MachineIRBuilder &B = Helper.MIRBuilder;
5920   MachineRegisterInfo &MRI = *B.getMRI();
5921 
5922   Register VData = MI.getOperand(1).getReg();
5923   LLT Ty = MRI.getType(VData);
5924   LLT EltTy = Ty.getScalarType();
5925   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
5926   const LLT S32 = LLT::scalar(32);
5927 
5928   MachineMemOperand *MMO = *MI.memoperands_begin();
5929   const int MemSize = MMO->getSize().getValue();
5930   LLT MemTy = MMO->getMemoryType();
5931 
5932   VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
5933 
5934   castBufferRsrcArgToV4I32(MI, B, 2);
5935   Register RSrc = MI.getOperand(2).getReg();
5936 
5937   unsigned ImmOffset;
5938 
5939   // The typed intrinsics add an immediate after the registers.
5940   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5941 
5942   // The struct intrinsic variants add one additional operand over raw.
5943   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
5944   Register VIndex;
5945   int OpOffset = 0;
5946   if (HasVIndex) {
5947     VIndex = MI.getOperand(3).getReg();
5948     OpOffset = 1;
5949   } else {
5950     VIndex = B.buildConstant(S32, 0).getReg(0);
5951   }
5952 
5953   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
5954   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
5955 
5956   unsigned Format = 0;
5957   if (IsTyped) {
5958     Format = MI.getOperand(5 + OpOffset).getImm();
5959     ++OpOffset;
5960   }
5961 
5962   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
5963 
5964   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
5965 
5966   unsigned Opc;
5967   if (IsTyped) {
5968     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5969                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5970   } else if (IsFormat) {
5971     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5972                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5973   } else {
5974     switch (MemSize) {
5975     case 1:
5976       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5977       break;
5978     case 2:
5979       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5980       break;
5981     default:
5982       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5983       break;
5984     }
5985   }
5986 
5987   auto MIB = B.buildInstr(Opc)
5988     .addUse(VData)              // vdata
5989     .addUse(RSrc)               // rsrc
5990     .addUse(VIndex)             // vindex
5991     .addUse(VOffset)            // voffset
5992     .addUse(SOffset)            // soffset
5993     .addImm(ImmOffset);         // offset(imm)
5994 
5995   if (IsTyped)
5996     MIB.addImm(Format);
5997 
5998   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
5999      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6000      .addMemOperand(MMO);
6001 
6002   MI.eraseFromParent();
6003   return true;
6004 }
6005 
6006 static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6007                             Register VIndex, Register VOffset, Register SOffset,
6008                             unsigned ImmOffset, unsigned Format,
6009                             unsigned AuxiliaryData, MachineMemOperand *MMO,
6010                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6011   auto MIB = B.buildInstr(Opc)
6012                  .addDef(LoadDstReg) // vdata
6013                  .addUse(RSrc)       // rsrc
6014                  .addUse(VIndex)     // vindex
6015                  .addUse(VOffset)    // voffset
6016                  .addUse(SOffset)    // soffset
6017                  .addImm(ImmOffset); // offset(imm)
6018 
6019   if (IsTyped)
6020     MIB.addImm(Format);
6021 
6022   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
6023       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6024       .addMemOperand(MMO);
6025 }
6026 
6027 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
6028                                              LegalizerHelper &Helper,
6029                                              bool IsFormat,
6030                                              bool IsTyped) const {
6031   MachineIRBuilder &B = Helper.MIRBuilder;
6032   MachineRegisterInfo &MRI = *B.getMRI();
6033   GISelChangeObserver &Observer = Helper.Observer;
6034 
6035   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6036   MachineMemOperand *MMO = *MI.memoperands_begin();
6037   const LLT MemTy = MMO->getMemoryType();
6038   const LLT S32 = LLT::scalar(32);
6039 
6040   Register Dst = MI.getOperand(0).getReg();
6041 
6042   Register StatusDst;
6043   int OpOffset = 0;
6044   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6045   bool IsTFE = MI.getNumExplicitDefs() == 2;
6046   if (IsTFE) {
6047     StatusDst = MI.getOperand(1).getReg();
6048     ++OpOffset;
6049   }
6050 
6051   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6052   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6053 
6054   // The typed intrinsics add an immediate after the registers.
6055   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6056 
6057   // The struct intrinsic variants add one additional operand over raw.
6058   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6059   Register VIndex;
6060   if (HasVIndex) {
6061     VIndex = MI.getOperand(3 + OpOffset).getReg();
6062     ++OpOffset;
6063   } else {
6064     VIndex = B.buildConstant(S32, 0).getReg(0);
6065   }
6066 
6067   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6068   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6069 
6070   unsigned Format = 0;
6071   if (IsTyped) {
6072     Format = MI.getOperand(5 + OpOffset).getImm();
6073     ++OpOffset;
6074   }
6075 
6076   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6077   unsigned ImmOffset;
6078 
6079   LLT Ty = MRI.getType(Dst);
6080   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6081   // logic doesn't have to handle that case.
6082   if (hasBufferRsrcWorkaround(Ty)) {
6083     Observer.changingInstr(MI);
6084     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6085     Observer.changedInstr(MI);
6086     Dst = MI.getOperand(0).getReg();
6087     B.setInsertPt(B.getMBB(), MI);
6088   }
6089   if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6090     Ty = getBitcastRegisterType(Ty);
6091     Observer.changingInstr(MI);
6092     Helper.bitcastDst(MI, Ty, 0);
6093     Observer.changedInstr(MI);
6094     Dst = MI.getOperand(0).getReg();
6095     B.setInsertPt(B.getMBB(), MI);
6096   }
6097 
6098   LLT EltTy = Ty.getScalarType();
6099   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6100   const bool Unpacked = ST.hasUnpackedD16VMem();
6101 
6102   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6103 
6104   unsigned Opc;
6105 
6106   // TODO: Support TFE for typed and narrow loads.
6107   if (IsTyped) {
6108     if (IsTFE)
6109       return false;
6110     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6111                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6112   } else if (IsFormat) {
6113     if (IsD16) {
6114       if (IsTFE)
6115         return false;
6116       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6117     } else {
6118       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6119                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6120     }
6121   } else {
6122     switch (MemTy.getSizeInBits()) {
6123     case 8:
6124       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6125                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6126       break;
6127     case 16:
6128       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6129                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6130       break;
6131     default:
6132       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6133                   : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6134       break;
6135     }
6136   }
6137 
6138   if (IsTFE) {
6139     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6140     unsigned NumLoadDWords = NumValueDWords + 1;
6141     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6142     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6143     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6144                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6145     if (MemTy.getSizeInBits() < 32) {
6146       Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6147       B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6148       B.buildTrunc(Dst, ExtDst);
6149     } else if (NumValueDWords == 1) {
6150       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6151     } else {
6152       SmallVector<Register, 5> LoadElts;
6153       for (unsigned I = 0; I != NumValueDWords; ++I)
6154         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6155       LoadElts.push_back(StatusDst);
6156       B.buildUnmerge(LoadElts, LoadDstReg);
6157       LoadElts.truncate(NumValueDWords);
6158       B.buildMergeLikeInstr(Dst, LoadElts);
6159     }
6160   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6161              (IsD16 && !Ty.isVector())) {
6162     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6163     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6164                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6165     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6166     B.buildTrunc(Dst, LoadDstReg);
6167   } else if (Unpacked && IsD16 && Ty.isVector()) {
6168     LLT UnpackedTy = Ty.changeElementSize(32);
6169     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6170     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6171                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6172     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6173     // FIXME: G_TRUNC should work, but legalization currently fails
6174     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6175     SmallVector<Register, 4> Repack;
6176     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6177       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6178     B.buildMergeLikeInstr(Dst, Repack);
6179   } else {
6180     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6181                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6182   }
6183 
6184   MI.eraseFromParent();
6185   return true;
6186 }
6187 
6188 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6189   switch (IntrID) {
6190   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6191   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6192   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6193   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6194     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6195   case Intrinsic::amdgcn_raw_buffer_atomic_add:
6196   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6197   case Intrinsic::amdgcn_struct_buffer_atomic_add:
6198   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6199     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6200   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6201   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6202   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6203   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6204     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6205   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6206   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6207   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6208   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6209     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6210   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6211   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6212   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6213   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6214     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6215   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6216   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6217   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6218   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6219     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6220   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6221   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6222   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6223   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6224     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6225   case Intrinsic::amdgcn_raw_buffer_atomic_and:
6226   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6227   case Intrinsic::amdgcn_struct_buffer_atomic_and:
6228   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6229     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6230   case Intrinsic::amdgcn_raw_buffer_atomic_or:
6231   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6232   case Intrinsic::amdgcn_struct_buffer_atomic_or:
6233   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6234     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6235   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6236   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6237   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6238   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6239     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6240   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6241   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6242   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6243   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6244     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6245   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6246   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6247   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6248   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6249     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6250   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6251   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6252   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6253   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6254     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6255   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6256   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6257   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6258   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6259     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6260   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6261   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6262   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6263   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6264     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6265   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6266   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6267   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6268   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6269     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6270   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6271   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6272     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6273   default:
6274     llvm_unreachable("unhandled atomic opcode");
6275   }
6276 }
6277 
6278 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
6279                                                MachineIRBuilder &B,
6280                                                Intrinsic::ID IID) const {
6281   const bool IsCmpSwap =
6282       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6283       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6284       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6285       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6286 
6287   Register Dst = MI.getOperand(0).getReg();
6288   // Since we don't have 128-bit atomics, we don't need to handle the case of
6289   // p8 argmunents to the atomic itself
6290   Register VData = MI.getOperand(2).getReg();
6291 
6292   Register CmpVal;
6293   int OpOffset = 0;
6294 
6295   if (IsCmpSwap) {
6296     CmpVal = MI.getOperand(3).getReg();
6297     ++OpOffset;
6298   }
6299 
6300   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6301   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6302   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6303 
6304   // The struct intrinsic variants add one additional operand over raw.
6305   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6306   Register VIndex;
6307   if (HasVIndex) {
6308     VIndex = MI.getOperand(4 + OpOffset).getReg();
6309     ++OpOffset;
6310   } else {
6311     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6312   }
6313 
6314   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6315   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6316   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6317 
6318   MachineMemOperand *MMO = *MI.memoperands_begin();
6319 
6320   unsigned ImmOffset;
6321   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6322 
6323   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6324       .addDef(Dst)
6325       .addUse(VData); // vdata
6326 
6327   if (IsCmpSwap)
6328     MIB.addReg(CmpVal);
6329 
6330   MIB.addUse(RSrc)               // rsrc
6331      .addUse(VIndex)             // vindex
6332      .addUse(VOffset)            // voffset
6333      .addUse(SOffset)            // soffset
6334      .addImm(ImmOffset)          // offset(imm)
6335      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
6336      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6337      .addMemOperand(MMO);
6338 
6339   MI.eraseFromParent();
6340   return true;
6341 }
6342 
6343 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6344 /// vector with s16 typed elements.
6345 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6346                                       SmallVectorImpl<Register> &PackedAddrs,
6347                                       unsigned ArgOffset,
6348                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
6349                                       bool IsA16, bool IsG16) {
6350   const LLT S16 = LLT::scalar(16);
6351   const LLT V2S16 = LLT::fixed_vector(2, 16);
6352   auto EndIdx = Intr->VAddrEnd;
6353 
6354   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6355     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6356     if (!SrcOp.isReg())
6357       continue; // _L to _LZ may have eliminated this.
6358 
6359     Register AddrReg = SrcOp.getReg();
6360 
6361     if ((I < Intr->GradientStart) ||
6362         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6363         (I >= Intr->CoordStart && !IsA16)) {
6364       if ((I < Intr->GradientStart) && IsA16 &&
6365           (B.getMRI()->getType(AddrReg) == S16)) {
6366         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6367         // Special handling of bias when A16 is on. Bias is of type half but
6368         // occupies full 32-bit.
6369         PackedAddrs.push_back(
6370             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6371                 .getReg(0));
6372       } else {
6373         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6374                "Bias needs to be converted to 16 bit in A16 mode");
6375         // Handle any gradient or coordinate operands that should not be packed
6376         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
6377         PackedAddrs.push_back(AddrReg);
6378       }
6379     } else {
6380       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
6381       // derivatives dx/dh and dx/dv are packed with undef.
6382       if (((I + 1) >= EndIdx) ||
6383           ((Intr->NumGradients / 2) % 2 == 1 &&
6384            (I == static_cast<unsigned>(Intr->GradientStart +
6385                                        (Intr->NumGradients / 2) - 1) ||
6386             I == static_cast<unsigned>(Intr->GradientStart +
6387                                        Intr->NumGradients - 1))) ||
6388           // Check for _L to _LZ optimization
6389           !MI.getOperand(ArgOffset + I + 1).isReg()) {
6390         PackedAddrs.push_back(
6391             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6392                 .getReg(0));
6393       } else {
6394         PackedAddrs.push_back(
6395             B.buildBuildVector(
6396                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6397                 .getReg(0));
6398         ++I;
6399       }
6400     }
6401   }
6402 }
6403 
6404 /// Convert from separate vaddr components to a single vector address register,
6405 /// and replace the remaining operands with $noreg.
6406 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
6407                                      int DimIdx, int NumVAddrs) {
6408   const LLT S32 = LLT::scalar(32);
6409   (void)S32;
6410   SmallVector<Register, 8> AddrRegs;
6411   for (int I = 0; I != NumVAddrs; ++I) {
6412     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6413     if (SrcOp.isReg()) {
6414       AddrRegs.push_back(SrcOp.getReg());
6415       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
6416     }
6417   }
6418 
6419   int NumAddrRegs = AddrRegs.size();
6420   if (NumAddrRegs != 1) {
6421     auto VAddr =
6422         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
6423     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6424   }
6425 
6426   for (int I = 1; I != NumVAddrs; ++I) {
6427     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
6428     if (SrcOp.isReg())
6429       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
6430   }
6431 }
6432 
6433 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
6434 ///
6435 /// Depending on the subtarget, load/store with 16-bit element data need to be
6436 /// rewritten to use the low half of 32-bit registers, or directly use a packed
6437 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
6438 /// registers.
6439 ///
6440 /// We don't want to directly select image instructions just yet, but also want
6441 /// to exposes all register repacking to the legalizer/combiners. We also don't
6442 /// want a selected instruction entering RegBankSelect. In order to avoid
6443 /// defining a multitude of intermediate image instructions, directly hack on
6444 /// the intrinsic's arguments. In cases like a16 addresses, this requires
6445 /// padding now unnecessary arguments with $noreg.
6446 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6447     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6448     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
6449 
6450   const MachineFunction &MF = *MI.getMF();
6451   const unsigned NumDefs = MI.getNumExplicitDefs();
6452   const unsigned ArgOffset = NumDefs + 1;
6453   bool IsTFE = NumDefs == 2;
6454   // We are only processing the operands of d16 image operations on subtargets
6455   // that use the unpacked register layout, or need to repack the TFE result.
6456 
6457   // TODO: Do we need to guard against already legalized intrinsics?
6458   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6459       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6460 
6461   MachineRegisterInfo *MRI = B.getMRI();
6462   const LLT S32 = LLT::scalar(32);
6463   const LLT S16 = LLT::scalar(16);
6464   const LLT V2S16 = LLT::fixed_vector(2, 16);
6465 
6466   unsigned DMask = 0;
6467   Register VData;
6468   LLT Ty;
6469 
6470   if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6471     VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6472     Ty = MRI->getType(VData);
6473   }
6474 
6475   const bool IsAtomicPacked16Bit =
6476       (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6477        BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6478 
6479   // Check for 16 bit addresses and pack if true.
6480   LLT GradTy =
6481       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6482   LLT AddrTy =
6483       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
6484   const bool IsG16 =
6485       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
6486   const bool IsA16 = AddrTy == S16;
6487   const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
6488 
6489   int DMaskLanes = 0;
6490   if (!BaseOpcode->Atomic) {
6491     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
6492     if (BaseOpcode->Gather4) {
6493       DMaskLanes = 4;
6494     } else if (DMask != 0) {
6495       DMaskLanes = llvm::popcount(DMask);
6496     } else if (!IsTFE && !BaseOpcode->Store) {
6497       // If dmask is 0, this is a no-op load. This can be eliminated.
6498       B.buildUndef(MI.getOperand(0));
6499       MI.eraseFromParent();
6500       return true;
6501     }
6502   }
6503 
6504   Observer.changingInstr(MI);
6505   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
6506 
6507   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6508                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6509   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6510                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6511   unsigned NewOpcode = LoadOpcode;
6512   if (BaseOpcode->Store)
6513     NewOpcode = StoreOpcode;
6514   else if (BaseOpcode->NoReturn)
6515     NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6516 
6517   // Track that we legalized this
6518   MI.setDesc(B.getTII().get(NewOpcode));
6519 
6520   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
6521   // dmask to be at least 1 otherwise the instruction will fail
6522   if (IsTFE && DMask == 0) {
6523     DMask = 0x1;
6524     DMaskLanes = 1;
6525     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
6526   }
6527 
6528   if (BaseOpcode->Atomic) {
6529     Register VData0 = MI.getOperand(2).getReg();
6530     LLT Ty = MRI->getType(VData0);
6531 
6532     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
6533     if (Ty.isVector() && !IsAtomicPacked16Bit)
6534       return false;
6535 
6536     if (BaseOpcode->AtomicX2) {
6537       Register VData1 = MI.getOperand(3).getReg();
6538       // The two values are packed in one register.
6539       LLT PackedTy = LLT::fixed_vector(2, Ty);
6540       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
6541       MI.getOperand(2).setReg(Concat.getReg(0));
6542       MI.getOperand(3).setReg(AMDGPU::NoRegister);
6543     }
6544   }
6545 
6546   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
6547 
6548   // Rewrite the addressing register layout before doing anything else.
6549   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6550     // 16 bit gradients are supported, but are tied to the A16 control
6551     // so both gradients and addresses must be 16 bit
6552     return false;
6553   }
6554 
6555   if (IsA16 && !ST.hasA16()) {
6556     // A16 not supported
6557     return false;
6558   }
6559 
6560   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
6561   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6562 
6563   if (IsA16 || IsG16) {
6564     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6565     // instructions expect VGPR_32
6566     SmallVector<Register, 4> PackedRegs;
6567 
6568     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
6569 
6570     // See also below in the non-a16 branch
6571     const bool UseNSA = ST.hasNSAEncoding() &&
6572                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
6573                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
6574     const bool UsePartialNSA =
6575         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
6576 
6577     if (UsePartialNSA) {
6578       // Pack registers that would go over NSAMaxSize into last VAddr register
6579       LLT PackedAddrTy =
6580           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
6581       auto Concat = B.buildConcatVectors(
6582           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6583       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
6584       PackedRegs.resize(NSAMaxSize);
6585     } else if (!UseNSA && PackedRegs.size() > 1) {
6586       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
6587       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
6588       PackedRegs[0] = Concat.getReg(0);
6589       PackedRegs.resize(1);
6590     }
6591 
6592     const unsigned NumPacked = PackedRegs.size();
6593     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6594       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6595       if (!SrcOp.isReg()) {
6596         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
6597         continue;
6598       }
6599 
6600       assert(SrcOp.getReg() != AMDGPU::NoRegister);
6601 
6602       if (I - Intr->VAddrStart < NumPacked)
6603         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
6604       else
6605         SrcOp.setReg(AMDGPU::NoRegister);
6606     }
6607   } else {
6608     // If the register allocator cannot place the address registers contiguously
6609     // without introducing moves, then using the non-sequential address encoding
6610     // is always preferable, since it saves VALU instructions and is usually a
6611     // wash in terms of code size or even better.
6612     //
6613     // However, we currently have no way of hinting to the register allocator
6614     // that MIMG addresses should be placed contiguously when it is possible to
6615     // do so, so force non-NSA for the common 2-address case as a heuristic.
6616     //
6617     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6618     // allocation when possible.
6619     //
6620     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
6621     // set of the remaining addresses.
6622     const bool UseNSA = ST.hasNSAEncoding() &&
6623                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
6624                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6625     const bool UsePartialNSA =
6626         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6627 
6628     if (UsePartialNSA) {
6629       convertImageAddrToPacked(B, MI,
6630                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
6631                                Intr->NumVAddrs - NSAMaxSize + 1);
6632     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6633       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6634                                Intr->NumVAddrs);
6635     }
6636   }
6637 
6638   int Flags = 0;
6639   if (IsA16)
6640     Flags |= 1;
6641   if (IsG16)
6642     Flags |= 2;
6643   MI.addOperand(MachineOperand::CreateImm(Flags));
6644 
6645   if (BaseOpcode->NoReturn) { // No TFE for stores?
6646     // TODO: Handle dmask trim
6647     if (!Ty.isVector() || !IsD16)
6648       return true;
6649 
6650     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
6651     if (RepackedReg != VData) {
6652       MI.getOperand(1).setReg(RepackedReg);
6653     }
6654 
6655     return true;
6656   }
6657 
6658   Register DstReg = MI.getOperand(0).getReg();
6659   const LLT EltTy = Ty.getScalarType();
6660   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
6661 
6662   // Confirm that the return type is large enough for the dmask specified
6663   if (NumElts < DMaskLanes)
6664     return false;
6665 
6666   if (NumElts > 4 || DMaskLanes > 4)
6667     return false;
6668 
6669   // Image atomic instructions are using DMask to specify how many bits
6670   // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
6671   // DMaskLanes for image atomic has default value '0'.
6672   // We must be sure that atomic variants (especially packed) will not be
6673   // truncated from v2s16 or v4s16 to s16 type.
6674   //
6675   // ChangeElementCount will be needed for image load where Ty is always scalar.
6676   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6677   const LLT AdjustedTy =
6678       DMaskLanes == 0
6679           ? Ty
6680           : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
6681 
6682   // The raw dword aligned data component of the load. The only legal cases
6683   // where this matters should be when using the packed D16 format, for
6684   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
6685   LLT RoundedTy;
6686 
6687   // S32 vector to cover all data, plus TFE result element.
6688   LLT TFETy;
6689 
6690   // Register type to use for each loaded component. Will be S32 or V2S16.
6691   LLT RegTy;
6692 
6693   if (IsD16 && ST.hasUnpackedD16VMem()) {
6694     RoundedTy =
6695         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6696     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
6697     RegTy = S32;
6698   } else {
6699     unsigned EltSize = EltTy.getSizeInBits();
6700     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
6701     unsigned RoundedSize = 32 * RoundedElts;
6702     RoundedTy = LLT::scalarOrVector(
6703         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6704     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
6705     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
6706   }
6707 
6708   // The return type does not need adjustment.
6709   // TODO: Should we change s16 case to s32 or <2 x s16>?
6710   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
6711     return true;
6712 
6713   Register Dst1Reg;
6714 
6715   // Insert after the instruction.
6716   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
6717 
6718   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
6719   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
6720   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6721   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
6722 
6723   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
6724 
6725   MI.getOperand(0).setReg(NewResultReg);
6726 
6727   // In the IR, TFE is supposed to be used with a 2 element struct return
6728   // type. The instruction really returns these two values in one contiguous
6729   // register, with one additional dword beyond the loaded data. Rewrite the
6730   // return type to use a single register result.
6731 
6732   if (IsTFE) {
6733     Dst1Reg = MI.getOperand(1).getReg();
6734     if (MRI->getType(Dst1Reg) != S32)
6735       return false;
6736 
6737     // TODO: Make sure the TFE operand bit is set.
6738     MI.removeOperand(1);
6739 
6740     // Handle the easy case that requires no repack instructions.
6741     if (Ty == S32) {
6742       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6743       return true;
6744     }
6745   }
6746 
6747   // Now figure out how to copy the new result register back into the old
6748   // result.
6749   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
6750 
6751   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
6752 
6753   if (ResultNumRegs == 1) {
6754     assert(!IsTFE);
6755     ResultRegs[0] = NewResultReg;
6756   } else {
6757     // We have to repack into a new vector of some kind.
6758     for (int I = 0; I != NumDataRegs; ++I)
6759       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
6760     B.buildUnmerge(ResultRegs, NewResultReg);
6761 
6762     // Drop the final TFE element to get the data part. The TFE result is
6763     // directly written to the right place already.
6764     if (IsTFE)
6765       ResultRegs.resize(NumDataRegs);
6766   }
6767 
6768   // For an s16 scalar result, we form an s32 result with a truncate regardless
6769   // of packed vs. unpacked.
6770   if (IsD16 && !Ty.isVector()) {
6771     B.buildTrunc(DstReg, ResultRegs[0]);
6772     return true;
6773   }
6774 
6775   // Avoid a build/concat_vector of 1 entry.
6776   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
6777     B.buildBitcast(DstReg, ResultRegs[0]);
6778     return true;
6779   }
6780 
6781   assert(Ty.isVector());
6782 
6783   if (IsD16) {
6784     // For packed D16 results with TFE enabled, all the data components are
6785     // S32. Cast back to the expected type.
6786     //
6787     // TODO: We don't really need to use load s32 elements. We would only need one
6788     // cast for the TFE result if a multiple of v2s16 was used.
6789     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
6790       for (Register &Reg : ResultRegs)
6791         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
6792     } else if (ST.hasUnpackedD16VMem()) {
6793       for (Register &Reg : ResultRegs)
6794         Reg = B.buildTrunc(S16, Reg).getReg(0);
6795     }
6796   }
6797 
6798   auto padWithUndef = [&](LLT Ty, int NumElts) {
6799     if (NumElts == 0)
6800       return;
6801     Register Undef = B.buildUndef(Ty).getReg(0);
6802     for (int I = 0; I != NumElts; ++I)
6803       ResultRegs.push_back(Undef);
6804   };
6805 
6806   // Pad out any elements eliminated due to the dmask.
6807   LLT ResTy = MRI->getType(ResultRegs[0]);
6808   if (!ResTy.isVector()) {
6809     padWithUndef(ResTy, NumElts - ResultRegs.size());
6810     B.buildBuildVector(DstReg, ResultRegs);
6811     return true;
6812   }
6813 
6814   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
6815   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
6816 
6817   // Deal with the one annoying legal case.
6818   const LLT V3S16 = LLT::fixed_vector(3, 16);
6819   if (Ty == V3S16) {
6820     if (IsTFE) {
6821       if (ResultRegs.size() == 1) {
6822         NewResultReg = ResultRegs[0];
6823       } else if (ResultRegs.size() == 2) {
6824         LLT V4S16 = LLT::fixed_vector(4, 16);
6825         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
6826       } else {
6827         return false;
6828       }
6829     }
6830 
6831     if (MRI->getType(DstReg).getNumElements() <
6832         MRI->getType(NewResultReg).getNumElements()) {
6833       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6834     } else {
6835       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6836     }
6837     return true;
6838   }
6839 
6840   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
6841   B.buildConcatVectors(DstReg, ResultRegs);
6842   return true;
6843 }
6844 
6845 bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
6846                                               MachineInstr &MI) const {
6847   MachineIRBuilder &B = Helper.MIRBuilder;
6848   GISelChangeObserver &Observer = Helper.Observer;
6849 
6850   Register OrigDst = MI.getOperand(0).getReg();
6851   Register Dst;
6852   LLT Ty = B.getMRI()->getType(OrigDst);
6853   unsigned Size = Ty.getSizeInBits();
6854   MachineFunction &MF = B.getMF();
6855   unsigned Opc = 0;
6856   if (Size < 32 && ST.hasScalarSubwordLoads()) {
6857     assert(Size == 8 || Size == 16);
6858     Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6859                     : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6860     // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
6861     // destination register.
6862     Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
6863   } else {
6864     Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6865     Dst = OrigDst;
6866   }
6867 
6868   Observer.changingInstr(MI);
6869 
6870   // Handle needing to s.buffer.load() a p8 value.
6871   if (hasBufferRsrcWorkaround(Ty)) {
6872     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6873     B.setInsertPt(B.getMBB(), MI);
6874   }
6875   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6876     Ty = getBitcastRegisterType(Ty);
6877     Helper.bitcastDst(MI, Ty, 0);
6878     B.setInsertPt(B.getMBB(), MI);
6879   }
6880 
6881   // FIXME: We don't really need this intermediate instruction. The intrinsic
6882   // should be fixed to have a memory operand. Since it's readnone, we're not
6883   // allowed to add one.
6884   MI.setDesc(B.getTII().get(Opc));
6885   MI.removeOperand(1); // Remove intrinsic ID
6886 
6887   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
6888   const unsigned MemSize = (Size + 7) / 8;
6889   const Align MemAlign = B.getDataLayout().getABITypeAlign(
6890       getTypeForLLT(Ty, MF.getFunction().getContext()));
6891   MachineMemOperand *MMO = MF.getMachineMemOperand(
6892       MachinePointerInfo(),
6893       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6894           MachineMemOperand::MOInvariant,
6895       MemSize, MemAlign);
6896   MI.addMemOperand(MF, MMO);
6897   if (Dst != OrigDst) {
6898     MI.getOperand(0).setReg(Dst);
6899     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6900     B.buildTrunc(OrigDst, Dst);
6901   }
6902 
6903   // If we don't have 96-bit result scalar loads, widening to 128-bit should
6904   // always be legal. We may need to restore this to a 96-bit result if it turns
6905   // out this needs to be converted to a vector load during RegBankSelect.
6906   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
6907     if (Ty.isVector())
6908       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
6909     else
6910       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
6911   }
6912 
6913   Observer.changedInstr(MI);
6914   return true;
6915 }
6916 
6917 bool AMDGPULegalizerInfo::legalizeSBufferPrefetch(LegalizerHelper &Helper,
6918                                                   MachineInstr &MI) const {
6919   MachineIRBuilder &B = Helper.MIRBuilder;
6920   GISelChangeObserver &Observer = Helper.Observer;
6921   Observer.changingInstr(MI);
6922   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
6923   MI.removeOperand(0); // Remove intrinsic ID
6924   castBufferRsrcArgToV4I32(MI, B, 0);
6925   Observer.changedInstr(MI);
6926   return true;
6927 }
6928 
6929 // TODO: Move to selection
6930 bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
6931                                        MachineRegisterInfo &MRI,
6932                                        MachineIRBuilder &B) const {
6933   if (!ST.isTrapHandlerEnabled() ||
6934       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6935     return legalizeTrapEndpgm(MI, MRI, B);
6936 
6937   return ST.supportsGetDoorbellID() ?
6938          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6939 }
6940 
6941 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6942     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6943   const DebugLoc &DL = MI.getDebugLoc();
6944   MachineBasicBlock &BB = B.getMBB();
6945   MachineFunction *MF = BB.getParent();
6946 
6947   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6948     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6949       .addImm(0);
6950     MI.eraseFromParent();
6951     return true;
6952   }
6953 
6954   // We need a block split to make the real endpgm a terminator. We also don't
6955   // want to break phis in successor blocks, so we can't just delete to the
6956   // end of the block.
6957   BB.splitAt(MI, false /*UpdateLiveIns*/);
6958   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6959   MF->push_back(TrapBB);
6960   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6961     .addImm(0);
6962   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6963     .addMBB(TrapBB);
6964 
6965   BB.addSuccessor(TrapBB);
6966   MI.eraseFromParent();
6967   return true;
6968 }
6969 
6970 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6971     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6972   MachineFunction &MF = B.getMF();
6973   const LLT S64 = LLT::scalar(64);
6974 
6975   Register SGPR01(AMDGPU::SGPR0_SGPR1);
6976   // For code object version 5, queue_ptr is passed through implicit kernarg.
6977   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
6978       AMDGPU::AMDHSA_COV5) {
6979     AMDGPUTargetLowering::ImplicitParameter Param =
6980         AMDGPUTargetLowering::QUEUE_PTR;
6981     uint64_t Offset =
6982         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
6983 
6984     Register KernargPtrReg = MRI.createGenericVirtualRegister(
6985         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6986 
6987     if (!loadInputValue(KernargPtrReg, B,
6988                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
6989       return false;
6990 
6991     // TODO: can we be smarter about machine pointer info?
6992     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
6993     MachineMemOperand *MMO = MF.getMachineMemOperand(
6994         PtrInfo,
6995         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6996             MachineMemOperand::MOInvariant,
6997         LLT::scalar(64), commonAlignment(Align(64), Offset));
6998 
6999     // Pointer address
7000     Register LoadAddr = MRI.createGenericVirtualRegister(
7001         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7002     B.buildPtrAdd(LoadAddr, KernargPtrReg,
7003                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7004     // Load address
7005     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7006     B.buildCopy(SGPR01, Temp);
7007     B.buildInstr(AMDGPU::S_TRAP)
7008         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7009         .addReg(SGPR01, RegState::Implicit);
7010     MI.eraseFromParent();
7011     return true;
7012   }
7013 
7014   // Pass queue pointer to trap handler as input, and insert trap instruction
7015   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7016   Register LiveIn =
7017     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
7018   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
7019     return false;
7020 
7021   B.buildCopy(SGPR01, LiveIn);
7022   B.buildInstr(AMDGPU::S_TRAP)
7023       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7024       .addReg(SGPR01, RegState::Implicit);
7025 
7026   MI.eraseFromParent();
7027   return true;
7028 }
7029 
7030 bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
7031                                           MachineRegisterInfo &MRI,
7032                                           MachineIRBuilder &B) const {
7033   // We need to simulate the 's_trap 2' instruction on targets that run in
7034   // PRIV=1 (where it is treated as a nop).
7035   if (ST.hasPrivEnabledTrap2NopBug()) {
7036     ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7037                                            MI.getDebugLoc());
7038     MI.eraseFromParent();
7039     return true;
7040   }
7041 
7042   B.buildInstr(AMDGPU::S_TRAP)
7043       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7044   MI.eraseFromParent();
7045   return true;
7046 }
7047 
7048 bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
7049                                             MachineRegisterInfo &MRI,
7050                                             MachineIRBuilder &B) const {
7051   // Is non-HSA path or trap-handler disabled? Then, report a warning
7052   // accordingly
7053   if (!ST.isTrapHandlerEnabled() ||
7054       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7055     Function &Fn = B.getMF().getFunction();
7056     Fn.getContext().diagnose(DiagnosticInfoUnsupported(
7057         Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7058   } else {
7059     // Insert debug-trap instruction
7060     B.buildInstr(AMDGPU::S_TRAP)
7061         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7062   }
7063 
7064   MI.eraseFromParent();
7065   return true;
7066 }
7067 
7068 bool AMDGPULegalizerInfo::legalizeBVHIntersectRayIntrinsic(
7069     MachineInstr &MI, MachineIRBuilder &B) const {
7070   MachineRegisterInfo &MRI = *B.getMRI();
7071   const LLT S16 = LLT::scalar(16);
7072   const LLT S32 = LLT::scalar(32);
7073   const LLT V2S16 = LLT::fixed_vector(2, 16);
7074   const LLT V3S32 = LLT::fixed_vector(3, 32);
7075 
7076   Register DstReg = MI.getOperand(0).getReg();
7077   Register NodePtr = MI.getOperand(2).getReg();
7078   Register RayExtent = MI.getOperand(3).getReg();
7079   Register RayOrigin = MI.getOperand(4).getReg();
7080   Register RayDir = MI.getOperand(5).getReg();
7081   Register RayInvDir = MI.getOperand(6).getReg();
7082   Register TDescr = MI.getOperand(7).getReg();
7083 
7084   if (!ST.hasGFX10_AEncoding()) {
7085     Function &Fn = B.getMF().getFunction();
7086     Fn.getContext().diagnose(DiagnosticInfoUnsupported(
7087         Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7088     return false;
7089   }
7090 
7091   const bool IsGFX11 = AMDGPU::isGFX11(ST);
7092   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7093   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7094   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7095   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7096   const unsigned NumVDataDwords = 4;
7097   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7098   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7099   const bool UseNSA =
7100       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7101 
7102   const unsigned BaseOpcodes[2][2] = {
7103       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7104       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7105        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7106   int Opcode;
7107   if (UseNSA) {
7108     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7109                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7110                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
7111                                                : AMDGPU::MIMGEncGfx10NSA,
7112                                    NumVDataDwords, NumVAddrDwords);
7113   } else {
7114     assert(!IsGFX12Plus);
7115     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7116                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7117                                            : AMDGPU::MIMGEncGfx10Default,
7118                                    NumVDataDwords, NumVAddrDwords);
7119   }
7120   assert(Opcode != -1);
7121 
7122   SmallVector<Register, 12> Ops;
7123   if (UseNSA && IsGFX11Plus) {
7124     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7125       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7126       auto Merged = B.buildMergeLikeInstr(
7127           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7128       Ops.push_back(Merged.getReg(0));
7129     };
7130 
7131     Ops.push_back(NodePtr);
7132     Ops.push_back(RayExtent);
7133     packLanes(RayOrigin);
7134 
7135     if (IsA16) {
7136       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7137       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7138       auto MergedDir = B.buildMergeLikeInstr(
7139           V3S32,
7140           {B.buildBitcast(
7141                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7142                                                    UnmergeRayDir.getReg(0)}))
7143                .getReg(0),
7144            B.buildBitcast(
7145                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7146                                                    UnmergeRayDir.getReg(1)}))
7147                .getReg(0),
7148            B.buildBitcast(
7149                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7150                                                    UnmergeRayDir.getReg(2)}))
7151                .getReg(0)});
7152       Ops.push_back(MergedDir.getReg(0));
7153     } else {
7154       packLanes(RayDir);
7155       packLanes(RayInvDir);
7156     }
7157   } else {
7158     if (Is64) {
7159       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7160       Ops.push_back(Unmerge.getReg(0));
7161       Ops.push_back(Unmerge.getReg(1));
7162     } else {
7163       Ops.push_back(NodePtr);
7164     }
7165     Ops.push_back(RayExtent);
7166 
7167     auto packLanes = [&Ops, &S32, &B](Register Src) {
7168       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7169       Ops.push_back(Unmerge.getReg(0));
7170       Ops.push_back(Unmerge.getReg(1));
7171       Ops.push_back(Unmerge.getReg(2));
7172     };
7173 
7174     packLanes(RayOrigin);
7175     if (IsA16) {
7176       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7177       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7178       Register R1 = MRI.createGenericVirtualRegister(S32);
7179       Register R2 = MRI.createGenericVirtualRegister(S32);
7180       Register R3 = MRI.createGenericVirtualRegister(S32);
7181       B.buildMergeLikeInstr(R1,
7182                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7183       B.buildMergeLikeInstr(
7184           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7185       B.buildMergeLikeInstr(
7186           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7187       Ops.push_back(R1);
7188       Ops.push_back(R2);
7189       Ops.push_back(R3);
7190     } else {
7191       packLanes(RayDir);
7192       packLanes(RayInvDir);
7193     }
7194   }
7195 
7196   if (!UseNSA) {
7197     // Build a single vector containing all the operands so far prepared.
7198     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7199     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7200     Ops.clear();
7201     Ops.push_back(MergedOps);
7202   }
7203 
7204   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7205                  .addDef(DstReg)
7206                  .addImm(Opcode);
7207 
7208   for (Register R : Ops) {
7209     MIB.addUse(R);
7210   }
7211 
7212   MIB.addUse(TDescr)
7213      .addImm(IsA16 ? 1 : 0)
7214      .cloneMemRefs(MI);
7215 
7216   MI.eraseFromParent();
7217   return true;
7218 }
7219 
7220 bool AMDGPULegalizerInfo::legalizeBVHDualOrBVH8IntersectRayIntrinsic(
7221     MachineInstr &MI, MachineIRBuilder &B) const {
7222   const LLT S32 = LLT::scalar(32);
7223   const LLT V2S32 = LLT::fixed_vector(2, 32);
7224 
7225   Register DstReg = MI.getOperand(0).getReg();
7226   Register DstOrigin = MI.getOperand(1).getReg();
7227   Register DstDir = MI.getOperand(2).getReg();
7228   Register NodePtr = MI.getOperand(4).getReg();
7229   Register RayExtent = MI.getOperand(5).getReg();
7230   Register InstanceMask = MI.getOperand(6).getReg();
7231   Register RayOrigin = MI.getOperand(7).getReg();
7232   Register RayDir = MI.getOperand(8).getReg();
7233   Register Offsets = MI.getOperand(9).getReg();
7234   Register TDescr = MI.getOperand(10).getReg();
7235 
7236   if (!ST.hasBVHDualAndBVH8Insts()) {
7237     Function &Fn = B.getMF().getFunction();
7238     Fn.getContext().diagnose(DiagnosticInfoUnsupported(
7239         Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7240     return false;
7241   }
7242 
7243   bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7244                 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7245   const unsigned NumVDataDwords = 10;
7246   const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7247   int Opcode = AMDGPU::getMIMGOpcode(
7248       IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7249              : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7250       AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7251   assert(Opcode != -1);
7252 
7253   auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7254       V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7255 
7256   B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7257                       : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7258       .addDef(DstReg)
7259       .addDef(DstOrigin)
7260       .addDef(DstDir)
7261       .addImm(Opcode)
7262       .addUse(NodePtr)
7263       .addUse(RayExtentInstanceMaskVec.getReg(0))
7264       .addUse(RayOrigin)
7265       .addUse(RayDir)
7266       .addUse(Offsets)
7267       .addUse(TDescr)
7268       .cloneMemRefs(MI);
7269 
7270   MI.eraseFromParent();
7271   return true;
7272 }
7273 
7274 bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
7275                                             MachineIRBuilder &B) const {
7276   const SITargetLowering *TLI = ST.getTargetLowering();
7277   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
7278   Register DstReg = MI.getOperand(0).getReg();
7279   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7280   MI.eraseFromParent();
7281   return true;
7282 }
7283 
7284 bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7285                                          MachineIRBuilder &B) const {
7286   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7287   if (!ST.hasArchitectedSGPRs())
7288     return false;
7289   LLT S32 = LLT::scalar(32);
7290   Register DstReg = MI.getOperand(0).getReg();
7291   auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7292   auto LSB = B.buildConstant(S32, 25);
7293   auto Width = B.buildConstant(S32, 5);
7294   B.buildUbfx(DstReg, TTMP8, LSB, Width);
7295   MI.eraseFromParent();
7296   return true;
7297 }
7298 
7299 static constexpr unsigned FPEnvModeBitField =
7300     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
7301 
7302 static constexpr unsigned FPEnvTrapBitField =
7303     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
7304 
7305 bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7306                                            MachineRegisterInfo &MRI,
7307                                            MachineIRBuilder &B) const {
7308   Register Src = MI.getOperand(0).getReg();
7309   if (MRI.getType(Src) != S64)
7310     return false;
7311 
7312   auto ModeReg =
7313       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7314                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7315           .addImm(FPEnvModeBitField);
7316   auto TrapReg =
7317       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7318                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7319           .addImm(FPEnvTrapBitField);
7320   B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7321   MI.eraseFromParent();
7322   return true;
7323 }
7324 
7325 bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7326                                            MachineRegisterInfo &MRI,
7327                                            MachineIRBuilder &B) const {
7328   Register Src = MI.getOperand(0).getReg();
7329   if (MRI.getType(Src) != S64)
7330     return false;
7331 
7332   auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7333   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7334                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7335       .addImm(static_cast<int16_t>(FPEnvModeBitField))
7336       .addReg(Unmerge.getReg(0));
7337   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7338                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7339       .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7340       .addReg(Unmerge.getReg(1));
7341   MI.eraseFromParent();
7342   return true;
7343 }
7344 
7345 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
7346                                             MachineInstr &MI) const {
7347   MachineIRBuilder &B = Helper.MIRBuilder;
7348   MachineRegisterInfo &MRI = *B.getMRI();
7349 
7350   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
7351   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7352   switch (IntrID) {
7353   case Intrinsic::amdgcn_if:
7354   case Intrinsic::amdgcn_else: {
7355     MachineInstr *Br = nullptr;
7356     MachineBasicBlock *UncondBrTarget = nullptr;
7357     bool Negated = false;
7358     if (MachineInstr *BrCond =
7359             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7360       const SIRegisterInfo *TRI
7361         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7362 
7363       Register Def = MI.getOperand(1).getReg();
7364       Register Use = MI.getOperand(3).getReg();
7365 
7366       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7367 
7368       if (Negated)
7369         std::swap(CondBrTarget, UncondBrTarget);
7370 
7371       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7372       if (IntrID == Intrinsic::amdgcn_if) {
7373         B.buildInstr(AMDGPU::SI_IF)
7374           .addDef(Def)
7375           .addUse(Use)
7376           .addMBB(UncondBrTarget);
7377       } else {
7378         B.buildInstr(AMDGPU::SI_ELSE)
7379             .addDef(Def)
7380             .addUse(Use)
7381             .addMBB(UncondBrTarget);
7382       }
7383 
7384       if (Br) {
7385         Br->getOperand(0).setMBB(CondBrTarget);
7386       } else {
7387         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
7388         // since we're swapping branch targets it needs to be reinserted.
7389         // FIXME: IRTranslator should probably not do this
7390         B.buildBr(*CondBrTarget);
7391       }
7392 
7393       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
7394       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
7395       MI.eraseFromParent();
7396       BrCond->eraseFromParent();
7397       return true;
7398     }
7399 
7400     return false;
7401   }
7402   case Intrinsic::amdgcn_loop: {
7403     MachineInstr *Br = nullptr;
7404     MachineBasicBlock *UncondBrTarget = nullptr;
7405     bool Negated = false;
7406     if (MachineInstr *BrCond =
7407             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
7408       const SIRegisterInfo *TRI
7409         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
7410 
7411       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7412       Register Reg = MI.getOperand(2).getReg();
7413 
7414       if (Negated)
7415         std::swap(CondBrTarget, UncondBrTarget);
7416 
7417       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7418       B.buildInstr(AMDGPU::SI_LOOP)
7419         .addUse(Reg)
7420         .addMBB(UncondBrTarget);
7421 
7422       if (Br)
7423         Br->getOperand(0).setMBB(CondBrTarget);
7424       else
7425         B.buildBr(*CondBrTarget);
7426 
7427       MI.eraseFromParent();
7428       BrCond->eraseFromParent();
7429       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
7430       return true;
7431     }
7432 
7433     return false;
7434   }
7435   case Intrinsic::amdgcn_addrspacecast_nonnull:
7436     return legalizeAddrSpaceCast(MI, MRI, B);
7437   case Intrinsic::amdgcn_make_buffer_rsrc:
7438     return legalizePointerAsRsrcIntrin(MI, MRI, B);
7439   case Intrinsic::amdgcn_kernarg_segment_ptr:
7440     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
7441       // This only makes sense to call in a kernel, so just lower to null.
7442       B.buildConstant(MI.getOperand(0).getReg(), 0);
7443       MI.eraseFromParent();
7444       return true;
7445     }
7446 
7447     return legalizePreloadedArgIntrin(
7448       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
7449   case Intrinsic::amdgcn_implicitarg_ptr:
7450     return legalizeImplicitArgPtr(MI, MRI, B);
7451   case Intrinsic::amdgcn_workitem_id_x:
7452     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
7453                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
7454   case Intrinsic::amdgcn_workitem_id_y:
7455     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
7456                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
7457   case Intrinsic::amdgcn_workitem_id_z:
7458     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
7459                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
7460   case Intrinsic::amdgcn_workgroup_id_x:
7461     return legalizePreloadedArgIntrin(MI, MRI, B,
7462                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
7463   case Intrinsic::amdgcn_workgroup_id_y:
7464     return legalizePreloadedArgIntrin(MI, MRI, B,
7465                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
7466   case Intrinsic::amdgcn_workgroup_id_z:
7467     return legalizePreloadedArgIntrin(MI, MRI, B,
7468                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7469   case Intrinsic::amdgcn_wave_id:
7470     return legalizeWaveID(MI, B);
7471   case Intrinsic::amdgcn_lds_kernel_id:
7472     return legalizePreloadedArgIntrin(MI, MRI, B,
7473                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
7474   case Intrinsic::amdgcn_dispatch_ptr:
7475     return legalizePreloadedArgIntrin(MI, MRI, B,
7476                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
7477   case Intrinsic::amdgcn_queue_ptr:
7478     return legalizePreloadedArgIntrin(MI, MRI, B,
7479                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
7480   case Intrinsic::amdgcn_implicit_buffer_ptr:
7481     return legalizePreloadedArgIntrin(
7482       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
7483   case Intrinsic::amdgcn_dispatch_id:
7484     return legalizePreloadedArgIntrin(MI, MRI, B,
7485                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
7486   case Intrinsic::r600_read_ngroups_x:
7487     // TODO: Emit error for hsa
7488     return legalizeKernargMemParameter(MI, B,
7489                                        SI::KernelInputOffsets::NGROUPS_X);
7490   case Intrinsic::r600_read_ngroups_y:
7491     return legalizeKernargMemParameter(MI, B,
7492                                        SI::KernelInputOffsets::NGROUPS_Y);
7493   case Intrinsic::r600_read_ngroups_z:
7494     return legalizeKernargMemParameter(MI, B,
7495                                        SI::KernelInputOffsets::NGROUPS_Z);
7496   case Intrinsic::r600_read_local_size_x:
7497     // TODO: Could insert G_ASSERT_ZEXT from s16
7498     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
7499   case Intrinsic::r600_read_local_size_y:
7500     // TODO: Could insert G_ASSERT_ZEXT from s16
7501     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
7502     // TODO: Could insert G_ASSERT_ZEXT from s16
7503   case Intrinsic::r600_read_local_size_z:
7504     return legalizeKernargMemParameter(MI, B,
7505                                        SI::KernelInputOffsets::LOCAL_SIZE_Z);
7506   case Intrinsic::amdgcn_fdiv_fast:
7507     return legalizeFDIVFastIntrin(MI, MRI, B);
7508   case Intrinsic::amdgcn_is_shared:
7509     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
7510   case Intrinsic::amdgcn_is_private:
7511     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
7512   case Intrinsic::amdgcn_wavefrontsize: {
7513     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
7514     MI.eraseFromParent();
7515     return true;
7516   }
7517   case Intrinsic::amdgcn_s_buffer_load:
7518     return legalizeSBufferLoad(Helper, MI);
7519   case Intrinsic::amdgcn_raw_buffer_store:
7520   case Intrinsic::amdgcn_raw_ptr_buffer_store:
7521   case Intrinsic::amdgcn_struct_buffer_store:
7522   case Intrinsic::amdgcn_struct_ptr_buffer_store:
7523     return legalizeBufferStore(MI, Helper, false, false);
7524   case Intrinsic::amdgcn_raw_buffer_store_format:
7525   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7526   case Intrinsic::amdgcn_struct_buffer_store_format:
7527   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7528     return legalizeBufferStore(MI, Helper, false, true);
7529   case Intrinsic::amdgcn_raw_tbuffer_store:
7530   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7531   case Intrinsic::amdgcn_struct_tbuffer_store:
7532   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7533     return legalizeBufferStore(MI, Helper, true, true);
7534   case Intrinsic::amdgcn_raw_buffer_load:
7535   case Intrinsic::amdgcn_raw_ptr_buffer_load:
7536   case Intrinsic::amdgcn_raw_atomic_buffer_load:
7537   case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7538   case Intrinsic::amdgcn_struct_buffer_load:
7539   case Intrinsic::amdgcn_struct_ptr_buffer_load:
7540   case Intrinsic::amdgcn_struct_atomic_buffer_load:
7541   case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7542     return legalizeBufferLoad(MI, Helper, false, false);
7543   case Intrinsic::amdgcn_raw_buffer_load_format:
7544   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7545   case Intrinsic::amdgcn_struct_buffer_load_format:
7546   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7547     return legalizeBufferLoad(MI, Helper, true, false);
7548   case Intrinsic::amdgcn_raw_tbuffer_load:
7549   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7550   case Intrinsic::amdgcn_struct_tbuffer_load:
7551   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7552     return legalizeBufferLoad(MI, Helper, true, true);
7553   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7554   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7555   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7556   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7557   case Intrinsic::amdgcn_raw_buffer_atomic_add:
7558   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7559   case Intrinsic::amdgcn_struct_buffer_atomic_add:
7560   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7561   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7562   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7563   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7564   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7565   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7566   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7567   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7568   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7569   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7570   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7571   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7572   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7573   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7574   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7575   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7576   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7577   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7578   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7579   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7580   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7581   case Intrinsic::amdgcn_raw_buffer_atomic_and:
7582   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7583   case Intrinsic::amdgcn_struct_buffer_atomic_and:
7584   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7585   case Intrinsic::amdgcn_raw_buffer_atomic_or:
7586   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7587   case Intrinsic::amdgcn_struct_buffer_atomic_or:
7588   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7589   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7590   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7591   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7592   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7593   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7594   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7595   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7596   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7597   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7598   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7599   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7600   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7601   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7602   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7603   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7604   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7605   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7606   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7607   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7608   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7609   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7610   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7611   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7612   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7613   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7614   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7615   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7616   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7617     return legalizeBufferAtomic(MI, B, IntrID);
7618   case Intrinsic::amdgcn_rsq_clamp:
7619     return legalizeRsqClampIntrinsic(MI, MRI, B);
7620   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7621     return legalizeBVHIntersectRayIntrinsic(MI, B);
7622   case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
7623   case Intrinsic::amdgcn_image_bvh8_intersect_ray:
7624     return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B);
7625   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7626   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7627   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7628   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7629   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7630   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7631   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7632   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7633     Register Index = MI.getOperand(5).getReg();
7634     LLT S32 = LLT::scalar(32);
7635     if (MRI.getType(Index) != S32)
7636       MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7637     return true;
7638   }
7639   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7640   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7641   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7642     Register Index = MI.getOperand(7).getReg();
7643     LLT S32 = LLT::scalar(32);
7644     if (MRI.getType(Index) != S32)
7645       MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7646     return true;
7647   }
7648   case Intrinsic::amdgcn_fmed3: {
7649     GISelChangeObserver &Observer = Helper.Observer;
7650 
7651     // FIXME: This is to workaround the inability of tablegen match combiners to
7652     // match intrinsics in patterns.
7653     Observer.changingInstr(MI);
7654     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7655     MI.removeOperand(1);
7656     Observer.changedInstr(MI);
7657     return true;
7658   }
7659   case Intrinsic::amdgcn_readlane:
7660   case Intrinsic::amdgcn_writelane:
7661   case Intrinsic::amdgcn_readfirstlane:
7662   case Intrinsic::amdgcn_permlane16:
7663   case Intrinsic::amdgcn_permlanex16:
7664   case Intrinsic::amdgcn_permlane64:
7665   case Intrinsic::amdgcn_set_inactive:
7666   case Intrinsic::amdgcn_set_inactive_chain_arg:
7667   case Intrinsic::amdgcn_mov_dpp8:
7668   case Intrinsic::amdgcn_update_dpp:
7669     return legalizeLaneOp(Helper, MI, IntrID);
7670   case Intrinsic::amdgcn_s_buffer_prefetch_data:
7671     return legalizeSBufferPrefetch(Helper, MI);
7672   case Intrinsic::amdgcn_dead: {
7673     // TODO: Use poison instead of undef
7674     for (const MachineOperand &Def : MI.defs())
7675       B.buildUndef(Def);
7676     MI.eraseFromParent();
7677     return true;
7678   }
7679   default: {
7680     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7681             AMDGPU::getImageDimIntrinsicInfo(IntrID))
7682       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
7683     return true;
7684   }
7685   }
7686 
7687   return true;
7688 }
7689