xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision af23369a6deaaeb612ab266eb88b8bb8d560c322)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 /// \returns true if this is an odd sized vector which should widen by adding an
64 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65 /// excludes s1 vectors, which should always be scalarized.
66 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
67   return [=](const LegalityQuery &Query) {
68     const LLT Ty = Query.Types[TypeIdx];
69     if (!Ty.isVector())
70       return false;
71 
72     const LLT EltTy = Ty.getElementType();
73     const unsigned EltSize = EltTy.getSizeInBits();
74     return Ty.getNumElements() % 2 != 0 &&
75            EltSize > 1 && EltSize < 32 &&
76            Ty.getSizeInBits() % 32 != 0;
77   };
78 }
79 
80 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81   return [=](const LegalityQuery &Query) {
82     const LLT Ty = Query.Types[TypeIdx];
83     return Ty.getSizeInBits() % 32 == 0;
84   };
85 }
86 
87 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     const LLT EltTy = Ty.getScalarType();
91     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
92   };
93 }
94 
95 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
96   return [=](const LegalityQuery &Query) {
97     const LLT Ty = Query.Types[TypeIdx];
98     const LLT EltTy = Ty.getElementType();
99     return std::make_pair(TypeIdx,
100                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
101   };
102 }
103 
104 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
105   return [=](const LegalityQuery &Query) {
106     const LLT Ty = Query.Types[TypeIdx];
107     const LLT EltTy = Ty.getElementType();
108     unsigned Size = Ty.getSizeInBits();
109     unsigned Pieces = (Size + 63) / 64;
110     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111     return std::make_pair(
112         TypeIdx,
113         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
114   };
115 }
116 
117 // Increase the number of vector elements to reach the next multiple of 32-bit
118 // type.
119 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
120   return [=](const LegalityQuery &Query) {
121     const LLT Ty = Query.Types[TypeIdx];
122 
123     const LLT EltTy = Ty.getElementType();
124     const int Size = Ty.getSizeInBits();
125     const int EltSize = EltTy.getSizeInBits();
126     const int NextMul32 = (Size + 31) / 32;
127 
128     assert(EltSize < 32);
129 
130     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
132   };
133 }
134 
135 static LLT getBitcastRegisterType(const LLT Ty) {
136   const unsigned Size = Ty.getSizeInBits();
137 
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB})
534       .legalFor({S32, S16, V2S16})
535       .clampMaxNumElementsStrict(0, S16, 2)
536       .scalarize(0)
537       .minScalar(0, S16)
538       .widenScalarToNextMultipleOf(0, 32)
539       .maxScalar(0, S32);
540 
541     getActionDefinitionsBuilder(G_MUL)
542       .legalFor({S32, S16, V2S16})
543       .clampMaxNumElementsStrict(0, S16, 2)
544       .scalarize(0)
545       .minScalar(0, S16)
546       .widenScalarToNextMultipleOf(0, 32)
547       .custom();
548     assert(ST.hasMad64_32());
549 
550     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551       .legalFor({S32, S16, V2S16}) // Clamp modifier
552       .minScalarOrElt(0, S16)
553       .clampMaxNumElementsStrict(0, S16, 2)
554       .scalarize(0)
555       .widenScalarToNextPow2(0, 32)
556       .lower();
557   } else if (ST.has16BitInsts()) {
558     getActionDefinitionsBuilder({G_ADD, G_SUB})
559       .legalFor({S32, S16})
560       .minScalar(0, S16)
561       .widenScalarToNextMultipleOf(0, 32)
562       .maxScalar(0, S32)
563       .scalarize(0);
564 
565     getActionDefinitionsBuilder(G_MUL)
566       .legalFor({S32, S16})
567       .scalarize(0)
568       .minScalar(0, S16)
569       .widenScalarToNextMultipleOf(0, 32)
570       .custom();
571     assert(ST.hasMad64_32());
572 
573     // Technically the saturating operations require clamp bit support, but this
574     // was introduced at the same time as 16-bit operations.
575     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576       .legalFor({S32, S16}) // Clamp modifier
577       .minScalar(0, S16)
578       .scalarize(0)
579       .widenScalarToNextPow2(0, 16)
580       .lower();
581 
582     // We're just lowering this, but it helps get a better result to try to
583     // coerce to the desired type first.
584     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585       .minScalar(0, S16)
586       .scalarize(0)
587       .lower();
588   } else {
589     getActionDefinitionsBuilder({G_ADD, G_SUB})
590       .legalFor({S32})
591       .widenScalarToNextMultipleOf(0, 32)
592       .clampScalar(0, S32, S32)
593       .scalarize(0);
594 
595     auto &Mul = getActionDefinitionsBuilder(G_MUL)
596       .legalFor({S32})
597       .scalarize(0)
598       .minScalar(0, S32)
599       .widenScalarToNextMultipleOf(0, 32);
600 
601     if (ST.hasMad64_32())
602       Mul.custom();
603     else
604       Mul.maxScalar(0, S32);
605 
606     if (ST.hasIntClamp()) {
607       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608         .legalFor({S32}) // Clamp modifier.
609         .scalarize(0)
610         .minScalarOrElt(0, S32)
611         .lower();
612     } else {
613       // Clamp bit support was added in VI, along with 16-bit operations.
614       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615         .minScalar(0, S32)
616         .scalarize(0)
617         .lower();
618     }
619 
620     // FIXME: DAG expansion gets better results. The widening uses the smaller
621     // range values and goes for the min/max lowering directly.
622     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623       .minScalar(0, S32)
624       .scalarize(0)
625       .lower();
626   }
627 
628   getActionDefinitionsBuilder(
629       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
630       .customFor({S32, S64})
631       .clampScalar(0, S32, S64)
632       .widenScalarToNextPow2(0, 32)
633       .scalarize(0);
634 
635   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
636                    .legalFor({S32})
637                    .maxScalar(0, S32);
638 
639   if (ST.hasVOP3PInsts()) {
640     Mulh
641       .clampMaxNumElements(0, S8, 2)
642       .lowerFor({V2S8});
643   }
644 
645   Mulh
646     .scalarize(0)
647     .lower();
648 
649   // Report legal for any types we can handle anywhere. For the cases only legal
650   // on the SALU, RegBankSelect will be able to re-legalize.
651   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
652     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
653     .clampScalar(0, S32, S64)
654     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
655     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
656     .widenScalarToNextPow2(0)
657     .scalarize(0);
658 
659   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
660                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661     .legalFor({{S32, S1}, {S32, S32}})
662     .minScalar(0, S32)
663     .scalarize(0)
664     .lower();
665 
666   getActionDefinitionsBuilder(G_BITCAST)
667     // Don't worry about the size constraint.
668     .legalIf(all(isRegisterType(0), isRegisterType(1)))
669     .lower();
670 
671 
672   getActionDefinitionsBuilder(G_CONSTANT)
673     .legalFor({S1, S32, S64, S16, GlobalPtr,
674                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675     .legalIf(isPointer(0))
676     .clampScalar(0, S32, S64)
677     .widenScalarToNextPow2(0);
678 
679   getActionDefinitionsBuilder(G_FCONSTANT)
680     .legalFor({S32, S64, S16})
681     .clampScalar(0, S16, S64);
682 
683   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
684       .legalIf(isRegisterType(0))
685       // s1 and s16 are special cases because they have legal operations on
686       // them, but don't really occupy registers in the normal way.
687       .legalFor({S1, S16})
688       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
689       .clampScalarOrElt(0, S32, MaxScalar)
690       .widenScalarToNextPow2(0, 32)
691       .clampMaxNumElements(0, S32, 16);
692 
693   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
694 
695   // If the amount is divergent, we have to do a wave reduction to get the
696   // maximum value, so this is expanded during RegBankSelect.
697   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
698     .legalFor({{PrivatePtr, S32}});
699 
700   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701     .customIf(typeIsNot(0, PrivatePtr));
702 
703   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
704 
705   auto &FPOpActions = getActionDefinitionsBuilder(
706     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
707     .legalFor({S32, S64});
708   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
709     .customFor({S32, S64});
710   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
711     .customFor({S32, S64});
712 
713   if (ST.has16BitInsts()) {
714     if (ST.hasVOP3PInsts())
715       FPOpActions.legalFor({S16, V2S16});
716     else
717       FPOpActions.legalFor({S16});
718 
719     TrigActions.customFor({S16});
720     FDIVActions.customFor({S16});
721   }
722 
723   auto &MinNumMaxNum = getActionDefinitionsBuilder({
724       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
725 
726   if (ST.hasVOP3PInsts()) {
727     MinNumMaxNum.customFor(FPTypesPK16)
728       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
729       .clampMaxNumElements(0, S16, 2)
730       .clampScalar(0, S16, S64)
731       .scalarize(0);
732   } else if (ST.has16BitInsts()) {
733     MinNumMaxNum.customFor(FPTypes16)
734       .clampScalar(0, S16, S64)
735       .scalarize(0);
736   } else {
737     MinNumMaxNum.customFor(FPTypesBase)
738       .clampScalar(0, S32, S64)
739       .scalarize(0);
740   }
741 
742   if (ST.hasVOP3PInsts())
743     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
744 
745   FPOpActions
746     .scalarize(0)
747     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
748 
749   TrigActions
750     .scalarize(0)
751     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
752 
753   FDIVActions
754     .scalarize(0)
755     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
756 
757   getActionDefinitionsBuilder({G_FNEG, G_FABS})
758     .legalFor(FPTypesPK16)
759     .clampMaxNumElementsStrict(0, S16, 2)
760     .scalarize(0)
761     .clampScalar(0, S16, S64);
762 
763   if (ST.has16BitInsts()) {
764     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
765       .legalFor({S32, S64, S16})
766       .scalarize(0)
767       .clampScalar(0, S16, S64);
768   } else {
769     getActionDefinitionsBuilder(G_FSQRT)
770       .legalFor({S32, S64})
771       .scalarize(0)
772       .clampScalar(0, S32, S64);
773 
774     if (ST.hasFractBug()) {
775       getActionDefinitionsBuilder(G_FFLOOR)
776         .customFor({S64})
777         .legalFor({S32, S64})
778         .scalarize(0)
779         .clampScalar(0, S32, S64);
780     } else {
781       getActionDefinitionsBuilder(G_FFLOOR)
782         .legalFor({S32, S64})
783         .scalarize(0)
784         .clampScalar(0, S32, S64);
785     }
786   }
787 
788   getActionDefinitionsBuilder(G_FPTRUNC)
789     .legalFor({{S32, S64}, {S16, S32}})
790     .scalarize(0)
791     .lower();
792 
793   getActionDefinitionsBuilder(G_FPEXT)
794     .legalFor({{S64, S32}, {S32, S16}})
795     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
796     .scalarize(0);
797 
798   auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
799   if (ST.has16BitInsts()) {
800     FSubActions
801       // Use actual fsub instruction
802       .legalFor({S32, S16})
803       // Must use fadd + fneg
804       .lowerFor({S64, V2S16});
805   } else {
806     FSubActions
807       // Use actual fsub instruction
808       .legalFor({S32})
809       // Must use fadd + fneg
810       .lowerFor({S64, S16, V2S16});
811   }
812 
813   FSubActions
814     .scalarize(0)
815     .clampScalar(0, S32, S64);
816 
817   // Whether this is legal depends on the floating point mode for the function.
818   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
819   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
820     FMad.customFor({S32, S16});
821   else if (ST.hasMadMacF32Insts())
822     FMad.customFor({S32});
823   else if (ST.hasMadF16())
824     FMad.customFor({S16});
825   FMad.scalarize(0)
826       .lower();
827 
828   auto &FRem = getActionDefinitionsBuilder(G_FREM);
829   if (ST.has16BitInsts()) {
830     FRem.customFor({S16, S32, S64});
831   } else {
832     FRem.minScalar(0, S32)
833         .customFor({S32, S64});
834   }
835   FRem.scalarize(0);
836 
837   // TODO: Do we need to clamp maximum bitwidth?
838   getActionDefinitionsBuilder(G_TRUNC)
839     .legalIf(isScalar(0))
840     .legalFor({{V2S16, V2S32}})
841     .clampMaxNumElements(0, S16, 2)
842     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
843     // situations (like an invalid implicit use), we don't want to infinite loop
844     // in the legalizer.
845     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
846     .alwaysLegal();
847 
848   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
849     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
850                {S32, S1}, {S64, S1}, {S16, S1}})
851     .scalarize(0)
852     .clampScalar(0, S32, S64)
853     .widenScalarToNextPow2(1, 32);
854 
855   // TODO: Split s1->s64 during regbankselect for VALU.
856   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
857                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
858                     .lowerIf(typeIs(1, S1))
859                     .customFor({{S32, S64}, {S64, S64}});
860   if (ST.has16BitInsts())
861     IToFP.legalFor({{S16, S16}});
862   IToFP.clampScalar(1, S32, S64)
863        .minScalar(0, S32)
864        .scalarize(0)
865        .widenScalarToNextPow2(1);
866 
867   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
868     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
869     .customFor({{S64, S32}, {S64, S64}})
870     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
871   if (ST.has16BitInsts())
872     FPToI.legalFor({{S16, S16}});
873   else
874     FPToI.minScalar(1, S32);
875 
876   FPToI.minScalar(0, S32)
877        .widenScalarToNextPow2(0, 32)
878        .scalarize(0)
879        .lower();
880 
881   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
882       .customFor({S16, S32})
883       .scalarize(0)
884       .lower();
885 
886   // Lower roundeven into G_FRINT
887   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
888     .scalarize(0)
889     .lower();
890 
891   if (ST.has16BitInsts()) {
892     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
893       .legalFor({S16, S32, S64})
894       .clampScalar(0, S16, S64)
895       .scalarize(0);
896   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
897     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
898       .legalFor({S32, S64})
899       .clampScalar(0, S32, S64)
900       .scalarize(0);
901   } else {
902     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
903       .legalFor({S32})
904       .customFor({S64})
905       .clampScalar(0, S32, S64)
906       .scalarize(0);
907   }
908 
909   getActionDefinitionsBuilder(G_PTR_ADD)
910     .legalIf(all(isPointer(0), sameSize(0, 1)))
911     .scalarize(0)
912     .scalarSameSizeAs(1, 0);
913 
914   getActionDefinitionsBuilder(G_PTRMASK)
915     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
916     .scalarSameSizeAs(1, 0)
917     .scalarize(0);
918 
919   auto &CmpBuilder =
920     getActionDefinitionsBuilder(G_ICMP)
921     // The compare output type differs based on the register bank of the output,
922     // so make both s1 and s32 legal.
923     //
924     // Scalar compares producing output in scc will be promoted to s32, as that
925     // is the allocatable register type that will be needed for the copy from
926     // scc. This will be promoted during RegBankSelect, and we assume something
927     // before that won't try to use s32 result types.
928     //
929     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
930     // bank.
931     .legalForCartesianProduct(
932       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
933     .legalForCartesianProduct(
934       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
935   if (ST.has16BitInsts()) {
936     CmpBuilder.legalFor({{S1, S16}});
937   }
938 
939   CmpBuilder
940     .widenScalarToNextPow2(1)
941     .clampScalar(1, S32, S64)
942     .scalarize(0)
943     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
944 
945   getActionDefinitionsBuilder(G_FCMP)
946     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
947     .widenScalarToNextPow2(1)
948     .clampScalar(1, S32, S64)
949     .scalarize(0);
950 
951   // FIXME: fpow has a selection pattern that should move to custom lowering.
952   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
953   if (ST.has16BitInsts())
954     Exp2Ops.legalFor({S32, S16});
955   else
956     Exp2Ops.legalFor({S32});
957   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
958   Exp2Ops.scalarize(0);
959 
960   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
961   if (ST.has16BitInsts())
962     ExpOps.customFor({{S32}, {S16}});
963   else
964     ExpOps.customFor({S32});
965   ExpOps.clampScalar(0, MinScalarFPTy, S32)
966         .scalarize(0);
967 
968   getActionDefinitionsBuilder(G_FPOWI)
969     .clampScalar(0, MinScalarFPTy, S32)
970     .lower();
971 
972   // The 64-bit versions produce 32-bit results, but only on the SALU.
973   getActionDefinitionsBuilder(G_CTPOP)
974     .legalFor({{S32, S32}, {S32, S64}})
975     .clampScalar(0, S32, S32)
976     .widenScalarToNextPow2(1, 32)
977     .clampScalar(1, S32, S64)
978     .scalarize(0)
979     .widenScalarToNextPow2(0, 32);
980 
981 
982   // The hardware instructions return a different result on 0 than the generic
983   // instructions expect. The hardware produces -1, but these produce the
984   // bitwidth.
985   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
986     .scalarize(0)
987     .clampScalar(0, S32, S32)
988     .clampScalar(1, S32, S64)
989     .widenScalarToNextPow2(0, 32)
990     .widenScalarToNextPow2(1, 32)
991     .custom();
992 
993   // The 64-bit versions produce 32-bit results, but only on the SALU.
994   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
995     .legalFor({{S32, S32}, {S32, S64}})
996     .clampScalar(0, S32, S32)
997     .clampScalar(1, S32, S64)
998     .scalarize(0)
999     .widenScalarToNextPow2(0, 32)
1000     .widenScalarToNextPow2(1, 32);
1001 
1002   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1003   // RegBankSelect.
1004   getActionDefinitionsBuilder(G_BITREVERSE)
1005     .legalFor({S32, S64})
1006     .clampScalar(0, S32, S64)
1007     .scalarize(0)
1008     .widenScalarToNextPow2(0);
1009 
1010   if (ST.has16BitInsts()) {
1011     getActionDefinitionsBuilder(G_BSWAP)
1012       .legalFor({S16, S32, V2S16})
1013       .clampMaxNumElementsStrict(0, S16, 2)
1014       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1015       // narrowScalar limitation.
1016       .widenScalarToNextPow2(0)
1017       .clampScalar(0, S16, S32)
1018       .scalarize(0);
1019 
1020     if (ST.hasVOP3PInsts()) {
1021       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1022         .legalFor({S32, S16, V2S16})
1023         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1024         .clampMaxNumElements(0, S16, 2)
1025         .minScalar(0, S16)
1026         .widenScalarToNextPow2(0)
1027         .scalarize(0)
1028         .lower();
1029     } else {
1030       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1031         .legalFor({S32, S16})
1032         .widenScalarToNextPow2(0)
1033         .minScalar(0, S16)
1034         .scalarize(0)
1035         .lower();
1036     }
1037   } else {
1038     // TODO: Should have same legality without v_perm_b32
1039     getActionDefinitionsBuilder(G_BSWAP)
1040       .legalFor({S32})
1041       .lowerIf(scalarNarrowerThan(0, 32))
1042       // FIXME: Fixing non-power-of-2 before clamp is workaround for
1043       // narrowScalar limitation.
1044       .widenScalarToNextPow2(0)
1045       .maxScalar(0, S32)
1046       .scalarize(0)
1047       .lower();
1048 
1049     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1050       .legalFor({S32})
1051       .minScalar(0, S32)
1052       .widenScalarToNextPow2(0)
1053       .scalarize(0)
1054       .lower();
1055   }
1056 
1057   getActionDefinitionsBuilder(G_INTTOPTR)
1058     // List the common cases
1059     .legalForCartesianProduct(AddrSpaces64, {S64})
1060     .legalForCartesianProduct(AddrSpaces32, {S32})
1061     .scalarize(0)
1062     // Accept any address space as long as the size matches
1063     .legalIf(sameSize(0, 1))
1064     .widenScalarIf(smallerThan(1, 0),
1065       [](const LegalityQuery &Query) {
1066         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1067       })
1068     .narrowScalarIf(largerThan(1, 0),
1069       [](const LegalityQuery &Query) {
1070         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1071       });
1072 
1073   getActionDefinitionsBuilder(G_PTRTOINT)
1074     // List the common cases
1075     .legalForCartesianProduct(AddrSpaces64, {S64})
1076     .legalForCartesianProduct(AddrSpaces32, {S32})
1077     .scalarize(0)
1078     // Accept any address space as long as the size matches
1079     .legalIf(sameSize(0, 1))
1080     .widenScalarIf(smallerThan(0, 1),
1081       [](const LegalityQuery &Query) {
1082         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1083       })
1084     .narrowScalarIf(
1085       largerThan(0, 1),
1086       [](const LegalityQuery &Query) {
1087         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1088       });
1089 
1090   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1091     .scalarize(0)
1092     .custom();
1093 
1094   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1095                                     bool IsLoad) -> bool {
1096     const LLT DstTy = Query.Types[0];
1097 
1098     // Split vector extloads.
1099     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1100 
1101     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1102       return true;
1103 
1104     const LLT PtrTy = Query.Types[1];
1105     unsigned AS = PtrTy.getAddressSpace();
1106     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1107       return true;
1108 
1109     // Catch weird sized loads that don't evenly divide into the access sizes
1110     // TODO: May be able to widen depending on alignment etc.
1111     unsigned NumRegs = (MemSize + 31) / 32;
1112     if (NumRegs == 3) {
1113       if (!ST.hasDwordx3LoadStores())
1114         return true;
1115     } else {
1116       // If the alignment allows, these should have been widened.
1117       if (!isPowerOf2_32(NumRegs))
1118         return true;
1119     }
1120 
1121     return false;
1122   };
1123 
1124   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1125   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1126   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1127 
1128   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1129   // LDS
1130   // TODO: Unsupported flat for SI.
1131 
1132   for (unsigned Op : {G_LOAD, G_STORE}) {
1133     const bool IsStore = Op == G_STORE;
1134 
1135     auto &Actions = getActionDefinitionsBuilder(Op);
1136     // Explicitly list some common cases.
1137     // TODO: Does this help compile time at all?
1138     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1139                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1140                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1141                                       {S64, GlobalPtr, S64, GlobalAlign32},
1142                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1143                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1144                                       {S32, GlobalPtr, S8, GlobalAlign8},
1145                                       {S32, GlobalPtr, S16, GlobalAlign16},
1146 
1147                                       {S32, LocalPtr, S32, 32},
1148                                       {S64, LocalPtr, S64, 32},
1149                                       {V2S32, LocalPtr, V2S32, 32},
1150                                       {S32, LocalPtr, S8, 8},
1151                                       {S32, LocalPtr, S16, 16},
1152                                       {V2S16, LocalPtr, S32, 32},
1153 
1154                                       {S32, PrivatePtr, S32, 32},
1155                                       {S32, PrivatePtr, S8, 8},
1156                                       {S32, PrivatePtr, S16, 16},
1157                                       {V2S16, PrivatePtr, S32, 32},
1158 
1159                                       {S32, ConstantPtr, S32, GlobalAlign32},
1160                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1161                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1162                                       {S64, ConstantPtr, S64, GlobalAlign32},
1163                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1164     Actions.legalIf(
1165       [=](const LegalityQuery &Query) -> bool {
1166         return isLoadStoreLegal(ST, Query);
1167       });
1168 
1169     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1170     // 64-bits.
1171     //
1172     // TODO: Should generalize bitcast action into coerce, which will also cover
1173     // inserting addrspacecasts.
1174     Actions.customIf(typeIs(1, Constant32Ptr));
1175 
1176     // Turn any illegal element vectors into something easier to deal
1177     // with. These will ultimately produce 32-bit scalar shifts to extract the
1178     // parts anyway.
1179     //
1180     // For odd 16-bit element vectors, prefer to split those into pieces with
1181     // 16-bit vector parts.
1182     Actions.bitcastIf(
1183       [=](const LegalityQuery &Query) -> bool {
1184         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1185                                           Query.MMODescrs[0].MemoryTy);
1186       }, bitcastToRegisterType(0));
1187 
1188     if (!IsStore) {
1189       // Widen suitably aligned loads by loading extra bytes. The standard
1190       // legalization actions can't properly express widening memory operands.
1191       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1192         return shouldWidenLoad(ST, Query, G_LOAD);
1193       });
1194     }
1195 
1196     // FIXME: load/store narrowing should be moved to lower action
1197     Actions
1198         .narrowScalarIf(
1199             [=](const LegalityQuery &Query) -> bool {
1200               return !Query.Types[0].isVector() &&
1201                      needToSplitMemOp(Query, Op == G_LOAD);
1202             },
1203             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1204               const LLT DstTy = Query.Types[0];
1205               const LLT PtrTy = Query.Types[1];
1206 
1207               const unsigned DstSize = DstTy.getSizeInBits();
1208               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1209 
1210               // Split extloads.
1211               if (DstSize > MemSize)
1212                 return std::make_pair(0, LLT::scalar(MemSize));
1213 
1214               unsigned MaxSize = maxSizeForAddrSpace(ST,
1215                                                      PtrTy.getAddressSpace(),
1216                                                      Op == G_LOAD);
1217               if (MemSize > MaxSize)
1218                 return std::make_pair(0, LLT::scalar(MaxSize));
1219 
1220               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1221               return std::make_pair(0, LLT::scalar(Align));
1222             })
1223         .fewerElementsIf(
1224             [=](const LegalityQuery &Query) -> bool {
1225               return Query.Types[0].isVector() &&
1226                      needToSplitMemOp(Query, Op == G_LOAD);
1227             },
1228             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1229               const LLT DstTy = Query.Types[0];
1230               const LLT PtrTy = Query.Types[1];
1231 
1232               LLT EltTy = DstTy.getElementType();
1233               unsigned MaxSize = maxSizeForAddrSpace(ST,
1234                                                      PtrTy.getAddressSpace(),
1235                                                      Op == G_LOAD);
1236 
1237               // FIXME: Handle widened to power of 2 results better. This ends
1238               // up scalarizing.
1239               // FIXME: 3 element stores scalarized on SI
1240 
1241               // Split if it's too large for the address space.
1242               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1243               if (MemSize > MaxSize) {
1244                 unsigned NumElts = DstTy.getNumElements();
1245                 unsigned EltSize = EltTy.getSizeInBits();
1246 
1247                 if (MaxSize % EltSize == 0) {
1248                   return std::make_pair(
1249                       0, LLT::scalarOrVector(
1250                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1251                 }
1252 
1253                 unsigned NumPieces = MemSize / MaxSize;
1254 
1255                 // FIXME: Refine when odd breakdowns handled
1256                 // The scalars will need to be re-legalized.
1257                 if (NumPieces == 1 || NumPieces >= NumElts ||
1258                     NumElts % NumPieces != 0)
1259                   return std::make_pair(0, EltTy);
1260 
1261                 return std::make_pair(
1262                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1263               }
1264 
1265               // FIXME: We could probably handle weird extending loads better.
1266               if (DstTy.getSizeInBits() > MemSize)
1267                 return std::make_pair(0, EltTy);
1268 
1269               unsigned EltSize = EltTy.getSizeInBits();
1270               unsigned DstSize = DstTy.getSizeInBits();
1271               if (!isPowerOf2_32(DstSize)) {
1272                 // We're probably decomposing an odd sized store. Try to split
1273                 // to the widest type. TODO: Account for alignment. As-is it
1274                 // should be OK, since the new parts will be further legalized.
1275                 unsigned FloorSize = PowerOf2Floor(DstSize);
1276                 return std::make_pair(
1277                     0, LLT::scalarOrVector(
1278                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1279               }
1280 
1281               // May need relegalization for the scalars.
1282               return std::make_pair(0, EltTy);
1283             })
1284     .minScalar(0, S32)
1285     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1286     .widenScalarToNextPow2(0)
1287     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1288     .lower();
1289   }
1290 
1291   // FIXME: Unaligned accesses not lowered.
1292   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1293                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1294                                                   {S32, GlobalPtr, S16, 2 * 8},
1295                                                   {S32, LocalPtr, S8, 8},
1296                                                   {S32, LocalPtr, S16, 16},
1297                                                   {S32, PrivatePtr, S8, 8},
1298                                                   {S32, PrivatePtr, S16, 16},
1299                                                   {S32, ConstantPtr, S8, 8},
1300                                                   {S32, ConstantPtr, S16, 2 * 8}})
1301                        .legalIf(
1302                          [=](const LegalityQuery &Query) -> bool {
1303                            return isLoadStoreLegal(ST, Query);
1304                          });
1305 
1306   if (ST.hasFlatAddressSpace()) {
1307     ExtLoads.legalForTypesWithMemDesc(
1308         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1309   }
1310 
1311   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1312   // 64-bits.
1313   //
1314   // TODO: Should generalize bitcast action into coerce, which will also cover
1315   // inserting addrspacecasts.
1316   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1317 
1318   ExtLoads.clampScalar(0, S32, S32)
1319           .widenScalarToNextPow2(0)
1320           .lower();
1321 
1322   auto &Atomics = getActionDefinitionsBuilder(
1323     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1324      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1325      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1326      G_ATOMICRMW_UMIN})
1327     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1328                {S64, GlobalPtr}, {S64, LocalPtr},
1329                {S32, RegionPtr}, {S64, RegionPtr}});
1330   if (ST.hasFlatAddressSpace()) {
1331     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1332   }
1333 
1334   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1335   if (ST.hasLDSFPAtomicAdd()) {
1336     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1337     if (ST.hasGFX90AInsts())
1338       Atomic.legalFor({{S64, LocalPtr}});
1339     if (ST.hasGFX940Insts())
1340       Atomic.legalFor({{V2S16, LocalPtr}});
1341   }
1342   if (ST.hasAtomicFaddInsts())
1343     Atomic.legalFor({{S32, GlobalPtr}});
1344 
1345   if (ST.hasGFX90AInsts()) {
1346     // These are legal with some caveats, and should have undergone expansion in
1347     // the IR in most situations
1348     // TODO: Move atomic expansion into legalizer
1349     // TODO: Also supports <2 x f16>
1350     Atomic.legalFor({
1351         {S32, GlobalPtr},
1352         {S64, GlobalPtr},
1353         {S64, FlatPtr}
1354       });
1355   }
1356 
1357   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1358   // demarshalling
1359   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1360     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1361                 {S32, FlatPtr}, {S64, FlatPtr}})
1362     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1363                {S32, RegionPtr}, {S64, RegionPtr}});
1364   // TODO: Pointer types, any 32-bit or 64-bit vector
1365 
1366   // Condition should be s32 for scalar, s1 for vector.
1367   getActionDefinitionsBuilder(G_SELECT)
1368       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1369                                  LocalPtr, FlatPtr, PrivatePtr,
1370                                  LLT::fixed_vector(2, LocalPtr),
1371                                  LLT::fixed_vector(2, PrivatePtr)},
1372                                 {S1, S32})
1373       .clampScalar(0, S16, S64)
1374       .scalarize(1)
1375       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1376       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1377       .clampMaxNumElements(0, S32, 2)
1378       .clampMaxNumElements(0, LocalPtr, 2)
1379       .clampMaxNumElements(0, PrivatePtr, 2)
1380       .scalarize(0)
1381       .widenScalarToNextPow2(0)
1382       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1383 
1384   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1385   // be more flexible with the shift amount type.
1386   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1387     .legalFor({{S32, S32}, {S64, S32}});
1388   if (ST.has16BitInsts()) {
1389     if (ST.hasVOP3PInsts()) {
1390       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1391             .clampMaxNumElements(0, S16, 2);
1392     } else
1393       Shifts.legalFor({{S16, S16}});
1394 
1395     // TODO: Support 16-bit shift amounts for all types
1396     Shifts.widenScalarIf(
1397       [=](const LegalityQuery &Query) {
1398         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1399         // 32-bit amount.
1400         const LLT ValTy = Query.Types[0];
1401         const LLT AmountTy = Query.Types[1];
1402         return ValTy.getSizeInBits() <= 16 &&
1403                AmountTy.getSizeInBits() < 16;
1404       }, changeTo(1, S16));
1405     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1406     Shifts.clampScalar(1, S32, S32);
1407     Shifts.widenScalarToNextPow2(0, 16);
1408     Shifts.clampScalar(0, S16, S64);
1409 
1410     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1411       .minScalar(0, S16)
1412       .scalarize(0)
1413       .lower();
1414   } else {
1415     // Make sure we legalize the shift amount type first, as the general
1416     // expansion for the shifted type will produce much worse code if it hasn't
1417     // been truncated already.
1418     Shifts.clampScalar(1, S32, S32);
1419     Shifts.widenScalarToNextPow2(0, 32);
1420     Shifts.clampScalar(0, S32, S64);
1421 
1422     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1423       .minScalar(0, S32)
1424       .scalarize(0)
1425       .lower();
1426   }
1427   Shifts.scalarize(0);
1428 
1429   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1430     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1431     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1432     unsigned IdxTypeIdx = 2;
1433 
1434     getActionDefinitionsBuilder(Op)
1435       .customIf([=](const LegalityQuery &Query) {
1436           const LLT EltTy = Query.Types[EltTypeIdx];
1437           const LLT VecTy = Query.Types[VecTypeIdx];
1438           const LLT IdxTy = Query.Types[IdxTypeIdx];
1439           const unsigned EltSize = EltTy.getSizeInBits();
1440           return (EltSize == 32 || EltSize == 64) &&
1441                   VecTy.getSizeInBits() % 32 == 0 &&
1442                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1443                   IdxTy.getSizeInBits() == 32;
1444         })
1445       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1446                  bitcastToVectorElement32(VecTypeIdx))
1447       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1448       .bitcastIf(
1449         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1450         [=](const LegalityQuery &Query) {
1451           // For > 64-bit element types, try to turn this into a 64-bit
1452           // element vector since we may be able to do better indexing
1453           // if this is scalar. If not, fall back to 32.
1454           const LLT EltTy = Query.Types[EltTypeIdx];
1455           const LLT VecTy = Query.Types[VecTypeIdx];
1456           const unsigned DstEltSize = EltTy.getSizeInBits();
1457           const unsigned VecSize = VecTy.getSizeInBits();
1458 
1459           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1460           return std::make_pair(
1461               VecTypeIdx,
1462               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1463         })
1464       .clampScalar(EltTypeIdx, S32, S64)
1465       .clampScalar(VecTypeIdx, S32, S64)
1466       .clampScalar(IdxTypeIdx, S32, S32)
1467       .clampMaxNumElements(VecTypeIdx, S32, 32)
1468       // TODO: Clamp elements for 64-bit vectors?
1469       // It should only be necessary with variable indexes.
1470       // As a last resort, lower to the stack
1471       .lower();
1472   }
1473 
1474   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1475     .unsupportedIf([=](const LegalityQuery &Query) {
1476         const LLT &EltTy = Query.Types[1].getElementType();
1477         return Query.Types[0] != EltTy;
1478       });
1479 
1480   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1481     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1482     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1483 
1484     // FIXME: Doesn't handle extract of illegal sizes.
1485     getActionDefinitionsBuilder(Op)
1486       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1487       .lowerIf([=](const LegalityQuery &Query) {
1488           // Sub-vector(or single element) insert and extract.
1489           // TODO: verify immediate offset here since lower only works with
1490           // whole elements.
1491           const LLT BigTy = Query.Types[BigTyIdx];
1492           return BigTy.isVector();
1493         })
1494       // FIXME: Multiples of 16 should not be legal.
1495       .legalIf([=](const LegalityQuery &Query) {
1496           const LLT BigTy = Query.Types[BigTyIdx];
1497           const LLT LitTy = Query.Types[LitTyIdx];
1498           return (BigTy.getSizeInBits() % 32 == 0) &&
1499                  (LitTy.getSizeInBits() % 16 == 0);
1500         })
1501       .widenScalarIf(
1502         [=](const LegalityQuery &Query) {
1503           const LLT BigTy = Query.Types[BigTyIdx];
1504           return (BigTy.getScalarSizeInBits() < 16);
1505         },
1506         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1507       .widenScalarIf(
1508         [=](const LegalityQuery &Query) {
1509           const LLT LitTy = Query.Types[LitTyIdx];
1510           return (LitTy.getScalarSizeInBits() < 16);
1511         },
1512         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1513       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1514       .widenScalarToNextPow2(BigTyIdx, 32);
1515 
1516   }
1517 
1518   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1519     .legalForCartesianProduct(AllS32Vectors, {S32})
1520     .legalForCartesianProduct(AllS64Vectors, {S64})
1521     .clampNumElements(0, V16S32, V32S32)
1522     .clampNumElements(0, V2S64, V16S64)
1523     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1524 
1525   if (ST.hasScalarPackInsts()) {
1526     BuildVector
1527       // FIXME: Should probably widen s1 vectors straight to s32
1528       .minScalarOrElt(0, S16)
1529       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1530       .minScalar(1, S32);
1531 
1532     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1533       .legalFor({V2S16, S32})
1534       .lower();
1535     BuildVector.minScalarOrElt(0, S32);
1536   } else {
1537     BuildVector.customFor({V2S16, S16});
1538     BuildVector.minScalarOrElt(0, S32);
1539 
1540     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1541       .customFor({V2S16, S32})
1542       .lower();
1543   }
1544 
1545   BuildVector.legalIf(isRegisterType(0));
1546 
1547   // FIXME: Clamp maximum size
1548   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1549     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1550     .clampMaxNumElements(0, S32, 32)
1551     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1552     .clampMaxNumElements(0, S16, 64);
1553 
1554   // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
1555   // pre-legalize.
1556   if (ST.hasVOP3PInsts()) {
1557     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1558       .customFor({V2S16, V2S16})
1559       .lower();
1560   } else
1561     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1562 
1563   // Merge/Unmerge
1564   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1565     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1566     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1567 
1568     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1569       const LLT Ty = Query.Types[TypeIdx];
1570       if (Ty.isVector()) {
1571         const LLT &EltTy = Ty.getElementType();
1572         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1573           return true;
1574         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1575           return true;
1576       }
1577       return false;
1578     };
1579 
1580     auto &Builder = getActionDefinitionsBuilder(Op)
1581       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1582       .lowerFor({{S16, V2S16}})
1583       .lowerIf([=](const LegalityQuery &Query) {
1584           const LLT BigTy = Query.Types[BigTyIdx];
1585           return BigTy.getSizeInBits() == 32;
1586         })
1587       // Try to widen to s16 first for small types.
1588       // TODO: Only do this on targets with legal s16 shifts
1589       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1590       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1591       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1592       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1593                            elementTypeIs(1, S16)),
1594                        changeTo(1, V2S16))
1595       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1596       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1597       // valid.
1598       .clampScalar(LitTyIdx, S32, S512)
1599       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1600       // Break up vectors with weird elements into scalars
1601       .fewerElementsIf(
1602         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1603         scalarize(0))
1604       .fewerElementsIf(
1605         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1606         scalarize(1))
1607       .clampScalar(BigTyIdx, S32, MaxScalar);
1608 
1609     if (Op == G_MERGE_VALUES) {
1610       Builder.widenScalarIf(
1611         // TODO: Use 16-bit shifts if legal for 8-bit values?
1612         [=](const LegalityQuery &Query) {
1613           const LLT Ty = Query.Types[LitTyIdx];
1614           return Ty.getSizeInBits() < 32;
1615         },
1616         changeTo(LitTyIdx, S32));
1617     }
1618 
1619     Builder.widenScalarIf(
1620       [=](const LegalityQuery &Query) {
1621         const LLT Ty = Query.Types[BigTyIdx];
1622         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1623           Ty.getSizeInBits() % 16 != 0;
1624       },
1625       [=](const LegalityQuery &Query) {
1626         // Pick the next power of 2, or a multiple of 64 over 128.
1627         // Whichever is smaller.
1628         const LLT &Ty = Query.Types[BigTyIdx];
1629         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1630         if (NewSizeInBits >= 256) {
1631           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1632           if (RoundedTo < NewSizeInBits)
1633             NewSizeInBits = RoundedTo;
1634         }
1635         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1636       })
1637       // Any vectors left are the wrong size. Scalarize them.
1638       .scalarize(0)
1639       .scalarize(1);
1640   }
1641 
1642   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1643   // RegBankSelect.
1644   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1645     .legalFor({{S32}, {S64}});
1646 
1647   if (ST.hasVOP3PInsts()) {
1648     SextInReg.lowerFor({{V2S16}})
1649       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1650       // get more vector shift opportunities, since we'll get those when
1651       // expanded.
1652       .clampMaxNumElementsStrict(0, S16, 2);
1653   } else if (ST.has16BitInsts()) {
1654     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1655   } else {
1656     // Prefer to promote to s32 before lowering if we don't have 16-bit
1657     // shifts. This avoid a lot of intermediate truncate and extend operations.
1658     SextInReg.lowerFor({{S32}, {S64}});
1659   }
1660 
1661   SextInReg
1662     .scalarize(0)
1663     .clampScalar(0, S32, S64)
1664     .lower();
1665 
1666   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1667     .scalarize(0)
1668     .lower();
1669 
1670   // TODO: Only Try to form v2s16 with legal packed instructions.
1671   getActionDefinitionsBuilder(G_FSHR)
1672     .legalFor({{S32, S32}})
1673     .lowerFor({{V2S16, V2S16}})
1674     .clampMaxNumElementsStrict(0, S16, 2)
1675     .scalarize(0)
1676     .lower();
1677 
1678   if (ST.hasVOP3PInsts()) {
1679     getActionDefinitionsBuilder(G_FSHL)
1680       .lowerFor({{V2S16, V2S16}})
1681       .clampMaxNumElementsStrict(0, S16, 2)
1682       .scalarize(0)
1683       .lower();
1684   } else {
1685     getActionDefinitionsBuilder(G_FSHL)
1686       .scalarize(0)
1687       .lower();
1688   }
1689 
1690   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1691     .legalFor({S64});
1692 
1693   getActionDefinitionsBuilder(G_FENCE)
1694     .alwaysLegal();
1695 
1696   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1697       .scalarize(0)
1698       .minScalar(0, S32)
1699       .lower();
1700 
1701   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1702       .legalFor({{S32, S32}, {S64, S32}})
1703       .clampScalar(1, S32, S32)
1704       .clampScalar(0, S32, S64)
1705       .widenScalarToNextPow2(0)
1706       .scalarize(0);
1707 
1708   getActionDefinitionsBuilder({
1709       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1710       G_FCOPYSIGN,
1711 
1712       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1713       G_ATOMICRMW_NAND,
1714       G_ATOMICRMW_FSUB,
1715       G_READ_REGISTER,
1716       G_WRITE_REGISTER,
1717 
1718       G_SADDO, G_SSUBO,
1719 
1720        // TODO: Implement
1721       G_FMINIMUM, G_FMAXIMUM}).lower();
1722 
1723   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1724       .lower();
1725 
1726   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1727         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1728         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1729     .unsupported();
1730 
1731   getLegacyLegalizerInfo().computeTables();
1732   verify(*ST.getInstrInfo());
1733 }
1734 
1735 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1736                                          MachineInstr &MI) const {
1737   MachineIRBuilder &B = Helper.MIRBuilder;
1738   MachineRegisterInfo &MRI = *B.getMRI();
1739 
1740   switch (MI.getOpcode()) {
1741   case TargetOpcode::G_ADDRSPACE_CAST:
1742     return legalizeAddrSpaceCast(MI, MRI, B);
1743   case TargetOpcode::G_FRINT:
1744     return legalizeFrint(MI, MRI, B);
1745   case TargetOpcode::G_FCEIL:
1746     return legalizeFceil(MI, MRI, B);
1747   case TargetOpcode::G_FREM:
1748     return legalizeFrem(MI, MRI, B);
1749   case TargetOpcode::G_INTRINSIC_TRUNC:
1750     return legalizeIntrinsicTrunc(MI, MRI, B);
1751   case TargetOpcode::G_SITOFP:
1752     return legalizeITOFP(MI, MRI, B, true);
1753   case TargetOpcode::G_UITOFP:
1754     return legalizeITOFP(MI, MRI, B, false);
1755   case TargetOpcode::G_FPTOSI:
1756     return legalizeFPTOI(MI, MRI, B, true);
1757   case TargetOpcode::G_FPTOUI:
1758     return legalizeFPTOI(MI, MRI, B, false);
1759   case TargetOpcode::G_FMINNUM:
1760   case TargetOpcode::G_FMAXNUM:
1761   case TargetOpcode::G_FMINNUM_IEEE:
1762   case TargetOpcode::G_FMAXNUM_IEEE:
1763     return legalizeMinNumMaxNum(Helper, MI);
1764   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1765     return legalizeExtractVectorElt(MI, MRI, B);
1766   case TargetOpcode::G_INSERT_VECTOR_ELT:
1767     return legalizeInsertVectorElt(MI, MRI, B);
1768   case TargetOpcode::G_SHUFFLE_VECTOR:
1769     return legalizeShuffleVector(MI, MRI, B);
1770   case TargetOpcode::G_FSIN:
1771   case TargetOpcode::G_FCOS:
1772     return legalizeSinCos(MI, MRI, B);
1773   case TargetOpcode::G_GLOBAL_VALUE:
1774     return legalizeGlobalValue(MI, MRI, B);
1775   case TargetOpcode::G_LOAD:
1776   case TargetOpcode::G_SEXTLOAD:
1777   case TargetOpcode::G_ZEXTLOAD:
1778     return legalizeLoad(Helper, MI);
1779   case TargetOpcode::G_FMAD:
1780     return legalizeFMad(MI, MRI, B);
1781   case TargetOpcode::G_FDIV:
1782     return legalizeFDIV(MI, MRI, B);
1783   case TargetOpcode::G_UDIV:
1784   case TargetOpcode::G_UREM:
1785   case TargetOpcode::G_UDIVREM:
1786     return legalizeUnsignedDIV_REM(MI, MRI, B);
1787   case TargetOpcode::G_SDIV:
1788   case TargetOpcode::G_SREM:
1789   case TargetOpcode::G_SDIVREM:
1790     return legalizeSignedDIV_REM(MI, MRI, B);
1791   case TargetOpcode::G_ATOMIC_CMPXCHG:
1792     return legalizeAtomicCmpXChg(MI, MRI, B);
1793   case TargetOpcode::G_FLOG:
1794     return legalizeFlog(MI, B, numbers::ln2f);
1795   case TargetOpcode::G_FLOG10:
1796     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1797   case TargetOpcode::G_FEXP:
1798     return legalizeFExp(MI, B);
1799   case TargetOpcode::G_FPOW:
1800     return legalizeFPow(MI, B);
1801   case TargetOpcode::G_FFLOOR:
1802     return legalizeFFloor(MI, MRI, B);
1803   case TargetOpcode::G_BUILD_VECTOR:
1804     return legalizeBuildVector(MI, MRI, B);
1805   case TargetOpcode::G_MUL:
1806     return legalizeMul(Helper, MI);
1807   case TargetOpcode::G_CTLZ:
1808   case TargetOpcode::G_CTTZ:
1809     return legalizeCTLZ_CTTZ(MI, MRI, B);
1810   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
1811     return legalizeFPTruncRound(MI, B);
1812   default:
1813     return false;
1814   }
1815 
1816   llvm_unreachable("expected switch to return");
1817 }
1818 
1819 Register AMDGPULegalizerInfo::getSegmentAperture(
1820   unsigned AS,
1821   MachineRegisterInfo &MRI,
1822   MachineIRBuilder &B) const {
1823   MachineFunction &MF = B.getMF();
1824   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1825   const LLT S32 = LLT::scalar(32);
1826 
1827   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1828 
1829   if (ST.hasApertureRegs()) {
1830     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1831     // getreg.
1832     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1833         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1834         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1835     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1836         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1837         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1838     unsigned Encoding =
1839         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1840         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1841         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1842 
1843     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1844 
1845     B.buildInstr(AMDGPU::S_GETREG_B32)
1846       .addDef(GetReg)
1847       .addImm(Encoding);
1848     MRI.setType(GetReg, S32);
1849 
1850     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1851     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1852   }
1853 
1854   // TODO: can we be smarter about machine pointer info?
1855   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1856   Register LoadAddr = MRI.createGenericVirtualRegister(
1857     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1858   // For code object version 5, private_base and shared_base are passed through
1859   // implicit kernargs.
1860   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
1861     AMDGPUTargetLowering::ImplicitParameter Param =
1862         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
1863                                       : AMDGPUTargetLowering::PRIVATE_BASE;
1864     uint64_t Offset =
1865         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
1866 
1867     Register KernargPtrReg = MRI.createGenericVirtualRegister(
1868         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1869 
1870     if (!loadInputValue(KernargPtrReg, B,
1871                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
1872       return Register();
1873 
1874     MachineMemOperand *MMO = MF.getMachineMemOperand(
1875         PtrInfo,
1876         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1877             MachineMemOperand::MOInvariant,
1878         LLT::scalar(32), commonAlignment(Align(64), Offset));
1879 
1880     // Pointer address
1881     B.buildPtrAdd(LoadAddr, KernargPtrReg,
1882                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
1883     // Load address
1884     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1885   }
1886 
1887   Register QueuePtr = MRI.createGenericVirtualRegister(
1888     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1889 
1890   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1891     return Register();
1892 
1893   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1894   // private_segment_aperture_base_hi.
1895   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1896 
1897   MachineMemOperand *MMO = MF.getMachineMemOperand(
1898       PtrInfo,
1899       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1900           MachineMemOperand::MOInvariant,
1901       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1902 
1903   B.buildPtrAdd(LoadAddr, QueuePtr,
1904                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
1905   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1906 }
1907 
1908 /// Return true if the value is a known valid address, such that a null check is
1909 /// not necessary.
1910 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1911                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1912   MachineInstr *Def = MRI.getVRegDef(Val);
1913   switch (Def->getOpcode()) {
1914   case AMDGPU::G_FRAME_INDEX:
1915   case AMDGPU::G_GLOBAL_VALUE:
1916   case AMDGPU::G_BLOCK_ADDR:
1917     return true;
1918   case AMDGPU::G_CONSTANT: {
1919     const ConstantInt *CI = Def->getOperand(1).getCImm();
1920     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1921   }
1922   default:
1923     return false;
1924   }
1925 
1926   return false;
1927 }
1928 
1929 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1930   MachineInstr &MI, MachineRegisterInfo &MRI,
1931   MachineIRBuilder &B) const {
1932   MachineFunction &MF = B.getMF();
1933 
1934   const LLT S32 = LLT::scalar(32);
1935   Register Dst = MI.getOperand(0).getReg();
1936   Register Src = MI.getOperand(1).getReg();
1937 
1938   LLT DstTy = MRI.getType(Dst);
1939   LLT SrcTy = MRI.getType(Src);
1940   unsigned DestAS = DstTy.getAddressSpace();
1941   unsigned SrcAS = SrcTy.getAddressSpace();
1942 
1943   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1944   // vector element.
1945   assert(!DstTy.isVector());
1946 
1947   const AMDGPUTargetMachine &TM
1948     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1949 
1950   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1951     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1952     return true;
1953   }
1954 
1955   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
1956       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1957        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1958     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1959       // Extract low 32-bits of the pointer.
1960       B.buildExtract(Dst, Src, 0);
1961       MI.eraseFromParent();
1962       return true;
1963     }
1964 
1965     unsigned NullVal = TM.getNullPointerValue(DestAS);
1966 
1967     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1968     auto FlatNull = B.buildConstant(SrcTy, 0);
1969 
1970     // Extract low 32-bits of the pointer.
1971     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1972 
1973     auto CmpRes =
1974         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1975     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1976 
1977     MI.eraseFromParent();
1978     return true;
1979   }
1980 
1981   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
1982       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1983        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
1984     if (!ST.hasFlatAddressSpace())
1985       return false;
1986 
1987     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1988     if (!ApertureReg.isValid())
1989       return false;
1990 
1991     // Coerce the type of the low half of the result so we can use merge_values.
1992     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1993 
1994     // TODO: Should we allow mismatched types but matching sizes in merges to
1995     // avoid the ptrtoint?
1996     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1997 
1998     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1999       B.buildCopy(Dst, BuildPtr);
2000       MI.eraseFromParent();
2001       return true;
2002     }
2003 
2004     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
2005     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
2006 
2007     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2008                               SegmentNull.getReg(0));
2009 
2010     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2011 
2012     MI.eraseFromParent();
2013     return true;
2014   }
2015 
2016   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2017       SrcTy.getSizeInBits() == 64) {
2018     // Truncate.
2019     B.buildExtract(Dst, Src, 0);
2020     MI.eraseFromParent();
2021     return true;
2022   }
2023 
2024   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2025       DstTy.getSizeInBits() == 64) {
2026     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2027     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2028 
2029     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
2030     // another. Merge operands are required to be the same type, but creating an
2031     // extra ptrtoint would be kind of pointless.
2032     auto HighAddr = B.buildConstant(
2033         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
2034     B.buildMerge(Dst, {Src, HighAddr});
2035     MI.eraseFromParent();
2036     return true;
2037   }
2038 
2039   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
2040       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
2041 
2042   LLVMContext &Ctx = MF.getFunction().getContext();
2043   Ctx.diagnose(InvalidAddrSpaceCast);
2044   B.buildUndef(Dst);
2045   MI.eraseFromParent();
2046   return true;
2047 }
2048 
2049 bool AMDGPULegalizerInfo::legalizeFrint(
2050   MachineInstr &MI, MachineRegisterInfo &MRI,
2051   MachineIRBuilder &B) const {
2052   Register Src = MI.getOperand(1).getReg();
2053   LLT Ty = MRI.getType(Src);
2054   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2055 
2056   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2057   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2058 
2059   auto C1 = B.buildFConstant(Ty, C1Val);
2060   auto CopySign = B.buildFCopysign(Ty, C1, Src);
2061 
2062   // TODO: Should this propagate fast-math-flags?
2063   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2064   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2065 
2066   auto C2 = B.buildFConstant(Ty, C2Val);
2067   auto Fabs = B.buildFAbs(Ty, Src);
2068 
2069   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2070   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2071   MI.eraseFromParent();
2072   return true;
2073 }
2074 
2075 bool AMDGPULegalizerInfo::legalizeFceil(
2076   MachineInstr &MI, MachineRegisterInfo &MRI,
2077   MachineIRBuilder &B) const {
2078 
2079   const LLT S1 = LLT::scalar(1);
2080   const LLT S64 = LLT::scalar(64);
2081 
2082   Register Src = MI.getOperand(1).getReg();
2083   assert(MRI.getType(Src) == S64);
2084 
2085   // result = trunc(src)
2086   // if (src > 0.0 && src != result)
2087   //   result += 1.0
2088 
2089   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2090 
2091   const auto Zero = B.buildFConstant(S64, 0.0);
2092   const auto One = B.buildFConstant(S64, 1.0);
2093   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2094   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2095   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2096   auto Add = B.buildSelect(S64, And, One, Zero);
2097 
2098   // TODO: Should this propagate fast-math-flags?
2099   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2100   MI.eraseFromParent();
2101   return true;
2102 }
2103 
2104 bool AMDGPULegalizerInfo::legalizeFrem(
2105   MachineInstr &MI, MachineRegisterInfo &MRI,
2106   MachineIRBuilder &B) const {
2107     Register DstReg = MI.getOperand(0).getReg();
2108     Register Src0Reg = MI.getOperand(1).getReg();
2109     Register Src1Reg = MI.getOperand(2).getReg();
2110     auto Flags = MI.getFlags();
2111     LLT Ty = MRI.getType(DstReg);
2112 
2113     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2114     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2115     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2116     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2117     MI.eraseFromParent();
2118     return true;
2119 }
2120 
2121 static MachineInstrBuilder extractF64Exponent(Register Hi,
2122                                               MachineIRBuilder &B) {
2123   const unsigned FractBits = 52;
2124   const unsigned ExpBits = 11;
2125   LLT S32 = LLT::scalar(32);
2126 
2127   auto Const0 = B.buildConstant(S32, FractBits - 32);
2128   auto Const1 = B.buildConstant(S32, ExpBits);
2129 
2130   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2131     .addUse(Hi)
2132     .addUse(Const0.getReg(0))
2133     .addUse(Const1.getReg(0));
2134 
2135   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2136 }
2137 
2138 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2139   MachineInstr &MI, MachineRegisterInfo &MRI,
2140   MachineIRBuilder &B) const {
2141   const LLT S1 = LLT::scalar(1);
2142   const LLT S32 = LLT::scalar(32);
2143   const LLT S64 = LLT::scalar(64);
2144 
2145   Register Src = MI.getOperand(1).getReg();
2146   assert(MRI.getType(Src) == S64);
2147 
2148   // TODO: Should this use extract since the low half is unused?
2149   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2150   Register Hi = Unmerge.getReg(1);
2151 
2152   // Extract the upper half, since this is where we will find the sign and
2153   // exponent.
2154   auto Exp = extractF64Exponent(Hi, B);
2155 
2156   const unsigned FractBits = 52;
2157 
2158   // Extract the sign bit.
2159   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2160   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2161 
2162   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2163 
2164   const auto Zero32 = B.buildConstant(S32, 0);
2165 
2166   // Extend back to 64-bits.
2167   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2168 
2169   auto Shr = B.buildAShr(S64, FractMask, Exp);
2170   auto Not = B.buildNot(S64, Shr);
2171   auto Tmp0 = B.buildAnd(S64, Src, Not);
2172   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2173 
2174   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2175   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2176 
2177   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2178   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2179   MI.eraseFromParent();
2180   return true;
2181 }
2182 
2183 bool AMDGPULegalizerInfo::legalizeITOFP(
2184   MachineInstr &MI, MachineRegisterInfo &MRI,
2185   MachineIRBuilder &B, bool Signed) const {
2186 
2187   Register Dst = MI.getOperand(0).getReg();
2188   Register Src = MI.getOperand(1).getReg();
2189 
2190   const LLT S64 = LLT::scalar(64);
2191   const LLT S32 = LLT::scalar(32);
2192 
2193   assert(MRI.getType(Src) == S64);
2194 
2195   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2196   auto ThirtyTwo = B.buildConstant(S32, 32);
2197 
2198   if (MRI.getType(Dst) == S64) {
2199     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2200                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2201 
2202     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2203     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2204                      .addUse(CvtHi.getReg(0))
2205                      .addUse(ThirtyTwo.getReg(0));
2206 
2207     // TODO: Should this propagate fast-math-flags?
2208     B.buildFAdd(Dst, LdExp, CvtLo);
2209     MI.eraseFromParent();
2210     return true;
2211   }
2212 
2213   assert(MRI.getType(Dst) == S32);
2214 
2215   auto One = B.buildConstant(S32, 1);
2216 
2217   MachineInstrBuilder ShAmt;
2218   if (Signed) {
2219     auto ThirtyOne = B.buildConstant(S32, 31);
2220     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2221     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2222     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2223     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2224                                /*HasSideEffects=*/false)
2225                   .addUse(Unmerge.getReg(1));
2226     auto LS2 = B.buildSub(S32, LS, One);
2227     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2228   } else
2229     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2230   auto Norm = B.buildShl(S64, Src, ShAmt);
2231   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2232   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2233   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2234   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2235   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2236   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2237                    /*HasSideEffects=*/false)
2238       .addUse(FVal.getReg(0))
2239       .addUse(Scale.getReg(0));
2240   MI.eraseFromParent();
2241   return true;
2242 }
2243 
2244 // TODO: Copied from DAG implementation. Verify logic and document how this
2245 // actually works.
2246 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2247                                         MachineRegisterInfo &MRI,
2248                                         MachineIRBuilder &B,
2249                                         bool Signed) const {
2250 
2251   Register Dst = MI.getOperand(0).getReg();
2252   Register Src = MI.getOperand(1).getReg();
2253 
2254   const LLT S64 = LLT::scalar(64);
2255   const LLT S32 = LLT::scalar(32);
2256 
2257   const LLT SrcLT = MRI.getType(Src);
2258   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2259 
2260   unsigned Flags = MI.getFlags();
2261 
2262   // The basic idea of converting a floating point number into a pair of 32-bit
2263   // integers is illustrated as follows:
2264   //
2265   //     tf := trunc(val);
2266   //    hif := floor(tf * 2^-32);
2267   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2268   //     hi := fptoi(hif);
2269   //     lo := fptoi(lof);
2270   //
2271   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2272   MachineInstrBuilder Sign;
2273   if (Signed && SrcLT == S32) {
2274     // However, a 32-bit floating point number has only 23 bits mantissa and
2275     // it's not enough to hold all the significant bits of `lof` if val is
2276     // negative. To avoid the loss of precision, We need to take the absolute
2277     // value after truncating and flip the result back based on the original
2278     // signedness.
2279     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2280     Trunc = B.buildFAbs(S32, Trunc, Flags);
2281   }
2282   MachineInstrBuilder K0, K1;
2283   if (SrcLT == S64) {
2284     K0 = B.buildFConstant(S64,
2285                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2286     K1 = B.buildFConstant(S64,
2287                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2288   } else {
2289     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2290     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2291   }
2292 
2293   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2294   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2295   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2296 
2297   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2298                                      : B.buildFPTOUI(S32, FloorMul);
2299   auto Lo = B.buildFPTOUI(S32, Fma);
2300 
2301   if (Signed && SrcLT == S32) {
2302     // Flip the result based on the signedness, which is either all 0s or 1s.
2303     Sign = B.buildMerge(S64, {Sign, Sign});
2304     // r := xor({lo, hi}, sign) - sign;
2305     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2306   } else
2307     B.buildMerge(Dst, {Lo, Hi});
2308   MI.eraseFromParent();
2309 
2310   return true;
2311 }
2312 
2313 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2314                                                MachineInstr &MI) const {
2315   MachineFunction &MF = Helper.MIRBuilder.getMF();
2316   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2317 
2318   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2319                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2320 
2321   // With ieee_mode disabled, the instructions have the correct behavior
2322   // already for G_FMINNUM/G_FMAXNUM
2323   if (!MFI->getMode().IEEE)
2324     return !IsIEEEOp;
2325 
2326   if (IsIEEEOp)
2327     return true;
2328 
2329   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2330 }
2331 
2332 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2333   MachineInstr &MI, MachineRegisterInfo &MRI,
2334   MachineIRBuilder &B) const {
2335   // TODO: Should move some of this into LegalizerHelper.
2336 
2337   // TODO: Promote dynamic indexing of s16 to s32
2338 
2339   // FIXME: Artifact combiner probably should have replaced the truncated
2340   // constant before this, so we shouldn't need
2341   // getIConstantVRegValWithLookThrough.
2342   Optional<ValueAndVReg> MaybeIdxVal =
2343       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2344   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2345     return true;
2346   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2347 
2348   Register Dst = MI.getOperand(0).getReg();
2349   Register Vec = MI.getOperand(1).getReg();
2350 
2351   LLT VecTy = MRI.getType(Vec);
2352   LLT EltTy = VecTy.getElementType();
2353   assert(EltTy == MRI.getType(Dst));
2354 
2355   if (IdxVal < VecTy.getNumElements()) {
2356     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2357     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2358   } else {
2359     B.buildUndef(Dst);
2360   }
2361 
2362   MI.eraseFromParent();
2363   return true;
2364 }
2365 
2366 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2367   MachineInstr &MI, MachineRegisterInfo &MRI,
2368   MachineIRBuilder &B) const {
2369   // TODO: Should move some of this into LegalizerHelper.
2370 
2371   // TODO: Promote dynamic indexing of s16 to s32
2372 
2373   // FIXME: Artifact combiner probably should have replaced the truncated
2374   // constant before this, so we shouldn't need
2375   // getIConstantVRegValWithLookThrough.
2376   Optional<ValueAndVReg> MaybeIdxVal =
2377       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2378   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2379     return true;
2380 
2381   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2382   Register Dst = MI.getOperand(0).getReg();
2383   Register Vec = MI.getOperand(1).getReg();
2384   Register Ins = MI.getOperand(2).getReg();
2385 
2386   LLT VecTy = MRI.getType(Vec);
2387   LLT EltTy = VecTy.getElementType();
2388   assert(EltTy == MRI.getType(Ins));
2389   (void)Ins;
2390 
2391   unsigned NumElts = VecTy.getNumElements();
2392   if (IdxVal < NumElts) {
2393     SmallVector<Register, 8> SrcRegs;
2394     for (unsigned i = 0; i < NumElts; ++i)
2395       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2396     B.buildUnmerge(SrcRegs, Vec);
2397 
2398     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2399     B.buildMerge(Dst, SrcRegs);
2400   } else {
2401     B.buildUndef(Dst);
2402   }
2403 
2404   MI.eraseFromParent();
2405   return true;
2406 }
2407 
2408 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2409   MachineInstr &MI, MachineRegisterInfo &MRI,
2410   MachineIRBuilder &B) const {
2411   const LLT V2S16 = LLT::fixed_vector(2, 16);
2412 
2413   Register Dst = MI.getOperand(0).getReg();
2414   Register Src0 = MI.getOperand(1).getReg();
2415   LLT DstTy = MRI.getType(Dst);
2416   LLT SrcTy = MRI.getType(Src0);
2417 
2418   if (SrcTy == V2S16 && DstTy == V2S16 &&
2419       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2420     return true;
2421 
2422   MachineIRBuilder HelperBuilder(MI);
2423   GISelObserverWrapper DummyObserver;
2424   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2425   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2426 }
2427 
2428 bool AMDGPULegalizerInfo::legalizeSinCos(
2429   MachineInstr &MI, MachineRegisterInfo &MRI,
2430   MachineIRBuilder &B) const {
2431 
2432   Register DstReg = MI.getOperand(0).getReg();
2433   Register SrcReg = MI.getOperand(1).getReg();
2434   LLT Ty = MRI.getType(DstReg);
2435   unsigned Flags = MI.getFlags();
2436 
2437   Register TrigVal;
2438   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2439   if (ST.hasTrigReducedRange()) {
2440     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2441     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2442       .addUse(MulVal.getReg(0))
2443       .setMIFlags(Flags).getReg(0);
2444   } else
2445     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2446 
2447   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2448     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2449   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2450     .addUse(TrigVal)
2451     .setMIFlags(Flags);
2452   MI.eraseFromParent();
2453   return true;
2454 }
2455 
2456 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2457                                                   MachineIRBuilder &B,
2458                                                   const GlobalValue *GV,
2459                                                   int64_t Offset,
2460                                                   unsigned GAFlags) const {
2461   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2462   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2463   // to the following code sequence:
2464   //
2465   // For constant address space:
2466   //   s_getpc_b64 s[0:1]
2467   //   s_add_u32 s0, s0, $symbol
2468   //   s_addc_u32 s1, s1, 0
2469   //
2470   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2471   //   a fixup or relocation is emitted to replace $symbol with a literal
2472   //   constant, which is a pc-relative offset from the encoding of the $symbol
2473   //   operand to the global variable.
2474   //
2475   // For global address space:
2476   //   s_getpc_b64 s[0:1]
2477   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2478   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2479   //
2480   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2481   //   fixups or relocations are emitted to replace $symbol@*@lo and
2482   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2483   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2484   //   operand to the global variable.
2485   //
2486   // What we want here is an offset from the value returned by s_getpc
2487   // (which is the address of the s_add_u32 instruction) to the global
2488   // variable, but since the encoding of $symbol starts 4 bytes after the start
2489   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2490   // small. This requires us to add 4 to the global variable offset in order to
2491   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2492   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2493   // instruction.
2494 
2495   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2496 
2497   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2498     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2499 
2500   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2501     .addDef(PCReg);
2502 
2503   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2504   if (GAFlags == SIInstrInfo::MO_NONE)
2505     MIB.addImm(0);
2506   else
2507     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2508 
2509   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2510 
2511   if (PtrTy.getSizeInBits() == 32)
2512     B.buildExtract(DstReg, PCReg, 0);
2513   return true;
2514  }
2515 
2516 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2517   MachineInstr &MI, MachineRegisterInfo &MRI,
2518   MachineIRBuilder &B) const {
2519   Register DstReg = MI.getOperand(0).getReg();
2520   LLT Ty = MRI.getType(DstReg);
2521   unsigned AS = Ty.getAddressSpace();
2522 
2523   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2524   MachineFunction &MF = B.getMF();
2525   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2526 
2527   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2528     if (!MFI->isModuleEntryFunction() &&
2529         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2530       const Function &Fn = MF.getFunction();
2531       DiagnosticInfoUnsupported BadLDSDecl(
2532         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2533         DS_Warning);
2534       Fn.getContext().diagnose(BadLDSDecl);
2535 
2536       // We currently don't have a way to correctly allocate LDS objects that
2537       // aren't directly associated with a kernel. We do force inlining of
2538       // functions that use local objects. However, if these dead functions are
2539       // not eliminated, we don't want a compile time error. Just emit a warning
2540       // and a trap, since there should be no callable path here.
2541       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2542       B.buildUndef(DstReg);
2543       MI.eraseFromParent();
2544       return true;
2545     }
2546 
2547     // TODO: We could emit code to handle the initialization somewhere.
2548     // We ignore the initializer for now and legalize it to allow selection.
2549     // The initializer will anyway get errored out during assembly emission.
2550     const SITargetLowering *TLI = ST.getTargetLowering();
2551     if (!TLI->shouldUseLDSConstAddress(GV)) {
2552       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2553       return true; // Leave in place;
2554     }
2555 
2556     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2557       Type *Ty = GV->getValueType();
2558       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2559       // zero-sized type in other languages to declare the dynamic shared
2560       // memory which size is not known at the compile time. They will be
2561       // allocated by the runtime and placed directly after the static
2562       // allocated ones. They all share the same offset.
2563       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2564         // Adjust alignment for that dynamic shared memory array.
2565         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2566         LLT S32 = LLT::scalar(32);
2567         auto Sz =
2568             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2569         B.buildIntToPtr(DstReg, Sz);
2570         MI.eraseFromParent();
2571         return true;
2572       }
2573     }
2574 
2575     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2576                                                    *cast<GlobalVariable>(GV)));
2577     MI.eraseFromParent();
2578     return true;
2579   }
2580 
2581   const SITargetLowering *TLI = ST.getTargetLowering();
2582 
2583   if (TLI->shouldEmitFixup(GV)) {
2584     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2585     MI.eraseFromParent();
2586     return true;
2587   }
2588 
2589   if (TLI->shouldEmitPCReloc(GV)) {
2590     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2591     MI.eraseFromParent();
2592     return true;
2593   }
2594 
2595   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2596   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2597 
2598   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2599   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2600       MachinePointerInfo::getGOT(MF),
2601       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2602           MachineMemOperand::MOInvariant,
2603       LoadTy, Align(8));
2604 
2605   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2606 
2607   if (Ty.getSizeInBits() == 32) {
2608     // Truncate if this is a 32-bit constant address.
2609     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2610     B.buildExtract(DstReg, Load, 0);
2611   } else
2612     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2613 
2614   MI.eraseFromParent();
2615   return true;
2616 }
2617 
2618 static LLT widenToNextPowerOf2(LLT Ty) {
2619   if (Ty.isVector())
2620     return Ty.changeElementCount(
2621         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2622   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2623 }
2624 
2625 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2626                                        MachineInstr &MI) const {
2627   MachineIRBuilder &B = Helper.MIRBuilder;
2628   MachineRegisterInfo &MRI = *B.getMRI();
2629   GISelChangeObserver &Observer = Helper.Observer;
2630 
2631   Register PtrReg = MI.getOperand(1).getReg();
2632   LLT PtrTy = MRI.getType(PtrReg);
2633   unsigned AddrSpace = PtrTy.getAddressSpace();
2634 
2635   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2636     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2637     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2638     Observer.changingInstr(MI);
2639     MI.getOperand(1).setReg(Cast.getReg(0));
2640     Observer.changedInstr(MI);
2641     return true;
2642   }
2643 
2644   if (MI.getOpcode() != AMDGPU::G_LOAD)
2645     return false;
2646 
2647   Register ValReg = MI.getOperand(0).getReg();
2648   LLT ValTy = MRI.getType(ValReg);
2649 
2650   MachineMemOperand *MMO = *MI.memoperands_begin();
2651   const unsigned ValSize = ValTy.getSizeInBits();
2652   const LLT MemTy = MMO->getMemoryType();
2653   const Align MemAlign = MMO->getAlign();
2654   const unsigned MemSize = MemTy.getSizeInBits();
2655   const uint64_t AlignInBits = 8 * MemAlign.value();
2656 
2657   // Widen non-power-of-2 loads to the alignment if needed
2658   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2659     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2660 
2661     // This was already the correct extending load result type, so just adjust
2662     // the memory type.
2663     if (WideMemSize == ValSize) {
2664       MachineFunction &MF = B.getMF();
2665 
2666       MachineMemOperand *WideMMO =
2667           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2668       Observer.changingInstr(MI);
2669       MI.setMemRefs(MF, {WideMMO});
2670       Observer.changedInstr(MI);
2671       return true;
2672     }
2673 
2674     // Don't bother handling edge case that should probably never be produced.
2675     if (ValSize > WideMemSize)
2676       return false;
2677 
2678     LLT WideTy = widenToNextPowerOf2(ValTy);
2679 
2680     Register WideLoad;
2681     if (!WideTy.isVector()) {
2682       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2683       B.buildTrunc(ValReg, WideLoad).getReg(0);
2684     } else {
2685       // Extract the subvector.
2686 
2687       if (isRegisterType(ValTy)) {
2688         // If this a case where G_EXTRACT is legal, use it.
2689         // (e.g. <3 x s32> -> <4 x s32>)
2690         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2691         B.buildExtract(ValReg, WideLoad, 0);
2692       } else {
2693         // For cases where the widened type isn't a nice register value, unmerge
2694         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2695         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2696         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2697       }
2698     }
2699 
2700     MI.eraseFromParent();
2701     return true;
2702   }
2703 
2704   return false;
2705 }
2706 
2707 bool AMDGPULegalizerInfo::legalizeFMad(
2708   MachineInstr &MI, MachineRegisterInfo &MRI,
2709   MachineIRBuilder &B) const {
2710   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2711   assert(Ty.isScalar());
2712 
2713   MachineFunction &MF = B.getMF();
2714   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2715 
2716   // TODO: Always legal with future ftz flag.
2717   // FIXME: Do we need just output?
2718   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2719     return true;
2720   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2721     return true;
2722 
2723   MachineIRBuilder HelperBuilder(MI);
2724   GISelObserverWrapper DummyObserver;
2725   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2726   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2727 }
2728 
2729 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2730   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2731   Register DstReg = MI.getOperand(0).getReg();
2732   Register PtrReg = MI.getOperand(1).getReg();
2733   Register CmpVal = MI.getOperand(2).getReg();
2734   Register NewVal = MI.getOperand(3).getReg();
2735 
2736   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2737          "this should not have been custom lowered");
2738 
2739   LLT ValTy = MRI.getType(CmpVal);
2740   LLT VecTy = LLT::fixed_vector(2, ValTy);
2741 
2742   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2743 
2744   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2745     .addDef(DstReg)
2746     .addUse(PtrReg)
2747     .addUse(PackedVal)
2748     .setMemRefs(MI.memoperands());
2749 
2750   MI.eraseFromParent();
2751   return true;
2752 }
2753 
2754 bool AMDGPULegalizerInfo::legalizeFlog(
2755   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2756   Register Dst = MI.getOperand(0).getReg();
2757   Register Src = MI.getOperand(1).getReg();
2758   LLT Ty = B.getMRI()->getType(Dst);
2759   unsigned Flags = MI.getFlags();
2760 
2761   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2762   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2763 
2764   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2765   MI.eraseFromParent();
2766   return true;
2767 }
2768 
2769 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2770                                        MachineIRBuilder &B) const {
2771   Register Dst = MI.getOperand(0).getReg();
2772   Register Src = MI.getOperand(1).getReg();
2773   unsigned Flags = MI.getFlags();
2774   LLT Ty = B.getMRI()->getType(Dst);
2775 
2776   auto K = B.buildFConstant(Ty, numbers::log2e);
2777   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2778   B.buildFExp2(Dst, Mul, Flags);
2779   MI.eraseFromParent();
2780   return true;
2781 }
2782 
2783 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2784                                        MachineIRBuilder &B) const {
2785   Register Dst = MI.getOperand(0).getReg();
2786   Register Src0 = MI.getOperand(1).getReg();
2787   Register Src1 = MI.getOperand(2).getReg();
2788   unsigned Flags = MI.getFlags();
2789   LLT Ty = B.getMRI()->getType(Dst);
2790   const LLT S16 = LLT::scalar(16);
2791   const LLT S32 = LLT::scalar(32);
2792 
2793   if (Ty == S32) {
2794     auto Log = B.buildFLog2(S32, Src0, Flags);
2795     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2796       .addUse(Log.getReg(0))
2797       .addUse(Src1)
2798       .setMIFlags(Flags);
2799     B.buildFExp2(Dst, Mul, Flags);
2800   } else if (Ty == S16) {
2801     // There's no f16 fmul_legacy, so we need to convert for it.
2802     auto Log = B.buildFLog2(S16, Src0, Flags);
2803     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2804     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2805     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2806       .addUse(Ext0.getReg(0))
2807       .addUse(Ext1.getReg(0))
2808       .setMIFlags(Flags);
2809 
2810     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2811   } else
2812     return false;
2813 
2814   MI.eraseFromParent();
2815   return true;
2816 }
2817 
2818 // Find a source register, ignoring any possible source modifiers.
2819 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2820   Register ModSrc = OrigSrc;
2821   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2822     ModSrc = SrcFNeg->getOperand(1).getReg();
2823     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2824       ModSrc = SrcFAbs->getOperand(1).getReg();
2825   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2826     ModSrc = SrcFAbs->getOperand(1).getReg();
2827   return ModSrc;
2828 }
2829 
2830 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2831                                          MachineRegisterInfo &MRI,
2832                                          MachineIRBuilder &B) const {
2833 
2834   const LLT S1 = LLT::scalar(1);
2835   const LLT S64 = LLT::scalar(64);
2836   Register Dst = MI.getOperand(0).getReg();
2837   Register OrigSrc = MI.getOperand(1).getReg();
2838   unsigned Flags = MI.getFlags();
2839   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2840          "this should not have been custom lowered");
2841 
2842   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2843   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2844   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2845   // V_FRACT bug is:
2846   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2847   //
2848   // Convert floor(x) to (x - fract(x))
2849 
2850   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2851     .addUse(OrigSrc)
2852     .setMIFlags(Flags);
2853 
2854   // Give source modifier matching some assistance before obscuring a foldable
2855   // pattern.
2856 
2857   // TODO: We can avoid the neg on the fract? The input sign to fract
2858   // shouldn't matter?
2859   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2860 
2861   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2862 
2863   Register Min = MRI.createGenericVirtualRegister(S64);
2864 
2865   // We don't need to concern ourselves with the snan handling difference, so
2866   // use the one which will directly select.
2867   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2868   if (MFI->getMode().IEEE)
2869     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2870   else
2871     B.buildFMinNum(Min, Fract, Const, Flags);
2872 
2873   Register CorrectedFract = Min;
2874   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2875     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2876     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2877   }
2878 
2879   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2880   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2881 
2882   MI.eraseFromParent();
2883   return true;
2884 }
2885 
2886 // Turn an illegal packed v2s16 build vector into bit operations.
2887 // TODO: This should probably be a bitcast action in LegalizerHelper.
2888 bool AMDGPULegalizerInfo::legalizeBuildVector(
2889   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2890   Register Dst = MI.getOperand(0).getReg();
2891   const LLT S32 = LLT::scalar(32);
2892   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2893 
2894   Register Src0 = MI.getOperand(1).getReg();
2895   Register Src1 = MI.getOperand(2).getReg();
2896   assert(MRI.getType(Src0) == LLT::scalar(16));
2897 
2898   auto Merge = B.buildMerge(S32, {Src0, Src1});
2899   B.buildBitcast(Dst, Merge);
2900 
2901   MI.eraseFromParent();
2902   return true;
2903 }
2904 
2905 // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
2906 //
2907 // Source and accumulation registers must all be 32-bits.
2908 //
2909 // TODO: When the multiply is uniform, we should produce a code sequence
2910 // that is better suited to instruction selection on the SALU. Instead of
2911 // the outer loop going over parts of the result, the outer loop should go
2912 // over parts of one of the factors. This should result in instruction
2913 // selection that makes full use of S_ADDC_U32 instructions.
2914 void AMDGPULegalizerInfo::buildMultiply(
2915     LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
2916     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
2917     bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
2918   // Use (possibly empty) vectors of S1 registers to represent the set of
2919   // carries from one pair of positions to the next.
2920   using Carry = SmallVector<Register, 2>;
2921 
2922   MachineIRBuilder &B = Helper.MIRBuilder;
2923 
2924   const LLT S1 = LLT::scalar(1);
2925   const LLT S32 = LLT::scalar(32);
2926   const LLT S64 = LLT::scalar(64);
2927 
2928   Register Zero32;
2929   Register Zero64;
2930 
2931   auto getZero32 = [&]() -> Register {
2932     if (!Zero32)
2933       Zero32 = B.buildConstant(S32, 0).getReg(0);
2934     return Zero32;
2935   };
2936   auto getZero64 = [&]() -> Register {
2937     if (!Zero64)
2938       Zero64 = B.buildConstant(S64, 0).getReg(0);
2939     return Zero64;
2940   };
2941 
2942   // Merge the given carries into the 32-bit LocalAccum, which is modified
2943   // in-place.
2944   //
2945   // Returns the carry-out, which is a single S1 register or null.
2946   auto mergeCarry =
2947       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
2948         if (CarryIn.empty())
2949           return Register();
2950 
2951         bool HaveCarryOut = true;
2952         Register CarryAccum;
2953         if (CarryIn.size() == 1) {
2954           if (!LocalAccum) {
2955             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2956             return Register();
2957           }
2958 
2959           CarryAccum = getZero32();
2960         } else {
2961           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
2962           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
2963             CarryAccum =
2964                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
2965                     .getReg(0);
2966           }
2967 
2968           if (!LocalAccum) {
2969             LocalAccum = getZero32();
2970             HaveCarryOut = false;
2971           }
2972         }
2973 
2974         auto Add =
2975             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
2976         LocalAccum = Add.getReg(0);
2977         return HaveCarryOut ? Add.getReg(1) : Register();
2978       };
2979 
2980   // Build a multiply-add chain to compute
2981   //
2982   //   LocalAccum + (partial products at DstIndex)
2983   //       + (opportunistic subset of CarryIn)
2984   //
2985   // LocalAccum is an array of one or two 32-bit registers that are updated
2986   // in-place. The incoming registers may be null.
2987   //
2988   // In some edge cases, carry-ins can be consumed "for free". In that case,
2989   // the consumed carry bits are removed from CarryIn in-place.
2990   auto buildMadChain =
2991       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
2992           -> Carry {
2993         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
2994                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
2995 
2996         Carry CarryOut;
2997         unsigned j0 = 0;
2998 
2999         // Use plain 32-bit multiplication for the most significant part of the
3000         // result by default.
3001         if (LocalAccum.size() == 1 &&
3002             (!UsePartialMad64_32 || !CarryIn.empty())) {
3003           do {
3004             unsigned j1 = DstIndex - j0;
3005             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3006             if (!LocalAccum[0]) {
3007               LocalAccum[0] = Mul.getReg(0);
3008             } else {
3009               if (CarryIn.empty()) {
3010                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
3011               } else {
3012                 LocalAccum[0] =
3013                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
3014                         .getReg(0);
3015                 CarryIn.pop_back();
3016               }
3017             }
3018             ++j0;
3019           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3020         }
3021 
3022         // Build full 64-bit multiplies.
3023         if (j0 <= DstIndex) {
3024           bool HaveSmallAccum = false;
3025           Register Tmp;
3026 
3027           if (LocalAccum[0]) {
3028             if (LocalAccum.size() == 1) {
3029               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
3030               HaveSmallAccum = true;
3031             } else if (LocalAccum[1]) {
3032               Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
3033               HaveSmallAccum = false;
3034             } else {
3035               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
3036               HaveSmallAccum = true;
3037             }
3038           } else {
3039             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3040             Tmp = getZero64();
3041             HaveSmallAccum = true;
3042           }
3043 
3044           do {
3045             unsigned j1 = DstIndex - j0;
3046             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
3047                                     {Src0[j0], Src1[j1], Tmp});
3048             Tmp = Mad.getReg(0);
3049             if (!HaveSmallAccum)
3050               CarryOut.push_back(Mad.getReg(1));
3051             HaveSmallAccum = false;
3052             ++j0;
3053           } while (j0 <= DstIndex);
3054 
3055           auto Unmerge = B.buildUnmerge(S32, Tmp);
3056           LocalAccum[0] = Unmerge.getReg(0);
3057           if (LocalAccum.size() > 1)
3058             LocalAccum[1] = Unmerge.getReg(1);
3059         }
3060 
3061         return CarryOut;
3062       };
3063 
3064   // Outer multiply loop, iterating over destination parts from least
3065   // significant to most significant parts.
3066   //
3067   // The columns of the following diagram correspond to the destination parts
3068   // affected by one iteration of the outer loop (ignoring boundary
3069   // conditions).
3070   //
3071   //   Dest index relative to 2 * i:      1 0 -1
3072   //                                      ------
3073   //   Carries from previous iteration:     e o
3074   //   Even-aligned partial product sum:  E E .
3075   //   Odd-aligned partial product sum:     O O
3076   //
3077   // 'o' is OddCarry, 'e' is EvenCarry.
3078   // EE and OO are computed from partial products via buildMadChain and use
3079   // accumulation where possible and appropriate.
3080   //
3081   Register SeparateOddCarry;
3082   Carry EvenCarry;
3083   Carry OddCarry;
3084 
3085   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
3086     Carry OddCarryIn = std::move(OddCarry);
3087     Carry EvenCarryIn = std::move(EvenCarry);
3088     OddCarry.clear();
3089     EvenCarry.clear();
3090 
3091     // Partial products at offset 2 * i.
3092     if (2 * i < Accum.size()) {
3093       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
3094       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
3095     }
3096 
3097     // Partial products at offset 2 * i - 1.
3098     if (i > 0) {
3099       if (!SeparateOddAlignedProducts) {
3100         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
3101         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3102       } else {
3103         bool IsHighest = 2 * i >= Accum.size();
3104         Register SeparateOddOut[2];
3105         auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
3106                               .take_front(IsHighest ? 1 : 2);
3107         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
3108 
3109         MachineInstr *Lo;
3110 
3111         if (i == 1) {
3112           if (!IsHighest)
3113             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
3114           else
3115             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
3116         } else {
3117           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
3118                             SeparateOddCarry);
3119         }
3120         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
3121 
3122         if (!IsHighest) {
3123           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
3124                                 Lo->getOperand(1).getReg());
3125           Accum[2 * i] = Hi.getReg(0);
3126           SeparateOddCarry = Hi.getReg(1);
3127         }
3128       }
3129     }
3130 
3131     // Add in the carries from the previous iteration
3132     if (i > 0) {
3133       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
3134         EvenCarryIn.push_back(CarryOut);
3135 
3136       if (2 * i < Accum.size()) {
3137         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
3138           OddCarry.push_back(CarryOut);
3139       }
3140     }
3141   }
3142 }
3143 
3144 // Custom narrowing of wide multiplies using wide multiply-add instructions.
3145 //
3146 // TODO: If the multiply is followed by an addition, we should attempt to
3147 // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
3148 bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
3149                                       MachineInstr &MI) const {
3150   assert(ST.hasMad64_32());
3151   assert(MI.getOpcode() == TargetOpcode::G_MUL);
3152 
3153   MachineIRBuilder &B = Helper.MIRBuilder;
3154   MachineRegisterInfo &MRI = *B.getMRI();
3155 
3156   Register DstReg = MI.getOperand(0).getReg();
3157   Register Src0 = MI.getOperand(1).getReg();
3158   Register Src1 = MI.getOperand(2).getReg();
3159 
3160   LLT Ty = MRI.getType(DstReg);
3161   assert(Ty.isScalar());
3162 
3163   unsigned Size = Ty.getSizeInBits();
3164   unsigned NumParts = Size / 32;
3165   assert((Size % 32) == 0);
3166   assert(NumParts >= 2);
3167 
3168   // Whether to use MAD_64_32 for partial products whose high half is
3169   // discarded. This avoids some ADD instructions but risks false dependency
3170   // stalls on some subtargets in some cases.
3171   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
3172 
3173   // Whether to compute odd-aligned partial products separately. This is
3174   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
3175   // in an even-aligned VGPR.
3176   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
3177 
3178   LLT S32 = LLT::scalar(32);
3179   SmallVector<Register, 2> Src0Parts, Src1Parts;
3180   for (unsigned i = 0; i < NumParts; ++i) {
3181     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
3182     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
3183   }
3184   B.buildUnmerge(Src0Parts, Src0);
3185   B.buildUnmerge(Src1Parts, Src1);
3186 
3187   SmallVector<Register, 2> AccumRegs(NumParts);
3188   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
3189                 SeparateOddAlignedProducts);
3190 
3191   B.buildMerge(DstReg, AccumRegs);
3192   MI.eraseFromParent();
3193   return true;
3194 
3195 }
3196 
3197 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3198 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3199 // case with a single min instruction instead of a compare+select.
3200 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3201                                             MachineRegisterInfo &MRI,
3202                                             MachineIRBuilder &B) const {
3203   Register Dst = MI.getOperand(0).getReg();
3204   Register Src = MI.getOperand(1).getReg();
3205   LLT DstTy = MRI.getType(Dst);
3206   LLT SrcTy = MRI.getType(Src);
3207 
3208   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3209                         ? AMDGPU::G_AMDGPU_FFBH_U32
3210                         : AMDGPU::G_AMDGPU_FFBL_B32;
3211   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3212   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3213 
3214   MI.eraseFromParent();
3215   return true;
3216 }
3217 
3218 // Check that this is a G_XOR x, -1
3219 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3220   if (MI.getOpcode() != TargetOpcode::G_XOR)
3221     return false;
3222   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3223   return ConstVal && *ConstVal == -1;
3224 }
3225 
3226 // Return the use branch instruction, otherwise null if the usage is invalid.
3227 static MachineInstr *
3228 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3229                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
3230   Register CondDef = MI.getOperand(0).getReg();
3231   if (!MRI.hasOneNonDBGUse(CondDef))
3232     return nullptr;
3233 
3234   MachineBasicBlock *Parent = MI.getParent();
3235   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3236 
3237   if (isNot(MRI, *UseMI)) {
3238     Register NegatedCond = UseMI->getOperand(0).getReg();
3239     if (!MRI.hasOneNonDBGUse(NegatedCond))
3240       return nullptr;
3241 
3242     // We're deleting the def of this value, so we need to remove it.
3243     eraseInstr(*UseMI, MRI);
3244 
3245     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3246     Negated = true;
3247   }
3248 
3249   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3250     return nullptr;
3251 
3252   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3253   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
3254   if (Next == Parent->end()) {
3255     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
3256     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
3257       return nullptr;
3258     UncondBrTarget = &*NextMBB;
3259   } else {
3260     if (Next->getOpcode() != AMDGPU::G_BR)
3261       return nullptr;
3262     Br = &*Next;
3263     UncondBrTarget = Br->getOperand(0).getMBB();
3264   }
3265 
3266   return UseMI;
3267 }
3268 
3269 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3270                                          const ArgDescriptor *Arg,
3271                                          const TargetRegisterClass *ArgRC,
3272                                          LLT ArgTy) const {
3273   MCRegister SrcReg = Arg->getRegister();
3274   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
3275   assert(DstReg.isVirtual() && "Virtual register expected");
3276 
3277   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
3278                                              *ArgRC, B.getDebugLoc(), ArgTy);
3279   if (Arg->isMasked()) {
3280     // TODO: Should we try to emit this once in the entry block?
3281     const LLT S32 = LLT::scalar(32);
3282     const unsigned Mask = Arg->getMask();
3283     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
3284 
3285     Register AndMaskSrc = LiveIn;
3286 
3287     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
3288     // 0.
3289     if (Shift != 0) {
3290       auto ShiftAmt = B.buildConstant(S32, Shift);
3291       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
3292     }
3293 
3294     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
3295   } else {
3296     B.buildCopy(DstReg, LiveIn);
3297   }
3298 
3299   return true;
3300 }
3301 
3302 bool AMDGPULegalizerInfo::loadInputValue(
3303     Register DstReg, MachineIRBuilder &B,
3304     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3305   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3306   const ArgDescriptor *Arg;
3307   const TargetRegisterClass *ArgRC;
3308   LLT ArgTy;
3309   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3310 
3311   if (!Arg) {
3312     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3313       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3314       // case the pointer argument may be missing and we use null.
3315       B.buildConstant(DstReg, 0);
3316       return true;
3317     }
3318 
3319     // It's undefined behavior if a function marked with the amdgpu-no-*
3320     // attributes uses the corresponding intrinsic.
3321     B.buildUndef(DstReg);
3322     return true;
3323   }
3324 
3325   if (!Arg->isRegister() || !Arg->getRegister().isValid())
3326     return false; // TODO: Handle these
3327   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3328 }
3329 
3330 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
3331     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3332     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3333   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
3334     return false;
3335 
3336   MI.eraseFromParent();
3337   return true;
3338 }
3339 
3340 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
3341                                 int64_t C) {
3342   B.buildConstant(MI.getOperand(0).getReg(), C);
3343   MI.eraseFromParent();
3344   return true;
3345 }
3346 
3347 bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
3348     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
3349     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3350   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
3351   if (MaxID == 0)
3352     return replaceWithConstant(B, MI, 0);
3353 
3354   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3355   const ArgDescriptor *Arg;
3356   const TargetRegisterClass *ArgRC;
3357   LLT ArgTy;
3358   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3359 
3360   Register DstReg = MI.getOperand(0).getReg();
3361   if (!Arg) {
3362     // It's undefined behavior if a function marked with the amdgpu-no-*
3363     // attributes uses the corresponding intrinsic.
3364     B.buildUndef(DstReg);
3365     MI.eraseFromParent();
3366     return true;
3367   }
3368 
3369   if (Arg->isMasked()) {
3370     // Don't bother inserting AssertZext for packed IDs since we're emitting the
3371     // masking operations anyway.
3372     //
3373     // TODO: We could assert the top bit is 0 for the source copy.
3374     if (!loadInputValue(DstReg, B, ArgType))
3375       return false;
3376   } else {
3377     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
3378     if (!loadInputValue(TmpReg, B, ArgType))
3379       return false;
3380     B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
3381   }
3382 
3383   MI.eraseFromParent();
3384   return true;
3385 }
3386 
3387 Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
3388                                                      int64_t Offset) const {
3389   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3390   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
3391 
3392   // TODO: If we passed in the base kernel offset we could have a better
3393   // alignment than 4, but we don't really need it.
3394   if (!loadInputValue(KernArgReg, B,
3395                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3396     llvm_unreachable("failed to find kernarg segment ptr");
3397 
3398   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
3399   // TODO: Should get nuw
3400   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
3401 }
3402 
3403 /// Legalize a value that's loaded from kernel arguments. This is only used by
3404 /// legacy intrinsics.
3405 bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
3406                                                       MachineIRBuilder &B,
3407                                                       uint64_t Offset,
3408                                                       Align Alignment) const {
3409   Register DstReg = MI.getOperand(0).getReg();
3410 
3411   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
3412          "unexpected kernarg parameter type");
3413 
3414   Register Ptr = getKernargParameterPtr(B, Offset);
3415   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
3416   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
3417               MachineMemOperand::MODereferenceable |
3418                   MachineMemOperand::MOInvariant);
3419   MI.eraseFromParent();
3420   return true;
3421 }
3422 
3423 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
3424                                        MachineRegisterInfo &MRI,
3425                                        MachineIRBuilder &B) const {
3426   Register Dst = MI.getOperand(0).getReg();
3427   LLT DstTy = MRI.getType(Dst);
3428   LLT S16 = LLT::scalar(16);
3429   LLT S32 = LLT::scalar(32);
3430   LLT S64 = LLT::scalar(64);
3431 
3432   if (DstTy == S16)
3433     return legalizeFDIV16(MI, MRI, B);
3434   if (DstTy == S32)
3435     return legalizeFDIV32(MI, MRI, B);
3436   if (DstTy == S64)
3437     return legalizeFDIV64(MI, MRI, B);
3438 
3439   return false;
3440 }
3441 
3442 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3443                                                         Register DstDivReg,
3444                                                         Register DstRemReg,
3445                                                         Register X,
3446                                                         Register Y) const {
3447   const LLT S1 = LLT::scalar(1);
3448   const LLT S32 = LLT::scalar(32);
3449 
3450   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
3451   // algorithm used here.
3452 
3453   // Initial estimate of inv(y).
3454   auto FloatY = B.buildUITOFP(S32, Y);
3455   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
3456   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
3457   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
3458   auto Z = B.buildFPTOUI(S32, ScaledY);
3459 
3460   // One round of UNR.
3461   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
3462   auto NegYZ = B.buildMul(S32, NegY, Z);
3463   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
3464 
3465   // Quotient/remainder estimate.
3466   auto Q = B.buildUMulH(S32, X, Z);
3467   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3468 
3469   // First quotient/remainder refinement.
3470   auto One = B.buildConstant(S32, 1);
3471   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3472   if (DstDivReg)
3473     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3474   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3475 
3476   // Second quotient/remainder refinement.
3477   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3478   if (DstDivReg)
3479     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3480 
3481   if (DstRemReg)
3482     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3483 }
3484 
3485 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3486 //
3487 // Return lo, hi of result
3488 //
3489 // %cvt.lo = G_UITOFP Val.lo
3490 // %cvt.hi = G_UITOFP Val.hi
3491 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3492 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3493 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3494 // %mul2 = G_FMUL %mul1, 2**(-32)
3495 // %trunc = G_INTRINSIC_TRUNC %mul2
3496 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3497 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3498 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3499                                                        Register Val) {
3500   const LLT S32 = LLT::scalar(32);
3501   auto Unmerge = B.buildUnmerge(S32, Val);
3502 
3503   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3504   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3505 
3506   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3507                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3508 
3509   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3510   auto Mul1 =
3511       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3512 
3513   // 2**(-32)
3514   auto Mul2 =
3515       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3516   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3517 
3518   // -(2**32)
3519   auto Mad2 = B.buildFMAD(S32, Trunc,
3520                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3521 
3522   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3523   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3524 
3525   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3526 }
3527 
3528 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3529                                                         Register DstDivReg,
3530                                                         Register DstRemReg,
3531                                                         Register Numer,
3532                                                         Register Denom) const {
3533   const LLT S32 = LLT::scalar(32);
3534   const LLT S64 = LLT::scalar(64);
3535   const LLT S1 = LLT::scalar(1);
3536   Register RcpLo, RcpHi;
3537 
3538   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3539 
3540   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3541 
3542   auto Zero64 = B.buildConstant(S64, 0);
3543   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3544 
3545   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3546   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3547 
3548   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3549   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3550   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3551 
3552   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3553   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3554   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3555 
3556   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3557   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3558   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3559   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3560   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3561 
3562   auto Zero32 = B.buildConstant(S32, 0);
3563   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3564   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3565   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3566 
3567   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3568   Register NumerLo = UnmergeNumer.getReg(0);
3569   Register NumerHi = UnmergeNumer.getReg(1);
3570 
3571   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3572   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3573   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3574   Register Mul3_Lo = UnmergeMul3.getReg(0);
3575   Register Mul3_Hi = UnmergeMul3.getReg(1);
3576   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3577   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3578   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3579   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3580 
3581   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3582   Register DenomLo = UnmergeDenom.getReg(0);
3583   Register DenomHi = UnmergeDenom.getReg(1);
3584 
3585   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3586   auto C1 = B.buildSExt(S32, CmpHi);
3587 
3588   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3589   auto C2 = B.buildSExt(S32, CmpLo);
3590 
3591   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3592   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3593 
3594   // TODO: Here and below portions of the code can be enclosed into if/endif.
3595   // Currently control flow is unconditional and we have 4 selects after
3596   // potential endif to substitute PHIs.
3597 
3598   // if C3 != 0 ...
3599   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3600   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3601   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3602   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3603 
3604   auto One64 = B.buildConstant(S64, 1);
3605   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3606 
3607   auto C4 =
3608       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3609   auto C5 =
3610       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3611   auto C6 = B.buildSelect(
3612       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3613 
3614   // if (C6 != 0)
3615   auto Add4 = B.buildAdd(S64, Add3, One64);
3616   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3617 
3618   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3619   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3620   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3621 
3622   // endif C6
3623   // endif C3
3624 
3625   if (DstDivReg) {
3626     auto Sel1 = B.buildSelect(
3627         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3628     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3629                   Sel1, MulHi3);
3630   }
3631 
3632   if (DstRemReg) {
3633     auto Sel2 = B.buildSelect(
3634         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3635     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3636                   Sel2, Sub1);
3637   }
3638 }
3639 
3640 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3641                                                   MachineRegisterInfo &MRI,
3642                                                   MachineIRBuilder &B) const {
3643   Register DstDivReg, DstRemReg;
3644   switch (MI.getOpcode()) {
3645   default:
3646     llvm_unreachable("Unexpected opcode!");
3647   case AMDGPU::G_UDIV: {
3648     DstDivReg = MI.getOperand(0).getReg();
3649     break;
3650   }
3651   case AMDGPU::G_UREM: {
3652     DstRemReg = MI.getOperand(0).getReg();
3653     break;
3654   }
3655   case AMDGPU::G_UDIVREM: {
3656     DstDivReg = MI.getOperand(0).getReg();
3657     DstRemReg = MI.getOperand(1).getReg();
3658     break;
3659   }
3660   }
3661 
3662   const LLT S64 = LLT::scalar(64);
3663   const LLT S32 = LLT::scalar(32);
3664   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3665   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3666   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3667   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3668 
3669   if (Ty == S32)
3670     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3671   else if (Ty == S64)
3672     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3673   else
3674     return false;
3675 
3676   MI.eraseFromParent();
3677   return true;
3678 }
3679 
3680 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3681                                                 MachineRegisterInfo &MRI,
3682                                                 MachineIRBuilder &B) const {
3683   const LLT S64 = LLT::scalar(64);
3684   const LLT S32 = LLT::scalar(32);
3685 
3686   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3687   if (Ty != S32 && Ty != S64)
3688     return false;
3689 
3690   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3691   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3692   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3693 
3694   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3695   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3696   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3697 
3698   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3699   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3700 
3701   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3702   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3703 
3704   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3705   switch (MI.getOpcode()) {
3706   default:
3707     llvm_unreachable("Unexpected opcode!");
3708   case AMDGPU::G_SDIV: {
3709     DstDivReg = MI.getOperand(0).getReg();
3710     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3711     break;
3712   }
3713   case AMDGPU::G_SREM: {
3714     DstRemReg = MI.getOperand(0).getReg();
3715     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3716     break;
3717   }
3718   case AMDGPU::G_SDIVREM: {
3719     DstDivReg = MI.getOperand(0).getReg();
3720     DstRemReg = MI.getOperand(1).getReg();
3721     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3722     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3723     break;
3724   }
3725   }
3726 
3727   if (Ty == S32)
3728     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3729   else
3730     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3731 
3732   if (DstDivReg) {
3733     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3734     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3735     B.buildSub(DstDivReg, SignXor, Sign);
3736   }
3737 
3738   if (DstRemReg) {
3739     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3740     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3741     B.buildSub(DstRemReg, SignXor, Sign);
3742   }
3743 
3744   MI.eraseFromParent();
3745   return true;
3746 }
3747 
3748 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3749                                                  MachineRegisterInfo &MRI,
3750                                                  MachineIRBuilder &B) const {
3751   Register Res = MI.getOperand(0).getReg();
3752   Register LHS = MI.getOperand(1).getReg();
3753   Register RHS = MI.getOperand(2).getReg();
3754   uint16_t Flags = MI.getFlags();
3755   LLT ResTy = MRI.getType(Res);
3756 
3757   const MachineFunction &MF = B.getMF();
3758   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3759                             MI.getFlag(MachineInstr::FmAfn);
3760 
3761   if (!AllowInaccurateRcp)
3762     return false;
3763 
3764   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3765     // 1 / x -> RCP(x)
3766     if (CLHS->isExactlyValue(1.0)) {
3767       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3768         .addUse(RHS)
3769         .setMIFlags(Flags);
3770 
3771       MI.eraseFromParent();
3772       return true;
3773     }
3774 
3775     // -1 / x -> RCP( FNEG(x) )
3776     if (CLHS->isExactlyValue(-1.0)) {
3777       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3778       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3779         .addUse(FNeg.getReg(0))
3780         .setMIFlags(Flags);
3781 
3782       MI.eraseFromParent();
3783       return true;
3784     }
3785   }
3786 
3787   // x / y -> x * (1.0 / y)
3788   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3789     .addUse(RHS)
3790     .setMIFlags(Flags);
3791   B.buildFMul(Res, LHS, RCP, Flags);
3792 
3793   MI.eraseFromParent();
3794   return true;
3795 }
3796 
3797 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3798                                                    MachineRegisterInfo &MRI,
3799                                                    MachineIRBuilder &B) const {
3800   Register Res = MI.getOperand(0).getReg();
3801   Register X = MI.getOperand(1).getReg();
3802   Register Y = MI.getOperand(2).getReg();
3803   uint16_t Flags = MI.getFlags();
3804   LLT ResTy = MRI.getType(Res);
3805 
3806   const MachineFunction &MF = B.getMF();
3807   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3808                             MI.getFlag(MachineInstr::FmAfn);
3809 
3810   if (!AllowInaccurateRcp)
3811     return false;
3812 
3813   auto NegY = B.buildFNeg(ResTy, Y);
3814   auto One = B.buildFConstant(ResTy, 1.0);
3815 
3816   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3817     .addUse(Y)
3818     .setMIFlags(Flags);
3819 
3820   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3821   R = B.buildFMA(ResTy, Tmp0, R, R);
3822 
3823   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3824   R = B.buildFMA(ResTy, Tmp1, R, R);
3825 
3826   auto Ret = B.buildFMul(ResTy, X, R);
3827   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3828 
3829   B.buildFMA(Res, Tmp2, R, Ret);
3830   MI.eraseFromParent();
3831   return true;
3832 }
3833 
3834 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3835                                          MachineRegisterInfo &MRI,
3836                                          MachineIRBuilder &B) const {
3837   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3838     return true;
3839 
3840   Register Res = MI.getOperand(0).getReg();
3841   Register LHS = MI.getOperand(1).getReg();
3842   Register RHS = MI.getOperand(2).getReg();
3843 
3844   uint16_t Flags = MI.getFlags();
3845 
3846   LLT S16 = LLT::scalar(16);
3847   LLT S32 = LLT::scalar(32);
3848 
3849   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3850   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3851 
3852   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3853     .addUse(RHSExt.getReg(0))
3854     .setMIFlags(Flags);
3855 
3856   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3857   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3858 
3859   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3860     .addUse(RDst.getReg(0))
3861     .addUse(RHS)
3862     .addUse(LHS)
3863     .setMIFlags(Flags);
3864 
3865   MI.eraseFromParent();
3866   return true;
3867 }
3868 
3869 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3870 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3871 static void toggleSPDenormMode(bool Enable,
3872                                MachineIRBuilder &B,
3873                                const GCNSubtarget &ST,
3874                                AMDGPU::SIModeRegisterDefaults Mode) {
3875   // Set SP denorm mode to this value.
3876   unsigned SPDenormMode =
3877     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3878 
3879   if (ST.hasDenormModeInst()) {
3880     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3881     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3882 
3883     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3884     B.buildInstr(AMDGPU::S_DENORM_MODE)
3885       .addImm(NewDenormModeValue);
3886 
3887   } else {
3888     // Select FP32 bit field in mode register.
3889     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3890                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3891                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3892 
3893     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3894       .addImm(SPDenormMode)
3895       .addImm(SPDenormModeBitField);
3896   }
3897 }
3898 
3899 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3900                                          MachineRegisterInfo &MRI,
3901                                          MachineIRBuilder &B) const {
3902   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3903     return true;
3904 
3905   Register Res = MI.getOperand(0).getReg();
3906   Register LHS = MI.getOperand(1).getReg();
3907   Register RHS = MI.getOperand(2).getReg();
3908   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3909   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3910 
3911   uint16_t Flags = MI.getFlags();
3912 
3913   LLT S32 = LLT::scalar(32);
3914   LLT S1 = LLT::scalar(1);
3915 
3916   auto One = B.buildFConstant(S32, 1.0f);
3917 
3918   auto DenominatorScaled =
3919     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3920       .addUse(LHS)
3921       .addUse(RHS)
3922       .addImm(0)
3923       .setMIFlags(Flags);
3924   auto NumeratorScaled =
3925     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3926       .addUse(LHS)
3927       .addUse(RHS)
3928       .addImm(1)
3929       .setMIFlags(Flags);
3930 
3931   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3932     .addUse(DenominatorScaled.getReg(0))
3933     .setMIFlags(Flags);
3934   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3935 
3936   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3937   // aren't modeled as reading it.
3938   if (!Mode.allFP32Denormals())
3939     toggleSPDenormMode(true, B, ST, Mode);
3940 
3941   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3942   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3943   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3944   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3945   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3946   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3947 
3948   if (!Mode.allFP32Denormals())
3949     toggleSPDenormMode(false, B, ST, Mode);
3950 
3951   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3952     .addUse(Fma4.getReg(0))
3953     .addUse(Fma1.getReg(0))
3954     .addUse(Fma3.getReg(0))
3955     .addUse(NumeratorScaled.getReg(1))
3956     .setMIFlags(Flags);
3957 
3958   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3959     .addUse(Fmas.getReg(0))
3960     .addUse(RHS)
3961     .addUse(LHS)
3962     .setMIFlags(Flags);
3963 
3964   MI.eraseFromParent();
3965   return true;
3966 }
3967 
3968 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3969                                          MachineRegisterInfo &MRI,
3970                                          MachineIRBuilder &B) const {
3971   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3972     return true;
3973 
3974   Register Res = MI.getOperand(0).getReg();
3975   Register LHS = MI.getOperand(1).getReg();
3976   Register RHS = MI.getOperand(2).getReg();
3977 
3978   uint16_t Flags = MI.getFlags();
3979 
3980   LLT S64 = LLT::scalar(64);
3981   LLT S1 = LLT::scalar(1);
3982 
3983   auto One = B.buildFConstant(S64, 1.0);
3984 
3985   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3986     .addUse(LHS)
3987     .addUse(RHS)
3988     .addImm(0)
3989     .setMIFlags(Flags);
3990 
3991   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3992 
3993   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3994     .addUse(DivScale0.getReg(0))
3995     .setMIFlags(Flags);
3996 
3997   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3998   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3999   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4000 
4001   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4002     .addUse(LHS)
4003     .addUse(RHS)
4004     .addImm(1)
4005     .setMIFlags(Flags);
4006 
4007   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
4008   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4009   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4010 
4011   Register Scale;
4012   if (!ST.hasUsableDivScaleConditionOutput()) {
4013     // Workaround a hardware bug on SI where the condition output from div_scale
4014     // is not usable.
4015 
4016     LLT S32 = LLT::scalar(32);
4017 
4018     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4019     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4020     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4021     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4022 
4023     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4024                               Scale1Unmerge.getReg(1));
4025     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4026                               Scale0Unmerge.getReg(1));
4027     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4028   } else {
4029     Scale = DivScale1.getReg(1);
4030   }
4031 
4032   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4033     .addUse(Fma4.getReg(0))
4034     .addUse(Fma3.getReg(0))
4035     .addUse(Mul.getReg(0))
4036     .addUse(Scale)
4037     .setMIFlags(Flags);
4038 
4039   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4040     .addUse(Fmas.getReg(0))
4041     .addUse(RHS)
4042     .addUse(LHS)
4043     .setMIFlags(Flags);
4044 
4045   MI.eraseFromParent();
4046   return true;
4047 }
4048 
4049 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
4050                                                  MachineRegisterInfo &MRI,
4051                                                  MachineIRBuilder &B) const {
4052   Register Res = MI.getOperand(0).getReg();
4053   Register LHS = MI.getOperand(2).getReg();
4054   Register RHS = MI.getOperand(3).getReg();
4055   uint16_t Flags = MI.getFlags();
4056 
4057   LLT S32 = LLT::scalar(32);
4058   LLT S1 = LLT::scalar(1);
4059 
4060   auto Abs = B.buildFAbs(S32, RHS, Flags);
4061   const APFloat C0Val(1.0f);
4062 
4063   auto C0 = B.buildConstant(S32, 0x6f800000);
4064   auto C1 = B.buildConstant(S32, 0x2f800000);
4065   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
4066 
4067   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
4068   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
4069 
4070   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
4071 
4072   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4073     .addUse(Mul0.getReg(0))
4074     .setMIFlags(Flags);
4075 
4076   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
4077 
4078   B.buildFMul(Res, Sel, Mul1, Flags);
4079 
4080   MI.eraseFromParent();
4081   return true;
4082 }
4083 
4084 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4085 // FIXME: Why do we handle this one but not other removed instructions?
4086 //
4087 // Reciprocal square root.  The clamp prevents infinite results, clamping
4088 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4089 // +-max_float.
4090 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4091                                                     MachineRegisterInfo &MRI,
4092                                                     MachineIRBuilder &B) const {
4093   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4094     return true;
4095 
4096   Register Dst = MI.getOperand(0).getReg();
4097   Register Src = MI.getOperand(2).getReg();
4098   auto Flags = MI.getFlags();
4099 
4100   LLT Ty = MRI.getType(Dst);
4101 
4102   const fltSemantics *FltSemantics;
4103   if (Ty == LLT::scalar(32))
4104     FltSemantics = &APFloat::IEEEsingle();
4105   else if (Ty == LLT::scalar(64))
4106     FltSemantics = &APFloat::IEEEdouble();
4107   else
4108     return false;
4109 
4110   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4111     .addUse(Src)
4112     .setMIFlags(Flags);
4113 
4114   // We don't need to concern ourselves with the snan handling difference, since
4115   // the rsq quieted (or not) so use the one which will directly select.
4116   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4117   const bool UseIEEE = MFI->getMode().IEEE;
4118 
4119   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4120   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4121                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4122 
4123   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4124 
4125   if (UseIEEE)
4126     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4127   else
4128     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4129   MI.eraseFromParent();
4130   return true;
4131 }
4132 
4133 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4134   switch (IID) {
4135   case Intrinsic::amdgcn_ds_fadd:
4136     return AMDGPU::G_ATOMICRMW_FADD;
4137   case Intrinsic::amdgcn_ds_fmin:
4138     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4139   case Intrinsic::amdgcn_ds_fmax:
4140     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4141   default:
4142     llvm_unreachable("not a DS FP intrinsic");
4143   }
4144 }
4145 
4146 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4147                                                       MachineInstr &MI,
4148                                                       Intrinsic::ID IID) const {
4149   GISelChangeObserver &Observer = Helper.Observer;
4150   Observer.changingInstr(MI);
4151 
4152   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4153 
4154   // The remaining operands were used to set fields in the MemOperand on
4155   // construction.
4156   for (int I = 6; I > 3; --I)
4157     MI.removeOperand(I);
4158 
4159   MI.removeOperand(1); // Remove the intrinsic ID.
4160   Observer.changedInstr(MI);
4161   return true;
4162 }
4163 
4164 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4165                                             MachineRegisterInfo &MRI,
4166                                             MachineIRBuilder &B) const {
4167   uint64_t Offset =
4168     ST.getTargetLowering()->getImplicitParameterOffset(
4169       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4170   LLT DstTy = MRI.getType(DstReg);
4171   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4172 
4173   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4174   if (!loadInputValue(KernargPtrReg, B,
4175                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4176     return false;
4177 
4178   // FIXME: This should be nuw
4179   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4180   return true;
4181 }
4182 
4183 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
4184                                                  MachineRegisterInfo &MRI,
4185                                                  MachineIRBuilder &B) const {
4186   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4187   if (!MFI->isEntryFunction()) {
4188     return legalizePreloadedArgIntrin(MI, MRI, B,
4189                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
4190   }
4191 
4192   Register DstReg = MI.getOperand(0).getReg();
4193   if (!getImplicitArgPtr(DstReg, MRI, B))
4194     return false;
4195 
4196   MI.eraseFromParent();
4197   return true;
4198 }
4199 
4200 bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4201                                          MachineRegisterInfo &MRI,
4202                                          MachineIRBuilder &B) const {
4203   Function &F = B.getMF().getFunction();
4204   Optional<uint32_t> KnownSize =
4205       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4206   if (KnownSize.has_value())
4207     B.buildConstant(DstReg, KnownSize.value());
4208   return false;
4209 }
4210 
4211 bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4212                                               MachineRegisterInfo &MRI,
4213                                               MachineIRBuilder &B) const {
4214 
4215   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4216   if (!MFI->isEntryFunction()) {
4217     return legalizePreloadedArgIntrin(MI, MRI, B,
4218                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4219   }
4220 
4221   Register DstReg = MI.getOperand(0).getReg();
4222   if (!getLDSKernelId(DstReg, MRI, B))
4223     return false;
4224 
4225   MI.eraseFromParent();
4226   return true;
4227 }
4228 
4229 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
4230                                               MachineRegisterInfo &MRI,
4231                                               MachineIRBuilder &B,
4232                                               unsigned AddrSpace) const {
4233   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4234   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4235   Register Hi32 = Unmerge.getReg(1);
4236 
4237   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
4238   MI.eraseFromParent();
4239   return true;
4240 }
4241 
4242 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
4243 // offset (the offset that is included in bounds checking and swizzling, to be
4244 // split between the instruction's voffset and immoffset fields) and soffset
4245 // (the offset that is excluded from bounds checking and swizzling, to go in
4246 // the instruction's soffset field).  This function takes the first kind of
4247 // offset and figures out how to split it between voffset and immoffset.
4248 std::pair<Register, unsigned>
4249 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
4250                                         Register OrigOffset) const {
4251   const unsigned MaxImm = 4095;
4252   Register BaseReg;
4253   unsigned ImmOffset;
4254   const LLT S32 = LLT::scalar(32);
4255   MachineRegisterInfo &MRI = *B.getMRI();
4256 
4257   std::tie(BaseReg, ImmOffset) =
4258       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
4259 
4260   // If BaseReg is a pointer, convert it to int.
4261   if (MRI.getType(BaseReg).isPointer())
4262     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
4263 
4264   // If the immediate value is too big for the immoffset field, put the value
4265   // and -4096 into the immoffset field so that the value that is copied/added
4266   // for the voffset field is a multiple of 4096, and it stands more chance
4267   // of being CSEd with the copy/add for another similar load/store.
4268   // However, do not do that rounding down to a multiple of 4096 if that is a
4269   // negative number, as it appears to be illegal to have a negative offset
4270   // in the vgpr, even if adding the immediate offset makes it positive.
4271   unsigned Overflow = ImmOffset & ~MaxImm;
4272   ImmOffset -= Overflow;
4273   if ((int32_t)Overflow < 0) {
4274     Overflow += ImmOffset;
4275     ImmOffset = 0;
4276   }
4277 
4278   if (Overflow != 0) {
4279     if (!BaseReg) {
4280       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
4281     } else {
4282       auto OverflowVal = B.buildConstant(S32, Overflow);
4283       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
4284     }
4285   }
4286 
4287   if (!BaseReg)
4288     BaseReg = B.buildConstant(S32, 0).getReg(0);
4289 
4290   return std::make_pair(BaseReg, ImmOffset);
4291 }
4292 
4293 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4294 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4295                                           Register VOffset, Register SOffset,
4296                                           unsigned ImmOffset, Register VIndex,
4297                                           MachineRegisterInfo &MRI) const {
4298   Optional<ValueAndVReg> MaybeVOffsetVal =
4299       getIConstantVRegValWithLookThrough(VOffset, MRI);
4300   Optional<ValueAndVReg> MaybeSOffsetVal =
4301       getIConstantVRegValWithLookThrough(SOffset, MRI);
4302   Optional<ValueAndVReg> MaybeVIndexVal =
4303       getIConstantVRegValWithLookThrough(VIndex, MRI);
4304   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4305   // update the MMO with that offset. The stride is unknown so we can only do
4306   // this if VIndex is constant 0.
4307   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4308       MaybeVIndexVal->Value == 0) {
4309     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4310                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4311     MMO->setOffset(TotalOffset);
4312   } else {
4313     // We don't have a constant combined offset to use in the MMO. Give up.
4314     MMO->setValue((Value *)nullptr);
4315   }
4316 }
4317 
4318 /// Handle register layout difference for f16 images for some subtargets.
4319 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
4320                                              MachineRegisterInfo &MRI,
4321                                              Register Reg,
4322                                              bool ImageStore) const {
4323   const LLT S16 = LLT::scalar(16);
4324   const LLT S32 = LLT::scalar(32);
4325   LLT StoreVT = MRI.getType(Reg);
4326   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
4327 
4328   if (ST.hasUnpackedD16VMem()) {
4329     auto Unmerge = B.buildUnmerge(S16, Reg);
4330 
4331     SmallVector<Register, 4> WideRegs;
4332     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4333       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
4334 
4335     int NumElts = StoreVT.getNumElements();
4336 
4337     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4338         .getReg(0);
4339   }
4340 
4341   if (ImageStore && ST.hasImageStoreD16Bug()) {
4342     if (StoreVT.getNumElements() == 2) {
4343       SmallVector<Register, 4> PackedRegs;
4344       Reg = B.buildBitcast(S32, Reg).getReg(0);
4345       PackedRegs.push_back(Reg);
4346       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4347       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4348           .getReg(0);
4349     }
4350 
4351     if (StoreVT.getNumElements() == 3) {
4352       SmallVector<Register, 4> PackedRegs;
4353       auto Unmerge = B.buildUnmerge(S16, Reg);
4354       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4355         PackedRegs.push_back(Unmerge.getReg(I));
4356       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4357       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4358       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4359     }
4360 
4361     if (StoreVT.getNumElements() == 4) {
4362       SmallVector<Register, 4> PackedRegs;
4363       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4364       auto Unmerge = B.buildUnmerge(S32, Reg);
4365       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4366         PackedRegs.push_back(Unmerge.getReg(I));
4367       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4368       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4369           .getReg(0);
4370     }
4371 
4372     llvm_unreachable("invalid data type");
4373   }
4374 
4375   if (StoreVT == LLT::fixed_vector(3, S16)) {
4376     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
4377               .getReg(0);
4378   }
4379   return Reg;
4380 }
4381 
4382 Register AMDGPULegalizerInfo::fixStoreSourceType(
4383   MachineIRBuilder &B, Register VData, bool IsFormat) const {
4384   MachineRegisterInfo *MRI = B.getMRI();
4385   LLT Ty = MRI->getType(VData);
4386 
4387   const LLT S16 = LLT::scalar(16);
4388 
4389   // Fixup illegal register types for i8 stores.
4390   if (Ty == LLT::scalar(8) || Ty == S16) {
4391     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
4392     return AnyExt;
4393   }
4394 
4395   if (Ty.isVector()) {
4396     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
4397       if (IsFormat)
4398         return handleD16VData(B, *MRI, VData);
4399     }
4400   }
4401 
4402   return VData;
4403 }
4404 
4405 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
4406                                               MachineRegisterInfo &MRI,
4407                                               MachineIRBuilder &B,
4408                                               bool IsTyped,
4409                                               bool IsFormat) const {
4410   Register VData = MI.getOperand(1).getReg();
4411   LLT Ty = MRI.getType(VData);
4412   LLT EltTy = Ty.getScalarType();
4413   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4414   const LLT S32 = LLT::scalar(32);
4415 
4416   VData = fixStoreSourceType(B, VData, IsFormat);
4417   Register RSrc = MI.getOperand(2).getReg();
4418 
4419   MachineMemOperand *MMO = *MI.memoperands_begin();
4420   const int MemSize = MMO->getSize();
4421 
4422   unsigned ImmOffset;
4423 
4424   // The typed intrinsics add an immediate after the registers.
4425   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4426 
4427   // The struct intrinsic variants add one additional operand over raw.
4428   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4429   Register VIndex;
4430   int OpOffset = 0;
4431   if (HasVIndex) {
4432     VIndex = MI.getOperand(3).getReg();
4433     OpOffset = 1;
4434   } else {
4435     VIndex = B.buildConstant(S32, 0).getReg(0);
4436   }
4437 
4438   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4439   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4440 
4441   unsigned Format = 0;
4442   if (IsTyped) {
4443     Format = MI.getOperand(5 + OpOffset).getImm();
4444     ++OpOffset;
4445   }
4446 
4447   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4448 
4449   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4450   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4451 
4452   unsigned Opc;
4453   if (IsTyped) {
4454     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
4455                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
4456   } else if (IsFormat) {
4457     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
4458                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
4459   } else {
4460     switch (MemSize) {
4461     case 1:
4462       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
4463       break;
4464     case 2:
4465       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
4466       break;
4467     default:
4468       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
4469       break;
4470     }
4471   }
4472 
4473   auto MIB = B.buildInstr(Opc)
4474     .addUse(VData)              // vdata
4475     .addUse(RSrc)               // rsrc
4476     .addUse(VIndex)             // vindex
4477     .addUse(VOffset)            // voffset
4478     .addUse(SOffset)            // soffset
4479     .addImm(ImmOffset);         // offset(imm)
4480 
4481   if (IsTyped)
4482     MIB.addImm(Format);
4483 
4484   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4485      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4486      .addMemOperand(MMO);
4487 
4488   MI.eraseFromParent();
4489   return true;
4490 }
4491 
4492 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
4493                                              MachineRegisterInfo &MRI,
4494                                              MachineIRBuilder &B,
4495                                              bool IsFormat,
4496                                              bool IsTyped) const {
4497   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4498   MachineMemOperand *MMO = *MI.memoperands_begin();
4499   const LLT MemTy = MMO->getMemoryType();
4500   const LLT S32 = LLT::scalar(32);
4501 
4502   Register Dst = MI.getOperand(0).getReg();
4503   Register RSrc = MI.getOperand(2).getReg();
4504 
4505   // The typed intrinsics add an immediate after the registers.
4506   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4507 
4508   // The struct intrinsic variants add one additional operand over raw.
4509   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4510   Register VIndex;
4511   int OpOffset = 0;
4512   if (HasVIndex) {
4513     VIndex = MI.getOperand(3).getReg();
4514     OpOffset = 1;
4515   } else {
4516     VIndex = B.buildConstant(S32, 0).getReg(0);
4517   }
4518 
4519   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4520   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4521 
4522   unsigned Format = 0;
4523   if (IsTyped) {
4524     Format = MI.getOperand(5 + OpOffset).getImm();
4525     ++OpOffset;
4526   }
4527 
4528   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4529   unsigned ImmOffset;
4530 
4531   LLT Ty = MRI.getType(Dst);
4532   LLT EltTy = Ty.getScalarType();
4533   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4534   const bool Unpacked = ST.hasUnpackedD16VMem();
4535 
4536   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4537   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4538 
4539   unsigned Opc;
4540 
4541   if (IsTyped) {
4542     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4543                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4544   } else if (IsFormat) {
4545     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4546                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4547   } else {
4548     switch (MemTy.getSizeInBits()) {
4549     case 8:
4550       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4551       break;
4552     case 16:
4553       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4554       break;
4555     default:
4556       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4557       break;
4558     }
4559   }
4560 
4561   Register LoadDstReg;
4562 
4563   bool IsExtLoad =
4564       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4565   LLT UnpackedTy = Ty.changeElementSize(32);
4566 
4567   if (IsExtLoad)
4568     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4569   else if (Unpacked && IsD16 && Ty.isVector())
4570     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4571   else
4572     LoadDstReg = Dst;
4573 
4574   auto MIB = B.buildInstr(Opc)
4575     .addDef(LoadDstReg)         // vdata
4576     .addUse(RSrc)               // rsrc
4577     .addUse(VIndex)             // vindex
4578     .addUse(VOffset)            // voffset
4579     .addUse(SOffset)            // soffset
4580     .addImm(ImmOffset);         // offset(imm)
4581 
4582   if (IsTyped)
4583     MIB.addImm(Format);
4584 
4585   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4586      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4587      .addMemOperand(MMO);
4588 
4589   if (LoadDstReg != Dst) {
4590     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4591 
4592     // Widen result for extending loads was widened.
4593     if (IsExtLoad)
4594       B.buildTrunc(Dst, LoadDstReg);
4595     else {
4596       // Repack to original 16-bit vector result
4597       // FIXME: G_TRUNC should work, but legalization currently fails
4598       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4599       SmallVector<Register, 4> Repack;
4600       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4601         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4602       B.buildMerge(Dst, Repack);
4603     }
4604   }
4605 
4606   MI.eraseFromParent();
4607   return true;
4608 }
4609 
4610 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4611                                                MachineIRBuilder &B,
4612                                                bool IsInc) const {
4613   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4614                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4615   B.buildInstr(Opc)
4616     .addDef(MI.getOperand(0).getReg())
4617     .addUse(MI.getOperand(2).getReg())
4618     .addUse(MI.getOperand(3).getReg())
4619     .cloneMemRefs(MI);
4620   MI.eraseFromParent();
4621   return true;
4622 }
4623 
4624 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4625   switch (IntrID) {
4626   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4627   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4628     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4629   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4630   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4631     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4632   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4633   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4634     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4635   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4636   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4637     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4638   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4639   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4640     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4641   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4642   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4643     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4644   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4645   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4646     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4647   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4648   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4649     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4650   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4651   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4652     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4653   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4654   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4655     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4656   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4657   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4658     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4659   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4660   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4661     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4662   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4663   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4664     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4665   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4666   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4667     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4668   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4669   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4670     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4671   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4672   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4673     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4674   default:
4675     llvm_unreachable("unhandled atomic opcode");
4676   }
4677 }
4678 
4679 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4680                                                MachineIRBuilder &B,
4681                                                Intrinsic::ID IID) const {
4682   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4683                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4684   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4685 
4686   Register Dst;
4687 
4688   int OpOffset = 0;
4689   if (HasReturn) {
4690     // A few FP atomics do not support return values.
4691     Dst = MI.getOperand(0).getReg();
4692   } else {
4693     OpOffset = -1;
4694   }
4695 
4696   Register VData = MI.getOperand(2 + OpOffset).getReg();
4697   Register CmpVal;
4698 
4699   if (IsCmpSwap) {
4700     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4701     ++OpOffset;
4702   }
4703 
4704   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4705   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4706 
4707   // The struct intrinsic variants add one additional operand over raw.
4708   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4709   Register VIndex;
4710   if (HasVIndex) {
4711     VIndex = MI.getOperand(4 + OpOffset).getReg();
4712     ++OpOffset;
4713   } else {
4714     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4715   }
4716 
4717   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4718   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4719   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4720 
4721   MachineMemOperand *MMO = *MI.memoperands_begin();
4722 
4723   unsigned ImmOffset;
4724   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4725   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4726 
4727   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4728 
4729   if (HasReturn)
4730     MIB.addDef(Dst);
4731 
4732   MIB.addUse(VData); // vdata
4733 
4734   if (IsCmpSwap)
4735     MIB.addReg(CmpVal);
4736 
4737   MIB.addUse(RSrc)               // rsrc
4738      .addUse(VIndex)             // vindex
4739      .addUse(VOffset)            // voffset
4740      .addUse(SOffset)            // soffset
4741      .addImm(ImmOffset)          // offset(imm)
4742      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4743      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4744      .addMemOperand(MMO);
4745 
4746   MI.eraseFromParent();
4747   return true;
4748 }
4749 
4750 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4751 /// vector with s16 typed elements.
4752 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4753                                       SmallVectorImpl<Register> &PackedAddrs,
4754                                       unsigned ArgOffset,
4755                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4756                                       bool IsA16, bool IsG16) {
4757   const LLT S16 = LLT::scalar(16);
4758   const LLT V2S16 = LLT::fixed_vector(2, 16);
4759   auto EndIdx = Intr->VAddrEnd;
4760 
4761   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4762     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4763     if (!SrcOp.isReg())
4764       continue; // _L to _LZ may have eliminated this.
4765 
4766     Register AddrReg = SrcOp.getReg();
4767 
4768     if ((I < Intr->GradientStart) ||
4769         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4770         (I >= Intr->CoordStart && !IsA16)) {
4771       if ((I < Intr->GradientStart) && IsA16 &&
4772           (B.getMRI()->getType(AddrReg) == S16)) {
4773         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4774         // Special handling of bias when A16 is on. Bias is of type half but
4775         // occupies full 32-bit.
4776         PackedAddrs.push_back(
4777             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4778                 .getReg(0));
4779       } else {
4780         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4781                "Bias needs to be converted to 16 bit in A16 mode");
4782         // Handle any gradient or coordinate operands that should not be packed
4783         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4784         PackedAddrs.push_back(AddrReg);
4785       }
4786     } else {
4787       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4788       // derivatives dx/dh and dx/dv are packed with undef.
4789       if (((I + 1) >= EndIdx) ||
4790           ((Intr->NumGradients / 2) % 2 == 1 &&
4791            (I == static_cast<unsigned>(Intr->GradientStart +
4792                                        (Intr->NumGradients / 2) - 1) ||
4793             I == static_cast<unsigned>(Intr->GradientStart +
4794                                        Intr->NumGradients - 1))) ||
4795           // Check for _L to _LZ optimization
4796           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4797         PackedAddrs.push_back(
4798             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4799                 .getReg(0));
4800       } else {
4801         PackedAddrs.push_back(
4802             B.buildBuildVector(
4803                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4804                 .getReg(0));
4805         ++I;
4806       }
4807     }
4808   }
4809 }
4810 
4811 /// Convert from separate vaddr components to a single vector address register,
4812 /// and replace the remaining operands with $noreg.
4813 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4814                                      int DimIdx, int NumVAddrs) {
4815   const LLT S32 = LLT::scalar(32);
4816 
4817   SmallVector<Register, 8> AddrRegs;
4818   for (int I = 0; I != NumVAddrs; ++I) {
4819     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4820     if (SrcOp.isReg()) {
4821       AddrRegs.push_back(SrcOp.getReg());
4822       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4823     }
4824   }
4825 
4826   int NumAddrRegs = AddrRegs.size();
4827   if (NumAddrRegs != 1) {
4828     // Above 8 elements round up to next power of 2 (i.e. 16).
4829     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4830       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4831       auto Undef = B.buildUndef(S32);
4832       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4833       NumAddrRegs = RoundedNumRegs;
4834     }
4835 
4836     auto VAddr =
4837         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4838     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4839   }
4840 
4841   for (int I = 1; I != NumVAddrs; ++I) {
4842     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4843     if (SrcOp.isReg())
4844       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4845   }
4846 }
4847 
4848 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4849 ///
4850 /// Depending on the subtarget, load/store with 16-bit element data need to be
4851 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4852 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4853 /// registers.
4854 ///
4855 /// We don't want to directly select image instructions just yet, but also want
4856 /// to exposes all register repacking to the legalizer/combiners. We also don't
4857 /// want a selected instruction entering RegBankSelect. In order to avoid
4858 /// defining a multitude of intermediate image instructions, directly hack on
4859 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4860 /// padding now unnecessary arguments with $noreg.
4861 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4862     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4863     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4864 
4865   const unsigned NumDefs = MI.getNumExplicitDefs();
4866   const unsigned ArgOffset = NumDefs + 1;
4867   bool IsTFE = NumDefs == 2;
4868   // We are only processing the operands of d16 image operations on subtargets
4869   // that use the unpacked register layout, or need to repack the TFE result.
4870 
4871   // TODO: Do we need to guard against already legalized intrinsics?
4872   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4873       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4874 
4875   MachineRegisterInfo *MRI = B.getMRI();
4876   const LLT S32 = LLT::scalar(32);
4877   const LLT S16 = LLT::scalar(16);
4878   const LLT V2S16 = LLT::fixed_vector(2, 16);
4879 
4880   unsigned DMask = 0;
4881   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4882   LLT Ty = MRI->getType(VData);
4883 
4884   // Check for 16 bit addresses and pack if true.
4885   LLT GradTy =
4886       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4887   LLT AddrTy =
4888       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4889   const bool IsG16 = GradTy == S16;
4890   const bool IsA16 = AddrTy == S16;
4891   const bool IsD16 = Ty.getScalarType() == S16;
4892 
4893   int DMaskLanes = 0;
4894   if (!BaseOpcode->Atomic) {
4895     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4896     if (BaseOpcode->Gather4) {
4897       DMaskLanes = 4;
4898     } else if (DMask != 0) {
4899       DMaskLanes = countPopulation(DMask);
4900     } else if (!IsTFE && !BaseOpcode->Store) {
4901       // If dmask is 0, this is a no-op load. This can be eliminated.
4902       B.buildUndef(MI.getOperand(0));
4903       MI.eraseFromParent();
4904       return true;
4905     }
4906   }
4907 
4908   Observer.changingInstr(MI);
4909   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4910 
4911   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4912                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4913   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4914                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4915   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4916 
4917   // Track that we legalized this
4918   MI.setDesc(B.getTII().get(NewOpcode));
4919 
4920   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4921   // dmask to be at least 1 otherwise the instruction will fail
4922   if (IsTFE && DMask == 0) {
4923     DMask = 0x1;
4924     DMaskLanes = 1;
4925     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4926   }
4927 
4928   if (BaseOpcode->Atomic) {
4929     Register VData0 = MI.getOperand(2).getReg();
4930     LLT Ty = MRI->getType(VData0);
4931 
4932     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4933     if (Ty.isVector())
4934       return false;
4935 
4936     if (BaseOpcode->AtomicX2) {
4937       Register VData1 = MI.getOperand(3).getReg();
4938       // The two values are packed in one register.
4939       LLT PackedTy = LLT::fixed_vector(2, Ty);
4940       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4941       MI.getOperand(2).setReg(Concat.getReg(0));
4942       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4943     }
4944   }
4945 
4946   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4947 
4948   // Rewrite the addressing register layout before doing anything else.
4949   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4950     // 16 bit gradients are supported, but are tied to the A16 control
4951     // so both gradients and addresses must be 16 bit
4952     return false;
4953   }
4954 
4955   if (IsA16 && !ST.hasA16()) {
4956     // A16 not supported
4957     return false;
4958   }
4959 
4960   if (IsA16 || IsG16) {
4961     if (Intr->NumVAddrs > 1) {
4962       SmallVector<Register, 4> PackedRegs;
4963 
4964       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4965                                 IsG16);
4966 
4967       // See also below in the non-a16 branch
4968       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4969                           PackedRegs.size() <= ST.getNSAMaxSize();
4970 
4971       if (!UseNSA && PackedRegs.size() > 1) {
4972         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4973         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4974         PackedRegs[0] = Concat.getReg(0);
4975         PackedRegs.resize(1);
4976       }
4977 
4978       const unsigned NumPacked = PackedRegs.size();
4979       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4980         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4981         if (!SrcOp.isReg()) {
4982           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4983           continue;
4984         }
4985 
4986         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4987 
4988         if (I - Intr->VAddrStart < NumPacked)
4989           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4990         else
4991           SrcOp.setReg(AMDGPU::NoRegister);
4992       }
4993     }
4994   } else {
4995     // If the register allocator cannot place the address registers contiguously
4996     // without introducing moves, then using the non-sequential address encoding
4997     // is always preferable, since it saves VALU instructions and is usually a
4998     // wash in terms of code size or even better.
4999     //
5000     // However, we currently have no way of hinting to the register allocator
5001     // that MIMG addresses should be placed contiguously when it is possible to
5002     // do so, so force non-NSA for the common 2-address case as a heuristic.
5003     //
5004     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
5005     // allocation when possible.
5006     //
5007     // TODO: we can actually allow partial NSA where the final register is a
5008     // contiguous set of the remaining addresses.
5009     // This could help where there are more addresses than supported.
5010     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
5011                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
5012 
5013     if (!UseNSA && Intr->NumVAddrs > 1)
5014       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5015                                Intr->NumVAddrs);
5016   }
5017 
5018   int Flags = 0;
5019   if (IsA16)
5020     Flags |= 1;
5021   if (IsG16)
5022     Flags |= 2;
5023   MI.addOperand(MachineOperand::CreateImm(Flags));
5024 
5025   if (BaseOpcode->Store) { // No TFE for stores?
5026     // TODO: Handle dmask trim
5027     if (!Ty.isVector() || !IsD16)
5028       return true;
5029 
5030     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
5031     if (RepackedReg != VData) {
5032       MI.getOperand(1).setReg(RepackedReg);
5033     }
5034 
5035     return true;
5036   }
5037 
5038   Register DstReg = MI.getOperand(0).getReg();
5039   const LLT EltTy = Ty.getScalarType();
5040   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
5041 
5042   // Confirm that the return type is large enough for the dmask specified
5043   if (NumElts < DMaskLanes)
5044     return false;
5045 
5046   if (NumElts > 4 || DMaskLanes > 4)
5047     return false;
5048 
5049   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5050   const LLT AdjustedTy =
5051       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
5052 
5053   // The raw dword aligned data component of the load. The only legal cases
5054   // where this matters should be when using the packed D16 format, for
5055   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
5056   LLT RoundedTy;
5057 
5058   // S32 vector to to cover all data, plus TFE result element.
5059   LLT TFETy;
5060 
5061   // Register type to use for each loaded component. Will be S32 or V2S16.
5062   LLT RegTy;
5063 
5064   if (IsD16 && ST.hasUnpackedD16VMem()) {
5065     RoundedTy =
5066         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5067     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
5068     RegTy = S32;
5069   } else {
5070     unsigned EltSize = EltTy.getSizeInBits();
5071     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
5072     unsigned RoundedSize = 32 * RoundedElts;
5073     RoundedTy = LLT::scalarOrVector(
5074         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5075     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
5076     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
5077   }
5078 
5079   // The return type does not need adjustment.
5080   // TODO: Should we change s16 case to s32 or <2 x s16>?
5081   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
5082     return true;
5083 
5084   Register Dst1Reg;
5085 
5086   // Insert after the instruction.
5087   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
5088 
5089   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
5090   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
5091   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
5092   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
5093 
5094   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
5095 
5096   MI.getOperand(0).setReg(NewResultReg);
5097 
5098   // In the IR, TFE is supposed to be used with a 2 element struct return
5099   // type. The instruction really returns these two values in one contiguous
5100   // register, with one additional dword beyond the loaded data. Rewrite the
5101   // return type to use a single register result.
5102 
5103   if (IsTFE) {
5104     Dst1Reg = MI.getOperand(1).getReg();
5105     if (MRI->getType(Dst1Reg) != S32)
5106       return false;
5107 
5108     // TODO: Make sure the TFE operand bit is set.
5109     MI.removeOperand(1);
5110 
5111     // Handle the easy case that requires no repack instructions.
5112     if (Ty == S32) {
5113       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
5114       return true;
5115     }
5116   }
5117 
5118   // Now figure out how to copy the new result register back into the old
5119   // result.
5120   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
5121 
5122   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
5123 
5124   if (ResultNumRegs == 1) {
5125     assert(!IsTFE);
5126     ResultRegs[0] = NewResultReg;
5127   } else {
5128     // We have to repack into a new vector of some kind.
5129     for (int I = 0; I != NumDataRegs; ++I)
5130       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
5131     B.buildUnmerge(ResultRegs, NewResultReg);
5132 
5133     // Drop the final TFE element to get the data part. The TFE result is
5134     // directly written to the right place already.
5135     if (IsTFE)
5136       ResultRegs.resize(NumDataRegs);
5137   }
5138 
5139   // For an s16 scalar result, we form an s32 result with a truncate regardless
5140   // of packed vs. unpacked.
5141   if (IsD16 && !Ty.isVector()) {
5142     B.buildTrunc(DstReg, ResultRegs[0]);
5143     return true;
5144   }
5145 
5146   // Avoid a build/concat_vector of 1 entry.
5147   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
5148     B.buildBitcast(DstReg, ResultRegs[0]);
5149     return true;
5150   }
5151 
5152   assert(Ty.isVector());
5153 
5154   if (IsD16) {
5155     // For packed D16 results with TFE enabled, all the data components are
5156     // S32. Cast back to the expected type.
5157     //
5158     // TODO: We don't really need to use load s32 elements. We would only need one
5159     // cast for the TFE result if a multiple of v2s16 was used.
5160     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
5161       for (Register &Reg : ResultRegs)
5162         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
5163     } else if (ST.hasUnpackedD16VMem()) {
5164       for (Register &Reg : ResultRegs)
5165         Reg = B.buildTrunc(S16, Reg).getReg(0);
5166     }
5167   }
5168 
5169   auto padWithUndef = [&](LLT Ty, int NumElts) {
5170     if (NumElts == 0)
5171       return;
5172     Register Undef = B.buildUndef(Ty).getReg(0);
5173     for (int I = 0; I != NumElts; ++I)
5174       ResultRegs.push_back(Undef);
5175   };
5176 
5177   // Pad out any elements eliminated due to the dmask.
5178   LLT ResTy = MRI->getType(ResultRegs[0]);
5179   if (!ResTy.isVector()) {
5180     padWithUndef(ResTy, NumElts - ResultRegs.size());
5181     B.buildBuildVector(DstReg, ResultRegs);
5182     return true;
5183   }
5184 
5185   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
5186   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
5187 
5188   // Deal with the one annoying legal case.
5189   const LLT V3S16 = LLT::fixed_vector(3, 16);
5190   if (Ty == V3S16) {
5191     if (IsTFE) {
5192       if (ResultRegs.size() == 1) {
5193         NewResultReg = ResultRegs[0];
5194       } else if (ResultRegs.size() == 2) {
5195         LLT V4S16 = LLT::fixed_vector(4, 16);
5196         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
5197       } else {
5198         return false;
5199       }
5200     }
5201 
5202     if (MRI->getType(DstReg).getNumElements() <
5203         MRI->getType(NewResultReg).getNumElements()) {
5204       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
5205     } else {
5206       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
5207     }
5208     return true;
5209   }
5210 
5211   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
5212   B.buildConcatVectors(DstReg, ResultRegs);
5213   return true;
5214 }
5215 
5216 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5217   LegalizerHelper &Helper, MachineInstr &MI) const {
5218   MachineIRBuilder &B = Helper.MIRBuilder;
5219   GISelChangeObserver &Observer = Helper.Observer;
5220 
5221   Register Dst = MI.getOperand(0).getReg();
5222   LLT Ty = B.getMRI()->getType(Dst);
5223   unsigned Size = Ty.getSizeInBits();
5224   MachineFunction &MF = B.getMF();
5225 
5226   Observer.changingInstr(MI);
5227 
5228   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5229     Ty = getBitcastRegisterType(Ty);
5230     Helper.bitcastDst(MI, Ty, 0);
5231     Dst = MI.getOperand(0).getReg();
5232     B.setInsertPt(B.getMBB(), MI);
5233   }
5234 
5235   // FIXME: We don't really need this intermediate instruction. The intrinsic
5236   // should be fixed to have a memory operand. Since it's readnone, we're not
5237   // allowed to add one.
5238   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
5239   MI.removeOperand(1); // Remove intrinsic ID
5240 
5241   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
5242   // TODO: Should this use datalayout alignment?
5243   const unsigned MemSize = (Size + 7) / 8;
5244   const Align MemAlign(4);
5245   MachineMemOperand *MMO = MF.getMachineMemOperand(
5246       MachinePointerInfo(),
5247       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5248           MachineMemOperand::MOInvariant,
5249       MemSize, MemAlign);
5250   MI.addMemOperand(MF, MMO);
5251 
5252   // There are no 96-bit result scalar loads, but widening to 128-bit should
5253   // always be legal. We may need to restore this to a 96-bit result if it turns
5254   // out this needs to be converted to a vector load during RegBankSelect.
5255   if (!isPowerOf2_32(Size)) {
5256     if (Ty.isVector())
5257       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
5258     else
5259       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
5260   }
5261 
5262   Observer.changedInstr(MI);
5263   return true;
5264 }
5265 
5266 // TODO: Move to selection
5267 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
5268                                                 MachineRegisterInfo &MRI,
5269                                                 MachineIRBuilder &B) const {
5270   if (!ST.isTrapHandlerEnabled() ||
5271       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5272     return legalizeTrapEndpgm(MI, MRI, B);
5273 
5274   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5275     switch (*HsaAbiVer) {
5276     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5277     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5278       return legalizeTrapHsaQueuePtr(MI, MRI, B);
5279     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5280     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5281       return ST.supportsGetDoorbellID() ?
5282           legalizeTrapHsa(MI, MRI, B) :
5283           legalizeTrapHsaQueuePtr(MI, MRI, B);
5284     }
5285   }
5286 
5287   llvm_unreachable("Unknown trap handler");
5288 }
5289 
5290 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5291     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5292   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5293   MI.eraseFromParent();
5294   return true;
5295 }
5296 
5297 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5298     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5299   MachineFunction &MF = B.getMF();
5300   const LLT S64 = LLT::scalar(64);
5301 
5302   Register SGPR01(AMDGPU::SGPR0_SGPR1);
5303   // For code object version 5, queue_ptr is passed through implicit kernarg.
5304   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5305     AMDGPUTargetLowering::ImplicitParameter Param =
5306         AMDGPUTargetLowering::QUEUE_PTR;
5307     uint64_t Offset =
5308         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
5309 
5310     Register KernargPtrReg = MRI.createGenericVirtualRegister(
5311         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5312 
5313     if (!loadInputValue(KernargPtrReg, B,
5314                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5315       return false;
5316 
5317     // TODO: can we be smarter about machine pointer info?
5318     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5319     MachineMemOperand *MMO = MF.getMachineMemOperand(
5320         PtrInfo,
5321         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5322             MachineMemOperand::MOInvariant,
5323         LLT::scalar(64), commonAlignment(Align(64), Offset));
5324 
5325     // Pointer address
5326     Register LoadAddr = MRI.createGenericVirtualRegister(
5327         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5328     B.buildPtrAdd(LoadAddr, KernargPtrReg,
5329                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
5330     // Load address
5331     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
5332     B.buildCopy(SGPR01, Temp);
5333     B.buildInstr(AMDGPU::S_TRAP)
5334         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5335         .addReg(SGPR01, RegState::Implicit);
5336     MI.eraseFromParent();
5337     return true;
5338   }
5339 
5340   // Pass queue pointer to trap handler as input, and insert trap instruction
5341   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5342   Register LiveIn =
5343     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5344   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
5345     return false;
5346 
5347   B.buildCopy(SGPR01, LiveIn);
5348   B.buildInstr(AMDGPU::S_TRAP)
5349       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
5350       .addReg(SGPR01, RegState::Implicit);
5351 
5352   MI.eraseFromParent();
5353   return true;
5354 }
5355 
5356 bool AMDGPULegalizerInfo::legalizeTrapHsa(
5357     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5358   B.buildInstr(AMDGPU::S_TRAP)
5359       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
5360   MI.eraseFromParent();
5361   return true;
5362 }
5363 
5364 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
5365     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5366   // Is non-HSA path or trap-handler disabled? Then, report a warning
5367   // accordingly
5368   if (!ST.isTrapHandlerEnabled() ||
5369       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5370     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
5371                                      "debugtrap handler not supported",
5372                                      MI.getDebugLoc(), DS_Warning);
5373     LLVMContext &Ctx = B.getMF().getFunction().getContext();
5374     Ctx.diagnose(NoTrap);
5375   } else {
5376     // Insert debug-trap instruction
5377     B.buildInstr(AMDGPU::S_TRAP)
5378         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
5379   }
5380 
5381   MI.eraseFromParent();
5382   return true;
5383 }
5384 
5385 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5386                                                MachineIRBuilder &B) const {
5387   MachineRegisterInfo &MRI = *B.getMRI();
5388   const LLT S16 = LLT::scalar(16);
5389   const LLT S32 = LLT::scalar(32);
5390   const LLT V2S16 = LLT::fixed_vector(2, 16);
5391   const LLT V3S32 = LLT::fixed_vector(3, 32);
5392 
5393   Register DstReg = MI.getOperand(0).getReg();
5394   Register NodePtr = MI.getOperand(2).getReg();
5395   Register RayExtent = MI.getOperand(3).getReg();
5396   Register RayOrigin = MI.getOperand(4).getReg();
5397   Register RayDir = MI.getOperand(5).getReg();
5398   Register RayInvDir = MI.getOperand(6).getReg();
5399   Register TDescr = MI.getOperand(7).getReg();
5400 
5401   if (!ST.hasGFX10_AEncoding()) {
5402     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5403                                         "intrinsic not supported on subtarget",
5404                                         MI.getDebugLoc());
5405     B.getMF().getFunction().getContext().diagnose(BadIntrin);
5406     return false;
5407   }
5408 
5409   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5410   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5411   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5412   const unsigned NumVDataDwords = 4;
5413   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
5414   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
5415   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5416   const unsigned BaseOpcodes[2][2] = {
5417       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5418       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5419        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5420   int Opcode;
5421   if (UseNSA) {
5422     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
5423                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
5424                                                : AMDGPU::MIMGEncGfx10NSA,
5425                                    NumVDataDwords, NumVAddrDwords);
5426   } else {
5427     Opcode = AMDGPU::getMIMGOpcode(
5428         BaseOpcodes[Is64][IsA16],
5429         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5430         NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
5431   }
5432   assert(Opcode != -1);
5433 
5434   SmallVector<Register, 12> Ops;
5435   if (UseNSA && IsGFX11Plus) {
5436     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
5437       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5438       auto Merged = B.buildMerge(
5439           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
5440       Ops.push_back(Merged.getReg(0));
5441     };
5442 
5443     Ops.push_back(NodePtr);
5444     Ops.push_back(RayExtent);
5445     packLanes(RayOrigin);
5446 
5447     if (IsA16) {
5448       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5449       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5450       auto MergedDir = B.buildMerge(
5451           V3S32,
5452           {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0),
5453                                                     UnmergeRayDir.getReg(0)}))
5454                .getReg(0),
5455            B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1),
5456                                                     UnmergeRayDir.getReg(1)}))
5457                .getReg(0),
5458            B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2),
5459                                                     UnmergeRayDir.getReg(2)}))
5460                .getReg(0)});
5461       Ops.push_back(MergedDir.getReg(0));
5462     } else {
5463       packLanes(RayDir);
5464       packLanes(RayInvDir);
5465     }
5466   } else {
5467     if (Is64) {
5468       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5469       Ops.push_back(Unmerge.getReg(0));
5470       Ops.push_back(Unmerge.getReg(1));
5471     } else {
5472       Ops.push_back(NodePtr);
5473     }
5474     Ops.push_back(RayExtent);
5475 
5476     auto packLanes = [&Ops, &S32, &B](Register Src) {
5477       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5478       Ops.push_back(Unmerge.getReg(0));
5479       Ops.push_back(Unmerge.getReg(1));
5480       Ops.push_back(Unmerge.getReg(2));
5481     };
5482 
5483     packLanes(RayOrigin);
5484     if (IsA16) {
5485       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
5486       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5487       Register R1 = MRI.createGenericVirtualRegister(S32);
5488       Register R2 = MRI.createGenericVirtualRegister(S32);
5489       Register R3 = MRI.createGenericVirtualRegister(S32);
5490       B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5491       B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5492       B.buildMerge(R3,
5493                    {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5494       Ops.push_back(R1);
5495       Ops.push_back(R2);
5496       Ops.push_back(R3);
5497     } else {
5498       packLanes(RayDir);
5499       packLanes(RayInvDir);
5500     }
5501   }
5502 
5503   if (!UseNSA) {
5504     // Build a single vector containing all the operands so far prepared.
5505     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5506     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5507     Ops.clear();
5508     Ops.push_back(MergedOps);
5509   }
5510 
5511   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5512     .addDef(DstReg)
5513     .addImm(Opcode);
5514 
5515   for (Register R : Ops) {
5516     MIB.addUse(R);
5517   }
5518 
5519   MIB.addUse(TDescr)
5520      .addImm(IsA16 ? 1 : 0)
5521      .cloneMemRefs(MI);
5522 
5523   MI.eraseFromParent();
5524   return true;
5525 }
5526 
5527 bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
5528                                                MachineIRBuilder &B) const {
5529   unsigned Opc;
5530   int RoundMode = MI.getOperand(2).getImm();
5531 
5532   if (RoundMode == (int)RoundingMode::TowardPositive)
5533     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
5534   else if (RoundMode == (int)RoundingMode::TowardNegative)
5535     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
5536   else
5537     return false;
5538 
5539   B.buildInstr(Opc)
5540       .addDef(MI.getOperand(0).getReg())
5541       .addUse(MI.getOperand(1).getReg());
5542 
5543   MI.eraseFromParent();
5544 
5545   return true;
5546 }
5547 
5548 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
5549                                             MachineInstr &MI) const {
5550   MachineIRBuilder &B = Helper.MIRBuilder;
5551   MachineRegisterInfo &MRI = *B.getMRI();
5552 
5553   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5554   auto IntrID = MI.getIntrinsicID();
5555   switch (IntrID) {
5556   case Intrinsic::amdgcn_if:
5557   case Intrinsic::amdgcn_else: {
5558     MachineInstr *Br = nullptr;
5559     MachineBasicBlock *UncondBrTarget = nullptr;
5560     bool Negated = false;
5561     if (MachineInstr *BrCond =
5562             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5563       const SIRegisterInfo *TRI
5564         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5565 
5566       Register Def = MI.getOperand(1).getReg();
5567       Register Use = MI.getOperand(3).getReg();
5568 
5569       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5570 
5571       if (Negated)
5572         std::swap(CondBrTarget, UncondBrTarget);
5573 
5574       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5575       if (IntrID == Intrinsic::amdgcn_if) {
5576         B.buildInstr(AMDGPU::SI_IF)
5577           .addDef(Def)
5578           .addUse(Use)
5579           .addMBB(UncondBrTarget);
5580       } else {
5581         B.buildInstr(AMDGPU::SI_ELSE)
5582             .addDef(Def)
5583             .addUse(Use)
5584             .addMBB(UncondBrTarget);
5585       }
5586 
5587       if (Br) {
5588         Br->getOperand(0).setMBB(CondBrTarget);
5589       } else {
5590         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
5591         // since we're swapping branch targets it needs to be reinserted.
5592         // FIXME: IRTranslator should probably not do this
5593         B.buildBr(*CondBrTarget);
5594       }
5595 
5596       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5597       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5598       MI.eraseFromParent();
5599       BrCond->eraseFromParent();
5600       return true;
5601     }
5602 
5603     return false;
5604   }
5605   case Intrinsic::amdgcn_loop: {
5606     MachineInstr *Br = nullptr;
5607     MachineBasicBlock *UncondBrTarget = nullptr;
5608     bool Negated = false;
5609     if (MachineInstr *BrCond =
5610             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5611       const SIRegisterInfo *TRI
5612         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5613 
5614       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5615       Register Reg = MI.getOperand(2).getReg();
5616 
5617       if (Negated)
5618         std::swap(CondBrTarget, UncondBrTarget);
5619 
5620       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5621       B.buildInstr(AMDGPU::SI_LOOP)
5622         .addUse(Reg)
5623         .addMBB(UncondBrTarget);
5624 
5625       if (Br)
5626         Br->getOperand(0).setMBB(CondBrTarget);
5627       else
5628         B.buildBr(*CondBrTarget);
5629 
5630       MI.eraseFromParent();
5631       BrCond->eraseFromParent();
5632       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5633       return true;
5634     }
5635 
5636     return false;
5637   }
5638   case Intrinsic::amdgcn_kernarg_segment_ptr:
5639     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5640       // This only makes sense to call in a kernel, so just lower to null.
5641       B.buildConstant(MI.getOperand(0).getReg(), 0);
5642       MI.eraseFromParent();
5643       return true;
5644     }
5645 
5646     return legalizePreloadedArgIntrin(
5647       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5648   case Intrinsic::amdgcn_implicitarg_ptr:
5649     return legalizeImplicitArgPtr(MI, MRI, B);
5650   case Intrinsic::amdgcn_workitem_id_x:
5651     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
5652                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5653   case Intrinsic::amdgcn_workitem_id_y:
5654     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
5655                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5656   case Intrinsic::amdgcn_workitem_id_z:
5657     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
5658                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5659   case Intrinsic::amdgcn_workgroup_id_x:
5660     return legalizePreloadedArgIntrin(MI, MRI, B,
5661                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5662   case Intrinsic::amdgcn_workgroup_id_y:
5663     return legalizePreloadedArgIntrin(MI, MRI, B,
5664                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5665   case Intrinsic::amdgcn_workgroup_id_z:
5666     return legalizePreloadedArgIntrin(MI, MRI, B,
5667                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5668   case Intrinsic::amdgcn_lds_kernel_id:
5669     return legalizePreloadedArgIntrin(MI, MRI, B,
5670                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5671   case Intrinsic::amdgcn_dispatch_ptr:
5672     return legalizePreloadedArgIntrin(MI, MRI, B,
5673                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5674   case Intrinsic::amdgcn_queue_ptr:
5675     return legalizePreloadedArgIntrin(MI, MRI, B,
5676                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5677   case Intrinsic::amdgcn_implicit_buffer_ptr:
5678     return legalizePreloadedArgIntrin(
5679       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5680   case Intrinsic::amdgcn_dispatch_id:
5681     return legalizePreloadedArgIntrin(MI, MRI, B,
5682                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5683   case Intrinsic::r600_read_ngroups_x:
5684     // TODO: Emit error for hsa
5685     return legalizeKernargMemParameter(MI, B,
5686                                        SI::KernelInputOffsets::NGROUPS_X);
5687   case Intrinsic::r600_read_ngroups_y:
5688     return legalizeKernargMemParameter(MI, B,
5689                                        SI::KernelInputOffsets::NGROUPS_Y);
5690   case Intrinsic::r600_read_ngroups_z:
5691     return legalizeKernargMemParameter(MI, B,
5692                                        SI::KernelInputOffsets::NGROUPS_Z);
5693   case Intrinsic::r600_read_local_size_x:
5694     // TODO: Could insert G_ASSERT_ZEXT from s16
5695     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
5696   case Intrinsic::r600_read_local_size_y:
5697     // TODO: Could insert G_ASSERT_ZEXT from s16
5698     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
5699     // TODO: Could insert G_ASSERT_ZEXT from s16
5700   case Intrinsic::r600_read_local_size_z:
5701     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
5702   case Intrinsic::r600_read_global_size_x:
5703     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
5704   case Intrinsic::r600_read_global_size_y:
5705     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
5706   case Intrinsic::r600_read_global_size_z:
5707     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
5708   case Intrinsic::amdgcn_fdiv_fast:
5709     return legalizeFDIVFastIntrin(MI, MRI, B);
5710   case Intrinsic::amdgcn_is_shared:
5711     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5712   case Intrinsic::amdgcn_is_private:
5713     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5714   case Intrinsic::amdgcn_wavefrontsize: {
5715     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5716     MI.eraseFromParent();
5717     return true;
5718   }
5719   case Intrinsic::amdgcn_s_buffer_load:
5720     return legalizeSBufferLoad(Helper, MI);
5721   case Intrinsic::amdgcn_raw_buffer_store:
5722   case Intrinsic::amdgcn_struct_buffer_store:
5723     return legalizeBufferStore(MI, MRI, B, false, false);
5724   case Intrinsic::amdgcn_raw_buffer_store_format:
5725   case Intrinsic::amdgcn_struct_buffer_store_format:
5726     return legalizeBufferStore(MI, MRI, B, false, true);
5727   case Intrinsic::amdgcn_raw_tbuffer_store:
5728   case Intrinsic::amdgcn_struct_tbuffer_store:
5729     return legalizeBufferStore(MI, MRI, B, true, true);
5730   case Intrinsic::amdgcn_raw_buffer_load:
5731   case Intrinsic::amdgcn_struct_buffer_load:
5732     return legalizeBufferLoad(MI, MRI, B, false, false);
5733   case Intrinsic::amdgcn_raw_buffer_load_format:
5734   case Intrinsic::amdgcn_struct_buffer_load_format:
5735     return legalizeBufferLoad(MI, MRI, B, true, false);
5736   case Intrinsic::amdgcn_raw_tbuffer_load:
5737   case Intrinsic::amdgcn_struct_tbuffer_load:
5738     return legalizeBufferLoad(MI, MRI, B, true, true);
5739   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5740   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5741   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5742   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5743   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5744   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5745   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5746   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5747   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5748   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5749   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5750   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5751   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5752   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5753   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5754   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5755   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5756   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5757   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5758   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5759   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5760   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5761   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5762   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5763   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5764   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5765   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5766   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5767   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5768   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5769     return legalizeBufferAtomic(MI, B, IntrID);
5770   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5771   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5772     Register DstReg = MI.getOperand(0).getReg();
5773     if (!MRI.use_empty(DstReg) &&
5774         !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) {
5775       Function &F = B.getMF().getFunction();
5776       DiagnosticInfoUnsupported NoFpRet(
5777           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5778           DS_Error);
5779       F.getContext().diagnose(NoFpRet);
5780       B.buildUndef(DstReg);
5781       MI.eraseFromParent();
5782       return true;
5783     }
5784 
5785     return legalizeBufferAtomic(MI, B, IntrID);
5786   }
5787   case Intrinsic::amdgcn_atomic_inc:
5788     return legalizeAtomicIncDec(MI, B, true);
5789   case Intrinsic::amdgcn_atomic_dec:
5790     return legalizeAtomicIncDec(MI, B, false);
5791   case Intrinsic::trap:
5792     return legalizeTrapIntrinsic(MI, MRI, B);
5793   case Intrinsic::debugtrap:
5794     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5795   case Intrinsic::amdgcn_rsq_clamp:
5796     return legalizeRsqClampIntrinsic(MI, MRI, B);
5797   case Intrinsic::amdgcn_ds_fadd:
5798   case Intrinsic::amdgcn_ds_fmin:
5799   case Intrinsic::amdgcn_ds_fmax:
5800     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5801   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5802     return legalizeBVHIntrinsic(MI, B);
5803   default: {
5804     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5805             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5806       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5807     return true;
5808   }
5809   }
5810 
5811   return true;
5812 }
5813