xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision d5b0e70f7e04d971691517ce1304d86a1e367e2e)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "llvm/ADT/ScopeExit.h"
23 #include "llvm/BinaryFormat/ELF.h"
24 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 
30 #define DEBUG_TYPE "amdgpu-legalinfo"
31 
32 using namespace llvm;
33 using namespace LegalizeActions;
34 using namespace LegalizeMutations;
35 using namespace LegalityPredicates;
36 using namespace MIPatternMatch;
37 
38 // Hack until load/store selection patterns support any tuple of legal types.
39 static cl::opt<bool> EnableNewLegality(
40   "amdgpu-global-isel-new-legality",
41   cl::desc("Use GlobalISel desired legality, rather than try to use"
42            "rules compatible with selection patterns"),
43   cl::init(false),
44   cl::ReallyHidden);
45 
46 static constexpr unsigned MaxRegisterSize = 1024;
47 
48 // Round the number of elements to the next power of two elements
49 static LLT getPow2VectorType(LLT Ty) {
50   unsigned NElts = Ty.getNumElements();
51   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
53 }
54 
55 // Round the number of bits to the next power of two bits
56 static LLT getPow2ScalarType(LLT Ty) {
57   unsigned Bits = Ty.getSizeInBits();
58   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
59   return LLT::scalar(Pow2Bits);
60 }
61 
62 /// \returns true if this is an odd sized vector which should widen by adding an
63 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64 /// excludes s1 vectors, which should always be scalarized.
65 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
66   return [=](const LegalityQuery &Query) {
67     const LLT Ty = Query.Types[TypeIdx];
68     if (!Ty.isVector())
69       return false;
70 
71     const LLT EltTy = Ty.getElementType();
72     const unsigned EltSize = EltTy.getSizeInBits();
73     return Ty.getNumElements() % 2 != 0 &&
74            EltSize > 1 && EltSize < 32 &&
75            Ty.getSizeInBits() % 32 != 0;
76   };
77 }
78 
79 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80   return [=](const LegalityQuery &Query) {
81     const LLT Ty = Query.Types[TypeIdx];
82     return Ty.getSizeInBits() % 32 == 0;
83   };
84 }
85 
86 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
87   return [=](const LegalityQuery &Query) {
88     const LLT Ty = Query.Types[TypeIdx];
89     const LLT EltTy = Ty.getScalarType();
90     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
91   };
92 }
93 
94 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
95   return [=](const LegalityQuery &Query) {
96     const LLT Ty = Query.Types[TypeIdx];
97     const LLT EltTy = Ty.getElementType();
98     return std::make_pair(TypeIdx,
99                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
100   };
101 }
102 
103 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
104   return [=](const LegalityQuery &Query) {
105     const LLT Ty = Query.Types[TypeIdx];
106     const LLT EltTy = Ty.getElementType();
107     unsigned Size = Ty.getSizeInBits();
108     unsigned Pieces = (Size + 63) / 64;
109     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110     return std::make_pair(
111         TypeIdx,
112         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
113   };
114 }
115 
116 // Increase the number of vector elements to reach the next multiple of 32-bit
117 // type.
118 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121 
122     const LLT EltTy = Ty.getElementType();
123     const int Size = Ty.getSizeInBits();
124     const int EltSize = EltTy.getSizeInBits();
125     const int NextMul32 = (Size + 31) / 32;
126 
127     assert(EltSize < 32);
128 
129     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
131   };
132 }
133 
134 static LLT getBitcastRegisterType(const LLT Ty) {
135   const unsigned Size = Ty.getSizeInBits();
136 
137   LLT CoercedTy;
138   if (Size <= 32) {
139     // <2 x s8> -> s16
140     // <4 x s8> -> s32
141     return LLT::scalar(Size);
142   }
143 
144   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145 }
146 
147 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148   return [=](const LegalityQuery &Query) {
149     const LLT Ty = Query.Types[TypeIdx];
150     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151   };
152 }
153 
154 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155   return [=](const LegalityQuery &Query) {
156     const LLT Ty = Query.Types[TypeIdx];
157     unsigned Size = Ty.getSizeInBits();
158     assert(Size % 32 == 0);
159     return std::make_pair(
160         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
161   };
162 }
163 
164 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
165   return [=](const LegalityQuery &Query) {
166     const LLT QueryTy = Query.Types[TypeIdx];
167     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
168   };
169 }
170 
171 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
172   return [=](const LegalityQuery &Query) {
173     const LLT QueryTy = Query.Types[TypeIdx];
174     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
175   };
176 }
177 
178 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
179   return [=](const LegalityQuery &Query) {
180     const LLT QueryTy = Query.Types[TypeIdx];
181     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
182   };
183 }
184 
185 static bool isRegisterSize(unsigned Size) {
186   return Size % 32 == 0 && Size <= MaxRegisterSize;
187 }
188 
189 static bool isRegisterVectorElementType(LLT EltTy) {
190   const int EltSize = EltTy.getSizeInBits();
191   return EltSize == 16 || EltSize % 32 == 0;
192 }
193 
194 static bool isRegisterVectorType(LLT Ty) {
195   const int EltSize = Ty.getElementType().getSizeInBits();
196   return EltSize == 32 || EltSize == 64 ||
197          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
198          EltSize == 128 || EltSize == 256;
199 }
200 
201 static bool isRegisterType(LLT Ty) {
202   if (!isRegisterSize(Ty.getSizeInBits()))
203     return false;
204 
205   if (Ty.isVector())
206     return isRegisterVectorType(Ty);
207 
208   return true;
209 }
210 
211 // Any combination of 32 or 64-bit elements up the maximum register size, and
212 // multiples of v2s16.
213 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     return isRegisterType(Query.Types[TypeIdx]);
216   };
217 }
218 
219 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
220   return [=](const LegalityQuery &Query) {
221     const LLT QueryTy = Query.Types[TypeIdx];
222     if (!QueryTy.isVector())
223       return false;
224     const LLT EltTy = QueryTy.getElementType();
225     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
226   };
227 }
228 
229 // If we have a truncating store or an extending load with a data size larger
230 // than 32-bits, we need to reduce to a 32-bit type.
231 static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
232   return [=](const LegalityQuery &Query) {
233     const LLT Ty = Query.Types[TypeIdx];
234     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
236   };
237 }
238 
239 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
240 // handle some operations by just promoting the register during
241 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
242 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
243                                     bool IsLoad) {
244   switch (AS) {
245   case AMDGPUAS::PRIVATE_ADDRESS:
246     // FIXME: Private element size.
247     return ST.enableFlatScratch() ? 128 : 32;
248   case AMDGPUAS::LOCAL_ADDRESS:
249     return ST.useDS128() ? 128 : 64;
250   case AMDGPUAS::GLOBAL_ADDRESS:
251   case AMDGPUAS::CONSTANT_ADDRESS:
252   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
253     // Treat constant and global as identical. SMRD loads are sometimes usable for
254     // global loads (ideally constant address space should be eliminated)
255     // depending on the context. Legality cannot be context dependent, but
256     // RegBankSelect can split the load as necessary depending on the pointer
257     // register bank/uniformity and if the memory is invariant or not written in a
258     // kernel.
259     return IsLoad ? 512 : 128;
260   default:
261     // Flat addresses may contextually need to be split to 32-bit parts if they
262     // may alias scratch depending on the subtarget.
263     return 128;
264   }
265 }
266 
267 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268                                  const LegalityQuery &Query) {
269   const LLT Ty = Query.Types[0];
270 
271   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
273 
274   unsigned RegSize = Ty.getSizeInBits();
275   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
277   unsigned AS = Query.Types[1].getAddressSpace();
278 
279   // All of these need to be custom lowered to cast the pointer operand.
280   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
281     return false;
282 
283   // Do not handle extending vector loads.
284   if (Ty.isVector() && MemSize != RegSize)
285     return false;
286 
287   // TODO: We should be able to widen loads if the alignment is high enough, but
288   // we also need to modify the memory access size.
289 #if 0
290   // Accept widening loads based on alignment.
291   if (IsLoad && MemSize < Size)
292     MemSize = std::max(MemSize, Align);
293 #endif
294 
295   // Only 1-byte and 2-byte to 32-bit extloads are valid.
296   if (MemSize != RegSize && RegSize != 32)
297     return false;
298 
299   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
300     return false;
301 
302   switch (MemSize) {
303   case 8:
304   case 16:
305   case 32:
306   case 64:
307   case 128:
308     break;
309   case 96:
310     if (!ST.hasDwordx3LoadStores())
311       return false;
312     break;
313   case 256:
314   case 512:
315     // These may contextually need to be broken down.
316     break;
317   default:
318     return false;
319   }
320 
321   assert(RegSize >= MemSize);
322 
323   if (AlignBits < MemSize) {
324     const SITargetLowering *TLI = ST.getTargetLowering();
325     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326                                                  Align(AlignBits / 8)))
327       return false;
328   }
329 
330   return true;
331 }
332 
333 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
334 // workaround this. Eventually it should ignore the type for loads and only care
335 // about the size. Return true in cases where we will workaround this for now by
336 // bitcasting.
337 static bool loadStoreBitcastWorkaround(const LLT Ty) {
338   if (EnableNewLegality)
339     return false;
340 
341   const unsigned Size = Ty.getSizeInBits();
342   if (Size <= 64)
343     return false;
344   if (!Ty.isVector())
345     return true;
346 
347   LLT EltTy = Ty.getElementType();
348   if (EltTy.isPointer())
349     return true;
350 
351   unsigned EltSize = EltTy.getSizeInBits();
352   return EltSize != 32 && EltSize != 64;
353 }
354 
355 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
356   const LLT Ty = Query.Types[0];
357   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
358          !loadStoreBitcastWorkaround(Ty);
359 }
360 
361 /// Return true if a load or store of the type should be lowered with a bitcast
362 /// to a different type.
363 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364                                        const LLT MemTy) {
365   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366   const unsigned Size = Ty.getSizeInBits();
367   if (Size != MemSizeInBits)
368     return Size <= 32 && Ty.isVector();
369 
370   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371     return true;
372 
373   // Don't try to handle bitcasting vector ext loads for now.
374   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375          (Size <= 32 || isRegisterSize(Size)) &&
376          !isRegisterVectorElementType(Ty.getElementType());
377 }
378 
379 /// Return true if we should legalize a load by widening an odd sized memory
380 /// access up to the alignment. Note this case when the memory access itself
381 /// changes, not the size of the result register.
382 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383                             uint64_t AlignInBits, unsigned AddrSpace,
384                             unsigned Opcode) {
385   unsigned SizeInBits = MemoryTy.getSizeInBits();
386   // We don't want to widen cases that are naturally legal.
387   if (isPowerOf2_32(SizeInBits))
388     return false;
389 
390   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391   // end up widening these for a scalar load during RegBankSelect, since there
392   // aren't 96-bit scalar loads.
393   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394     return false;
395 
396   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397     return false;
398 
399   // A load is known dereferenceable up to the alignment, so it's legal to widen
400   // to it.
401   //
402   // TODO: Could check dereferenceable for less aligned cases.
403   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404   if (AlignInBits < RoundedSize)
405     return false;
406 
407   // Do not widen if it would introduce a slow unaligned load.
408   const SITargetLowering *TLI = ST.getTargetLowering();
409   bool Fast = false;
410   return TLI->allowsMisalignedMemoryAccessesImpl(
411              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412              MachineMemOperand::MOLoad, &Fast) &&
413          Fast;
414 }
415 
416 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417                             unsigned Opcode) {
418   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419     return false;
420 
421   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422                          Query.MMODescrs[0].AlignInBits,
423                          Query.Types[1].getAddressSpace(), Opcode);
424 }
425 
426 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
427                                          const GCNTargetMachine &TM)
428   :  ST(ST_) {
429   using namespace TargetOpcode;
430 
431   auto GetAddrSpacePtr = [&TM](unsigned AS) {
432     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
433   };
434 
435   const LLT S1 = LLT::scalar(1);
436   const LLT S8 = LLT::scalar(8);
437   const LLT S16 = LLT::scalar(16);
438   const LLT S32 = LLT::scalar(32);
439   const LLT S64 = LLT::scalar(64);
440   const LLT S128 = LLT::scalar(128);
441   const LLT S256 = LLT::scalar(256);
442   const LLT S512 = LLT::scalar(512);
443   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
444 
445   const LLT V2S8 = LLT::fixed_vector(2, 8);
446   const LLT V2S16 = LLT::fixed_vector(2, 16);
447   const LLT V4S16 = LLT::fixed_vector(4, 16);
448 
449   const LLT V2S32 = LLT::fixed_vector(2, 32);
450   const LLT V3S32 = LLT::fixed_vector(3, 32);
451   const LLT V4S32 = LLT::fixed_vector(4, 32);
452   const LLT V5S32 = LLT::fixed_vector(5, 32);
453   const LLT V6S32 = LLT::fixed_vector(6, 32);
454   const LLT V7S32 = LLT::fixed_vector(7, 32);
455   const LLT V8S32 = LLT::fixed_vector(8, 32);
456   const LLT V9S32 = LLT::fixed_vector(9, 32);
457   const LLT V10S32 = LLT::fixed_vector(10, 32);
458   const LLT V11S32 = LLT::fixed_vector(11, 32);
459   const LLT V12S32 = LLT::fixed_vector(12, 32);
460   const LLT V13S32 = LLT::fixed_vector(13, 32);
461   const LLT V14S32 = LLT::fixed_vector(14, 32);
462   const LLT V15S32 = LLT::fixed_vector(15, 32);
463   const LLT V16S32 = LLT::fixed_vector(16, 32);
464   const LLT V32S32 = LLT::fixed_vector(32, 32);
465 
466   const LLT V2S64 = LLT::fixed_vector(2, 64);
467   const LLT V3S64 = LLT::fixed_vector(3, 64);
468   const LLT V4S64 = LLT::fixed_vector(4, 64);
469   const LLT V5S64 = LLT::fixed_vector(5, 64);
470   const LLT V6S64 = LLT::fixed_vector(6, 64);
471   const LLT V7S64 = LLT::fixed_vector(7, 64);
472   const LLT V8S64 = LLT::fixed_vector(8, 64);
473   const LLT V16S64 = LLT::fixed_vector(16, 64);
474 
475   std::initializer_list<LLT> AllS32Vectors =
476     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
477      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
478   std::initializer_list<LLT> AllS64Vectors =
479     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
480 
481   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
482   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
483   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
484   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
485   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
486   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
487   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
488 
489   const LLT CodePtr = FlatPtr;
490 
491   const std::initializer_list<LLT> AddrSpaces64 = {
492     GlobalPtr, ConstantPtr, FlatPtr
493   };
494 
495   const std::initializer_list<LLT> AddrSpaces32 = {
496     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
497   };
498 
499   const std::initializer_list<LLT> FPTypesBase = {
500     S32, S64
501   };
502 
503   const std::initializer_list<LLT> FPTypes16 = {
504     S32, S64, S16
505   };
506 
507   const std::initializer_list<LLT> FPTypesPK16 = {
508     S32, S64, S16, V2S16
509   };
510 
511   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
512 
513   // s1 for VCC branches, s32 for SCC branches.
514   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
515 
516   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
517   // elements for v3s16
518   getActionDefinitionsBuilder(G_PHI)
519     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
520     .legalFor(AllS32Vectors)
521     .legalFor(AllS64Vectors)
522     .legalFor(AddrSpaces64)
523     .legalFor(AddrSpaces32)
524     .legalIf(isPointer(0))
525     .clampScalar(0, S16, S256)
526     .widenScalarToNextPow2(0, 32)
527     .clampMaxNumElements(0, S32, 16)
528     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529     .scalarize(0);
530 
531   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532     // Full set of gfx9 features.
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16, V2S16})
535       .minScalar(0, S16)
536       .clampMaxNumElementsStrict(0, S16, 2)
537       .widenScalarToNextMultipleOf(0, 32)
538       .maxScalar(0, S32)
539       .scalarize(0);
540 
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542       .legalFor({S32, S16, V2S16}) // Clamp modifier
543       .minScalarOrElt(0, S16)
544       .clampMaxNumElementsStrict(0, S16, 2)
545       .scalarize(0)
546       .widenScalarToNextPow2(0, 32)
547       .lower();
548   } else if (ST.has16BitInsts()) {
549     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
550       .legalFor({S32, S16})
551       .minScalar(0, S16)
552       .widenScalarToNextMultipleOf(0, 32)
553       .maxScalar(0, S32)
554       .scalarize(0);
555 
556     // Technically the saturating operations require clamp bit support, but this
557     // was introduced at the same time as 16-bit operations.
558     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559       .legalFor({S32, S16}) // Clamp modifier
560       .minScalar(0, S16)
561       .scalarize(0)
562       .widenScalarToNextPow2(0, 16)
563       .lower();
564 
565     // We're just lowering this, but it helps get a better result to try to
566     // coerce to the desired type first.
567     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568       .minScalar(0, S16)
569       .scalarize(0)
570       .lower();
571   } else {
572     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
573       .legalFor({S32})
574       .widenScalarToNextMultipleOf(0, 32)
575       .clampScalar(0, S32, S32)
576       .scalarize(0);
577 
578     if (ST.hasIntClamp()) {
579       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580         .legalFor({S32}) // Clamp modifier.
581         .scalarize(0)
582         .minScalarOrElt(0, S32)
583         .lower();
584     } else {
585       // Clamp bit support was added in VI, along with 16-bit operations.
586       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587         .minScalar(0, S32)
588         .scalarize(0)
589         .lower();
590     }
591 
592     // FIXME: DAG expansion gets better results. The widening uses the smaller
593     // range values and goes for the min/max lowering directly.
594     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595       .minScalar(0, S32)
596       .scalarize(0)
597       .lower();
598   }
599 
600   getActionDefinitionsBuilder(
601       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
602       .customFor({S32, S64})
603       .clampScalar(0, S32, S64)
604       .widenScalarToNextPow2(0, 32)
605       .scalarize(0);
606 
607   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
608                    .legalFor({S32})
609                    .maxScalar(0, S32);
610 
611   if (ST.hasVOP3PInsts()) {
612     Mulh
613       .clampMaxNumElements(0, S8, 2)
614       .lowerFor({V2S8});
615   }
616 
617   Mulh
618     .scalarize(0)
619     .lower();
620 
621   // Report legal for any types we can handle anywhere. For the cases only legal
622   // on the SALU, RegBankSelect will be able to re-legalize.
623   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
624     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
625     .clampScalar(0, S32, S64)
626     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
627     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
628     .widenScalarToNextPow2(0)
629     .scalarize(0);
630 
631   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
632                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633     .legalFor({{S32, S1}, {S32, S32}})
634     .minScalar(0, S32)
635     // TODO: .scalarize(0)
636     .lower();
637 
638   getActionDefinitionsBuilder(G_BITCAST)
639     // Don't worry about the size constraint.
640     .legalIf(all(isRegisterType(0), isRegisterType(1)))
641     .lower();
642 
643 
644   getActionDefinitionsBuilder(G_CONSTANT)
645     .legalFor({S1, S32, S64, S16, GlobalPtr,
646                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647     .legalIf(isPointer(0))
648     .clampScalar(0, S32, S64)
649     .widenScalarToNextPow2(0);
650 
651   getActionDefinitionsBuilder(G_FCONSTANT)
652     .legalFor({S32, S64, S16})
653     .clampScalar(0, S16, S64);
654 
655   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
656       .legalIf(isRegisterType(0))
657       // s1 and s16 are special cases because they have legal operations on
658       // them, but don't really occupy registers in the normal way.
659       .legalFor({S1, S16})
660       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
661       .clampScalarOrElt(0, S32, MaxScalar)
662       .widenScalarToNextPow2(0, 32)
663       .clampMaxNumElements(0, S32, 16);
664 
665   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
666 
667   // If the amount is divergent, we have to do a wave reduction to get the
668   // maximum value, so this is expanded during RegBankSelect.
669   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
670     .legalFor({{PrivatePtr, S32}});
671 
672   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673     .customIf(typeIsNot(0, PrivatePtr));
674 
675   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
676 
677   auto &FPOpActions = getActionDefinitionsBuilder(
678     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
679     .legalFor({S32, S64});
680   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
681     .customFor({S32, S64});
682   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
683     .customFor({S32, S64});
684 
685   if (ST.has16BitInsts()) {
686     if (ST.hasVOP3PInsts())
687       FPOpActions.legalFor({S16, V2S16});
688     else
689       FPOpActions.legalFor({S16});
690 
691     TrigActions.customFor({S16});
692     FDIVActions.customFor({S16});
693   }
694 
695   auto &MinNumMaxNum = getActionDefinitionsBuilder({
696       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
697 
698   if (ST.hasVOP3PInsts()) {
699     MinNumMaxNum.customFor(FPTypesPK16)
700       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
701       .clampMaxNumElements(0, S16, 2)
702       .clampScalar(0, S16, S64)
703       .scalarize(0);
704   } else if (ST.has16BitInsts()) {
705     MinNumMaxNum.customFor(FPTypes16)
706       .clampScalar(0, S16, S64)
707       .scalarize(0);
708   } else {
709     MinNumMaxNum.customFor(FPTypesBase)
710       .clampScalar(0, S32, S64)
711       .scalarize(0);
712   }
713 
714   if (ST.hasVOP3PInsts())
715     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
716 
717   FPOpActions
718     .scalarize(0)
719     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
720 
721   TrigActions
722     .scalarize(0)
723     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
724 
725   FDIVActions
726     .scalarize(0)
727     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
728 
729   getActionDefinitionsBuilder({G_FNEG, G_FABS})
730     .legalFor(FPTypesPK16)
731     .clampMaxNumElementsStrict(0, S16, 2)
732     .scalarize(0)
733     .clampScalar(0, S16, S64);
734 
735   if (ST.has16BitInsts()) {
736     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
737       .legalFor({S32, S64, S16})
738       .scalarize(0)
739       .clampScalar(0, S16, S64);
740   } else {
741     getActionDefinitionsBuilder(G_FSQRT)
742       .legalFor({S32, S64})
743       .scalarize(0)
744       .clampScalar(0, S32, S64);
745 
746     if (ST.hasFractBug()) {
747       getActionDefinitionsBuilder(G_FFLOOR)
748         .customFor({S64})
749         .legalFor({S32, S64})
750         .scalarize(0)
751         .clampScalar(0, S32, S64);
752     } else {
753       getActionDefinitionsBuilder(G_FFLOOR)
754         .legalFor({S32, S64})
755         .scalarize(0)
756         .clampScalar(0, S32, S64);
757     }
758   }
759 
760   getActionDefinitionsBuilder(G_FPTRUNC)
761     .legalFor({{S32, S64}, {S16, S32}})
762     .scalarize(0)
763     .lower();
764 
765   getActionDefinitionsBuilder(G_FPEXT)
766     .legalFor({{S64, S32}, {S32, S16}})
767     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
768     .scalarize(0);
769 
770   getActionDefinitionsBuilder(G_FSUB)
771       // Use actual fsub instruction
772       .legalFor({S32})
773       // Must use fadd + fneg
774       .lowerFor({S64, S16, V2S16})
775       .scalarize(0)
776       .clampScalar(0, S32, S64);
777 
778   // Whether this is legal depends on the floating point mode for the function.
779   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
780   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
781     FMad.customFor({S32, S16});
782   else if (ST.hasMadMacF32Insts())
783     FMad.customFor({S32});
784   else if (ST.hasMadF16())
785     FMad.customFor({S16});
786   FMad.scalarize(0)
787       .lower();
788 
789   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790   if (ST.has16BitInsts()) {
791     FRem.customFor({S16, S32, S64});
792   } else {
793     FRem.minScalar(0, S32)
794         .customFor({S32, S64});
795   }
796   FRem.scalarize(0);
797 
798   // TODO: Do we need to clamp maximum bitwidth?
799   getActionDefinitionsBuilder(G_TRUNC)
800     .legalIf(isScalar(0))
801     .legalFor({{V2S16, V2S32}})
802     .clampMaxNumElements(0, S16, 2)
803     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
804     // situations (like an invalid implicit use), we don't want to infinite loop
805     // in the legalizer.
806     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
807     .alwaysLegal();
808 
809   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
810     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
811                {S32, S1}, {S64, S1}, {S16, S1}})
812     .scalarize(0)
813     .clampScalar(0, S32, S64)
814     .widenScalarToNextPow2(1, 32);
815 
816   // TODO: Split s1->s64 during regbankselect for VALU.
817   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819                     .lowerIf(typeIs(1, S1))
820                     .customFor({{S32, S64}, {S64, S64}});
821   if (ST.has16BitInsts())
822     IToFP.legalFor({{S16, S16}});
823   IToFP.clampScalar(1, S32, S64)
824        .minScalar(0, S32)
825        .scalarize(0)
826        .widenScalarToNextPow2(1);
827 
828   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
829     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830     .customFor({{S64, S32}, {S64, S64}})
831     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
832   if (ST.has16BitInsts())
833     FPToI.legalFor({{S16, S16}});
834   else
835     FPToI.minScalar(1, S32);
836 
837   FPToI.minScalar(0, S32)
838        .widenScalarToNextPow2(0, 32)
839        .scalarize(0)
840        .lower();
841 
842   // Lower roundeven into G_FRINT
843   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844     .scalarize(0)
845     .lower();
846 
847   if (ST.has16BitInsts()) {
848     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849       .legalFor({S16, S32, S64})
850       .clampScalar(0, S16, S64)
851       .scalarize(0);
852   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
853     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
854       .legalFor({S32, S64})
855       .clampScalar(0, S32, S64)
856       .scalarize(0);
857   } else {
858     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
859       .legalFor({S32})
860       .customFor({S64})
861       .clampScalar(0, S32, S64)
862       .scalarize(0);
863   }
864 
865   getActionDefinitionsBuilder(G_PTR_ADD)
866     .legalIf(all(isPointer(0), sameSize(0, 1)))
867     .scalarize(0)
868     .scalarSameSizeAs(1, 0);
869 
870   getActionDefinitionsBuilder(G_PTRMASK)
871     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872     .scalarSameSizeAs(1, 0)
873     .scalarize(0);
874 
875   auto &CmpBuilder =
876     getActionDefinitionsBuilder(G_ICMP)
877     // The compare output type differs based on the register bank of the output,
878     // so make both s1 and s32 legal.
879     //
880     // Scalar compares producing output in scc will be promoted to s32, as that
881     // is the allocatable register type that will be needed for the copy from
882     // scc. This will be promoted during RegBankSelect, and we assume something
883     // before that won't try to use s32 result types.
884     //
885     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886     // bank.
887     .legalForCartesianProduct(
888       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889     .legalForCartesianProduct(
890       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
891   if (ST.has16BitInsts()) {
892     CmpBuilder.legalFor({{S1, S16}});
893   }
894 
895   CmpBuilder
896     .widenScalarToNextPow2(1)
897     .clampScalar(1, S32, S64)
898     .scalarize(0)
899     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
900 
901   getActionDefinitionsBuilder(G_FCMP)
902     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
903     .widenScalarToNextPow2(1)
904     .clampScalar(1, S32, S64)
905     .scalarize(0);
906 
907   // FIXME: fpow has a selection pattern that should move to custom lowering.
908   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
909   if (ST.has16BitInsts())
910     Exp2Ops.legalFor({S32, S16});
911   else
912     Exp2Ops.legalFor({S32});
913   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
914   Exp2Ops.scalarize(0);
915 
916   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
917   if (ST.has16BitInsts())
918     ExpOps.customFor({{S32}, {S16}});
919   else
920     ExpOps.customFor({S32});
921   ExpOps.clampScalar(0, MinScalarFPTy, S32)
922         .scalarize(0);
923 
924   getActionDefinitionsBuilder(G_FPOWI)
925     .clampScalar(0, MinScalarFPTy, S32)
926     .lower();
927 
928   // The 64-bit versions produce 32-bit results, but only on the SALU.
929   getActionDefinitionsBuilder(G_CTPOP)
930     .legalFor({{S32, S32}, {S32, S64}})
931     .clampScalar(0, S32, S32)
932     .widenScalarToNextPow2(1, 32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32);
936 
937 
938   // The hardware instructions return a different result on 0 than the generic
939   // instructions expect. The hardware produces -1, but these produce the
940   // bitwidth.
941   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
942     .scalarize(0)
943     .clampScalar(0, S32, S32)
944     .clampScalar(1, S32, S64)
945     .widenScalarToNextPow2(0, 32)
946     .widenScalarToNextPow2(1, 32)
947     .custom();
948 
949   // The 64-bit versions produce 32-bit results, but only on the SALU.
950   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
951     .legalFor({{S32, S32}, {S32, S64}})
952     .clampScalar(0, S32, S32)
953     .clampScalar(1, S32, S64)
954     .scalarize(0)
955     .widenScalarToNextPow2(0, 32)
956     .widenScalarToNextPow2(1, 32);
957 
958   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
959   // RegBankSelect.
960   getActionDefinitionsBuilder(G_BITREVERSE)
961     .legalFor({S32, S64})
962     .clampScalar(0, S32, S64)
963     .scalarize(0)
964     .widenScalarToNextPow2(0);
965 
966   if (ST.has16BitInsts()) {
967     getActionDefinitionsBuilder(G_BSWAP)
968       .legalFor({S16, S32, V2S16})
969       .clampMaxNumElementsStrict(0, S16, 2)
970       // FIXME: Fixing non-power-of-2 before clamp is workaround for
971       // narrowScalar limitation.
972       .widenScalarToNextPow2(0)
973       .clampScalar(0, S16, S32)
974       .scalarize(0);
975 
976     if (ST.hasVOP3PInsts()) {
977       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
978         .legalFor({S32, S16, V2S16})
979         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
980         .clampMaxNumElements(0, S16, 2)
981         .minScalar(0, S16)
982         .widenScalarToNextPow2(0)
983         .scalarize(0)
984         .lower();
985     } else {
986       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
987         .legalFor({S32, S16})
988         .widenScalarToNextPow2(0)
989         .minScalar(0, S16)
990         .scalarize(0)
991         .lower();
992     }
993   } else {
994     // TODO: Should have same legality without v_perm_b32
995     getActionDefinitionsBuilder(G_BSWAP)
996       .legalFor({S32})
997       .lowerIf(scalarNarrowerThan(0, 32))
998       // FIXME: Fixing non-power-of-2 before clamp is workaround for
999       // narrowScalar limitation.
1000       .widenScalarToNextPow2(0)
1001       .maxScalar(0, S32)
1002       .scalarize(0)
1003       .lower();
1004 
1005     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1006       .legalFor({S32})
1007       .minScalar(0, S32)
1008       .widenScalarToNextPow2(0)
1009       .scalarize(0)
1010       .lower();
1011   }
1012 
1013   getActionDefinitionsBuilder(G_INTTOPTR)
1014     // List the common cases
1015     .legalForCartesianProduct(AddrSpaces64, {S64})
1016     .legalForCartesianProduct(AddrSpaces32, {S32})
1017     .scalarize(0)
1018     // Accept any address space as long as the size matches
1019     .legalIf(sameSize(0, 1))
1020     .widenScalarIf(smallerThan(1, 0),
1021       [](const LegalityQuery &Query) {
1022         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1023       })
1024     .narrowScalarIf(largerThan(1, 0),
1025       [](const LegalityQuery &Query) {
1026         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1027       });
1028 
1029   getActionDefinitionsBuilder(G_PTRTOINT)
1030     // List the common cases
1031     .legalForCartesianProduct(AddrSpaces64, {S64})
1032     .legalForCartesianProduct(AddrSpaces32, {S32})
1033     .scalarize(0)
1034     // Accept any address space as long as the size matches
1035     .legalIf(sameSize(0, 1))
1036     .widenScalarIf(smallerThan(0, 1),
1037       [](const LegalityQuery &Query) {
1038         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1039       })
1040     .narrowScalarIf(
1041       largerThan(0, 1),
1042       [](const LegalityQuery &Query) {
1043         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1044       });
1045 
1046   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1047     .scalarize(0)
1048     .custom();
1049 
1050   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1051                                     bool IsLoad) -> bool {
1052     const LLT DstTy = Query.Types[0];
1053 
1054     // Split vector extloads.
1055     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1056 
1057     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1058       return true;
1059 
1060     const LLT PtrTy = Query.Types[1];
1061     unsigned AS = PtrTy.getAddressSpace();
1062     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1063       return true;
1064 
1065     // Catch weird sized loads that don't evenly divide into the access sizes
1066     // TODO: May be able to widen depending on alignment etc.
1067     unsigned NumRegs = (MemSize + 31) / 32;
1068     if (NumRegs == 3) {
1069       if (!ST.hasDwordx3LoadStores())
1070         return true;
1071     } else {
1072       // If the alignment allows, these should have been widened.
1073       if (!isPowerOf2_32(NumRegs))
1074         return true;
1075     }
1076 
1077     return false;
1078   };
1079 
1080   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1081   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1082   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1083 
1084   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1085   // LDS
1086   // TODO: Unsupported flat for SI.
1087 
1088   for (unsigned Op : {G_LOAD, G_STORE}) {
1089     const bool IsStore = Op == G_STORE;
1090 
1091     auto &Actions = getActionDefinitionsBuilder(Op);
1092     // Explicitly list some common cases.
1093     // TODO: Does this help compile time at all?
1094     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1095                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1096                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1097                                       {S64, GlobalPtr, S64, GlobalAlign32},
1098                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1099                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1100                                       {S32, GlobalPtr, S8, GlobalAlign8},
1101                                       {S32, GlobalPtr, S16, GlobalAlign16},
1102 
1103                                       {S32, LocalPtr, S32, 32},
1104                                       {S64, LocalPtr, S64, 32},
1105                                       {V2S32, LocalPtr, V2S32, 32},
1106                                       {S32, LocalPtr, S8, 8},
1107                                       {S32, LocalPtr, S16, 16},
1108                                       {V2S16, LocalPtr, S32, 32},
1109 
1110                                       {S32, PrivatePtr, S32, 32},
1111                                       {S32, PrivatePtr, S8, 8},
1112                                       {S32, PrivatePtr, S16, 16},
1113                                       {V2S16, PrivatePtr, S32, 32},
1114 
1115                                       {S32, ConstantPtr, S32, GlobalAlign32},
1116                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1117                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1118                                       {S64, ConstantPtr, S64, GlobalAlign32},
1119                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1120     Actions.legalIf(
1121       [=](const LegalityQuery &Query) -> bool {
1122         return isLoadStoreLegal(ST, Query);
1123       });
1124 
1125     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1126     // 64-bits.
1127     //
1128     // TODO: Should generalize bitcast action into coerce, which will also cover
1129     // inserting addrspacecasts.
1130     Actions.customIf(typeIs(1, Constant32Ptr));
1131 
1132     // Turn any illegal element vectors into something easier to deal
1133     // with. These will ultimately produce 32-bit scalar shifts to extract the
1134     // parts anyway.
1135     //
1136     // For odd 16-bit element vectors, prefer to split those into pieces with
1137     // 16-bit vector parts.
1138     Actions.bitcastIf(
1139       [=](const LegalityQuery &Query) -> bool {
1140         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1141                                           Query.MMODescrs[0].MemoryTy);
1142       }, bitcastToRegisterType(0));
1143 
1144     if (!IsStore) {
1145       // Widen suitably aligned loads by loading extra bytes. The standard
1146       // legalization actions can't properly express widening memory operands.
1147       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1148         return shouldWidenLoad(ST, Query, G_LOAD);
1149       });
1150     }
1151 
1152     // FIXME: load/store narrowing should be moved to lower action
1153     Actions
1154         .narrowScalarIf(
1155             [=](const LegalityQuery &Query) -> bool {
1156               return !Query.Types[0].isVector() &&
1157                      needToSplitMemOp(Query, Op == G_LOAD);
1158             },
1159             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1160               const LLT DstTy = Query.Types[0];
1161               const LLT PtrTy = Query.Types[1];
1162 
1163               const unsigned DstSize = DstTy.getSizeInBits();
1164               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1165 
1166               // Split extloads.
1167               if (DstSize > MemSize)
1168                 return std::make_pair(0, LLT::scalar(MemSize));
1169 
1170               unsigned MaxSize = maxSizeForAddrSpace(ST,
1171                                                      PtrTy.getAddressSpace(),
1172                                                      Op == G_LOAD);
1173               if (MemSize > MaxSize)
1174                 return std::make_pair(0, LLT::scalar(MaxSize));
1175 
1176               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1177               return std::make_pair(0, LLT::scalar(Align));
1178             })
1179         .fewerElementsIf(
1180             [=](const LegalityQuery &Query) -> bool {
1181               return Query.Types[0].isVector() &&
1182                      needToSplitMemOp(Query, Op == G_LOAD);
1183             },
1184             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1185               const LLT DstTy = Query.Types[0];
1186               const LLT PtrTy = Query.Types[1];
1187 
1188               LLT EltTy = DstTy.getElementType();
1189               unsigned MaxSize = maxSizeForAddrSpace(ST,
1190                                                      PtrTy.getAddressSpace(),
1191                                                      Op == G_LOAD);
1192 
1193               // FIXME: Handle widened to power of 2 results better. This ends
1194               // up scalarizing.
1195               // FIXME: 3 element stores scalarized on SI
1196 
1197               // Split if it's too large for the address space.
1198               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1199               if (MemSize > MaxSize) {
1200                 unsigned NumElts = DstTy.getNumElements();
1201                 unsigned EltSize = EltTy.getSizeInBits();
1202 
1203                 if (MaxSize % EltSize == 0) {
1204                   return std::make_pair(
1205                       0, LLT::scalarOrVector(
1206                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
1207                 }
1208 
1209                 unsigned NumPieces = MemSize / MaxSize;
1210 
1211                 // FIXME: Refine when odd breakdowns handled
1212                 // The scalars will need to be re-legalized.
1213                 if (NumPieces == 1 || NumPieces >= NumElts ||
1214                     NumElts % NumPieces != 0)
1215                   return std::make_pair(0, EltTy);
1216 
1217                 return std::make_pair(
1218                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
1219               }
1220 
1221               // FIXME: We could probably handle weird extending loads better.
1222               if (DstTy.getSizeInBits() > MemSize)
1223                 return std::make_pair(0, EltTy);
1224 
1225               unsigned EltSize = EltTy.getSizeInBits();
1226               unsigned DstSize = DstTy.getSizeInBits();
1227               if (!isPowerOf2_32(DstSize)) {
1228                 // We're probably decomposing an odd sized store. Try to split
1229                 // to the widest type. TODO: Account for alignment. As-is it
1230                 // should be OK, since the new parts will be further legalized.
1231                 unsigned FloorSize = PowerOf2Floor(DstSize);
1232                 return std::make_pair(
1233                     0, LLT::scalarOrVector(
1234                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
1235               }
1236 
1237               // May need relegalization for the scalars.
1238               return std::make_pair(0, EltTy);
1239             })
1240     .minScalar(0, S32)
1241     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
1242     .widenScalarToNextPow2(0)
1243     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1244     .lower();
1245   }
1246 
1247   // FIXME: Unaligned accesses not lowered.
1248   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1249                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1250                                                   {S32, GlobalPtr, S16, 2 * 8},
1251                                                   {S32, LocalPtr, S8, 8},
1252                                                   {S32, LocalPtr, S16, 16},
1253                                                   {S32, PrivatePtr, S8, 8},
1254                                                   {S32, PrivatePtr, S16, 16},
1255                                                   {S32, ConstantPtr, S8, 8},
1256                                                   {S32, ConstantPtr, S16, 2 * 8}})
1257                        .legalIf(
1258                          [=](const LegalityQuery &Query) -> bool {
1259                            return isLoadStoreLegal(ST, Query);
1260                          });
1261 
1262   if (ST.hasFlatAddressSpace()) {
1263     ExtLoads.legalForTypesWithMemDesc(
1264         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1265   }
1266 
1267   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1268   // 64-bits.
1269   //
1270   // TODO: Should generalize bitcast action into coerce, which will also cover
1271   // inserting addrspacecasts.
1272   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1273 
1274   ExtLoads.clampScalar(0, S32, S32)
1275           .widenScalarToNextPow2(0)
1276           .lower();
1277 
1278   auto &Atomics = getActionDefinitionsBuilder(
1279     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1280      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1281      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1282      G_ATOMICRMW_UMIN})
1283     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1284                {S64, GlobalPtr}, {S64, LocalPtr},
1285                {S32, RegionPtr}, {S64, RegionPtr}});
1286   if (ST.hasFlatAddressSpace()) {
1287     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1288   }
1289 
1290   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1291   if (ST.hasLDSFPAtomicAdd()) {
1292     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1293     if (ST.hasGFX90AInsts())
1294       Atomic.legalFor({{S64, LocalPtr}});
1295   }
1296   if (ST.hasAtomicFaddInsts())
1297     Atomic.legalFor({{S32, GlobalPtr}});
1298 
1299   if (ST.hasGFX90AInsts()) {
1300     // These are legal with some caveats, and should have undergone expansion in
1301     // the IR in most situations
1302     // TODO: Move atomic expansion into legalizer
1303     // TODO: Also supports <2 x f16>
1304     Atomic.legalFor({
1305         {S32, GlobalPtr},
1306         {S64, GlobalPtr},
1307         {S64, FlatPtr}
1308       });
1309   }
1310 
1311   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1312   // demarshalling
1313   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1314     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1315                 {S32, FlatPtr}, {S64, FlatPtr}})
1316     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1317                {S32, RegionPtr}, {S64, RegionPtr}});
1318   // TODO: Pointer types, any 32-bit or 64-bit vector
1319 
1320   // Condition should be s32 for scalar, s1 for vector.
1321   getActionDefinitionsBuilder(G_SELECT)
1322       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1323                                  LocalPtr, FlatPtr, PrivatePtr,
1324                                  LLT::fixed_vector(2, LocalPtr),
1325                                  LLT::fixed_vector(2, PrivatePtr)},
1326                                 {S1, S32})
1327       .clampScalar(0, S16, S64)
1328       .scalarize(1)
1329       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1330       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1331       .clampMaxNumElements(0, S32, 2)
1332       .clampMaxNumElements(0, LocalPtr, 2)
1333       .clampMaxNumElements(0, PrivatePtr, 2)
1334       .scalarize(0)
1335       .widenScalarToNextPow2(0)
1336       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1337 
1338   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1339   // be more flexible with the shift amount type.
1340   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1341     .legalFor({{S32, S32}, {S64, S32}});
1342   if (ST.has16BitInsts()) {
1343     if (ST.hasVOP3PInsts()) {
1344       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1345             .clampMaxNumElements(0, S16, 2);
1346     } else
1347       Shifts.legalFor({{S16, S16}});
1348 
1349     // TODO: Support 16-bit shift amounts for all types
1350     Shifts.widenScalarIf(
1351       [=](const LegalityQuery &Query) {
1352         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1353         // 32-bit amount.
1354         const LLT ValTy = Query.Types[0];
1355         const LLT AmountTy = Query.Types[1];
1356         return ValTy.getSizeInBits() <= 16 &&
1357                AmountTy.getSizeInBits() < 16;
1358       }, changeTo(1, S16));
1359     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1360     Shifts.clampScalar(1, S32, S32);
1361     Shifts.widenScalarToNextPow2(0, 16);
1362     Shifts.clampScalar(0, S16, S64);
1363 
1364     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1365       .minScalar(0, S16)
1366       .scalarize(0)
1367       .lower();
1368   } else {
1369     // Make sure we legalize the shift amount type first, as the general
1370     // expansion for the shifted type will produce much worse code if it hasn't
1371     // been truncated already.
1372     Shifts.clampScalar(1, S32, S32);
1373     Shifts.widenScalarToNextPow2(0, 32);
1374     Shifts.clampScalar(0, S32, S64);
1375 
1376     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1377       .minScalar(0, S32)
1378       .scalarize(0)
1379       .lower();
1380   }
1381   Shifts.scalarize(0);
1382 
1383   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1384     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1385     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1386     unsigned IdxTypeIdx = 2;
1387 
1388     getActionDefinitionsBuilder(Op)
1389       .customIf([=](const LegalityQuery &Query) {
1390           const LLT EltTy = Query.Types[EltTypeIdx];
1391           const LLT VecTy = Query.Types[VecTypeIdx];
1392           const LLT IdxTy = Query.Types[IdxTypeIdx];
1393           const unsigned EltSize = EltTy.getSizeInBits();
1394           return (EltSize == 32 || EltSize == 64) &&
1395                   VecTy.getSizeInBits() % 32 == 0 &&
1396                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1397                   IdxTy.getSizeInBits() == 32;
1398         })
1399       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1400                  bitcastToVectorElement32(VecTypeIdx))
1401       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1402       .bitcastIf(
1403         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1404         [=](const LegalityQuery &Query) {
1405           // For > 64-bit element types, try to turn this into a 64-bit
1406           // element vector since we may be able to do better indexing
1407           // if this is scalar. If not, fall back to 32.
1408           const LLT EltTy = Query.Types[EltTypeIdx];
1409           const LLT VecTy = Query.Types[VecTypeIdx];
1410           const unsigned DstEltSize = EltTy.getSizeInBits();
1411           const unsigned VecSize = VecTy.getSizeInBits();
1412 
1413           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1414           return std::make_pair(
1415               VecTypeIdx,
1416               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1417         })
1418       .clampScalar(EltTypeIdx, S32, S64)
1419       .clampScalar(VecTypeIdx, S32, S64)
1420       .clampScalar(IdxTypeIdx, S32, S32)
1421       .clampMaxNumElements(VecTypeIdx, S32, 32)
1422       // TODO: Clamp elements for 64-bit vectors?
1423       // It should only be necessary with variable indexes.
1424       // As a last resort, lower to the stack
1425       .lower();
1426   }
1427 
1428   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1429     .unsupportedIf([=](const LegalityQuery &Query) {
1430         const LLT &EltTy = Query.Types[1].getElementType();
1431         return Query.Types[0] != EltTy;
1432       });
1433 
1434   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1435     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1436     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1437 
1438     // FIXME: Doesn't handle extract of illegal sizes.
1439     getActionDefinitionsBuilder(Op)
1440       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1441       .lowerIf([=](const LegalityQuery &Query) {
1442           // Sub-vector(or single element) insert and extract.
1443           // TODO: verify immediate offset here since lower only works with
1444           // whole elements.
1445           const LLT BigTy = Query.Types[BigTyIdx];
1446           return BigTy.isVector();
1447         })
1448       // FIXME: Multiples of 16 should not be legal.
1449       .legalIf([=](const LegalityQuery &Query) {
1450           const LLT BigTy = Query.Types[BigTyIdx];
1451           const LLT LitTy = Query.Types[LitTyIdx];
1452           return (BigTy.getSizeInBits() % 32 == 0) &&
1453                  (LitTy.getSizeInBits() % 16 == 0);
1454         })
1455       .widenScalarIf(
1456         [=](const LegalityQuery &Query) {
1457           const LLT BigTy = Query.Types[BigTyIdx];
1458           return (BigTy.getScalarSizeInBits() < 16);
1459         },
1460         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1461       .widenScalarIf(
1462         [=](const LegalityQuery &Query) {
1463           const LLT LitTy = Query.Types[LitTyIdx];
1464           return (LitTy.getScalarSizeInBits() < 16);
1465         },
1466         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1467       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1468       .widenScalarToNextPow2(BigTyIdx, 32);
1469 
1470   }
1471 
1472   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1473     .legalForCartesianProduct(AllS32Vectors, {S32})
1474     .legalForCartesianProduct(AllS64Vectors, {S64})
1475     .clampNumElements(0, V16S32, V32S32)
1476     .clampNumElements(0, V2S64, V16S64)
1477     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1478 
1479   if (ST.hasScalarPackInsts()) {
1480     BuildVector
1481       // FIXME: Should probably widen s1 vectors straight to s32
1482       .minScalarOrElt(0, S16)
1483       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1484       .minScalar(1, S32);
1485 
1486     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1487       .legalFor({V2S16, S32})
1488       .lower();
1489     BuildVector.minScalarOrElt(0, S32);
1490   } else {
1491     BuildVector.customFor({V2S16, S16});
1492     BuildVector.minScalarOrElt(0, S32);
1493 
1494     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1495       .customFor({V2S16, S32})
1496       .lower();
1497   }
1498 
1499   BuildVector.legalIf(isRegisterType(0));
1500 
1501   // FIXME: Clamp maximum size
1502   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504     .clampMaxNumElements(0, S32, 32)
1505     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506     .clampMaxNumElements(0, S16, 64);
1507 
1508   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1509   // pre-legalize.
1510   if (ST.hasVOP3PInsts()) {
1511     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1512       .customFor({V2S16, V2S16})
1513       .lower();
1514   } else
1515     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1516 
1517   // Merge/Unmerge
1518   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1519     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1520     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1521 
1522     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1523       const LLT Ty = Query.Types[TypeIdx];
1524       if (Ty.isVector()) {
1525         const LLT &EltTy = Ty.getElementType();
1526         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1527           return true;
1528         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1529           return true;
1530       }
1531       return false;
1532     };
1533 
1534     auto &Builder = getActionDefinitionsBuilder(Op)
1535       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1536       .lowerFor({{S16, V2S16}})
1537       .lowerIf([=](const LegalityQuery &Query) {
1538           const LLT BigTy = Query.Types[BigTyIdx];
1539           return BigTy.getSizeInBits() == 32;
1540         })
1541       // Try to widen to s16 first for small types.
1542       // TODO: Only do this on targets with legal s16 shifts
1543       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1544       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1545       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1546       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1547                            elementTypeIs(1, S16)),
1548                        changeTo(1, V2S16))
1549       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1550       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1551       // valid.
1552       .clampScalar(LitTyIdx, S32, S512)
1553       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1554       // Break up vectors with weird elements into scalars
1555       .fewerElementsIf(
1556         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1557         scalarize(0))
1558       .fewerElementsIf(
1559         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1560         scalarize(1))
1561       .clampScalar(BigTyIdx, S32, MaxScalar);
1562 
1563     if (Op == G_MERGE_VALUES) {
1564       Builder.widenScalarIf(
1565         // TODO: Use 16-bit shifts if legal for 8-bit values?
1566         [=](const LegalityQuery &Query) {
1567           const LLT Ty = Query.Types[LitTyIdx];
1568           return Ty.getSizeInBits() < 32;
1569         },
1570         changeTo(LitTyIdx, S32));
1571     }
1572 
1573     Builder.widenScalarIf(
1574       [=](const LegalityQuery &Query) {
1575         const LLT Ty = Query.Types[BigTyIdx];
1576         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1577           Ty.getSizeInBits() % 16 != 0;
1578       },
1579       [=](const LegalityQuery &Query) {
1580         // Pick the next power of 2, or a multiple of 64 over 128.
1581         // Whichever is smaller.
1582         const LLT &Ty = Query.Types[BigTyIdx];
1583         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1584         if (NewSizeInBits >= 256) {
1585           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1586           if (RoundedTo < NewSizeInBits)
1587             NewSizeInBits = RoundedTo;
1588         }
1589         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1590       })
1591       // Any vectors left are the wrong size. Scalarize them.
1592       .scalarize(0)
1593       .scalarize(1);
1594   }
1595 
1596   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1597   // RegBankSelect.
1598   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1599     .legalFor({{S32}, {S64}});
1600 
1601   if (ST.hasVOP3PInsts()) {
1602     SextInReg.lowerFor({{V2S16}})
1603       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1604       // get more vector shift opportunities, since we'll get those when
1605       // expanded.
1606       .clampMaxNumElementsStrict(0, S16, 2);
1607   } else if (ST.has16BitInsts()) {
1608     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1609   } else {
1610     // Prefer to promote to s32 before lowering if we don't have 16-bit
1611     // shifts. This avoid a lot of intermediate truncate and extend operations.
1612     SextInReg.lowerFor({{S32}, {S64}});
1613   }
1614 
1615   SextInReg
1616     .scalarize(0)
1617     .clampScalar(0, S32, S64)
1618     .lower();
1619 
1620   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1621     .scalarize(0)
1622     .lower();
1623 
1624   // TODO: Only Try to form v2s16 with legal packed instructions.
1625   getActionDefinitionsBuilder(G_FSHR)
1626     .legalFor({{S32, S32}})
1627     .lowerFor({{V2S16, V2S16}})
1628     .clampMaxNumElementsStrict(0, S16, 2)
1629     .scalarize(0)
1630     .lower();
1631 
1632   if (ST.hasVOP3PInsts()) {
1633     getActionDefinitionsBuilder(G_FSHL)
1634       .lowerFor({{V2S16, V2S16}})
1635       .clampMaxNumElementsStrict(0, S16, 2)
1636       .scalarize(0)
1637       .lower();
1638   } else {
1639     getActionDefinitionsBuilder(G_FSHL)
1640       .scalarize(0)
1641       .lower();
1642   }
1643 
1644   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1645     .legalFor({S64});
1646 
1647   getActionDefinitionsBuilder(G_FENCE)
1648     .alwaysLegal();
1649 
1650   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1651       .scalarize(0)
1652       .minScalar(0, S32)
1653       .lower();
1654 
1655   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1656       .legalFor({{S32, S32}, {S64, S32}})
1657       .clampScalar(1, S32, S32)
1658       .clampScalar(0, S32, S64)
1659       .widenScalarToNextPow2(0)
1660       .scalarize(0);
1661 
1662   getActionDefinitionsBuilder({
1663       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1664       G_FCOPYSIGN,
1665 
1666       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1667       G_ATOMICRMW_NAND,
1668       G_ATOMICRMW_FSUB,
1669       G_READ_REGISTER,
1670       G_WRITE_REGISTER,
1671 
1672       G_SADDO, G_SSUBO,
1673 
1674        // TODO: Implement
1675       G_FMINIMUM, G_FMAXIMUM}).lower();
1676 
1677   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1678       .lower();
1679 
1680   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1681         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1682         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1683     .unsupported();
1684 
1685   getLegacyLegalizerInfo().computeTables();
1686   verify(*ST.getInstrInfo());
1687 }
1688 
1689 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1690                                          MachineInstr &MI) const {
1691   MachineIRBuilder &B = Helper.MIRBuilder;
1692   MachineRegisterInfo &MRI = *B.getMRI();
1693 
1694   switch (MI.getOpcode()) {
1695   case TargetOpcode::G_ADDRSPACE_CAST:
1696     return legalizeAddrSpaceCast(MI, MRI, B);
1697   case TargetOpcode::G_FRINT:
1698     return legalizeFrint(MI, MRI, B);
1699   case TargetOpcode::G_FCEIL:
1700     return legalizeFceil(MI, MRI, B);
1701   case TargetOpcode::G_FREM:
1702     return legalizeFrem(MI, MRI, B);
1703   case TargetOpcode::G_INTRINSIC_TRUNC:
1704     return legalizeIntrinsicTrunc(MI, MRI, B);
1705   case TargetOpcode::G_SITOFP:
1706     return legalizeITOFP(MI, MRI, B, true);
1707   case TargetOpcode::G_UITOFP:
1708     return legalizeITOFP(MI, MRI, B, false);
1709   case TargetOpcode::G_FPTOSI:
1710     return legalizeFPTOI(MI, MRI, B, true);
1711   case TargetOpcode::G_FPTOUI:
1712     return legalizeFPTOI(MI, MRI, B, false);
1713   case TargetOpcode::G_FMINNUM:
1714   case TargetOpcode::G_FMAXNUM:
1715   case TargetOpcode::G_FMINNUM_IEEE:
1716   case TargetOpcode::G_FMAXNUM_IEEE:
1717     return legalizeMinNumMaxNum(Helper, MI);
1718   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1719     return legalizeExtractVectorElt(MI, MRI, B);
1720   case TargetOpcode::G_INSERT_VECTOR_ELT:
1721     return legalizeInsertVectorElt(MI, MRI, B);
1722   case TargetOpcode::G_SHUFFLE_VECTOR:
1723     return legalizeShuffleVector(MI, MRI, B);
1724   case TargetOpcode::G_FSIN:
1725   case TargetOpcode::G_FCOS:
1726     return legalizeSinCos(MI, MRI, B);
1727   case TargetOpcode::G_GLOBAL_VALUE:
1728     return legalizeGlobalValue(MI, MRI, B);
1729   case TargetOpcode::G_LOAD:
1730   case TargetOpcode::G_SEXTLOAD:
1731   case TargetOpcode::G_ZEXTLOAD:
1732     return legalizeLoad(Helper, MI);
1733   case TargetOpcode::G_FMAD:
1734     return legalizeFMad(MI, MRI, B);
1735   case TargetOpcode::G_FDIV:
1736     return legalizeFDIV(MI, MRI, B);
1737   case TargetOpcode::G_UDIV:
1738   case TargetOpcode::G_UREM:
1739   case TargetOpcode::G_UDIVREM:
1740     return legalizeUnsignedDIV_REM(MI, MRI, B);
1741   case TargetOpcode::G_SDIV:
1742   case TargetOpcode::G_SREM:
1743   case TargetOpcode::G_SDIVREM:
1744     return legalizeSignedDIV_REM(MI, MRI, B);
1745   case TargetOpcode::G_ATOMIC_CMPXCHG:
1746     return legalizeAtomicCmpXChg(MI, MRI, B);
1747   case TargetOpcode::G_FLOG:
1748     return legalizeFlog(MI, B, numbers::ln2f);
1749   case TargetOpcode::G_FLOG10:
1750     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1751   case TargetOpcode::G_FEXP:
1752     return legalizeFExp(MI, B);
1753   case TargetOpcode::G_FPOW:
1754     return legalizeFPow(MI, B);
1755   case TargetOpcode::G_FFLOOR:
1756     return legalizeFFloor(MI, MRI, B);
1757   case TargetOpcode::G_BUILD_VECTOR:
1758     return legalizeBuildVector(MI, MRI, B);
1759   case TargetOpcode::G_CTLZ:
1760   case TargetOpcode::G_CTTZ:
1761     return legalizeCTLZ_CTTZ(MI, MRI, B);
1762   default:
1763     return false;
1764   }
1765 
1766   llvm_unreachable("expected switch to return");
1767 }
1768 
1769 Register AMDGPULegalizerInfo::getSegmentAperture(
1770   unsigned AS,
1771   MachineRegisterInfo &MRI,
1772   MachineIRBuilder &B) const {
1773   MachineFunction &MF = B.getMF();
1774   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1775   const LLT S32 = LLT::scalar(32);
1776 
1777   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1778 
1779   if (ST.hasApertureRegs()) {
1780     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1781     // getreg.
1782     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1783         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1784         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1785     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1786         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1787         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1788     unsigned Encoding =
1789         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1790         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1791         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1792 
1793     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1794 
1795     B.buildInstr(AMDGPU::S_GETREG_B32)
1796       .addDef(GetReg)
1797       .addImm(Encoding);
1798     MRI.setType(GetReg, S32);
1799 
1800     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1801     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1802   }
1803 
1804   Register QueuePtr = MRI.createGenericVirtualRegister(
1805     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1806 
1807   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1808     return Register();
1809 
1810   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1811   // private_segment_aperture_base_hi.
1812   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1813 
1814   // TODO: can we be smarter about machine pointer info?
1815   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1816   MachineMemOperand *MMO = MF.getMachineMemOperand(
1817       PtrInfo,
1818       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1819           MachineMemOperand::MOInvariant,
1820       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
1821 
1822   Register LoadAddr;
1823 
1824   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1825   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1826 }
1827 
1828 /// Return true if the value is a known valid address, such that a null check is
1829 /// not necessary.
1830 static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1831                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1832   MachineInstr *Def = MRI.getVRegDef(Val);
1833   switch (Def->getOpcode()) {
1834   case AMDGPU::G_FRAME_INDEX:
1835   case AMDGPU::G_GLOBAL_VALUE:
1836   case AMDGPU::G_BLOCK_ADDR:
1837     return true;
1838   case AMDGPU::G_CONSTANT: {
1839     const ConstantInt *CI = Def->getOperand(1).getCImm();
1840     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1841   }
1842   default:
1843     return false;
1844   }
1845 
1846   return false;
1847 }
1848 
1849 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1850   MachineInstr &MI, MachineRegisterInfo &MRI,
1851   MachineIRBuilder &B) const {
1852   MachineFunction &MF = B.getMF();
1853 
1854   const LLT S32 = LLT::scalar(32);
1855   Register Dst = MI.getOperand(0).getReg();
1856   Register Src = MI.getOperand(1).getReg();
1857 
1858   LLT DstTy = MRI.getType(Dst);
1859   LLT SrcTy = MRI.getType(Src);
1860   unsigned DestAS = DstTy.getAddressSpace();
1861   unsigned SrcAS = SrcTy.getAddressSpace();
1862 
1863   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1864   // vector element.
1865   assert(!DstTy.isVector());
1866 
1867   const AMDGPUTargetMachine &TM
1868     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1869 
1870   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1871     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1872     return true;
1873   }
1874 
1875   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1876     // Truncate.
1877     B.buildExtract(Dst, Src, 0);
1878     MI.eraseFromParent();
1879     return true;
1880   }
1881 
1882   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1883     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1884     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1885 
1886     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1887     // another. Merge operands are required to be the same type, but creating an
1888     // extra ptrtoint would be kind of pointless.
1889     auto HighAddr = B.buildConstant(
1890       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1891     B.buildMerge(Dst, {Src, HighAddr});
1892     MI.eraseFromParent();
1893     return true;
1894   }
1895 
1896   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1897     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1898            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1899 
1900     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1901       // Extract low 32-bits of the pointer.
1902       B.buildExtract(Dst, Src, 0);
1903       MI.eraseFromParent();
1904       return true;
1905     }
1906 
1907     unsigned NullVal = TM.getNullPointerValue(DestAS);
1908 
1909     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1910     auto FlatNull = B.buildConstant(SrcTy, 0);
1911 
1912     // Extract low 32-bits of the pointer.
1913     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1914 
1915     auto CmpRes =
1916         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1917     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1918 
1919     MI.eraseFromParent();
1920     return true;
1921   }
1922 
1923   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1924     return false;
1925 
1926   if (!ST.hasFlatAddressSpace())
1927     return false;
1928 
1929   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1930   if (!ApertureReg.isValid())
1931     return false;
1932 
1933   // Coerce the type of the low half of the result so we can use merge_values.
1934   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1935 
1936   // TODO: Should we allow mismatched types but matching sizes in merges to
1937   // avoid the ptrtoint?
1938   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1939 
1940   if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1941     B.buildCopy(Dst, BuildPtr);
1942     MI.eraseFromParent();
1943     return true;
1944   }
1945 
1946   auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1947   auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1948 
1949   auto CmpRes =
1950       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1951 
1952   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1953 
1954   MI.eraseFromParent();
1955   return true;
1956 }
1957 
1958 bool AMDGPULegalizerInfo::legalizeFrint(
1959   MachineInstr &MI, MachineRegisterInfo &MRI,
1960   MachineIRBuilder &B) const {
1961   Register Src = MI.getOperand(1).getReg();
1962   LLT Ty = MRI.getType(Src);
1963   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1964 
1965   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1966   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1967 
1968   auto C1 = B.buildFConstant(Ty, C1Val);
1969   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1970 
1971   // TODO: Should this propagate fast-math-flags?
1972   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1973   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1974 
1975   auto C2 = B.buildFConstant(Ty, C2Val);
1976   auto Fabs = B.buildFAbs(Ty, Src);
1977 
1978   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1979   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1980   MI.eraseFromParent();
1981   return true;
1982 }
1983 
1984 bool AMDGPULegalizerInfo::legalizeFceil(
1985   MachineInstr &MI, MachineRegisterInfo &MRI,
1986   MachineIRBuilder &B) const {
1987 
1988   const LLT S1 = LLT::scalar(1);
1989   const LLT S64 = LLT::scalar(64);
1990 
1991   Register Src = MI.getOperand(1).getReg();
1992   assert(MRI.getType(Src) == S64);
1993 
1994   // result = trunc(src)
1995   // if (src > 0.0 && src != result)
1996   //   result += 1.0
1997 
1998   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1999 
2000   const auto Zero = B.buildFConstant(S64, 0.0);
2001   const auto One = B.buildFConstant(S64, 1.0);
2002   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2003   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2004   auto And = B.buildAnd(S1, Lt0, NeTrunc);
2005   auto Add = B.buildSelect(S64, And, One, Zero);
2006 
2007   // TODO: Should this propagate fast-math-flags?
2008   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2009   MI.eraseFromParent();
2010   return true;
2011 }
2012 
2013 bool AMDGPULegalizerInfo::legalizeFrem(
2014   MachineInstr &MI, MachineRegisterInfo &MRI,
2015   MachineIRBuilder &B) const {
2016     Register DstReg = MI.getOperand(0).getReg();
2017     Register Src0Reg = MI.getOperand(1).getReg();
2018     Register Src1Reg = MI.getOperand(2).getReg();
2019     auto Flags = MI.getFlags();
2020     LLT Ty = MRI.getType(DstReg);
2021 
2022     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2023     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2024     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2025     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2026     MI.eraseFromParent();
2027     return true;
2028 }
2029 
2030 static MachineInstrBuilder extractF64Exponent(Register Hi,
2031                                               MachineIRBuilder &B) {
2032   const unsigned FractBits = 52;
2033   const unsigned ExpBits = 11;
2034   LLT S32 = LLT::scalar(32);
2035 
2036   auto Const0 = B.buildConstant(S32, FractBits - 32);
2037   auto Const1 = B.buildConstant(S32, ExpBits);
2038 
2039   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2040     .addUse(Hi)
2041     .addUse(Const0.getReg(0))
2042     .addUse(Const1.getReg(0));
2043 
2044   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2045 }
2046 
2047 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
2048   MachineInstr &MI, MachineRegisterInfo &MRI,
2049   MachineIRBuilder &B) const {
2050   const LLT S1 = LLT::scalar(1);
2051   const LLT S32 = LLT::scalar(32);
2052   const LLT S64 = LLT::scalar(64);
2053 
2054   Register Src = MI.getOperand(1).getReg();
2055   assert(MRI.getType(Src) == S64);
2056 
2057   // TODO: Should this use extract since the low half is unused?
2058   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2059   Register Hi = Unmerge.getReg(1);
2060 
2061   // Extract the upper half, since this is where we will find the sign and
2062   // exponent.
2063   auto Exp = extractF64Exponent(Hi, B);
2064 
2065   const unsigned FractBits = 52;
2066 
2067   // Extract the sign bit.
2068   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2069   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2070 
2071   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2072 
2073   const auto Zero32 = B.buildConstant(S32, 0);
2074 
2075   // Extend back to 64-bits.
2076   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
2077 
2078   auto Shr = B.buildAShr(S64, FractMask, Exp);
2079   auto Not = B.buildNot(S64, Shr);
2080   auto Tmp0 = B.buildAnd(S64, Src, Not);
2081   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2082 
2083   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2084   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2085 
2086   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2087   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2088   MI.eraseFromParent();
2089   return true;
2090 }
2091 
2092 bool AMDGPULegalizerInfo::legalizeITOFP(
2093   MachineInstr &MI, MachineRegisterInfo &MRI,
2094   MachineIRBuilder &B, bool Signed) const {
2095 
2096   Register Dst = MI.getOperand(0).getReg();
2097   Register Src = MI.getOperand(1).getReg();
2098 
2099   const LLT S64 = LLT::scalar(64);
2100   const LLT S32 = LLT::scalar(32);
2101 
2102   assert(MRI.getType(Src) == S64);
2103 
2104   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2105   auto ThirtyTwo = B.buildConstant(S32, 32);
2106 
2107   if (MRI.getType(Dst) == S64) {
2108     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2109                         : B.buildUITOFP(S64, Unmerge.getReg(1));
2110 
2111     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2112     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2113                      .addUse(CvtHi.getReg(0))
2114                      .addUse(ThirtyTwo.getReg(0));
2115 
2116     // TODO: Should this propagate fast-math-flags?
2117     B.buildFAdd(Dst, LdExp, CvtLo);
2118     MI.eraseFromParent();
2119     return true;
2120   }
2121 
2122   assert(MRI.getType(Dst) == S32);
2123 
2124   auto One = B.buildConstant(S32, 1);
2125 
2126   MachineInstrBuilder ShAmt;
2127   if (Signed) {
2128     auto ThirtyOne = B.buildConstant(S32, 31);
2129     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2130     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2131     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2132     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2133                                /*HasSideEffects=*/false)
2134                   .addUse(Unmerge.getReg(1));
2135     auto LS2 = B.buildSub(S32, LS, One);
2136     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2137   } else
2138     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2139   auto Norm = B.buildShl(S64, Src, ShAmt);
2140   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2141   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2142   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2143   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2144   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2145   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2146                    /*HasSideEffects=*/false)
2147       .addUse(FVal.getReg(0))
2148       .addUse(Scale.getReg(0));
2149   MI.eraseFromParent();
2150   return true;
2151 }
2152 
2153 // TODO: Copied from DAG implementation. Verify logic and document how this
2154 // actually works.
2155 bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2156                                         MachineRegisterInfo &MRI,
2157                                         MachineIRBuilder &B,
2158                                         bool Signed) const {
2159 
2160   Register Dst = MI.getOperand(0).getReg();
2161   Register Src = MI.getOperand(1).getReg();
2162 
2163   const LLT S64 = LLT::scalar(64);
2164   const LLT S32 = LLT::scalar(32);
2165 
2166   const LLT SrcLT = MRI.getType(Src);
2167   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2168 
2169   unsigned Flags = MI.getFlags();
2170 
2171   // The basic idea of converting a floating point number into a pair of 32-bit
2172   // integers is illustrated as follows:
2173   //
2174   //     tf := trunc(val);
2175   //    hif := floor(tf * 2^-32);
2176   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2177   //     hi := fptoi(hif);
2178   //     lo := fptoi(lof);
2179   //
2180   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2181   MachineInstrBuilder Sign;
2182   if (Signed && SrcLT == S32) {
2183     // However, a 32-bit floating point number has only 23 bits mantissa and
2184     // it's not enough to hold all the significant bits of `lof` if val is
2185     // negative. To avoid the loss of precision, We need to take the absolute
2186     // value after truncating and flip the result back based on the original
2187     // signedness.
2188     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2189     Trunc = B.buildFAbs(S32, Trunc, Flags);
2190   }
2191   MachineInstrBuilder K0, K1;
2192   if (SrcLT == S64) {
2193     K0 = B.buildFConstant(S64,
2194                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2195     K1 = B.buildFConstant(S64,
2196                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2197   } else {
2198     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2199     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2200   }
2201 
2202   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2203   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2204   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2205 
2206   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2207                                      : B.buildFPTOUI(S32, FloorMul);
2208   auto Lo = B.buildFPTOUI(S32, Fma);
2209 
2210   if (Signed && SrcLT == S32) {
2211     // Flip the result based on the signedness, which is either all 0s or 1s.
2212     Sign = B.buildMerge(S64, {Sign, Sign});
2213     // r := xor({lo, hi}, sign) - sign;
2214     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2215   } else
2216     B.buildMerge(Dst, {Lo, Hi});
2217   MI.eraseFromParent();
2218 
2219   return true;
2220 }
2221 
2222 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2223                                                MachineInstr &MI) const {
2224   MachineFunction &MF = Helper.MIRBuilder.getMF();
2225   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2226 
2227   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2228                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2229 
2230   // With ieee_mode disabled, the instructions have the correct behavior
2231   // already for G_FMINNUM/G_FMAXNUM
2232   if (!MFI->getMode().IEEE)
2233     return !IsIEEEOp;
2234 
2235   if (IsIEEEOp)
2236     return true;
2237 
2238   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2239 }
2240 
2241 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2242   MachineInstr &MI, MachineRegisterInfo &MRI,
2243   MachineIRBuilder &B) const {
2244   // TODO: Should move some of this into LegalizerHelper.
2245 
2246   // TODO: Promote dynamic indexing of s16 to s32
2247 
2248   // FIXME: Artifact combiner probably should have replaced the truncated
2249   // constant before this, so we shouldn't need
2250   // getIConstantVRegValWithLookThrough.
2251   Optional<ValueAndVReg> MaybeIdxVal =
2252       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2253   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2254     return true;
2255   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2256 
2257   Register Dst = MI.getOperand(0).getReg();
2258   Register Vec = MI.getOperand(1).getReg();
2259 
2260   LLT VecTy = MRI.getType(Vec);
2261   LLT EltTy = VecTy.getElementType();
2262   assert(EltTy == MRI.getType(Dst));
2263 
2264   if (IdxVal < VecTy.getNumElements()) {
2265     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2266     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2267   } else {
2268     B.buildUndef(Dst);
2269   }
2270 
2271   MI.eraseFromParent();
2272   return true;
2273 }
2274 
2275 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2276   MachineInstr &MI, MachineRegisterInfo &MRI,
2277   MachineIRBuilder &B) const {
2278   // TODO: Should move some of this into LegalizerHelper.
2279 
2280   // TODO: Promote dynamic indexing of s16 to s32
2281 
2282   // FIXME: Artifact combiner probably should have replaced the truncated
2283   // constant before this, so we shouldn't need
2284   // getIConstantVRegValWithLookThrough.
2285   Optional<ValueAndVReg> MaybeIdxVal =
2286       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2287   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2288     return true;
2289 
2290   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2291   Register Dst = MI.getOperand(0).getReg();
2292   Register Vec = MI.getOperand(1).getReg();
2293   Register Ins = MI.getOperand(2).getReg();
2294 
2295   LLT VecTy = MRI.getType(Vec);
2296   LLT EltTy = VecTy.getElementType();
2297   assert(EltTy == MRI.getType(Ins));
2298   (void)Ins;
2299 
2300   unsigned NumElts = VecTy.getNumElements();
2301   if (IdxVal < NumElts) {
2302     SmallVector<Register, 8> SrcRegs;
2303     for (unsigned i = 0; i < NumElts; ++i)
2304       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2305     B.buildUnmerge(SrcRegs, Vec);
2306 
2307     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2308     B.buildMerge(Dst, SrcRegs);
2309   } else {
2310     B.buildUndef(Dst);
2311   }
2312 
2313   MI.eraseFromParent();
2314   return true;
2315 }
2316 
2317 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2318   MachineInstr &MI, MachineRegisterInfo &MRI,
2319   MachineIRBuilder &B) const {
2320   const LLT V2S16 = LLT::fixed_vector(2, 16);
2321 
2322   Register Dst = MI.getOperand(0).getReg();
2323   Register Src0 = MI.getOperand(1).getReg();
2324   LLT DstTy = MRI.getType(Dst);
2325   LLT SrcTy = MRI.getType(Src0);
2326 
2327   if (SrcTy == V2S16 && DstTy == V2S16 &&
2328       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2329     return true;
2330 
2331   MachineIRBuilder HelperBuilder(MI);
2332   GISelObserverWrapper DummyObserver;
2333   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2334   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2335 }
2336 
2337 bool AMDGPULegalizerInfo::legalizeSinCos(
2338   MachineInstr &MI, MachineRegisterInfo &MRI,
2339   MachineIRBuilder &B) const {
2340 
2341   Register DstReg = MI.getOperand(0).getReg();
2342   Register SrcReg = MI.getOperand(1).getReg();
2343   LLT Ty = MRI.getType(DstReg);
2344   unsigned Flags = MI.getFlags();
2345 
2346   Register TrigVal;
2347   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2348   if (ST.hasTrigReducedRange()) {
2349     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2350     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2351       .addUse(MulVal.getReg(0))
2352       .setMIFlags(Flags).getReg(0);
2353   } else
2354     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2355 
2356   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2357     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2358   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2359     .addUse(TrigVal)
2360     .setMIFlags(Flags);
2361   MI.eraseFromParent();
2362   return true;
2363 }
2364 
2365 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2366                                                   MachineIRBuilder &B,
2367                                                   const GlobalValue *GV,
2368                                                   int64_t Offset,
2369                                                   unsigned GAFlags) const {
2370   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2371   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2372   // to the following code sequence:
2373   //
2374   // For constant address space:
2375   //   s_getpc_b64 s[0:1]
2376   //   s_add_u32 s0, s0, $symbol
2377   //   s_addc_u32 s1, s1, 0
2378   //
2379   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2380   //   a fixup or relocation is emitted to replace $symbol with a literal
2381   //   constant, which is a pc-relative offset from the encoding of the $symbol
2382   //   operand to the global variable.
2383   //
2384   // For global address space:
2385   //   s_getpc_b64 s[0:1]
2386   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2387   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2388   //
2389   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2390   //   fixups or relocations are emitted to replace $symbol@*@lo and
2391   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2392   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2393   //   operand to the global variable.
2394   //
2395   // What we want here is an offset from the value returned by s_getpc
2396   // (which is the address of the s_add_u32 instruction) to the global
2397   // variable, but since the encoding of $symbol starts 4 bytes after the start
2398   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2399   // small. This requires us to add 4 to the global variable offset in order to
2400   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2401   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2402   // instruction.
2403 
2404   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2405 
2406   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2407     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2408 
2409   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2410     .addDef(PCReg);
2411 
2412   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2413   if (GAFlags == SIInstrInfo::MO_NONE)
2414     MIB.addImm(0);
2415   else
2416     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2417 
2418   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2419 
2420   if (PtrTy.getSizeInBits() == 32)
2421     B.buildExtract(DstReg, PCReg, 0);
2422   return true;
2423  }
2424 
2425 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2426   MachineInstr &MI, MachineRegisterInfo &MRI,
2427   MachineIRBuilder &B) const {
2428   Register DstReg = MI.getOperand(0).getReg();
2429   LLT Ty = MRI.getType(DstReg);
2430   unsigned AS = Ty.getAddressSpace();
2431 
2432   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2433   MachineFunction &MF = B.getMF();
2434   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2435 
2436   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2437     if (!MFI->isModuleEntryFunction() &&
2438         !GV->getName().equals("llvm.amdgcn.module.lds")) {
2439       const Function &Fn = MF.getFunction();
2440       DiagnosticInfoUnsupported BadLDSDecl(
2441         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2442         DS_Warning);
2443       Fn.getContext().diagnose(BadLDSDecl);
2444 
2445       // We currently don't have a way to correctly allocate LDS objects that
2446       // aren't directly associated with a kernel. We do force inlining of
2447       // functions that use local objects. However, if these dead functions are
2448       // not eliminated, we don't want a compile time error. Just emit a warning
2449       // and a trap, since there should be no callable path here.
2450       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2451       B.buildUndef(DstReg);
2452       MI.eraseFromParent();
2453       return true;
2454     }
2455 
2456     // TODO: We could emit code to handle the initialization somewhere.
2457     // We ignore the initializer for now and legalize it to allow selection.
2458     // The initializer will anyway get errored out during assembly emission.
2459     const SITargetLowering *TLI = ST.getTargetLowering();
2460     if (!TLI->shouldUseLDSConstAddress(GV)) {
2461       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2462       return true; // Leave in place;
2463     }
2464 
2465     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2466       Type *Ty = GV->getValueType();
2467       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2468       // zero-sized type in other languages to declare the dynamic shared
2469       // memory which size is not known at the compile time. They will be
2470       // allocated by the runtime and placed directly after the static
2471       // allocated ones. They all share the same offset.
2472       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2473         // Adjust alignment for that dynamic shared memory array.
2474         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2475         LLT S32 = LLT::scalar(32);
2476         auto Sz =
2477             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2478         B.buildIntToPtr(DstReg, Sz);
2479         MI.eraseFromParent();
2480         return true;
2481       }
2482     }
2483 
2484     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2485                                                    *cast<GlobalVariable>(GV)));
2486     MI.eraseFromParent();
2487     return true;
2488   }
2489 
2490   const SITargetLowering *TLI = ST.getTargetLowering();
2491 
2492   if (TLI->shouldEmitFixup(GV)) {
2493     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2494     MI.eraseFromParent();
2495     return true;
2496   }
2497 
2498   if (TLI->shouldEmitPCReloc(GV)) {
2499     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2500     MI.eraseFromParent();
2501     return true;
2502   }
2503 
2504   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2505   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2506 
2507   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
2508   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2509       MachinePointerInfo::getGOT(MF),
2510       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2511           MachineMemOperand::MOInvariant,
2512       LoadTy, Align(8));
2513 
2514   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2515 
2516   if (Ty.getSizeInBits() == 32) {
2517     // Truncate if this is a 32-bit constant address.
2518     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2519     B.buildExtract(DstReg, Load, 0);
2520   } else
2521     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2522 
2523   MI.eraseFromParent();
2524   return true;
2525 }
2526 
2527 static LLT widenToNextPowerOf2(LLT Ty) {
2528   if (Ty.isVector())
2529     return Ty.changeElementCount(
2530         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2531   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2532 }
2533 
2534 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2535                                        MachineInstr &MI) const {
2536   MachineIRBuilder &B = Helper.MIRBuilder;
2537   MachineRegisterInfo &MRI = *B.getMRI();
2538   GISelChangeObserver &Observer = Helper.Observer;
2539 
2540   Register PtrReg = MI.getOperand(1).getReg();
2541   LLT PtrTy = MRI.getType(PtrReg);
2542   unsigned AddrSpace = PtrTy.getAddressSpace();
2543 
2544   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2545     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2546     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2547     Observer.changingInstr(MI);
2548     MI.getOperand(1).setReg(Cast.getReg(0));
2549     Observer.changedInstr(MI);
2550     return true;
2551   }
2552 
2553   if (MI.getOpcode() != AMDGPU::G_LOAD)
2554     return false;
2555 
2556   Register ValReg = MI.getOperand(0).getReg();
2557   LLT ValTy = MRI.getType(ValReg);
2558 
2559   MachineMemOperand *MMO = *MI.memoperands_begin();
2560   const unsigned ValSize = ValTy.getSizeInBits();
2561   const LLT MemTy = MMO->getMemoryType();
2562   const Align MemAlign = MMO->getAlign();
2563   const unsigned MemSize = MemTy.getSizeInBits();
2564   const uint64_t AlignInBits = 8 * MemAlign.value();
2565 
2566   // Widen non-power-of-2 loads to the alignment if needed
2567   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2568     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2569 
2570     // This was already the correct extending load result type, so just adjust
2571     // the memory type.
2572     if (WideMemSize == ValSize) {
2573       MachineFunction &MF = B.getMF();
2574 
2575       MachineMemOperand *WideMMO =
2576           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2577       Observer.changingInstr(MI);
2578       MI.setMemRefs(MF, {WideMMO});
2579       Observer.changedInstr(MI);
2580       return true;
2581     }
2582 
2583     // Don't bother handling edge case that should probably never be produced.
2584     if (ValSize > WideMemSize)
2585       return false;
2586 
2587     LLT WideTy = widenToNextPowerOf2(ValTy);
2588 
2589     Register WideLoad;
2590     if (!WideTy.isVector()) {
2591       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2592       B.buildTrunc(ValReg, WideLoad).getReg(0);
2593     } else {
2594       // Extract the subvector.
2595 
2596       if (isRegisterType(ValTy)) {
2597         // If this a case where G_EXTRACT is legal, use it.
2598         // (e.g. <3 x s32> -> <4 x s32>)
2599         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2600         B.buildExtract(ValReg, WideLoad, 0);
2601       } else {
2602         // For cases where the widened type isn't a nice register value, unmerge
2603         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2604         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2605         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2606       }
2607     }
2608 
2609     MI.eraseFromParent();
2610     return true;
2611   }
2612 
2613   return false;
2614 }
2615 
2616 bool AMDGPULegalizerInfo::legalizeFMad(
2617   MachineInstr &MI, MachineRegisterInfo &MRI,
2618   MachineIRBuilder &B) const {
2619   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2620   assert(Ty.isScalar());
2621 
2622   MachineFunction &MF = B.getMF();
2623   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2624 
2625   // TODO: Always legal with future ftz flag.
2626   // FIXME: Do we need just output?
2627   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2628     return true;
2629   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2630     return true;
2631 
2632   MachineIRBuilder HelperBuilder(MI);
2633   GISelObserverWrapper DummyObserver;
2634   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2635   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2636 }
2637 
2638 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2639   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2640   Register DstReg = MI.getOperand(0).getReg();
2641   Register PtrReg = MI.getOperand(1).getReg();
2642   Register CmpVal = MI.getOperand(2).getReg();
2643   Register NewVal = MI.getOperand(3).getReg();
2644 
2645   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2646          "this should not have been custom lowered");
2647 
2648   LLT ValTy = MRI.getType(CmpVal);
2649   LLT VecTy = LLT::fixed_vector(2, ValTy);
2650 
2651   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2652 
2653   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2654     .addDef(DstReg)
2655     .addUse(PtrReg)
2656     .addUse(PackedVal)
2657     .setMemRefs(MI.memoperands());
2658 
2659   MI.eraseFromParent();
2660   return true;
2661 }
2662 
2663 bool AMDGPULegalizerInfo::legalizeFlog(
2664   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2665   Register Dst = MI.getOperand(0).getReg();
2666   Register Src = MI.getOperand(1).getReg();
2667   LLT Ty = B.getMRI()->getType(Dst);
2668   unsigned Flags = MI.getFlags();
2669 
2670   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2671   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2672 
2673   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2674   MI.eraseFromParent();
2675   return true;
2676 }
2677 
2678 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2679                                        MachineIRBuilder &B) const {
2680   Register Dst = MI.getOperand(0).getReg();
2681   Register Src = MI.getOperand(1).getReg();
2682   unsigned Flags = MI.getFlags();
2683   LLT Ty = B.getMRI()->getType(Dst);
2684 
2685   auto K = B.buildFConstant(Ty, numbers::log2e);
2686   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2687   B.buildFExp2(Dst, Mul, Flags);
2688   MI.eraseFromParent();
2689   return true;
2690 }
2691 
2692 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2693                                        MachineIRBuilder &B) const {
2694   Register Dst = MI.getOperand(0).getReg();
2695   Register Src0 = MI.getOperand(1).getReg();
2696   Register Src1 = MI.getOperand(2).getReg();
2697   unsigned Flags = MI.getFlags();
2698   LLT Ty = B.getMRI()->getType(Dst);
2699   const LLT S16 = LLT::scalar(16);
2700   const LLT S32 = LLT::scalar(32);
2701 
2702   if (Ty == S32) {
2703     auto Log = B.buildFLog2(S32, Src0, Flags);
2704     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2705       .addUse(Log.getReg(0))
2706       .addUse(Src1)
2707       .setMIFlags(Flags);
2708     B.buildFExp2(Dst, Mul, Flags);
2709   } else if (Ty == S16) {
2710     // There's no f16 fmul_legacy, so we need to convert for it.
2711     auto Log = B.buildFLog2(S16, Src0, Flags);
2712     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2713     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2714     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2715       .addUse(Ext0.getReg(0))
2716       .addUse(Ext1.getReg(0))
2717       .setMIFlags(Flags);
2718 
2719     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2720   } else
2721     return false;
2722 
2723   MI.eraseFromParent();
2724   return true;
2725 }
2726 
2727 // Find a source register, ignoring any possible source modifiers.
2728 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2729   Register ModSrc = OrigSrc;
2730   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2731     ModSrc = SrcFNeg->getOperand(1).getReg();
2732     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2733       ModSrc = SrcFAbs->getOperand(1).getReg();
2734   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2735     ModSrc = SrcFAbs->getOperand(1).getReg();
2736   return ModSrc;
2737 }
2738 
2739 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2740                                          MachineRegisterInfo &MRI,
2741                                          MachineIRBuilder &B) const {
2742 
2743   const LLT S1 = LLT::scalar(1);
2744   const LLT S64 = LLT::scalar(64);
2745   Register Dst = MI.getOperand(0).getReg();
2746   Register OrigSrc = MI.getOperand(1).getReg();
2747   unsigned Flags = MI.getFlags();
2748   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2749          "this should not have been custom lowered");
2750 
2751   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2752   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2753   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2754   // V_FRACT bug is:
2755   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2756   //
2757   // Convert floor(x) to (x - fract(x))
2758 
2759   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2760     .addUse(OrigSrc)
2761     .setMIFlags(Flags);
2762 
2763   // Give source modifier matching some assistance before obscuring a foldable
2764   // pattern.
2765 
2766   // TODO: We can avoid the neg on the fract? The input sign to fract
2767   // shouldn't matter?
2768   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2769 
2770   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2771 
2772   Register Min = MRI.createGenericVirtualRegister(S64);
2773 
2774   // We don't need to concern ourselves with the snan handling difference, so
2775   // use the one which will directly select.
2776   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2777   if (MFI->getMode().IEEE)
2778     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2779   else
2780     B.buildFMinNum(Min, Fract, Const, Flags);
2781 
2782   Register CorrectedFract = Min;
2783   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2784     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2785     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2786   }
2787 
2788   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2789   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2790 
2791   MI.eraseFromParent();
2792   return true;
2793 }
2794 
2795 // Turn an illegal packed v2s16 build vector into bit operations.
2796 // TODO: This should probably be a bitcast action in LegalizerHelper.
2797 bool AMDGPULegalizerInfo::legalizeBuildVector(
2798   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2799   Register Dst = MI.getOperand(0).getReg();
2800   const LLT S32 = LLT::scalar(32);
2801   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
2802 
2803   Register Src0 = MI.getOperand(1).getReg();
2804   Register Src1 = MI.getOperand(2).getReg();
2805   assert(MRI.getType(Src0) == LLT::scalar(16));
2806 
2807   auto Merge = B.buildMerge(S32, {Src0, Src1});
2808   B.buildBitcast(Dst, Merge);
2809 
2810   MI.eraseFromParent();
2811   return true;
2812 }
2813 
2814 // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2815 // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2816 // case with a single min instruction instead of a compare+select.
2817 bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2818                                             MachineRegisterInfo &MRI,
2819                                             MachineIRBuilder &B) const {
2820   Register Dst = MI.getOperand(0).getReg();
2821   Register Src = MI.getOperand(1).getReg();
2822   LLT DstTy = MRI.getType(Dst);
2823   LLT SrcTy = MRI.getType(Src);
2824 
2825   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2826                         ? AMDGPU::G_AMDGPU_FFBH_U32
2827                         : AMDGPU::G_AMDGPU_FFBL_B32;
2828   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2829   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2830 
2831   MI.eraseFromParent();
2832   return true;
2833 }
2834 
2835 // Check that this is a G_XOR x, -1
2836 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2837   if (MI.getOpcode() != TargetOpcode::G_XOR)
2838     return false;
2839   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2840   return ConstVal && *ConstVal == -1;
2841 }
2842 
2843 // Return the use branch instruction, otherwise null if the usage is invalid.
2844 static MachineInstr *
2845 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2846                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2847   Register CondDef = MI.getOperand(0).getReg();
2848   if (!MRI.hasOneNonDBGUse(CondDef))
2849     return nullptr;
2850 
2851   MachineBasicBlock *Parent = MI.getParent();
2852   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2853 
2854   if (isNot(MRI, *UseMI)) {
2855     Register NegatedCond = UseMI->getOperand(0).getReg();
2856     if (!MRI.hasOneNonDBGUse(NegatedCond))
2857       return nullptr;
2858 
2859     // We're deleting the def of this value, so we need to remove it.
2860     eraseInstr(*UseMI, MRI);
2861 
2862     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2863     Negated = true;
2864   }
2865 
2866   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2867     return nullptr;
2868 
2869   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2870   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2871   if (Next == Parent->end()) {
2872     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2873     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2874       return nullptr;
2875     UncondBrTarget = &*NextMBB;
2876   } else {
2877     if (Next->getOpcode() != AMDGPU::G_BR)
2878       return nullptr;
2879     Br = &*Next;
2880     UncondBrTarget = Br->getOperand(0).getMBB();
2881   }
2882 
2883   return UseMI;
2884 }
2885 
2886 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2887                                          const ArgDescriptor *Arg,
2888                                          const TargetRegisterClass *ArgRC,
2889                                          LLT ArgTy) const {
2890   MCRegister SrcReg = Arg->getRegister();
2891   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2892   assert(DstReg.isVirtual() && "Virtual register expected");
2893 
2894   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2895                                              *ArgRC, B.getDebugLoc(), ArgTy);
2896   if (Arg->isMasked()) {
2897     // TODO: Should we try to emit this once in the entry block?
2898     const LLT S32 = LLT::scalar(32);
2899     const unsigned Mask = Arg->getMask();
2900     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2901 
2902     Register AndMaskSrc = LiveIn;
2903 
2904     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2905     // 0.
2906     if (Shift != 0) {
2907       auto ShiftAmt = B.buildConstant(S32, Shift);
2908       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2909     }
2910 
2911     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2912   } else {
2913     B.buildCopy(DstReg, LiveIn);
2914   }
2915 
2916   return true;
2917 }
2918 
2919 bool AMDGPULegalizerInfo::loadInputValue(
2920     Register DstReg, MachineIRBuilder &B,
2921     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2922   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2923   const ArgDescriptor *Arg;
2924   const TargetRegisterClass *ArgRC;
2925   LLT ArgTy;
2926   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2927 
2928   if (!Arg) {
2929     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2930       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2931       // case the pointer argument may be missing and we use null.
2932       B.buildConstant(DstReg, 0);
2933       return true;
2934     }
2935 
2936     // It's undefined behavior if a function marked with the amdgpu-no-*
2937     // attributes uses the corresponding intrinsic.
2938     B.buildUndef(DstReg);
2939     return true;
2940   }
2941 
2942   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2943     return false; // TODO: Handle these
2944   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2945 }
2946 
2947 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2948     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2949     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2950   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2951     return false;
2952 
2953   MI.eraseFromParent();
2954   return true;
2955 }
2956 
2957 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2958                                        MachineRegisterInfo &MRI,
2959                                        MachineIRBuilder &B) const {
2960   Register Dst = MI.getOperand(0).getReg();
2961   LLT DstTy = MRI.getType(Dst);
2962   LLT S16 = LLT::scalar(16);
2963   LLT S32 = LLT::scalar(32);
2964   LLT S64 = LLT::scalar(64);
2965 
2966   if (DstTy == S16)
2967     return legalizeFDIV16(MI, MRI, B);
2968   if (DstTy == S32)
2969     return legalizeFDIV32(MI, MRI, B);
2970   if (DstTy == S64)
2971     return legalizeFDIV64(MI, MRI, B);
2972 
2973   return false;
2974 }
2975 
2976 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2977                                                         Register DstDivReg,
2978                                                         Register DstRemReg,
2979                                                         Register X,
2980                                                         Register Y) const {
2981   const LLT S1 = LLT::scalar(1);
2982   const LLT S32 = LLT::scalar(32);
2983 
2984   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2985   // algorithm used here.
2986 
2987   // Initial estimate of inv(y).
2988   auto FloatY = B.buildUITOFP(S32, Y);
2989   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2990   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2991   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2992   auto Z = B.buildFPTOUI(S32, ScaledY);
2993 
2994   // One round of UNR.
2995   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2996   auto NegYZ = B.buildMul(S32, NegY, Z);
2997   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2998 
2999   // Quotient/remainder estimate.
3000   auto Q = B.buildUMulH(S32, X, Z);
3001   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
3002 
3003   // First quotient/remainder refinement.
3004   auto One = B.buildConstant(S32, 1);
3005   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3006   if (DstDivReg)
3007     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
3008   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
3009 
3010   // Second quotient/remainder refinement.
3011   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3012   if (DstDivReg)
3013     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
3014 
3015   if (DstRemReg)
3016     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
3017 }
3018 
3019 // Build integer reciprocal sequence around V_RCP_IFLAG_F32
3020 //
3021 // Return lo, hi of result
3022 //
3023 // %cvt.lo = G_UITOFP Val.lo
3024 // %cvt.hi = G_UITOFP Val.hi
3025 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
3026 // %rcp = G_AMDGPU_RCP_IFLAG %mad
3027 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
3028 // %mul2 = G_FMUL %mul1, 2**(-32)
3029 // %trunc = G_INTRINSIC_TRUNC %mul2
3030 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
3031 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
3032 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
3033                                                        Register Val) {
3034   const LLT S32 = LLT::scalar(32);
3035   auto Unmerge = B.buildUnmerge(S32, Val);
3036 
3037   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
3038   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
3039 
3040   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
3041                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
3042 
3043   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
3044   auto Mul1 =
3045       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
3046 
3047   // 2**(-32)
3048   auto Mul2 =
3049       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
3050   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
3051 
3052   // -(2**32)
3053   auto Mad2 = B.buildFMAD(S32, Trunc,
3054                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
3055 
3056   auto ResultLo = B.buildFPTOUI(S32, Mad2);
3057   auto ResultHi = B.buildFPTOUI(S32, Trunc);
3058 
3059   return {ResultLo.getReg(0), ResultHi.getReg(0)};
3060 }
3061 
3062 void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3063                                                         Register DstDivReg,
3064                                                         Register DstRemReg,
3065                                                         Register Numer,
3066                                                         Register Denom) const {
3067   const LLT S32 = LLT::scalar(32);
3068   const LLT S64 = LLT::scalar(64);
3069   const LLT S1 = LLT::scalar(1);
3070   Register RcpLo, RcpHi;
3071 
3072   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
3073 
3074   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
3075 
3076   auto Zero64 = B.buildConstant(S64, 0);
3077   auto NegDenom = B.buildSub(S64, Zero64, Denom);
3078 
3079   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
3080   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
3081 
3082   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
3083   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
3084   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
3085 
3086   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
3087   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3088   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
3089 
3090   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
3091   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
3092   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
3093   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
3094   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
3095 
3096   auto Zero32 = B.buildConstant(S32, 0);
3097   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3098   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3099   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
3100 
3101   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
3102   Register NumerLo = UnmergeNumer.getReg(0);
3103   Register NumerHi = UnmergeNumer.getReg(1);
3104 
3105   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
3106   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
3107   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
3108   Register Mul3_Lo = UnmergeMul3.getReg(0);
3109   Register Mul3_Hi = UnmergeMul3.getReg(1);
3110   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
3111   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
3112   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3113   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
3114 
3115   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
3116   Register DenomLo = UnmergeDenom.getReg(0);
3117   Register DenomHi = UnmergeDenom.getReg(1);
3118 
3119   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
3120   auto C1 = B.buildSExt(S32, CmpHi);
3121 
3122   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
3123   auto C2 = B.buildSExt(S32, CmpLo);
3124 
3125   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
3126   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
3127 
3128   // TODO: Here and below portions of the code can be enclosed into if/endif.
3129   // Currently control flow is unconditional and we have 4 selects after
3130   // potential endif to substitute PHIs.
3131 
3132   // if C3 != 0 ...
3133   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
3134   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
3135   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3136   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
3137 
3138   auto One64 = B.buildConstant(S64, 1);
3139   auto Add3 = B.buildAdd(S64, MulHi3, One64);
3140 
3141   auto C4 =
3142       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
3143   auto C5 =
3144       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
3145   auto C6 = B.buildSelect(
3146       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
3147 
3148   // if (C6 != 0)
3149   auto Add4 = B.buildAdd(S64, Add3, One64);
3150   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
3151 
3152   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
3153   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3154   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
3155 
3156   // endif C6
3157   // endif C3
3158 
3159   if (DstDivReg) {
3160     auto Sel1 = B.buildSelect(
3161         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3162     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3163                   Sel1, MulHi3);
3164   }
3165 
3166   if (DstRemReg) {
3167     auto Sel2 = B.buildSelect(
3168         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3169     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3170                   Sel2, Sub1);
3171   }
3172 }
3173 
3174 bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
3175                                                   MachineRegisterInfo &MRI,
3176                                                   MachineIRBuilder &B) const {
3177   Register DstDivReg, DstRemReg;
3178   switch (MI.getOpcode()) {
3179   default:
3180     llvm_unreachable("Unexpected opcode!");
3181   case AMDGPU::G_UDIV: {
3182     DstDivReg = MI.getOperand(0).getReg();
3183     break;
3184   }
3185   case AMDGPU::G_UREM: {
3186     DstRemReg = MI.getOperand(0).getReg();
3187     break;
3188   }
3189   case AMDGPU::G_UDIVREM: {
3190     DstDivReg = MI.getOperand(0).getReg();
3191     DstRemReg = MI.getOperand(1).getReg();
3192     break;
3193   }
3194   }
3195 
3196   const LLT S64 = LLT::scalar(64);
3197   const LLT S32 = LLT::scalar(32);
3198   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3199   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3200   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3201   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3202 
3203   if (Ty == S32)
3204     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
3205   else if (Ty == S64)
3206     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
3207   else
3208     return false;
3209 
3210   MI.eraseFromParent();
3211   return true;
3212 }
3213 
3214 bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
3215                                                 MachineRegisterInfo &MRI,
3216                                                 MachineIRBuilder &B) const {
3217   const LLT S64 = LLT::scalar(64);
3218   const LLT S32 = LLT::scalar(32);
3219 
3220   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3221   if (Ty != S32 && Ty != S64)
3222     return false;
3223 
3224   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3225   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3226   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3227 
3228   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3229   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3230   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3231 
3232   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3233   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3234 
3235   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3236   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3237 
3238   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3239   switch (MI.getOpcode()) {
3240   default:
3241     llvm_unreachable("Unexpected opcode!");
3242   case AMDGPU::G_SDIV: {
3243     DstDivReg = MI.getOperand(0).getReg();
3244     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3245     break;
3246   }
3247   case AMDGPU::G_SREM: {
3248     DstRemReg = MI.getOperand(0).getReg();
3249     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3250     break;
3251   }
3252   case AMDGPU::G_SDIVREM: {
3253     DstDivReg = MI.getOperand(0).getReg();
3254     DstRemReg = MI.getOperand(1).getReg();
3255     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3256     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3257     break;
3258   }
3259   }
3260 
3261   if (Ty == S32)
3262     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3263   else
3264     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
3265 
3266   if (DstDivReg) {
3267     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3268     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3269     B.buildSub(DstDivReg, SignXor, Sign);
3270   }
3271 
3272   if (DstRemReg) {
3273     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3274     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3275     B.buildSub(DstRemReg, SignXor, Sign);
3276   }
3277 
3278   MI.eraseFromParent();
3279   return true;
3280 }
3281 
3282 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3283                                                  MachineRegisterInfo &MRI,
3284                                                  MachineIRBuilder &B) const {
3285   Register Res = MI.getOperand(0).getReg();
3286   Register LHS = MI.getOperand(1).getReg();
3287   Register RHS = MI.getOperand(2).getReg();
3288   uint16_t Flags = MI.getFlags();
3289   LLT ResTy = MRI.getType(Res);
3290 
3291   const MachineFunction &MF = B.getMF();
3292   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3293                             MI.getFlag(MachineInstr::FmAfn);
3294 
3295   if (!AllowInaccurateRcp)
3296     return false;
3297 
3298   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3299     // 1 / x -> RCP(x)
3300     if (CLHS->isExactlyValue(1.0)) {
3301       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3302         .addUse(RHS)
3303         .setMIFlags(Flags);
3304 
3305       MI.eraseFromParent();
3306       return true;
3307     }
3308 
3309     // -1 / x -> RCP( FNEG(x) )
3310     if (CLHS->isExactlyValue(-1.0)) {
3311       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3312       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3313         .addUse(FNeg.getReg(0))
3314         .setMIFlags(Flags);
3315 
3316       MI.eraseFromParent();
3317       return true;
3318     }
3319   }
3320 
3321   // x / y -> x * (1.0 / y)
3322   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3323     .addUse(RHS)
3324     .setMIFlags(Flags);
3325   B.buildFMul(Res, LHS, RCP, Flags);
3326 
3327   MI.eraseFromParent();
3328   return true;
3329 }
3330 
3331 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3332                                                    MachineRegisterInfo &MRI,
3333                                                    MachineIRBuilder &B) const {
3334   Register Res = MI.getOperand(0).getReg();
3335   Register X = MI.getOperand(1).getReg();
3336   Register Y = MI.getOperand(2).getReg();
3337   uint16_t Flags = MI.getFlags();
3338   LLT ResTy = MRI.getType(Res);
3339 
3340   const MachineFunction &MF = B.getMF();
3341   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3342                             MI.getFlag(MachineInstr::FmAfn);
3343 
3344   if (!AllowInaccurateRcp)
3345     return false;
3346 
3347   auto NegY = B.buildFNeg(ResTy, Y);
3348   auto One = B.buildFConstant(ResTy, 1.0);
3349 
3350   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3351     .addUse(Y)
3352     .setMIFlags(Flags);
3353 
3354   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3355   R = B.buildFMA(ResTy, Tmp0, R, R);
3356 
3357   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3358   R = B.buildFMA(ResTy, Tmp1, R, R);
3359 
3360   auto Ret = B.buildFMul(ResTy, X, R);
3361   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3362 
3363   B.buildFMA(Res, Tmp2, R, Ret);
3364   MI.eraseFromParent();
3365   return true;
3366 }
3367 
3368 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3369                                          MachineRegisterInfo &MRI,
3370                                          MachineIRBuilder &B) const {
3371   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3372     return true;
3373 
3374   Register Res = MI.getOperand(0).getReg();
3375   Register LHS = MI.getOperand(1).getReg();
3376   Register RHS = MI.getOperand(2).getReg();
3377 
3378   uint16_t Flags = MI.getFlags();
3379 
3380   LLT S16 = LLT::scalar(16);
3381   LLT S32 = LLT::scalar(32);
3382 
3383   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3384   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3385 
3386   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3387     .addUse(RHSExt.getReg(0))
3388     .setMIFlags(Flags);
3389 
3390   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3391   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3392 
3393   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3394     .addUse(RDst.getReg(0))
3395     .addUse(RHS)
3396     .addUse(LHS)
3397     .setMIFlags(Flags);
3398 
3399   MI.eraseFromParent();
3400   return true;
3401 }
3402 
3403 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3404 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3405 static void toggleSPDenormMode(bool Enable,
3406                                MachineIRBuilder &B,
3407                                const GCNSubtarget &ST,
3408                                AMDGPU::SIModeRegisterDefaults Mode) {
3409   // Set SP denorm mode to this value.
3410   unsigned SPDenormMode =
3411     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3412 
3413   if (ST.hasDenormModeInst()) {
3414     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3415     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3416 
3417     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3418     B.buildInstr(AMDGPU::S_DENORM_MODE)
3419       .addImm(NewDenormModeValue);
3420 
3421   } else {
3422     // Select FP32 bit field in mode register.
3423     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3424                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3425                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3426 
3427     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3428       .addImm(SPDenormMode)
3429       .addImm(SPDenormModeBitField);
3430   }
3431 }
3432 
3433 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3434                                          MachineRegisterInfo &MRI,
3435                                          MachineIRBuilder &B) const {
3436   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3437     return true;
3438 
3439   Register Res = MI.getOperand(0).getReg();
3440   Register LHS = MI.getOperand(1).getReg();
3441   Register RHS = MI.getOperand(2).getReg();
3442   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3443   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3444 
3445   uint16_t Flags = MI.getFlags();
3446 
3447   LLT S32 = LLT::scalar(32);
3448   LLT S1 = LLT::scalar(1);
3449 
3450   auto One = B.buildFConstant(S32, 1.0f);
3451 
3452   auto DenominatorScaled =
3453     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3454       .addUse(LHS)
3455       .addUse(RHS)
3456       .addImm(0)
3457       .setMIFlags(Flags);
3458   auto NumeratorScaled =
3459     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3460       .addUse(LHS)
3461       .addUse(RHS)
3462       .addImm(1)
3463       .setMIFlags(Flags);
3464 
3465   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3466     .addUse(DenominatorScaled.getReg(0))
3467     .setMIFlags(Flags);
3468   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3469 
3470   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3471   // aren't modeled as reading it.
3472   if (!Mode.allFP32Denormals())
3473     toggleSPDenormMode(true, B, ST, Mode);
3474 
3475   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3476   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3477   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3478   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3479   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3480   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3481 
3482   if (!Mode.allFP32Denormals())
3483     toggleSPDenormMode(false, B, ST, Mode);
3484 
3485   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3486     .addUse(Fma4.getReg(0))
3487     .addUse(Fma1.getReg(0))
3488     .addUse(Fma3.getReg(0))
3489     .addUse(NumeratorScaled.getReg(1))
3490     .setMIFlags(Flags);
3491 
3492   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3493     .addUse(Fmas.getReg(0))
3494     .addUse(RHS)
3495     .addUse(LHS)
3496     .setMIFlags(Flags);
3497 
3498   MI.eraseFromParent();
3499   return true;
3500 }
3501 
3502 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3503                                          MachineRegisterInfo &MRI,
3504                                          MachineIRBuilder &B) const {
3505   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3506     return true;
3507 
3508   Register Res = MI.getOperand(0).getReg();
3509   Register LHS = MI.getOperand(1).getReg();
3510   Register RHS = MI.getOperand(2).getReg();
3511 
3512   uint16_t Flags = MI.getFlags();
3513 
3514   LLT S64 = LLT::scalar(64);
3515   LLT S1 = LLT::scalar(1);
3516 
3517   auto One = B.buildFConstant(S64, 1.0);
3518 
3519   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3520     .addUse(LHS)
3521     .addUse(RHS)
3522     .addImm(0)
3523     .setMIFlags(Flags);
3524 
3525   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3526 
3527   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3528     .addUse(DivScale0.getReg(0))
3529     .setMIFlags(Flags);
3530 
3531   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3532   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3533   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3534 
3535   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3536     .addUse(LHS)
3537     .addUse(RHS)
3538     .addImm(1)
3539     .setMIFlags(Flags);
3540 
3541   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3542   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3543   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3544 
3545   Register Scale;
3546   if (!ST.hasUsableDivScaleConditionOutput()) {
3547     // Workaround a hardware bug on SI where the condition output from div_scale
3548     // is not usable.
3549 
3550     LLT S32 = LLT::scalar(32);
3551 
3552     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3553     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3554     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3555     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3556 
3557     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3558                               Scale1Unmerge.getReg(1));
3559     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3560                               Scale0Unmerge.getReg(1));
3561     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3562   } else {
3563     Scale = DivScale1.getReg(1);
3564   }
3565 
3566   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3567     .addUse(Fma4.getReg(0))
3568     .addUse(Fma3.getReg(0))
3569     .addUse(Mul.getReg(0))
3570     .addUse(Scale)
3571     .setMIFlags(Flags);
3572 
3573   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3574     .addUse(Fmas.getReg(0))
3575     .addUse(RHS)
3576     .addUse(LHS)
3577     .setMIFlags(Flags);
3578 
3579   MI.eraseFromParent();
3580   return true;
3581 }
3582 
3583 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3584                                                  MachineRegisterInfo &MRI,
3585                                                  MachineIRBuilder &B) const {
3586   Register Res = MI.getOperand(0).getReg();
3587   Register LHS = MI.getOperand(2).getReg();
3588   Register RHS = MI.getOperand(3).getReg();
3589   uint16_t Flags = MI.getFlags();
3590 
3591   LLT S32 = LLT::scalar(32);
3592   LLT S1 = LLT::scalar(1);
3593 
3594   auto Abs = B.buildFAbs(S32, RHS, Flags);
3595   const APFloat C0Val(1.0f);
3596 
3597   auto C0 = B.buildConstant(S32, 0x6f800000);
3598   auto C1 = B.buildConstant(S32, 0x2f800000);
3599   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3600 
3601   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3602   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3603 
3604   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3605 
3606   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3607     .addUse(Mul0.getReg(0))
3608     .setMIFlags(Flags);
3609 
3610   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3611 
3612   B.buildFMul(Res, Sel, Mul1, Flags);
3613 
3614   MI.eraseFromParent();
3615   return true;
3616 }
3617 
3618 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3619 // FIXME: Why do we handle this one but not other removed instructions?
3620 //
3621 // Reciprocal square root.  The clamp prevents infinite results, clamping
3622 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3623 // +-max_float.
3624 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3625                                                     MachineRegisterInfo &MRI,
3626                                                     MachineIRBuilder &B) const {
3627   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3628     return true;
3629 
3630   Register Dst = MI.getOperand(0).getReg();
3631   Register Src = MI.getOperand(2).getReg();
3632   auto Flags = MI.getFlags();
3633 
3634   LLT Ty = MRI.getType(Dst);
3635 
3636   const fltSemantics *FltSemantics;
3637   if (Ty == LLT::scalar(32))
3638     FltSemantics = &APFloat::IEEEsingle();
3639   else if (Ty == LLT::scalar(64))
3640     FltSemantics = &APFloat::IEEEdouble();
3641   else
3642     return false;
3643 
3644   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3645     .addUse(Src)
3646     .setMIFlags(Flags);
3647 
3648   // We don't need to concern ourselves with the snan handling difference, since
3649   // the rsq quieted (or not) so use the one which will directly select.
3650   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3651   const bool UseIEEE = MFI->getMode().IEEE;
3652 
3653   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3654   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3655                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3656 
3657   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3658 
3659   if (UseIEEE)
3660     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3661   else
3662     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3663   MI.eraseFromParent();
3664   return true;
3665 }
3666 
3667 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3668   switch (IID) {
3669   case Intrinsic::amdgcn_ds_fadd:
3670     return AMDGPU::G_ATOMICRMW_FADD;
3671   case Intrinsic::amdgcn_ds_fmin:
3672     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3673   case Intrinsic::amdgcn_ds_fmax:
3674     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3675   default:
3676     llvm_unreachable("not a DS FP intrinsic");
3677   }
3678 }
3679 
3680 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3681                                                       MachineInstr &MI,
3682                                                       Intrinsic::ID IID) const {
3683   GISelChangeObserver &Observer = Helper.Observer;
3684   Observer.changingInstr(MI);
3685 
3686   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3687 
3688   // The remaining operands were used to set fields in the MemOperand on
3689   // construction.
3690   for (int I = 6; I > 3; --I)
3691     MI.RemoveOperand(I);
3692 
3693   MI.RemoveOperand(1); // Remove the intrinsic ID.
3694   Observer.changedInstr(MI);
3695   return true;
3696 }
3697 
3698 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3699                                             MachineRegisterInfo &MRI,
3700                                             MachineIRBuilder &B) const {
3701   uint64_t Offset =
3702     ST.getTargetLowering()->getImplicitParameterOffset(
3703       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3704   LLT DstTy = MRI.getType(DstReg);
3705   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3706 
3707   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3708   if (!loadInputValue(KernargPtrReg, B,
3709                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3710     return false;
3711 
3712   // FIXME: This should be nuw
3713   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3714   return true;
3715 }
3716 
3717 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3718                                                  MachineRegisterInfo &MRI,
3719                                                  MachineIRBuilder &B) const {
3720   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3721   if (!MFI->isEntryFunction()) {
3722     return legalizePreloadedArgIntrin(MI, MRI, B,
3723                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3724   }
3725 
3726   Register DstReg = MI.getOperand(0).getReg();
3727   if (!getImplicitArgPtr(DstReg, MRI, B))
3728     return false;
3729 
3730   MI.eraseFromParent();
3731   return true;
3732 }
3733 
3734 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3735                                               MachineRegisterInfo &MRI,
3736                                               MachineIRBuilder &B,
3737                                               unsigned AddrSpace) const {
3738   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3739   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3740   Register Hi32 = Unmerge.getReg(1);
3741 
3742   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3743   MI.eraseFromParent();
3744   return true;
3745 }
3746 
3747 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3748 // offset (the offset that is included in bounds checking and swizzling, to be
3749 // split between the instruction's voffset and immoffset fields) and soffset
3750 // (the offset that is excluded from bounds checking and swizzling, to go in
3751 // the instruction's soffset field).  This function takes the first kind of
3752 // offset and figures out how to split it between voffset and immoffset.
3753 std::pair<Register, unsigned>
3754 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3755                                         Register OrigOffset) const {
3756   const unsigned MaxImm = 4095;
3757   Register BaseReg;
3758   unsigned ImmOffset;
3759   const LLT S32 = LLT::scalar(32);
3760   MachineRegisterInfo &MRI = *B.getMRI();
3761 
3762   std::tie(BaseReg, ImmOffset) =
3763       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
3764 
3765   // If BaseReg is a pointer, convert it to int.
3766   if (MRI.getType(BaseReg).isPointer())
3767     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
3768 
3769   // If the immediate value is too big for the immoffset field, put the value
3770   // and -4096 into the immoffset field so that the value that is copied/added
3771   // for the voffset field is a multiple of 4096, and it stands more chance
3772   // of being CSEd with the copy/add for another similar load/store.
3773   // However, do not do that rounding down to a multiple of 4096 if that is a
3774   // negative number, as it appears to be illegal to have a negative offset
3775   // in the vgpr, even if adding the immediate offset makes it positive.
3776   unsigned Overflow = ImmOffset & ~MaxImm;
3777   ImmOffset -= Overflow;
3778   if ((int32_t)Overflow < 0) {
3779     Overflow += ImmOffset;
3780     ImmOffset = 0;
3781   }
3782 
3783   if (Overflow != 0) {
3784     if (!BaseReg) {
3785       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3786     } else {
3787       auto OverflowVal = B.buildConstant(S32, Overflow);
3788       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3789     }
3790   }
3791 
3792   if (!BaseReg)
3793     BaseReg = B.buildConstant(S32, 0).getReg(0);
3794 
3795   return std::make_pair(BaseReg, ImmOffset);
3796 }
3797 
3798 /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3799 void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3800                                           Register VOffset, Register SOffset,
3801                                           unsigned ImmOffset, Register VIndex,
3802                                           MachineRegisterInfo &MRI) const {
3803   Optional<ValueAndVReg> MaybeVOffsetVal =
3804       getIConstantVRegValWithLookThrough(VOffset, MRI);
3805   Optional<ValueAndVReg> MaybeSOffsetVal =
3806       getIConstantVRegValWithLookThrough(SOffset, MRI);
3807   Optional<ValueAndVReg> MaybeVIndexVal =
3808       getIConstantVRegValWithLookThrough(VIndex, MRI);
3809   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3810   // update the MMO with that offset. The stride is unknown so we can only do
3811   // this if VIndex is constant 0.
3812   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3813       MaybeVIndexVal->Value == 0) {
3814     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3815                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3816     MMO->setOffset(TotalOffset);
3817   } else {
3818     // We don't have a constant combined offset to use in the MMO. Give up.
3819     MMO->setValue((Value *)nullptr);
3820   }
3821 }
3822 
3823 /// Handle register layout difference for f16 images for some subtargets.
3824 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3825                                              MachineRegisterInfo &MRI,
3826                                              Register Reg,
3827                                              bool ImageStore) const {
3828   const LLT S16 = LLT::scalar(16);
3829   const LLT S32 = LLT::scalar(32);
3830   LLT StoreVT = MRI.getType(Reg);
3831   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3832 
3833   if (ST.hasUnpackedD16VMem()) {
3834     auto Unmerge = B.buildUnmerge(S16, Reg);
3835 
3836     SmallVector<Register, 4> WideRegs;
3837     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3838       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3839 
3840     int NumElts = StoreVT.getNumElements();
3841 
3842     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3843         .getReg(0);
3844   }
3845 
3846   if (ImageStore && ST.hasImageStoreD16Bug()) {
3847     if (StoreVT.getNumElements() == 2) {
3848       SmallVector<Register, 4> PackedRegs;
3849       Reg = B.buildBitcast(S32, Reg).getReg(0);
3850       PackedRegs.push_back(Reg);
3851       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3852       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3853           .getReg(0);
3854     }
3855 
3856     if (StoreVT.getNumElements() == 3) {
3857       SmallVector<Register, 4> PackedRegs;
3858       auto Unmerge = B.buildUnmerge(S16, Reg);
3859       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3860         PackedRegs.push_back(Unmerge.getReg(I));
3861       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3862       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3863       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3864     }
3865 
3866     if (StoreVT.getNumElements() == 4) {
3867       SmallVector<Register, 4> PackedRegs;
3868       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3869       auto Unmerge = B.buildUnmerge(S32, Reg);
3870       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3871         PackedRegs.push_back(Unmerge.getReg(I));
3872       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3873       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3874           .getReg(0);
3875     }
3876 
3877     llvm_unreachable("invalid data type");
3878   }
3879 
3880   if (StoreVT == LLT::fixed_vector(3, S16)) {
3881     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
3882               .getReg(0);
3883   }
3884   return Reg;
3885 }
3886 
3887 Register AMDGPULegalizerInfo::fixStoreSourceType(
3888   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3889   MachineRegisterInfo *MRI = B.getMRI();
3890   LLT Ty = MRI->getType(VData);
3891 
3892   const LLT S16 = LLT::scalar(16);
3893 
3894   // Fixup illegal register types for i8 stores.
3895   if (Ty == LLT::scalar(8) || Ty == S16) {
3896     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3897     return AnyExt;
3898   }
3899 
3900   if (Ty.isVector()) {
3901     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3902       if (IsFormat)
3903         return handleD16VData(B, *MRI, VData);
3904     }
3905   }
3906 
3907   return VData;
3908 }
3909 
3910 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3911                                               MachineRegisterInfo &MRI,
3912                                               MachineIRBuilder &B,
3913                                               bool IsTyped,
3914                                               bool IsFormat) const {
3915   Register VData = MI.getOperand(1).getReg();
3916   LLT Ty = MRI.getType(VData);
3917   LLT EltTy = Ty.getScalarType();
3918   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3919   const LLT S32 = LLT::scalar(32);
3920 
3921   VData = fixStoreSourceType(B, VData, IsFormat);
3922   Register RSrc = MI.getOperand(2).getReg();
3923 
3924   MachineMemOperand *MMO = *MI.memoperands_begin();
3925   const int MemSize = MMO->getSize();
3926 
3927   unsigned ImmOffset;
3928 
3929   // The typed intrinsics add an immediate after the registers.
3930   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3931 
3932   // The struct intrinsic variants add one additional operand over raw.
3933   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3934   Register VIndex;
3935   int OpOffset = 0;
3936   if (HasVIndex) {
3937     VIndex = MI.getOperand(3).getReg();
3938     OpOffset = 1;
3939   } else {
3940     VIndex = B.buildConstant(S32, 0).getReg(0);
3941   }
3942 
3943   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3944   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3945 
3946   unsigned Format = 0;
3947   if (IsTyped) {
3948     Format = MI.getOperand(5 + OpOffset).getImm();
3949     ++OpOffset;
3950   }
3951 
3952   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3953 
3954   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3955   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
3956 
3957   unsigned Opc;
3958   if (IsTyped) {
3959     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3960                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3961   } else if (IsFormat) {
3962     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3963                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3964   } else {
3965     switch (MemSize) {
3966     case 1:
3967       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3968       break;
3969     case 2:
3970       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3971       break;
3972     default:
3973       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3974       break;
3975     }
3976   }
3977 
3978   auto MIB = B.buildInstr(Opc)
3979     .addUse(VData)              // vdata
3980     .addUse(RSrc)               // rsrc
3981     .addUse(VIndex)             // vindex
3982     .addUse(VOffset)            // voffset
3983     .addUse(SOffset)            // soffset
3984     .addImm(ImmOffset);         // offset(imm)
3985 
3986   if (IsTyped)
3987     MIB.addImm(Format);
3988 
3989   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3990      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3991      .addMemOperand(MMO);
3992 
3993   MI.eraseFromParent();
3994   return true;
3995 }
3996 
3997 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3998                                              MachineRegisterInfo &MRI,
3999                                              MachineIRBuilder &B,
4000                                              bool IsFormat,
4001                                              bool IsTyped) const {
4002   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
4003   MachineMemOperand *MMO = *MI.memoperands_begin();
4004   const LLT MemTy = MMO->getMemoryType();
4005   const LLT S32 = LLT::scalar(32);
4006 
4007   Register Dst = MI.getOperand(0).getReg();
4008   Register RSrc = MI.getOperand(2).getReg();
4009 
4010   // The typed intrinsics add an immediate after the registers.
4011   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
4012 
4013   // The struct intrinsic variants add one additional operand over raw.
4014   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4015   Register VIndex;
4016   int OpOffset = 0;
4017   if (HasVIndex) {
4018     VIndex = MI.getOperand(3).getReg();
4019     OpOffset = 1;
4020   } else {
4021     VIndex = B.buildConstant(S32, 0).getReg(0);
4022   }
4023 
4024   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
4025   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
4026 
4027   unsigned Format = 0;
4028   if (IsTyped) {
4029     Format = MI.getOperand(5 + OpOffset).getImm();
4030     ++OpOffset;
4031   }
4032 
4033   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
4034   unsigned ImmOffset;
4035 
4036   LLT Ty = MRI.getType(Dst);
4037   LLT EltTy = Ty.getScalarType();
4038   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
4039   const bool Unpacked = ST.hasUnpackedD16VMem();
4040 
4041   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4042   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
4043 
4044   unsigned Opc;
4045 
4046   if (IsTyped) {
4047     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
4048                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
4049   } else if (IsFormat) {
4050     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
4051                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4052   } else {
4053     switch (MemTy.getSizeInBits()) {
4054     case 8:
4055       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
4056       break;
4057     case 16:
4058       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
4059       break;
4060     default:
4061       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
4062       break;
4063     }
4064   }
4065 
4066   Register LoadDstReg;
4067 
4068   bool IsExtLoad =
4069       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
4070   LLT UnpackedTy = Ty.changeElementSize(32);
4071 
4072   if (IsExtLoad)
4073     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4074   else if (Unpacked && IsD16 && Ty.isVector())
4075     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4076   else
4077     LoadDstReg = Dst;
4078 
4079   auto MIB = B.buildInstr(Opc)
4080     .addDef(LoadDstReg)         // vdata
4081     .addUse(RSrc)               // rsrc
4082     .addUse(VIndex)             // vindex
4083     .addUse(VOffset)            // voffset
4084     .addUse(SOffset)            // soffset
4085     .addImm(ImmOffset);         // offset(imm)
4086 
4087   if (IsTyped)
4088     MIB.addImm(Format);
4089 
4090   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4091      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4092      .addMemOperand(MMO);
4093 
4094   if (LoadDstReg != Dst) {
4095     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
4096 
4097     // Widen result for extending loads was widened.
4098     if (IsExtLoad)
4099       B.buildTrunc(Dst, LoadDstReg);
4100     else {
4101       // Repack to original 16-bit vector result
4102       // FIXME: G_TRUNC should work, but legalization currently fails
4103       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
4104       SmallVector<Register, 4> Repack;
4105       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
4106         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4107       B.buildMerge(Dst, Repack);
4108     }
4109   }
4110 
4111   MI.eraseFromParent();
4112   return true;
4113 }
4114 
4115 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
4116                                                MachineIRBuilder &B,
4117                                                bool IsInc) const {
4118   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
4119                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
4120   B.buildInstr(Opc)
4121     .addDef(MI.getOperand(0).getReg())
4122     .addUse(MI.getOperand(2).getReg())
4123     .addUse(MI.getOperand(3).getReg())
4124     .cloneMemRefs(MI);
4125   MI.eraseFromParent();
4126   return true;
4127 }
4128 
4129 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
4130   switch (IntrID) {
4131   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4132   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4133     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
4134   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4135   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4136     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
4137   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4138   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4139     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
4140   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4141   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4142     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
4143   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4144   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4145     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
4146   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4147   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4148     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
4149   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4150   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4151     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
4152   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4153   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4154     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
4155   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4156   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4157     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
4158   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4159   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4160     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
4161   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4162   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4163     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
4164   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4165   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4166     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
4167   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4168   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4169     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4170   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4171   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4172     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4173   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4174   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4175     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4176   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4177   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4178     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
4179   default:
4180     llvm_unreachable("unhandled atomic opcode");
4181   }
4182 }
4183 
4184 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
4185                                                MachineIRBuilder &B,
4186                                                Intrinsic::ID IID) const {
4187   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
4188                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4189   const bool HasReturn = MI.getNumExplicitDefs() != 0;
4190 
4191   Register Dst;
4192 
4193   int OpOffset = 0;
4194   if (HasReturn) {
4195     // A few FP atomics do not support return values.
4196     Dst = MI.getOperand(0).getReg();
4197   } else {
4198     OpOffset = -1;
4199   }
4200 
4201   Register VData = MI.getOperand(2 + OpOffset).getReg();
4202   Register CmpVal;
4203 
4204   if (IsCmpSwap) {
4205     CmpVal = MI.getOperand(3 + OpOffset).getReg();
4206     ++OpOffset;
4207   }
4208 
4209   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4210   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
4211 
4212   // The struct intrinsic variants add one additional operand over raw.
4213   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
4214   Register VIndex;
4215   if (HasVIndex) {
4216     VIndex = MI.getOperand(4 + OpOffset).getReg();
4217     ++OpOffset;
4218   } else {
4219     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
4220   }
4221 
4222   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
4223   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
4224   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
4225 
4226   MachineMemOperand *MMO = *MI.memoperands_begin();
4227 
4228   unsigned ImmOffset;
4229   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4230   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
4231 
4232   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4233 
4234   if (HasReturn)
4235     MIB.addDef(Dst);
4236 
4237   MIB.addUse(VData); // vdata
4238 
4239   if (IsCmpSwap)
4240     MIB.addReg(CmpVal);
4241 
4242   MIB.addUse(RSrc)               // rsrc
4243      .addUse(VIndex)             // vindex
4244      .addUse(VOffset)            // voffset
4245      .addUse(SOffset)            // soffset
4246      .addImm(ImmOffset)          // offset(imm)
4247      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
4248      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4249      .addMemOperand(MMO);
4250 
4251   MI.eraseFromParent();
4252   return true;
4253 }
4254 
4255 /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
4256 /// vector with s16 typed elements.
4257 static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4258                                       SmallVectorImpl<Register> &PackedAddrs,
4259                                       unsigned ArgOffset,
4260                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4261                                       bool IsA16, bool IsG16) {
4262   const LLT S16 = LLT::scalar(16);
4263   const LLT V2S16 = LLT::fixed_vector(2, 16);
4264   auto EndIdx = Intr->VAddrEnd;
4265 
4266   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4267     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4268     if (!SrcOp.isReg())
4269       continue; // _L to _LZ may have eliminated this.
4270 
4271     Register AddrReg = SrcOp.getReg();
4272 
4273     if ((I < Intr->GradientStart) ||
4274         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4275         (I >= Intr->CoordStart && !IsA16)) {
4276       if ((I < Intr->GradientStart) && IsA16 &&
4277           (B.getMRI()->getType(AddrReg) == S16)) {
4278         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
4279         // Special handling of bias when A16 is on. Bias is of type half but
4280         // occupies full 32-bit.
4281         PackedAddrs.push_back(
4282             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4283                 .getReg(0));
4284       } else {
4285         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4286                "Bias needs to be converted to 16 bit in A16 mode");
4287         // Handle any gradient or coordinate operands that should not be packed
4288         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4289         PackedAddrs.push_back(AddrReg);
4290       }
4291     } else {
4292       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4293       // derivatives dx/dh and dx/dv are packed with undef.
4294       if (((I + 1) >= EndIdx) ||
4295           ((Intr->NumGradients / 2) % 2 == 1 &&
4296            (I == static_cast<unsigned>(Intr->GradientStart +
4297                                        (Intr->NumGradients / 2) - 1) ||
4298             I == static_cast<unsigned>(Intr->GradientStart +
4299                                        Intr->NumGradients - 1))) ||
4300           // Check for _L to _LZ optimization
4301           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4302         PackedAddrs.push_back(
4303             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4304                 .getReg(0));
4305       } else {
4306         PackedAddrs.push_back(
4307             B.buildBuildVector(
4308                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4309                 .getReg(0));
4310         ++I;
4311       }
4312     }
4313   }
4314 }
4315 
4316 /// Convert from separate vaddr components to a single vector address register,
4317 /// and replace the remaining operands with $noreg.
4318 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4319                                      int DimIdx, int NumVAddrs) {
4320   const LLT S32 = LLT::scalar(32);
4321 
4322   SmallVector<Register, 8> AddrRegs;
4323   for (int I = 0; I != NumVAddrs; ++I) {
4324     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4325     if (SrcOp.isReg()) {
4326       AddrRegs.push_back(SrcOp.getReg());
4327       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4328     }
4329   }
4330 
4331   int NumAddrRegs = AddrRegs.size();
4332   if (NumAddrRegs != 1) {
4333     // Above 8 elements round up to next power of 2 (i.e. 16).
4334     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
4335       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4336       auto Undef = B.buildUndef(S32);
4337       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4338       NumAddrRegs = RoundedNumRegs;
4339     }
4340 
4341     auto VAddr =
4342         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
4343     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4344   }
4345 
4346   for (int I = 1; I != NumVAddrs; ++I) {
4347     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4348     if (SrcOp.isReg())
4349       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4350   }
4351 }
4352 
4353 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4354 ///
4355 /// Depending on the subtarget, load/store with 16-bit element data need to be
4356 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4357 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4358 /// registers.
4359 ///
4360 /// We don't want to directly select image instructions just yet, but also want
4361 /// to exposes all register repacking to the legalizer/combiners. We also don't
4362 /// want a selected instrution entering RegBankSelect. In order to avoid
4363 /// defining a multitude of intermediate image instructions, directly hack on
4364 /// the intrinsic's arguments. In cases like a16 addresses, this requires
4365 /// padding now unnecessary arguments with $noreg.
4366 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4367     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4368     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4369 
4370   const unsigned NumDefs = MI.getNumExplicitDefs();
4371   const unsigned ArgOffset = NumDefs + 1;
4372   bool IsTFE = NumDefs == 2;
4373   // We are only processing the operands of d16 image operations on subtargets
4374   // that use the unpacked register layout, or need to repack the TFE result.
4375 
4376   // TODO: Do we need to guard against already legalized intrinsics?
4377   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4378       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4379 
4380   MachineRegisterInfo *MRI = B.getMRI();
4381   const LLT S32 = LLT::scalar(32);
4382   const LLT S16 = LLT::scalar(16);
4383   const LLT V2S16 = LLT::fixed_vector(2, 16);
4384 
4385   unsigned DMask = 0;
4386   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4387   LLT Ty = MRI->getType(VData);
4388 
4389   // Check for 16 bit addresses and pack if true.
4390   LLT GradTy =
4391       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4392   LLT AddrTy =
4393       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4394   const bool IsG16 = GradTy == S16;
4395   const bool IsA16 = AddrTy == S16;
4396   const bool IsD16 = Ty.getScalarType() == S16;
4397 
4398   int DMaskLanes = 0;
4399   if (!BaseOpcode->Atomic) {
4400     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4401     if (BaseOpcode->Gather4) {
4402       DMaskLanes = 4;
4403     } else if (DMask != 0) {
4404       DMaskLanes = countPopulation(DMask);
4405     } else if (!IsTFE && !BaseOpcode->Store) {
4406       // If dmask is 0, this is a no-op load. This can be eliminated.
4407       B.buildUndef(MI.getOperand(0));
4408       MI.eraseFromParent();
4409       return true;
4410     }
4411   }
4412 
4413   Observer.changingInstr(MI);
4414   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4415 
4416   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4417                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4418   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4419                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4420   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
4421 
4422   // Track that we legalized this
4423   MI.setDesc(B.getTII().get(NewOpcode));
4424 
4425   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4426   // dmask to be at least 1 otherwise the instruction will fail
4427   if (IsTFE && DMask == 0) {
4428     DMask = 0x1;
4429     DMaskLanes = 1;
4430     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4431   }
4432 
4433   if (BaseOpcode->Atomic) {
4434     Register VData0 = MI.getOperand(2).getReg();
4435     LLT Ty = MRI->getType(VData0);
4436 
4437     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4438     if (Ty.isVector())
4439       return false;
4440 
4441     if (BaseOpcode->AtomicX2) {
4442       Register VData1 = MI.getOperand(3).getReg();
4443       // The two values are packed in one register.
4444       LLT PackedTy = LLT::fixed_vector(2, Ty);
4445       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4446       MI.getOperand(2).setReg(Concat.getReg(0));
4447       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4448     }
4449   }
4450 
4451   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4452 
4453   // Rewrite the addressing register layout before doing anything else.
4454   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4455     // 16 bit gradients are supported, but are tied to the A16 control
4456     // so both gradients and addresses must be 16 bit
4457     return false;
4458   }
4459 
4460   if (IsA16 && !ST.hasA16()) {
4461     // A16 not supported
4462     return false;
4463   }
4464 
4465   if (IsA16 || IsG16) {
4466     if (Intr->NumVAddrs > 1) {
4467       SmallVector<Register, 4> PackedRegs;
4468 
4469       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4470                                 IsG16);
4471 
4472       // See also below in the non-a16 branch
4473       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4474                           PackedRegs.size() <= ST.getNSAMaxSize();
4475 
4476       if (!UseNSA && PackedRegs.size() > 1) {
4477         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
4478         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4479         PackedRegs[0] = Concat.getReg(0);
4480         PackedRegs.resize(1);
4481       }
4482 
4483       const unsigned NumPacked = PackedRegs.size();
4484       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4485         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4486         if (!SrcOp.isReg()) {
4487           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4488           continue;
4489         }
4490 
4491         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4492 
4493         if (I - Intr->VAddrStart < NumPacked)
4494           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4495         else
4496           SrcOp.setReg(AMDGPU::NoRegister);
4497       }
4498     }
4499   } else {
4500     // If the register allocator cannot place the address registers contiguously
4501     // without introducing moves, then using the non-sequential address encoding
4502     // is always preferable, since it saves VALU instructions and is usually a
4503     // wash in terms of code size or even better.
4504     //
4505     // However, we currently have no way of hinting to the register allocator
4506     // that MIMG addresses should be placed contiguously when it is possible to
4507     // do so, so force non-NSA for the common 2-address case as a heuristic.
4508     //
4509     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4510     // allocation when possible.
4511     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4512                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
4513 
4514     if (!UseNSA && Intr->NumVAddrs > 1)
4515       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4516                                Intr->NumVAddrs);
4517   }
4518 
4519   int Flags = 0;
4520   if (IsA16)
4521     Flags |= 1;
4522   if (IsG16)
4523     Flags |= 2;
4524   MI.addOperand(MachineOperand::CreateImm(Flags));
4525 
4526   if (BaseOpcode->Store) { // No TFE for stores?
4527     // TODO: Handle dmask trim
4528     if (!Ty.isVector() || !IsD16)
4529       return true;
4530 
4531     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4532     if (RepackedReg != VData) {
4533       MI.getOperand(1).setReg(RepackedReg);
4534     }
4535 
4536     return true;
4537   }
4538 
4539   Register DstReg = MI.getOperand(0).getReg();
4540   const LLT EltTy = Ty.getScalarType();
4541   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4542 
4543   // Confirm that the return type is large enough for the dmask specified
4544   if (NumElts < DMaskLanes)
4545     return false;
4546 
4547   if (NumElts > 4 || DMaskLanes > 4)
4548     return false;
4549 
4550   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4551   const LLT AdjustedTy =
4552       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
4553 
4554   // The raw dword aligned data component of the load. The only legal cases
4555   // where this matters should be when using the packed D16 format, for
4556   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4557   LLT RoundedTy;
4558 
4559   // S32 vector to to cover all data, plus TFE result element.
4560   LLT TFETy;
4561 
4562   // Register type to use for each loaded component. Will be S32 or V2S16.
4563   LLT RegTy;
4564 
4565   if (IsD16 && ST.hasUnpackedD16VMem()) {
4566     RoundedTy =
4567         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4568     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
4569     RegTy = S32;
4570   } else {
4571     unsigned EltSize = EltTy.getSizeInBits();
4572     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4573     unsigned RoundedSize = 32 * RoundedElts;
4574     RoundedTy = LLT::scalarOrVector(
4575         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4576     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
4577     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4578   }
4579 
4580   // The return type does not need adjustment.
4581   // TODO: Should we change s16 case to s32 or <2 x s16>?
4582   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4583     return true;
4584 
4585   Register Dst1Reg;
4586 
4587   // Insert after the instruction.
4588   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4589 
4590   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4591   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4592   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4593   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4594 
4595   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4596 
4597   MI.getOperand(0).setReg(NewResultReg);
4598 
4599   // In the IR, TFE is supposed to be used with a 2 element struct return
4600   // type. The instruction really returns these two values in one contiguous
4601   // register, with one additional dword beyond the loaded data. Rewrite the
4602   // return type to use a single register result.
4603 
4604   if (IsTFE) {
4605     Dst1Reg = MI.getOperand(1).getReg();
4606     if (MRI->getType(Dst1Reg) != S32)
4607       return false;
4608 
4609     // TODO: Make sure the TFE operand bit is set.
4610     MI.RemoveOperand(1);
4611 
4612     // Handle the easy case that requires no repack instructions.
4613     if (Ty == S32) {
4614       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4615       return true;
4616     }
4617   }
4618 
4619   // Now figure out how to copy the new result register back into the old
4620   // result.
4621   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4622 
4623   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4624 
4625   if (ResultNumRegs == 1) {
4626     assert(!IsTFE);
4627     ResultRegs[0] = NewResultReg;
4628   } else {
4629     // We have to repack into a new vector of some kind.
4630     for (int I = 0; I != NumDataRegs; ++I)
4631       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4632     B.buildUnmerge(ResultRegs, NewResultReg);
4633 
4634     // Drop the final TFE element to get the data part. The TFE result is
4635     // directly written to the right place already.
4636     if (IsTFE)
4637       ResultRegs.resize(NumDataRegs);
4638   }
4639 
4640   // For an s16 scalar result, we form an s32 result with a truncate regardless
4641   // of packed vs. unpacked.
4642   if (IsD16 && !Ty.isVector()) {
4643     B.buildTrunc(DstReg, ResultRegs[0]);
4644     return true;
4645   }
4646 
4647   // Avoid a build/concat_vector of 1 entry.
4648   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4649     B.buildBitcast(DstReg, ResultRegs[0]);
4650     return true;
4651   }
4652 
4653   assert(Ty.isVector());
4654 
4655   if (IsD16) {
4656     // For packed D16 results with TFE enabled, all the data components are
4657     // S32. Cast back to the expected type.
4658     //
4659     // TODO: We don't really need to use load s32 elements. We would only need one
4660     // cast for the TFE result if a multiple of v2s16 was used.
4661     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4662       for (Register &Reg : ResultRegs)
4663         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4664     } else if (ST.hasUnpackedD16VMem()) {
4665       for (Register &Reg : ResultRegs)
4666         Reg = B.buildTrunc(S16, Reg).getReg(0);
4667     }
4668   }
4669 
4670   auto padWithUndef = [&](LLT Ty, int NumElts) {
4671     if (NumElts == 0)
4672       return;
4673     Register Undef = B.buildUndef(Ty).getReg(0);
4674     for (int I = 0; I != NumElts; ++I)
4675       ResultRegs.push_back(Undef);
4676   };
4677 
4678   // Pad out any elements eliminated due to the dmask.
4679   LLT ResTy = MRI->getType(ResultRegs[0]);
4680   if (!ResTy.isVector()) {
4681     padWithUndef(ResTy, NumElts - ResultRegs.size());
4682     B.buildBuildVector(DstReg, ResultRegs);
4683     return true;
4684   }
4685 
4686   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4687   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4688 
4689   // Deal with the one annoying legal case.
4690   const LLT V3S16 = LLT::fixed_vector(3, 16);
4691   if (Ty == V3S16) {
4692     if (IsTFE) {
4693       if (ResultRegs.size() == 1) {
4694         NewResultReg = ResultRegs[0];
4695       } else if (ResultRegs.size() == 2) {
4696         LLT V4S16 = LLT::fixed_vector(4, 16);
4697         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
4698       } else {
4699         return false;
4700       }
4701     }
4702 
4703     if (MRI->getType(DstReg).getNumElements() <
4704         MRI->getType(NewResultReg).getNumElements()) {
4705       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
4706     } else {
4707       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
4708     }
4709     return true;
4710   }
4711 
4712   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4713   B.buildConcatVectors(DstReg, ResultRegs);
4714   return true;
4715 }
4716 
4717 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4718   LegalizerHelper &Helper, MachineInstr &MI) const {
4719   MachineIRBuilder &B = Helper.MIRBuilder;
4720   GISelChangeObserver &Observer = Helper.Observer;
4721 
4722   Register Dst = MI.getOperand(0).getReg();
4723   LLT Ty = B.getMRI()->getType(Dst);
4724   unsigned Size = Ty.getSizeInBits();
4725   MachineFunction &MF = B.getMF();
4726 
4727   Observer.changingInstr(MI);
4728 
4729   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4730     Ty = getBitcastRegisterType(Ty);
4731     Helper.bitcastDst(MI, Ty, 0);
4732     Dst = MI.getOperand(0).getReg();
4733     B.setInsertPt(B.getMBB(), MI);
4734   }
4735 
4736   // FIXME: We don't really need this intermediate instruction. The intrinsic
4737   // should be fixed to have a memory operand. Since it's readnone, we're not
4738   // allowed to add one.
4739   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4740   MI.RemoveOperand(1); // Remove intrinsic ID
4741 
4742   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4743   // TODO: Should this use datalayout alignment?
4744   const unsigned MemSize = (Size + 7) / 8;
4745   const Align MemAlign(4);
4746   MachineMemOperand *MMO = MF.getMachineMemOperand(
4747       MachinePointerInfo(),
4748       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4749           MachineMemOperand::MOInvariant,
4750       MemSize, MemAlign);
4751   MI.addMemOperand(MF, MMO);
4752 
4753   // There are no 96-bit result scalar loads, but widening to 128-bit should
4754   // always be legal. We may need to restore this to a 96-bit result if it turns
4755   // out this needs to be converted to a vector load during RegBankSelect.
4756   if (!isPowerOf2_32(Size)) {
4757     if (Ty.isVector())
4758       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4759     else
4760       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4761   }
4762 
4763   Observer.changedInstr(MI);
4764   return true;
4765 }
4766 
4767 // TODO: Move to selection
4768 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4769                                                 MachineRegisterInfo &MRI,
4770                                                 MachineIRBuilder &B) const {
4771   if (!ST.isTrapHandlerEnabled() ||
4772       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4773     return legalizeTrapEndpgm(MI, MRI, B);
4774 
4775   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4776     switch (*HsaAbiVer) {
4777     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4778     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4779       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4780     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4781     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
4782       return ST.supportsGetDoorbellID() ?
4783           legalizeTrapHsa(MI, MRI, B) :
4784           legalizeTrapHsaQueuePtr(MI, MRI, B);
4785     }
4786   }
4787 
4788   llvm_unreachable("Unknown trap handler");
4789 }
4790 
4791 bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4792     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4793   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4794   MI.eraseFromParent();
4795   return true;
4796 }
4797 
4798 bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4799     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4800   // Pass queue pointer to trap handler as input, and insert trap instruction
4801   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4802   Register LiveIn =
4803     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4804   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4805     return false;
4806 
4807   Register SGPR01(AMDGPU::SGPR0_SGPR1);
4808   B.buildCopy(SGPR01, LiveIn);
4809   B.buildInstr(AMDGPU::S_TRAP)
4810       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
4811       .addReg(SGPR01, RegState::Implicit);
4812 
4813   MI.eraseFromParent();
4814   return true;
4815 }
4816 
4817 bool AMDGPULegalizerInfo::legalizeTrapHsa(
4818     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4819   B.buildInstr(AMDGPU::S_TRAP)
4820       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
4821   MI.eraseFromParent();
4822   return true;
4823 }
4824 
4825 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4826     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4827   // Is non-HSA path or trap-handler disabled? Then, report a warning
4828   // accordingly
4829   if (!ST.isTrapHandlerEnabled() ||
4830       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
4831     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4832                                      "debugtrap handler not supported",
4833                                      MI.getDebugLoc(), DS_Warning);
4834     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4835     Ctx.diagnose(NoTrap);
4836   } else {
4837     // Insert debug-trap instruction
4838     B.buildInstr(AMDGPU::S_TRAP)
4839         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
4840   }
4841 
4842   MI.eraseFromParent();
4843   return true;
4844 }
4845 
4846 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4847                                                MachineIRBuilder &B) const {
4848   MachineRegisterInfo &MRI = *B.getMRI();
4849   const LLT S16 = LLT::scalar(16);
4850   const LLT S32 = LLT::scalar(32);
4851 
4852   Register DstReg = MI.getOperand(0).getReg();
4853   Register NodePtr = MI.getOperand(2).getReg();
4854   Register RayExtent = MI.getOperand(3).getReg();
4855   Register RayOrigin = MI.getOperand(4).getReg();
4856   Register RayDir = MI.getOperand(5).getReg();
4857   Register RayInvDir = MI.getOperand(6).getReg();
4858   Register TDescr = MI.getOperand(7).getReg();
4859 
4860   if (!ST.hasGFX10_AEncoding()) {
4861     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4862                                         "intrinsic not supported on subtarget",
4863                                         MI.getDebugLoc());
4864     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4865     return false;
4866   }
4867 
4868   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4869   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4870   const unsigned NumVDataDwords = 4;
4871   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4872   const bool UseNSA =
4873       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4874   const unsigned BaseOpcodes[2][2] = {
4875       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4876       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4877        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4878   int Opcode;
4879   if (UseNSA) {
4880     Opcode =
4881         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4882                               NumVDataDwords, NumVAddrDwords);
4883   } else {
4884     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4885                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4886                                    PowerOf2Ceil(NumVAddrDwords));
4887   }
4888   assert(Opcode != -1);
4889 
4890   SmallVector<Register, 12> Ops;
4891   if (Is64) {
4892     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4893     Ops.push_back(Unmerge.getReg(0));
4894     Ops.push_back(Unmerge.getReg(1));
4895   } else {
4896     Ops.push_back(NodePtr);
4897   }
4898   Ops.push_back(RayExtent);
4899 
4900   auto packLanes = [&Ops, &S32, &B](Register Src) {
4901     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4902     Ops.push_back(Unmerge.getReg(0));
4903     Ops.push_back(Unmerge.getReg(1));
4904     Ops.push_back(Unmerge.getReg(2));
4905   };
4906 
4907   packLanes(RayOrigin);
4908   if (IsA16) {
4909     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
4910     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4911     Register R1 = MRI.createGenericVirtualRegister(S32);
4912     Register R2 = MRI.createGenericVirtualRegister(S32);
4913     Register R3 = MRI.createGenericVirtualRegister(S32);
4914     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4915     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4916     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4917     Ops.push_back(R1);
4918     Ops.push_back(R2);
4919     Ops.push_back(R3);
4920   } else {
4921     packLanes(RayDir);
4922     packLanes(RayInvDir);
4923   }
4924 
4925   if (!UseNSA) {
4926     // Build a single vector containing all the operands so far prepared.
4927     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4928     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4929     Ops.clear();
4930     Ops.push_back(MergedOps);
4931   }
4932 
4933   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4934     .addDef(DstReg)
4935     .addImm(Opcode);
4936 
4937   for (Register R : Ops) {
4938     MIB.addUse(R);
4939   }
4940 
4941   MIB.addUse(TDescr)
4942      .addImm(IsA16 ? 1 : 0)
4943      .cloneMemRefs(MI);
4944 
4945   MI.eraseFromParent();
4946   return true;
4947 }
4948 
4949 static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4950   B.buildConstant(MI.getOperand(0).getReg(), C);
4951   MI.eraseFromParent();
4952   return true;
4953 }
4954 
4955 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4956                                             MachineInstr &MI) const {
4957   MachineIRBuilder &B = Helper.MIRBuilder;
4958   MachineRegisterInfo &MRI = *B.getMRI();
4959 
4960   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4961   auto IntrID = MI.getIntrinsicID();
4962   switch (IntrID) {
4963   case Intrinsic::amdgcn_if:
4964   case Intrinsic::amdgcn_else: {
4965     MachineInstr *Br = nullptr;
4966     MachineBasicBlock *UncondBrTarget = nullptr;
4967     bool Negated = false;
4968     if (MachineInstr *BrCond =
4969             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4970       const SIRegisterInfo *TRI
4971         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4972 
4973       Register Def = MI.getOperand(1).getReg();
4974       Register Use = MI.getOperand(3).getReg();
4975 
4976       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4977 
4978       if (Negated)
4979         std::swap(CondBrTarget, UncondBrTarget);
4980 
4981       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4982       if (IntrID == Intrinsic::amdgcn_if) {
4983         B.buildInstr(AMDGPU::SI_IF)
4984           .addDef(Def)
4985           .addUse(Use)
4986           .addMBB(UncondBrTarget);
4987       } else {
4988         B.buildInstr(AMDGPU::SI_ELSE)
4989             .addDef(Def)
4990             .addUse(Use)
4991             .addMBB(UncondBrTarget);
4992       }
4993 
4994       if (Br) {
4995         Br->getOperand(0).setMBB(CondBrTarget);
4996       } else {
4997         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4998         // since we're swapping branch targets it needs to be reinserted.
4999         // FIXME: IRTranslator should probably not do this
5000         B.buildBr(*CondBrTarget);
5001       }
5002 
5003       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
5004       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
5005       MI.eraseFromParent();
5006       BrCond->eraseFromParent();
5007       return true;
5008     }
5009 
5010     return false;
5011   }
5012   case Intrinsic::amdgcn_loop: {
5013     MachineInstr *Br = nullptr;
5014     MachineBasicBlock *UncondBrTarget = nullptr;
5015     bool Negated = false;
5016     if (MachineInstr *BrCond =
5017             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
5018       const SIRegisterInfo *TRI
5019         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
5020 
5021       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5022       Register Reg = MI.getOperand(2).getReg();
5023 
5024       if (Negated)
5025         std::swap(CondBrTarget, UncondBrTarget);
5026 
5027       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5028       B.buildInstr(AMDGPU::SI_LOOP)
5029         .addUse(Reg)
5030         .addMBB(UncondBrTarget);
5031 
5032       if (Br)
5033         Br->getOperand(0).setMBB(CondBrTarget);
5034       else
5035         B.buildBr(*CondBrTarget);
5036 
5037       MI.eraseFromParent();
5038       BrCond->eraseFromParent();
5039       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
5040       return true;
5041     }
5042 
5043     return false;
5044   }
5045   case Intrinsic::amdgcn_kernarg_segment_ptr:
5046     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
5047       // This only makes sense to call in a kernel, so just lower to null.
5048       B.buildConstant(MI.getOperand(0).getReg(), 0);
5049       MI.eraseFromParent();
5050       return true;
5051     }
5052 
5053     return legalizePreloadedArgIntrin(
5054       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5055   case Intrinsic::amdgcn_implicitarg_ptr:
5056     return legalizeImplicitArgPtr(MI, MRI, B);
5057   case Intrinsic::amdgcn_workitem_id_x:
5058     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5059       return replaceWithConstant(B, MI, 0);
5060     return legalizePreloadedArgIntrin(MI, MRI, B,
5061                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
5062   case Intrinsic::amdgcn_workitem_id_y:
5063     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5064       return replaceWithConstant(B, MI, 0);
5065 
5066     return legalizePreloadedArgIntrin(MI, MRI, B,
5067                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
5068   case Intrinsic::amdgcn_workitem_id_z:
5069     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5070       return replaceWithConstant(B, MI, 0);
5071 
5072     return legalizePreloadedArgIntrin(MI, MRI, B,
5073                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
5074   case Intrinsic::amdgcn_workgroup_id_x:
5075     return legalizePreloadedArgIntrin(MI, MRI, B,
5076                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5077   case Intrinsic::amdgcn_workgroup_id_y:
5078     return legalizePreloadedArgIntrin(MI, MRI, B,
5079                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5080   case Intrinsic::amdgcn_workgroup_id_z:
5081     return legalizePreloadedArgIntrin(MI, MRI, B,
5082                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5083   case Intrinsic::amdgcn_dispatch_ptr:
5084     return legalizePreloadedArgIntrin(MI, MRI, B,
5085                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
5086   case Intrinsic::amdgcn_queue_ptr:
5087     return legalizePreloadedArgIntrin(MI, MRI, B,
5088                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
5089   case Intrinsic::amdgcn_implicit_buffer_ptr:
5090     return legalizePreloadedArgIntrin(
5091       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5092   case Intrinsic::amdgcn_dispatch_id:
5093     return legalizePreloadedArgIntrin(MI, MRI, B,
5094                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
5095   case Intrinsic::amdgcn_fdiv_fast:
5096     return legalizeFDIVFastIntrin(MI, MRI, B);
5097   case Intrinsic::amdgcn_is_shared:
5098     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
5099   case Intrinsic::amdgcn_is_private:
5100     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
5101   case Intrinsic::amdgcn_wavefrontsize: {
5102     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
5103     MI.eraseFromParent();
5104     return true;
5105   }
5106   case Intrinsic::amdgcn_s_buffer_load:
5107     return legalizeSBufferLoad(Helper, MI);
5108   case Intrinsic::amdgcn_raw_buffer_store:
5109   case Intrinsic::amdgcn_struct_buffer_store:
5110     return legalizeBufferStore(MI, MRI, B, false, false);
5111   case Intrinsic::amdgcn_raw_buffer_store_format:
5112   case Intrinsic::amdgcn_struct_buffer_store_format:
5113     return legalizeBufferStore(MI, MRI, B, false, true);
5114   case Intrinsic::amdgcn_raw_tbuffer_store:
5115   case Intrinsic::amdgcn_struct_tbuffer_store:
5116     return legalizeBufferStore(MI, MRI, B, true, true);
5117   case Intrinsic::amdgcn_raw_buffer_load:
5118   case Intrinsic::amdgcn_struct_buffer_load:
5119     return legalizeBufferLoad(MI, MRI, B, false, false);
5120   case Intrinsic::amdgcn_raw_buffer_load_format:
5121   case Intrinsic::amdgcn_struct_buffer_load_format:
5122     return legalizeBufferLoad(MI, MRI, B, true, false);
5123   case Intrinsic::amdgcn_raw_tbuffer_load:
5124   case Intrinsic::amdgcn_struct_tbuffer_load:
5125     return legalizeBufferLoad(MI, MRI, B, true, true);
5126   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5127   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5128   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5129   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5130   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5131   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5132   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5133   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5134   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5135   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5136   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5137   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5138   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5139   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5140   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5141   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5142   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5143   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5144   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5145   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5146   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5147   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5148   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5149   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5150   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5151   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5152   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5153   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5154   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5155   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5156     return legalizeBufferAtomic(MI, B, IntrID);
5157   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5158   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5159     Register DstReg = MI.getOperand(0).getReg();
5160     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5161       Function &F = B.getMF().getFunction();
5162       DiagnosticInfoUnsupported NoFpRet(
5163           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5164           DS_Error);
5165       F.getContext().diagnose(NoFpRet);
5166       B.buildUndef(DstReg);
5167       MI.eraseFromParent();
5168       return true;
5169     }
5170 
5171     return legalizeBufferAtomic(MI, B, IntrID);
5172   }
5173   case Intrinsic::amdgcn_atomic_inc:
5174     return legalizeAtomicIncDec(MI, B, true);
5175   case Intrinsic::amdgcn_atomic_dec:
5176     return legalizeAtomicIncDec(MI, B, false);
5177   case Intrinsic::trap:
5178     return legalizeTrapIntrinsic(MI, MRI, B);
5179   case Intrinsic::debugtrap:
5180     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5181   case Intrinsic::amdgcn_rsq_clamp:
5182     return legalizeRsqClampIntrinsic(MI, MRI, B);
5183   case Intrinsic::amdgcn_ds_fadd:
5184   case Intrinsic::amdgcn_ds_fmin:
5185   case Intrinsic::amdgcn_ds_fmax:
5186     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5187   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5188     return legalizeBVHIntrinsic(MI, B);
5189   default: {
5190     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5191             AMDGPU::getImageDimIntrinsicInfo(IntrID))
5192       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
5193     return true;
5194   }
5195   }
5196 
5197   return true;
5198 }
5199