xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 0d8fe2373503aeac48492f28073049a8bfa4feb5)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUInstrInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/ADT/ScopeExit.h"
22 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 
28 #define DEBUG_TYPE "amdgpu-legalinfo"
29 
30 using namespace llvm;
31 using namespace LegalizeActions;
32 using namespace LegalizeMutations;
33 using namespace LegalityPredicates;
34 using namespace MIPatternMatch;
35 
36 // Hack until load/store selection patterns support any tuple of legal types.
37 static cl::opt<bool> EnableNewLegality(
38   "amdgpu-global-isel-new-legality",
39   cl::desc("Use GlobalISel desired legality, rather than try to use"
40            "rules compatible with selection patterns"),
41   cl::init(false),
42   cl::ReallyHidden);
43 
44 static constexpr unsigned MaxRegisterSize = 1024;
45 
46 // Round the number of elements to the next power of two elements
47 static LLT getPow2VectorType(LLT Ty) {
48   unsigned NElts = Ty.getNumElements();
49   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
50   return Ty.changeNumElements(Pow2NElts);
51 }
52 
53 // Round the number of bits to the next power of two bits
54 static LLT getPow2ScalarType(LLT Ty) {
55   unsigned Bits = Ty.getSizeInBits();
56   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
57   return LLT::scalar(Pow2Bits);
58 }
59 
60 /// \returs true if this is an odd sized vector which should widen by adding an
61 /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62 /// excludes s1 vectors, which should always be scalarized.
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     if (!Ty.isVector())
67       return false;
68 
69     const LLT EltTy = Ty.getElementType();
70     const unsigned EltSize = EltTy.getSizeInBits();
71     return Ty.getNumElements() % 2 != 0 &&
72            EltSize > 1 && EltSize < 32 &&
73            Ty.getSizeInBits() % 32 != 0;
74   };
75 }
76 
77 static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78   return [=](const LegalityQuery &Query) {
79     const LLT Ty = Query.Types[TypeIdx];
80     return Ty.getSizeInBits() % 32 == 0;
81   };
82 }
83 
84 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
85   return [=](const LegalityQuery &Query) {
86     const LLT Ty = Query.Types[TypeIdx];
87     const LLT EltTy = Ty.getScalarType();
88     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
89   };
90 }
91 
92 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
93   return [=](const LegalityQuery &Query) {
94     const LLT Ty = Query.Types[TypeIdx];
95     const LLT EltTy = Ty.getElementType();
96     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
97   };
98 }
99 
100 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
101   return [=](const LegalityQuery &Query) {
102     const LLT Ty = Query.Types[TypeIdx];
103     const LLT EltTy = Ty.getElementType();
104     unsigned Size = Ty.getSizeInBits();
105     unsigned Pieces = (Size + 63) / 64;
106     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
107     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
108   };
109 }
110 
111 // Increase the number of vector elements to reach the next multiple of 32-bit
112 // type.
113 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
114   return [=](const LegalityQuery &Query) {
115     const LLT Ty = Query.Types[TypeIdx];
116 
117     const LLT EltTy = Ty.getElementType();
118     const int Size = Ty.getSizeInBits();
119     const int EltSize = EltTy.getSizeInBits();
120     const int NextMul32 = (Size + 31) / 32;
121 
122     assert(EltSize < 32);
123 
124     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
125     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
126   };
127 }
128 
129 static LLT getBitcastRegisterType(const LLT Ty) {
130   const unsigned Size = Ty.getSizeInBits();
131 
132   LLT CoercedTy;
133   if (Size <= 32) {
134     // <2 x s8> -> s16
135     // <4 x s8> -> s32
136     return LLT::scalar(Size);
137   }
138 
139   return LLT::scalarOrVector(Size / 32, 32);
140 }
141 
142 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143   return [=](const LegalityQuery &Query) {
144     const LLT Ty = Query.Types[TypeIdx];
145     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146   };
147 }
148 
149 static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT Ty = Query.Types[TypeIdx];
152     unsigned Size = Ty.getSizeInBits();
153     assert(Size % 32 == 0);
154     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
155   };
156 }
157 
158 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
159   return [=](const LegalityQuery &Query) {
160     const LLT QueryTy = Query.Types[TypeIdx];
161     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
162   };
163 }
164 
165 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
166   return [=](const LegalityQuery &Query) {
167     const LLT QueryTy = Query.Types[TypeIdx];
168     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
169   };
170 }
171 
172 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
173   return [=](const LegalityQuery &Query) {
174     const LLT QueryTy = Query.Types[TypeIdx];
175     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
176   };
177 }
178 
179 static bool isRegisterSize(unsigned Size) {
180   return Size % 32 == 0 && Size <= MaxRegisterSize;
181 }
182 
183 static bool isRegisterVectorElementType(LLT EltTy) {
184   const int EltSize = EltTy.getSizeInBits();
185   return EltSize == 16 || EltSize % 32 == 0;
186 }
187 
188 static bool isRegisterVectorType(LLT Ty) {
189   const int EltSize = Ty.getElementType().getSizeInBits();
190   return EltSize == 32 || EltSize == 64 ||
191          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
192          EltSize == 128 || EltSize == 256;
193 }
194 
195 static bool isRegisterType(LLT Ty) {
196   if (!isRegisterSize(Ty.getSizeInBits()))
197     return false;
198 
199   if (Ty.isVector())
200     return isRegisterVectorType(Ty);
201 
202   return true;
203 }
204 
205 // Any combination of 32 or 64-bit elements up the maximum register size, and
206 // multiples of v2s16.
207 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
208   return [=](const LegalityQuery &Query) {
209     return isRegisterType(Query.Types[TypeIdx]);
210   };
211 }
212 
213 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
214   return [=](const LegalityQuery &Query) {
215     const LLT QueryTy = Query.Types[TypeIdx];
216     if (!QueryTy.isVector())
217       return false;
218     const LLT EltTy = QueryTy.getElementType();
219     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
220   };
221 }
222 
223 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
224   return [=](const LegalityQuery &Query) {
225     const LLT Ty = Query.Types[TypeIdx];
226     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
227            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
228   };
229 }
230 
231 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
232 // handle some operations by just promoting the register during
233 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
234 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
235                                     bool IsLoad) {
236   switch (AS) {
237   case AMDGPUAS::PRIVATE_ADDRESS:
238     // FIXME: Private element size.
239     return ST.enableFlatScratch() ? 128 : 32;
240   case AMDGPUAS::LOCAL_ADDRESS:
241     return ST.useDS128() ? 128 : 64;
242   case AMDGPUAS::GLOBAL_ADDRESS:
243   case AMDGPUAS::CONSTANT_ADDRESS:
244   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
245     // Treat constant and global as identical. SMRD loads are sometimes usable for
246     // global loads (ideally constant address space should be eliminated)
247     // depending on the context. Legality cannot be context dependent, but
248     // RegBankSelect can split the load as necessary depending on the pointer
249     // register bank/uniformity and if the memory is invariant or not written in a
250     // kernel.
251     return IsLoad ? 512 : 128;
252   default:
253     // Flat addresses may contextually need to be split to 32-bit parts if they
254     // may alias scratch depending on the subtarget.
255     return 128;
256   }
257 }
258 
259 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
260                                  const LegalityQuery &Query,
261                                  unsigned Opcode) {
262   const LLT Ty = Query.Types[0];
263 
264   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
265   const bool IsLoad = Opcode != AMDGPU::G_STORE;
266 
267   unsigned RegSize = Ty.getSizeInBits();
268   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
270   unsigned AS = Query.Types[1].getAddressSpace();
271 
272   // All of these need to be custom lowered to cast the pointer operand.
273   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
274     return false;
275 
276   // TODO: We should be able to widen loads if the alignment is high enough, but
277   // we also need to modify the memory access size.
278 #if 0
279   // Accept widening loads based on alignment.
280   if (IsLoad && MemSize < Size)
281     MemSize = std::max(MemSize, Align);
282 #endif
283 
284   // Only 1-byte and 2-byte to 32-bit extloads are valid.
285   if (MemSize != RegSize && RegSize != 32)
286     return false;
287 
288   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
289     return false;
290 
291   switch (MemSize) {
292   case 8:
293   case 16:
294   case 32:
295   case 64:
296   case 128:
297     break;
298   case 96:
299     if (!ST.hasDwordx3LoadStores())
300       return false;
301     break;
302   case 256:
303   case 512:
304     // These may contextually need to be broken down.
305     break;
306   default:
307     return false;
308   }
309 
310   assert(RegSize >= MemSize);
311 
312   if (AlignBits < MemSize) {
313     const SITargetLowering *TLI = ST.getTargetLowering();
314     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315                                                  Align(AlignBits / 8)))
316       return false;
317   }
318 
319   return true;
320 }
321 
322 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
323 // workaround this. Eventually it should ignore the type for loads and only care
324 // about the size. Return true in cases where we will workaround this for now by
325 // bitcasting.
326 static bool loadStoreBitcastWorkaround(const LLT Ty) {
327   if (EnableNewLegality)
328     return false;
329 
330   const unsigned Size = Ty.getSizeInBits();
331   if (Size <= 64)
332     return false;
333   if (!Ty.isVector())
334     return true;
335 
336   LLT EltTy = Ty.getElementType();
337   if (EltTy.isPointer())
338     return true;
339 
340   unsigned EltSize = EltTy.getSizeInBits();
341   return EltSize != 32 && EltSize != 64;
342 }
343 
344 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
345                              unsigned Opcode) {
346   const LLT Ty = Query.Types[0];
347   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
348          !loadStoreBitcastWorkaround(Ty);
349 }
350 
351 /// Return true if a load or store of the type should be lowered with a bitcast
352 /// to a different type.
353 static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354                                        const unsigned MemSizeInBits) {
355   const unsigned Size = Ty.getSizeInBits();
356     if (Size != MemSizeInBits)
357       return Size <= 32 && Ty.isVector();
358 
359   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360     return true;
361   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362          !isRegisterVectorElementType(Ty.getElementType());
363 }
364 
365 /// Return true if we should legalize a load by widening an odd sized memory
366 /// access up to the alignment. Note this case when the memory access itself
367 /// changes, not the size of the result register.
368 static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369                             unsigned AlignInBits, unsigned AddrSpace,
370                             unsigned Opcode) {
371   // We don't want to widen cases that are naturally legal.
372   if (isPowerOf2_32(SizeInBits))
373     return false;
374 
375   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376   // end up widening these for a scalar load during RegBankSelect, since there
377   // aren't 96-bit scalar loads.
378   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379     return false;
380 
381   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382     return false;
383 
384   // A load is known dereferenceable up to the alignment, so it's legal to widen
385   // to it.
386   //
387   // TODO: Could check dereferenceable for less aligned cases.
388   unsigned RoundedSize = NextPowerOf2(SizeInBits);
389   if (AlignInBits < RoundedSize)
390     return false;
391 
392   // Do not widen if it would introduce a slow unaligned load.
393   const SITargetLowering *TLI = ST.getTargetLowering();
394   bool Fast = false;
395   return TLI->allowsMisalignedMemoryAccessesImpl(
396              RoundedSize, AddrSpace, Align(AlignInBits / 8),
397              MachineMemOperand::MOLoad, &Fast) &&
398          Fast;
399 }
400 
401 static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402                             unsigned Opcode) {
403   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404     return false;
405 
406   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407                          Query.MMODescrs[0].AlignInBits,
408                          Query.Types[1].getAddressSpace(), Opcode);
409 }
410 
411 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
412                                          const GCNTargetMachine &TM)
413   :  ST(ST_) {
414   using namespace TargetOpcode;
415 
416   auto GetAddrSpacePtr = [&TM](unsigned AS) {
417     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
418   };
419 
420   const LLT S1 = LLT::scalar(1);
421   const LLT S8 = LLT::scalar(8);
422   const LLT S16 = LLT::scalar(16);
423   const LLT S32 = LLT::scalar(32);
424   const LLT S64 = LLT::scalar(64);
425   const LLT S128 = LLT::scalar(128);
426   const LLT S256 = LLT::scalar(256);
427   const LLT S512 = LLT::scalar(512);
428   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
429 
430   const LLT V2S8 = LLT::vector(2, 8);
431   const LLT V2S16 = LLT::vector(2, 16);
432   const LLT V4S16 = LLT::vector(4, 16);
433 
434   const LLT V2S32 = LLT::vector(2, 32);
435   const LLT V3S32 = LLT::vector(3, 32);
436   const LLT V4S32 = LLT::vector(4, 32);
437   const LLT V5S32 = LLT::vector(5, 32);
438   const LLT V6S32 = LLT::vector(6, 32);
439   const LLT V7S32 = LLT::vector(7, 32);
440   const LLT V8S32 = LLT::vector(8, 32);
441   const LLT V9S32 = LLT::vector(9, 32);
442   const LLT V10S32 = LLT::vector(10, 32);
443   const LLT V11S32 = LLT::vector(11, 32);
444   const LLT V12S32 = LLT::vector(12, 32);
445   const LLT V13S32 = LLT::vector(13, 32);
446   const LLT V14S32 = LLT::vector(14, 32);
447   const LLT V15S32 = LLT::vector(15, 32);
448   const LLT V16S32 = LLT::vector(16, 32);
449   const LLT V32S32 = LLT::vector(32, 32);
450 
451   const LLT V2S64 = LLT::vector(2, 64);
452   const LLT V3S64 = LLT::vector(3, 64);
453   const LLT V4S64 = LLT::vector(4, 64);
454   const LLT V5S64 = LLT::vector(5, 64);
455   const LLT V6S64 = LLT::vector(6, 64);
456   const LLT V7S64 = LLT::vector(7, 64);
457   const LLT V8S64 = LLT::vector(8, 64);
458   const LLT V16S64 = LLT::vector(16, 64);
459 
460   std::initializer_list<LLT> AllS32Vectors =
461     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
462      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
463   std::initializer_list<LLT> AllS64Vectors =
464     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
465 
466   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
467   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
468   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
469   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
470   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
471   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
472   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
473 
474   const LLT CodePtr = FlatPtr;
475 
476   const std::initializer_list<LLT> AddrSpaces64 = {
477     GlobalPtr, ConstantPtr, FlatPtr
478   };
479 
480   const std::initializer_list<LLT> AddrSpaces32 = {
481     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
482   };
483 
484   const std::initializer_list<LLT> FPTypesBase = {
485     S32, S64
486   };
487 
488   const std::initializer_list<LLT> FPTypes16 = {
489     S32, S64, S16
490   };
491 
492   const std::initializer_list<LLT> FPTypesPK16 = {
493     S32, S64, S16, V2S16
494   };
495 
496   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
497 
498   setAction({G_BRCOND, S1}, Legal); // VCC branches
499   setAction({G_BRCOND, S32}, Legal); // SCC branches
500 
501   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
502   // elements for v3s16
503   getActionDefinitionsBuilder(G_PHI)
504     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
505     .legalFor(AllS32Vectors)
506     .legalFor(AllS64Vectors)
507     .legalFor(AddrSpaces64)
508     .legalFor(AddrSpaces32)
509     .legalIf(isPointer(0))
510     .clampScalar(0, S16, S256)
511     .widenScalarToNextPow2(0, 32)
512     .clampMaxNumElements(0, S32, 16)
513     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514     .scalarize(0);
515 
516   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517     // Full set of gfx9 features.
518     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
519       .legalFor({S32, S16, V2S16})
520       .clampScalar(0, S16, S32)
521       .clampMaxNumElements(0, S16, 2)
522       .scalarize(0)
523       .widenScalarToNextPow2(0, 32);
524 
525     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526       .legalFor({S32, S16, V2S16}) // Clamp modifier
527       .minScalarOrElt(0, S16)
528       .clampMaxNumElements(0, S16, 2)
529       .scalarize(0)
530       .widenScalarToNextPow2(0, 32)
531       .lower();
532   } else if (ST.has16BitInsts()) {
533     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
534       .legalFor({S32, S16})
535       .clampScalar(0, S16, S32)
536       .scalarize(0)
537       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538 
539     // Technically the saturating operations require clamp bit support, but this
540     // was introduced at the same time as 16-bit operations.
541     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542       .legalFor({S32, S16}) // Clamp modifier
543       .minScalar(0, S16)
544       .scalarize(0)
545       .widenScalarToNextPow2(0, 16)
546       .lower();
547 
548     // We're just lowering this, but it helps get a better result to try to
549     // coerce to the desired type first.
550     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551       .minScalar(0, S16)
552       .scalarize(0)
553       .lower();
554   } else {
555     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
556       .legalFor({S32})
557       .clampScalar(0, S32, S32)
558       .scalarize(0);
559 
560     if (ST.hasIntClamp()) {
561       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562         .legalFor({S32}) // Clamp modifier.
563         .scalarize(0)
564         .minScalarOrElt(0, S32)
565         .lower();
566     } else {
567       // Clamp bit support was added in VI, along with 16-bit operations.
568       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569         .minScalar(0, S32)
570         .scalarize(0)
571         .lower();
572     }
573 
574     // FIXME: DAG expansion gets better results. The widening uses the smaller
575     // range values and goes for the min/max lowering directly.
576     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577       .minScalar(0, S32)
578       .scalarize(0)
579       .lower();
580   }
581 
582   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
583     .customFor({S32, S64})
584     .clampScalar(0, S32, S64)
585     .widenScalarToNextPow2(0, 32)
586     .scalarize(0);
587 
588   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
589                    .legalFor({S32})
590                    .maxScalarOrElt(0, S32);
591 
592   if (ST.hasVOP3PInsts()) {
593     Mulh
594       .clampMaxNumElements(0, S8, 2)
595       .lowerFor({V2S8});
596   }
597 
598   Mulh
599     .scalarize(0)
600     .lower();
601 
602   // Report legal for any types we can handle anywhere. For the cases only legal
603   // on the SALU, RegBankSelect will be able to re-legalize.
604   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
605     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
606     .clampScalar(0, S32, S64)
607     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
608     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
609     .widenScalarToNextPow2(0)
610     .scalarize(0);
611 
612   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
613                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614     .legalFor({{S32, S1}, {S32, S32}})
615     .minScalar(0, S32)
616     // TODO: .scalarize(0)
617     .lower();
618 
619   getActionDefinitionsBuilder(G_BITCAST)
620     // Don't worry about the size constraint.
621     .legalIf(all(isRegisterType(0), isRegisterType(1)))
622     .lower();
623 
624 
625   getActionDefinitionsBuilder(G_CONSTANT)
626     .legalFor({S1, S32, S64, S16, GlobalPtr,
627                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628     .legalIf(isPointer(0))
629     .clampScalar(0, S32, S64)
630     .widenScalarToNextPow2(0);
631 
632   getActionDefinitionsBuilder(G_FCONSTANT)
633     .legalFor({S32, S64, S16})
634     .clampScalar(0, S16, S64);
635 
636   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
637       .legalIf(isRegisterType(0))
638       // s1 and s16 are special cases because they have legal operations on
639       // them, but don't really occupy registers in the normal way.
640       .legalFor({S1, S16})
641       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
642       .clampScalarOrElt(0, S32, MaxScalar)
643       .widenScalarToNextPow2(0, 32)
644       .clampMaxNumElements(0, S32, 16);
645 
646   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
647 
648   // If the amount is divergent, we have to do a wave reduction to get the
649   // maximum value, so this is expanded during RegBankSelect.
650   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
651     .legalFor({{PrivatePtr, S32}});
652 
653   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654     .customIf(typeIsNot(0, PrivatePtr));
655 
656   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
657 
658   auto &FPOpActions = getActionDefinitionsBuilder(
659     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
660     .legalFor({S32, S64});
661   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
662     .customFor({S32, S64});
663   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
664     .customFor({S32, S64});
665 
666   if (ST.has16BitInsts()) {
667     if (ST.hasVOP3PInsts())
668       FPOpActions.legalFor({S16, V2S16});
669     else
670       FPOpActions.legalFor({S16});
671 
672     TrigActions.customFor({S16});
673     FDIVActions.customFor({S16});
674   }
675 
676   auto &MinNumMaxNum = getActionDefinitionsBuilder({
677       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
678 
679   if (ST.hasVOP3PInsts()) {
680     MinNumMaxNum.customFor(FPTypesPK16)
681       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682       .clampMaxNumElements(0, S16, 2)
683       .clampScalar(0, S16, S64)
684       .scalarize(0);
685   } else if (ST.has16BitInsts()) {
686     MinNumMaxNum.customFor(FPTypes16)
687       .clampScalar(0, S16, S64)
688       .scalarize(0);
689   } else {
690     MinNumMaxNum.customFor(FPTypesBase)
691       .clampScalar(0, S32, S64)
692       .scalarize(0);
693   }
694 
695   if (ST.hasVOP3PInsts())
696     FPOpActions.clampMaxNumElements(0, S16, 2);
697 
698   FPOpActions
699     .scalarize(0)
700     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
701 
702   TrigActions
703     .scalarize(0)
704     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
705 
706   FDIVActions
707     .scalarize(0)
708     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
709 
710   getActionDefinitionsBuilder({G_FNEG, G_FABS})
711     .legalFor(FPTypesPK16)
712     .clampMaxNumElements(0, S16, 2)
713     .scalarize(0)
714     .clampScalar(0, S16, S64);
715 
716   if (ST.has16BitInsts()) {
717     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
718       .legalFor({S32, S64, S16})
719       .scalarize(0)
720       .clampScalar(0, S16, S64);
721   } else {
722     getActionDefinitionsBuilder(G_FSQRT)
723       .legalFor({S32, S64})
724       .scalarize(0)
725       .clampScalar(0, S32, S64);
726 
727     if (ST.hasFractBug()) {
728       getActionDefinitionsBuilder(G_FFLOOR)
729         .customFor({S64})
730         .legalFor({S32, S64})
731         .scalarize(0)
732         .clampScalar(0, S32, S64);
733     } else {
734       getActionDefinitionsBuilder(G_FFLOOR)
735         .legalFor({S32, S64})
736         .scalarize(0)
737         .clampScalar(0, S32, S64);
738     }
739   }
740 
741   getActionDefinitionsBuilder(G_FPTRUNC)
742     .legalFor({{S32, S64}, {S16, S32}})
743     .scalarize(0)
744     .lower();
745 
746   getActionDefinitionsBuilder(G_FPEXT)
747     .legalFor({{S64, S32}, {S32, S16}})
748     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
749     .scalarize(0);
750 
751   getActionDefinitionsBuilder(G_FSUB)
752       // Use actual fsub instruction
753       .legalFor({S32})
754       // Must use fadd + fneg
755       .lowerFor({S64, S16, V2S16})
756       .scalarize(0)
757       .clampScalar(0, S32, S64);
758 
759   // Whether this is legal depends on the floating point mode for the function.
760   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
761   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
762     FMad.customFor({S32, S16});
763   else if (ST.hasMadMacF32Insts())
764     FMad.customFor({S32});
765   else if (ST.hasMadF16())
766     FMad.customFor({S16});
767   FMad.scalarize(0)
768       .lower();
769 
770   auto &FRem = getActionDefinitionsBuilder(G_FREM);
771   if (ST.has16BitInsts()) {
772     FRem.customFor({S16, S32, S64});
773   } else {
774     FRem.minScalar(0, S32)
775         .customFor({S32, S64});
776   }
777   FRem.scalarize(0);
778 
779   // TODO: Do we need to clamp maximum bitwidth?
780   getActionDefinitionsBuilder(G_TRUNC)
781     .legalIf(isScalar(0))
782     .legalFor({{V2S16, V2S32}})
783     .clampMaxNumElements(0, S16, 2)
784     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
785     // situations (like an invalid implicit use), we don't want to infinite loop
786     // in the legalizer.
787     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
788     .alwaysLegal();
789 
790   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
791     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
792                {S32, S1}, {S64, S1}, {S16, S1}})
793     .scalarize(0)
794     .clampScalar(0, S32, S64)
795     .widenScalarToNextPow2(1, 32);
796 
797   // TODO: Split s1->s64 during regbankselect for VALU.
798   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
800     .lowerFor({{S32, S64}})
801     .lowerIf(typeIs(1, S1))
802     .customFor({{S64, S64}});
803   if (ST.has16BitInsts())
804     IToFP.legalFor({{S16, S16}});
805   IToFP.clampScalar(1, S32, S64)
806        .minScalar(0, S32)
807        .scalarize(0)
808        .widenScalarToNextPow2(1);
809 
810   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
811     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812     .customFor({{S64, S64}})
813     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
814   if (ST.has16BitInsts())
815     FPToI.legalFor({{S16, S16}});
816   else
817     FPToI.minScalar(1, S32);
818 
819   FPToI.minScalar(0, S32)
820        .scalarize(0)
821        .lower();
822 
823   // Lower roundeven into G_FRINT
824   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825     .scalarize(0)
826     .lower();
827 
828   if (ST.has16BitInsts()) {
829     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830       .legalFor({S16, S32, S64})
831       .clampScalar(0, S16, S64)
832       .scalarize(0);
833   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
834     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
835       .legalFor({S32, S64})
836       .clampScalar(0, S32, S64)
837       .scalarize(0);
838   } else {
839     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
840       .legalFor({S32})
841       .customFor({S64})
842       .clampScalar(0, S32, S64)
843       .scalarize(0);
844   }
845 
846   getActionDefinitionsBuilder(G_PTR_ADD)
847     .legalIf(all(isPointer(0), sameSize(0, 1)))
848     .scalarize(0)
849     .scalarSameSizeAs(1, 0);
850 
851   getActionDefinitionsBuilder(G_PTRMASK)
852     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853     .scalarSameSizeAs(1, 0)
854     .scalarize(0);
855 
856   auto &CmpBuilder =
857     getActionDefinitionsBuilder(G_ICMP)
858     // The compare output type differs based on the register bank of the output,
859     // so make both s1 and s32 legal.
860     //
861     // Scalar compares producing output in scc will be promoted to s32, as that
862     // is the allocatable register type that will be needed for the copy from
863     // scc. This will be promoted during RegBankSelect, and we assume something
864     // before that won't try to use s32 result types.
865     //
866     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867     // bank.
868     .legalForCartesianProduct(
869       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870     .legalForCartesianProduct(
871       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
872   if (ST.has16BitInsts()) {
873     CmpBuilder.legalFor({{S1, S16}});
874   }
875 
876   CmpBuilder
877     .widenScalarToNextPow2(1)
878     .clampScalar(1, S32, S64)
879     .scalarize(0)
880     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
881 
882   getActionDefinitionsBuilder(G_FCMP)
883     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
884     .widenScalarToNextPow2(1)
885     .clampScalar(1, S32, S64)
886     .scalarize(0);
887 
888   // FIXME: fpow has a selection pattern that should move to custom lowering.
889   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
890   if (ST.has16BitInsts())
891     Exp2Ops.legalFor({S32, S16});
892   else
893     Exp2Ops.legalFor({S32});
894   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
895   Exp2Ops.scalarize(0);
896 
897   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
898   if (ST.has16BitInsts())
899     ExpOps.customFor({{S32}, {S16}});
900   else
901     ExpOps.customFor({S32});
902   ExpOps.clampScalar(0, MinScalarFPTy, S32)
903         .scalarize(0);
904 
905   getActionDefinitionsBuilder(G_FPOWI)
906     .clampScalar(0, MinScalarFPTy, S32)
907     .lower();
908 
909   // The 64-bit versions produce 32-bit results, but only on the SALU.
910   getActionDefinitionsBuilder(G_CTPOP)
911     .legalFor({{S32, S32}, {S32, S64}})
912     .clampScalar(0, S32, S32)
913     .clampScalar(1, S32, S64)
914     .scalarize(0)
915     .widenScalarToNextPow2(0, 32)
916     .widenScalarToNextPow2(1, 32);
917 
918   // The hardware instructions return a different result on 0 than the generic
919   // instructions expect. The hardware produces -1, but these produce the
920   // bitwidth.
921   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
922     .scalarize(0)
923     .clampScalar(0, S32, S32)
924     .clampScalar(1, S32, S64)
925     .widenScalarToNextPow2(0, 32)
926     .widenScalarToNextPow2(1, 32)
927     .lower();
928 
929   // The 64-bit versions produce 32-bit results, but only on the SALU.
930   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
931     .legalFor({{S32, S32}, {S32, S64}})
932     .clampScalar(0, S32, S32)
933     .clampScalar(1, S32, S64)
934     .scalarize(0)
935     .widenScalarToNextPow2(0, 32)
936     .widenScalarToNextPow2(1, 32);
937 
938   getActionDefinitionsBuilder(G_BITREVERSE)
939     .legalFor({S32})
940     .clampScalar(0, S32, S32)
941     .scalarize(0);
942 
943   if (ST.has16BitInsts()) {
944     getActionDefinitionsBuilder(G_BSWAP)
945       .legalFor({S16, S32, V2S16})
946       .clampMaxNumElements(0, S16, 2)
947       // FIXME: Fixing non-power-of-2 before clamp is workaround for
948       // narrowScalar limitation.
949       .widenScalarToNextPow2(0)
950       .clampScalar(0, S16, S32)
951       .scalarize(0);
952 
953     if (ST.hasVOP3PInsts()) {
954       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
955         .legalFor({S32, S16, V2S16})
956         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
957         .clampMaxNumElements(0, S16, 2)
958         .minScalar(0, S16)
959         .widenScalarToNextPow2(0)
960         .scalarize(0)
961         .lower();
962     } else {
963       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
964         .legalFor({S32, S16})
965         .widenScalarToNextPow2(0)
966         .minScalar(0, S16)
967         .scalarize(0)
968         .lower();
969     }
970   } else {
971     // TODO: Should have same legality without v_perm_b32
972     getActionDefinitionsBuilder(G_BSWAP)
973       .legalFor({S32})
974       .lowerIf(scalarNarrowerThan(0, 32))
975       // FIXME: Fixing non-power-of-2 before clamp is workaround for
976       // narrowScalar limitation.
977       .widenScalarToNextPow2(0)
978       .maxScalar(0, S32)
979       .scalarize(0)
980       .lower();
981 
982     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
983       .legalFor({S32})
984       .minScalar(0, S32)
985       .widenScalarToNextPow2(0)
986       .scalarize(0)
987       .lower();
988   }
989 
990   getActionDefinitionsBuilder(G_INTTOPTR)
991     // List the common cases
992     .legalForCartesianProduct(AddrSpaces64, {S64})
993     .legalForCartesianProduct(AddrSpaces32, {S32})
994     .scalarize(0)
995     // Accept any address space as long as the size matches
996     .legalIf(sameSize(0, 1))
997     .widenScalarIf(smallerThan(1, 0),
998       [](const LegalityQuery &Query) {
999         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1000       })
1001     .narrowScalarIf(largerThan(1, 0),
1002       [](const LegalityQuery &Query) {
1003         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1004       });
1005 
1006   getActionDefinitionsBuilder(G_PTRTOINT)
1007     // List the common cases
1008     .legalForCartesianProduct(AddrSpaces64, {S64})
1009     .legalForCartesianProduct(AddrSpaces32, {S32})
1010     .scalarize(0)
1011     // Accept any address space as long as the size matches
1012     .legalIf(sameSize(0, 1))
1013     .widenScalarIf(smallerThan(0, 1),
1014       [](const LegalityQuery &Query) {
1015         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1016       })
1017     .narrowScalarIf(
1018       largerThan(0, 1),
1019       [](const LegalityQuery &Query) {
1020         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1021       });
1022 
1023   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1024     .scalarize(0)
1025     .custom();
1026 
1027   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1028                                     bool IsLoad) -> bool {
1029     const LLT DstTy = Query.Types[0];
1030 
1031     // Split vector extloads.
1032     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1033     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1034 
1035     if (MemSize < DstTy.getSizeInBits())
1036       MemSize = std::max(MemSize, AlignBits);
1037 
1038     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1039       return true;
1040 
1041     const LLT PtrTy = Query.Types[1];
1042     unsigned AS = PtrTy.getAddressSpace();
1043     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
1044       return true;
1045 
1046     // Catch weird sized loads that don't evenly divide into the access sizes
1047     // TODO: May be able to widen depending on alignment etc.
1048     unsigned NumRegs = (MemSize + 31) / 32;
1049     if (NumRegs == 3) {
1050       if (!ST.hasDwordx3LoadStores())
1051         return true;
1052     } else {
1053       // If the alignment allows, these should have been widened.
1054       if (!isPowerOf2_32(NumRegs))
1055         return true;
1056     }
1057 
1058     if (AlignBits < MemSize) {
1059       const SITargetLowering *TLI = ST.getTargetLowering();
1060       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1061                                                       Align(AlignBits / 8));
1062     }
1063 
1064     return false;
1065   };
1066 
1067   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1068   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1069   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1070 
1071   // TODO: Refine based on subtargets which support unaligned access or 128-bit
1072   // LDS
1073   // TODO: Unsupported flat for SI.
1074 
1075   for (unsigned Op : {G_LOAD, G_STORE}) {
1076     const bool IsStore = Op == G_STORE;
1077 
1078     auto &Actions = getActionDefinitionsBuilder(Op);
1079     // Explicitly list some common cases.
1080     // TODO: Does this help compile time at all?
1081     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
1082                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
1083                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
1084                                       {S64, GlobalPtr, 64, GlobalAlign32},
1085                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
1086                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
1087                                       {S32, GlobalPtr, 8, GlobalAlign8},
1088                                       {S32, GlobalPtr, 16, GlobalAlign16},
1089 
1090                                       {S32, LocalPtr, 32, 32},
1091                                       {S64, LocalPtr, 64, 32},
1092                                       {V2S32, LocalPtr, 64, 32},
1093                                       {S32, LocalPtr, 8, 8},
1094                                       {S32, LocalPtr, 16, 16},
1095                                       {V2S16, LocalPtr, 32, 32},
1096 
1097                                       {S32, PrivatePtr, 32, 32},
1098                                       {S32, PrivatePtr, 8, 8},
1099                                       {S32, PrivatePtr, 16, 16},
1100                                       {V2S16, PrivatePtr, 32, 32},
1101 
1102                                       {S32, ConstantPtr, 32, GlobalAlign32},
1103                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
1104                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
1105                                       {S64, ConstantPtr, 64, GlobalAlign32},
1106                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
1107     Actions.legalIf(
1108       [=](const LegalityQuery &Query) -> bool {
1109         return isLoadStoreLegal(ST, Query, Op);
1110       });
1111 
1112     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1113     // 64-bits.
1114     //
1115     // TODO: Should generalize bitcast action into coerce, which will also cover
1116     // inserting addrspacecasts.
1117     Actions.customIf(typeIs(1, Constant32Ptr));
1118 
1119     // Turn any illegal element vectors into something easier to deal
1120     // with. These will ultimately produce 32-bit scalar shifts to extract the
1121     // parts anyway.
1122     //
1123     // For odd 16-bit element vectors, prefer to split those into pieces with
1124     // 16-bit vector parts.
1125     Actions.bitcastIf(
1126       [=](const LegalityQuery &Query) -> bool {
1127         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1128                                           Query.MMODescrs[0].SizeInBits);
1129       }, bitcastToRegisterType(0));
1130 
1131     if (!IsStore) {
1132       // Widen suitably aligned loads by loading extra bytes. The standard
1133       // legalization actions can't properly express widening memory operands.
1134       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1135         return shouldWidenLoad(ST, Query, G_LOAD);
1136       });
1137     }
1138 
1139     // FIXME: load/store narrowing should be moved to lower action
1140     Actions
1141         .narrowScalarIf(
1142             [=](const LegalityQuery &Query) -> bool {
1143               return !Query.Types[0].isVector() &&
1144                      needToSplitMemOp(Query, Op == G_LOAD);
1145             },
1146             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1147               const LLT DstTy = Query.Types[0];
1148               const LLT PtrTy = Query.Types[1];
1149 
1150               const unsigned DstSize = DstTy.getSizeInBits();
1151               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1152 
1153               // Split extloads.
1154               if (DstSize > MemSize)
1155                 return std::make_pair(0, LLT::scalar(MemSize));
1156 
1157               if (!isPowerOf2_32(DstSize)) {
1158                 // We're probably decomposing an odd sized store. Try to split
1159                 // to the widest type. TODO: Account for alignment. As-is it
1160                 // should be OK, since the new parts will be further legalized.
1161                 unsigned FloorSize = PowerOf2Floor(DstSize);
1162                 return std::make_pair(0, LLT::scalar(FloorSize));
1163               }
1164 
1165               if (DstSize > 32 && (DstSize % 32 != 0)) {
1166                 // FIXME: Need a way to specify non-extload of larger size if
1167                 // suitably aligned.
1168                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1169               }
1170 
1171               unsigned MaxSize = maxSizeForAddrSpace(ST,
1172                                                      PtrTy.getAddressSpace(),
1173                                                      Op == G_LOAD);
1174               if (MemSize > MaxSize)
1175                 return std::make_pair(0, LLT::scalar(MaxSize));
1176 
1177               unsigned Align = Query.MMODescrs[0].AlignInBits;
1178               return std::make_pair(0, LLT::scalar(Align));
1179             })
1180         .fewerElementsIf(
1181             [=](const LegalityQuery &Query) -> bool {
1182               return Query.Types[0].isVector() &&
1183                      needToSplitMemOp(Query, Op == G_LOAD);
1184             },
1185             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1186               const LLT DstTy = Query.Types[0];
1187               const LLT PtrTy = Query.Types[1];
1188 
1189               LLT EltTy = DstTy.getElementType();
1190               unsigned MaxSize = maxSizeForAddrSpace(ST,
1191                                                      PtrTy.getAddressSpace(),
1192                                                      Op == G_LOAD);
1193 
1194               // FIXME: Handle widened to power of 2 results better. This ends
1195               // up scalarizing.
1196               // FIXME: 3 element stores scalarized on SI
1197 
1198               // Split if it's too large for the address space.
1199               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1200                 unsigned NumElts = DstTy.getNumElements();
1201                 unsigned EltSize = EltTy.getSizeInBits();
1202 
1203                 if (MaxSize % EltSize == 0) {
1204                   return std::make_pair(
1205                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1206                 }
1207 
1208                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1209 
1210                 // FIXME: Refine when odd breakdowns handled
1211                 // The scalars will need to be re-legalized.
1212                 if (NumPieces == 1 || NumPieces >= NumElts ||
1213                     NumElts % NumPieces != 0)
1214                   return std::make_pair(0, EltTy);
1215 
1216                 return std::make_pair(0,
1217                                       LLT::vector(NumElts / NumPieces, EltTy));
1218               }
1219 
1220               // FIXME: We could probably handle weird extending loads better.
1221               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1222               if (DstTy.getSizeInBits() > MemSize)
1223                 return std::make_pair(0, EltTy);
1224 
1225               unsigned EltSize = EltTy.getSizeInBits();
1226               unsigned DstSize = DstTy.getSizeInBits();
1227               if (!isPowerOf2_32(DstSize)) {
1228                 // We're probably decomposing an odd sized store. Try to split
1229                 // to the widest type. TODO: Account for alignment. As-is it
1230                 // should be OK, since the new parts will be further legalized.
1231                 unsigned FloorSize = PowerOf2Floor(DstSize);
1232                 return std::make_pair(
1233                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1234               }
1235 
1236               // Need to split because of alignment.
1237               unsigned Align = Query.MMODescrs[0].AlignInBits;
1238               if (EltSize > Align &&
1239                   (EltSize / Align < DstTy.getNumElements())) {
1240                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1241               }
1242 
1243               // May need relegalization for the scalars.
1244               return std::make_pair(0, EltTy);
1245             })
1246     .lowerIfMemSizeNotPow2()
1247     .minScalar(0, S32);
1248 
1249     if (IsStore)
1250       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1251 
1252     Actions
1253         .widenScalarToNextPow2(0)
1254         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1255         .lower();
1256   }
1257 
1258   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1259                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1260                                                   {S32, GlobalPtr, 16, 2 * 8},
1261                                                   {S32, LocalPtr, 8, 8},
1262                                                   {S32, LocalPtr, 16, 16},
1263                                                   {S32, PrivatePtr, 8, 8},
1264                                                   {S32, PrivatePtr, 16, 16},
1265                                                   {S32, ConstantPtr, 8, 8},
1266                                                   {S32, ConstantPtr, 16, 2 * 8}});
1267   if (ST.hasFlatAddressSpace()) {
1268     ExtLoads.legalForTypesWithMemDesc(
1269         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1270   }
1271 
1272   ExtLoads.clampScalar(0, S32, S32)
1273           .widenScalarToNextPow2(0)
1274           .unsupportedIfMemSizeNotPow2()
1275           .lower();
1276 
1277   auto &Atomics = getActionDefinitionsBuilder(
1278     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1279      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1280      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281      G_ATOMICRMW_UMIN})
1282     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283                {S64, GlobalPtr}, {S64, LocalPtr},
1284                {S32, RegionPtr}, {S64, RegionPtr}});
1285   if (ST.hasFlatAddressSpace()) {
1286     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1287   }
1288 
1289   if (ST.hasLDSFPAtomics()) {
1290     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1291       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1292   }
1293 
1294   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1295   // demarshalling
1296   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1297     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1298                 {S32, FlatPtr}, {S64, FlatPtr}})
1299     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1300                {S32, RegionPtr}, {S64, RegionPtr}});
1301   // TODO: Pointer types, any 32-bit or 64-bit vector
1302 
1303   // Condition should be s32 for scalar, s1 for vector.
1304   getActionDefinitionsBuilder(G_SELECT)
1305     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1306           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1307           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1308     .clampScalar(0, S16, S64)
1309     .scalarize(1)
1310     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1311     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1312     .clampMaxNumElements(0, S32, 2)
1313     .clampMaxNumElements(0, LocalPtr, 2)
1314     .clampMaxNumElements(0, PrivatePtr, 2)
1315     .scalarize(0)
1316     .widenScalarToNextPow2(0)
1317     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1318 
1319   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1320   // be more flexible with the shift amount type.
1321   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1322     .legalFor({{S32, S32}, {S64, S32}});
1323   if (ST.has16BitInsts()) {
1324     if (ST.hasVOP3PInsts()) {
1325       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1326             .clampMaxNumElements(0, S16, 2);
1327     } else
1328       Shifts.legalFor({{S16, S16}});
1329 
1330     // TODO: Support 16-bit shift amounts for all types
1331     Shifts.widenScalarIf(
1332       [=](const LegalityQuery &Query) {
1333         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1334         // 32-bit amount.
1335         const LLT ValTy = Query.Types[0];
1336         const LLT AmountTy = Query.Types[1];
1337         return ValTy.getSizeInBits() <= 16 &&
1338                AmountTy.getSizeInBits() < 16;
1339       }, changeTo(1, S16));
1340     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1341     Shifts.clampScalar(1, S32, S32);
1342     Shifts.clampScalar(0, S16, S64);
1343     Shifts.widenScalarToNextPow2(0, 16);
1344 
1345     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1346       .minScalar(0, S16)
1347       .scalarize(0)
1348       .lower();
1349   } else {
1350     // Make sure we legalize the shift amount type first, as the general
1351     // expansion for the shifted type will produce much worse code if it hasn't
1352     // been truncated already.
1353     Shifts.clampScalar(1, S32, S32);
1354     Shifts.clampScalar(0, S32, S64);
1355     Shifts.widenScalarToNextPow2(0, 32);
1356 
1357     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1358       .minScalar(0, S32)
1359       .scalarize(0)
1360       .lower();
1361   }
1362   Shifts.scalarize(0);
1363 
1364   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1365     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1366     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1367     unsigned IdxTypeIdx = 2;
1368 
1369     getActionDefinitionsBuilder(Op)
1370       .customIf([=](const LegalityQuery &Query) {
1371           const LLT EltTy = Query.Types[EltTypeIdx];
1372           const LLT VecTy = Query.Types[VecTypeIdx];
1373           const LLT IdxTy = Query.Types[IdxTypeIdx];
1374           const unsigned EltSize = EltTy.getSizeInBits();
1375           return (EltSize == 32 || EltSize == 64) &&
1376                   VecTy.getSizeInBits() % 32 == 0 &&
1377                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1378                   IdxTy.getSizeInBits() == 32;
1379         })
1380       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1381                  bitcastToVectorElement32(VecTypeIdx))
1382       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1383       .bitcastIf(
1384         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1385         [=](const LegalityQuery &Query) {
1386           // For > 64-bit element types, try to turn this into a 64-bit
1387           // element vector since we may be able to do better indexing
1388           // if this is scalar. If not, fall back to 32.
1389           const LLT EltTy = Query.Types[EltTypeIdx];
1390           const LLT VecTy = Query.Types[VecTypeIdx];
1391           const unsigned DstEltSize = EltTy.getSizeInBits();
1392           const unsigned VecSize = VecTy.getSizeInBits();
1393 
1394           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1395           return std::make_pair(
1396             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1397         })
1398       .clampScalar(EltTypeIdx, S32, S64)
1399       .clampScalar(VecTypeIdx, S32, S64)
1400       .clampScalar(IdxTypeIdx, S32, S32)
1401       .clampMaxNumElements(VecTypeIdx, S32, 32)
1402       // TODO: Clamp elements for 64-bit vectors?
1403       // It should only be necessary with variable indexes.
1404       // As a last resort, lower to the stack
1405       .lower();
1406   }
1407 
1408   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1409     .unsupportedIf([=](const LegalityQuery &Query) {
1410         const LLT &EltTy = Query.Types[1].getElementType();
1411         return Query.Types[0] != EltTy;
1412       });
1413 
1414   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1415     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1416     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1417 
1418     // FIXME: Doesn't handle extract of illegal sizes.
1419     getActionDefinitionsBuilder(Op)
1420       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1421       // FIXME: Multiples of 16 should not be legal.
1422       .legalIf([=](const LegalityQuery &Query) {
1423           const LLT BigTy = Query.Types[BigTyIdx];
1424           const LLT LitTy = Query.Types[LitTyIdx];
1425           return (BigTy.getSizeInBits() % 32 == 0) &&
1426                  (LitTy.getSizeInBits() % 16 == 0);
1427         })
1428       .widenScalarIf(
1429         [=](const LegalityQuery &Query) {
1430           const LLT BigTy = Query.Types[BigTyIdx];
1431           return (BigTy.getScalarSizeInBits() < 16);
1432         },
1433         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1434       .widenScalarIf(
1435         [=](const LegalityQuery &Query) {
1436           const LLT LitTy = Query.Types[LitTyIdx];
1437           return (LitTy.getScalarSizeInBits() < 16);
1438         },
1439         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1440       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1441       .widenScalarToNextPow2(BigTyIdx, 32);
1442 
1443   }
1444 
1445   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1446     .legalForCartesianProduct(AllS32Vectors, {S32})
1447     .legalForCartesianProduct(AllS64Vectors, {S64})
1448     .clampNumElements(0, V16S32, V32S32)
1449     .clampNumElements(0, V2S64, V16S64)
1450     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1451 
1452   if (ST.hasScalarPackInsts()) {
1453     BuildVector
1454       // FIXME: Should probably widen s1 vectors straight to s32
1455       .minScalarOrElt(0, S16)
1456       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1457       .minScalar(1, S32);
1458 
1459     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1460       .legalFor({V2S16, S32})
1461       .lower();
1462     BuildVector.minScalarOrElt(0, S32);
1463   } else {
1464     BuildVector.customFor({V2S16, S16});
1465     BuildVector.minScalarOrElt(0, S32);
1466 
1467     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1468       .customFor({V2S16, S32})
1469       .lower();
1470   }
1471 
1472   BuildVector.legalIf(isRegisterType(0));
1473 
1474   // FIXME: Clamp maximum size
1475   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1476     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1477     .clampMaxNumElements(0, S32, 32)
1478     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1479     .clampMaxNumElements(0, S16, 64);
1480 
1481   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1482   // pre-legalize.
1483   if (ST.hasVOP3PInsts()) {
1484     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1485       .customFor({V2S16, V2S16})
1486       .lower();
1487   } else
1488     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1489 
1490   // Merge/Unmerge
1491   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1492     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1493     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1494 
1495     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1496       const LLT Ty = Query.Types[TypeIdx];
1497       if (Ty.isVector()) {
1498         const LLT &EltTy = Ty.getElementType();
1499         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1500           return true;
1501         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1502           return true;
1503       }
1504       return false;
1505     };
1506 
1507     auto &Builder = getActionDefinitionsBuilder(Op)
1508       .legalIf(all(isRegisterType(0), isRegisterType(1)))
1509       .lowerFor({{S16, V2S16}})
1510       .lowerIf([=](const LegalityQuery &Query) {
1511           const LLT BigTy = Query.Types[BigTyIdx];
1512           return BigTy.getSizeInBits() == 32;
1513         })
1514       // Try to widen to s16 first for small types.
1515       // TODO: Only do this on targets with legal s16 shifts
1516       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1517       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1518       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1519       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1520                            elementTypeIs(1, S16)),
1521                        changeTo(1, V2S16))
1522       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1523       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1524       // valid.
1525       .clampScalar(LitTyIdx, S32, S512)
1526       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1527       // Break up vectors with weird elements into scalars
1528       .fewerElementsIf(
1529         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1530         scalarize(0))
1531       .fewerElementsIf(
1532         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1533         scalarize(1))
1534       .clampScalar(BigTyIdx, S32, MaxScalar);
1535 
1536     if (Op == G_MERGE_VALUES) {
1537       Builder.widenScalarIf(
1538         // TODO: Use 16-bit shifts if legal for 8-bit values?
1539         [=](const LegalityQuery &Query) {
1540           const LLT Ty = Query.Types[LitTyIdx];
1541           return Ty.getSizeInBits() < 32;
1542         },
1543         changeTo(LitTyIdx, S32));
1544     }
1545 
1546     Builder.widenScalarIf(
1547       [=](const LegalityQuery &Query) {
1548         const LLT Ty = Query.Types[BigTyIdx];
1549         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1550           Ty.getSizeInBits() % 16 != 0;
1551       },
1552       [=](const LegalityQuery &Query) {
1553         // Pick the next power of 2, or a multiple of 64 over 128.
1554         // Whichever is smaller.
1555         const LLT &Ty = Query.Types[BigTyIdx];
1556         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1557         if (NewSizeInBits >= 256) {
1558           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1559           if (RoundedTo < NewSizeInBits)
1560             NewSizeInBits = RoundedTo;
1561         }
1562         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1563       })
1564       // Any vectors left are the wrong size. Scalarize them.
1565       .scalarize(0)
1566       .scalarize(1);
1567   }
1568 
1569   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1570   // RegBankSelect.
1571   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1572     .legalFor({{S32}, {S64}});
1573 
1574   if (ST.hasVOP3PInsts()) {
1575     SextInReg.lowerFor({{V2S16}})
1576       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1577       // get more vector shift opportunities, since we'll get those when
1578       // expanded.
1579       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1580   } else if (ST.has16BitInsts()) {
1581     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1582   } else {
1583     // Prefer to promote to s32 before lowering if we don't have 16-bit
1584     // shifts. This avoid a lot of intermediate truncate and extend operations.
1585     SextInReg.lowerFor({{S32}, {S64}});
1586   }
1587 
1588   SextInReg
1589     .scalarize(0)
1590     .clampScalar(0, S32, S64)
1591     .lower();
1592 
1593   getActionDefinitionsBuilder(G_FSHR)
1594     .legalFor({{S32, S32}})
1595     .scalarize(0)
1596     .lower();
1597 
1598   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1599     .legalFor({S64});
1600 
1601   getActionDefinitionsBuilder(G_FENCE)
1602     .alwaysLegal();
1603 
1604   getActionDefinitionsBuilder({
1605       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1606       G_FCOPYSIGN,
1607 
1608       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1609       G_ATOMICRMW_NAND,
1610       G_ATOMICRMW_FSUB,
1611       G_READ_REGISTER,
1612       G_WRITE_REGISTER,
1613 
1614       G_SADDO, G_SSUBO,
1615 
1616        // TODO: Implement
1617       G_FMINIMUM, G_FMAXIMUM,
1618       G_FSHL
1619     }).lower();
1620 
1621   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1622         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1623         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1624     .unsupported();
1625 
1626   computeTables();
1627   verify(*ST.getInstrInfo());
1628 }
1629 
1630 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1631                                          MachineInstr &MI) const {
1632   MachineIRBuilder &B = Helper.MIRBuilder;
1633   MachineRegisterInfo &MRI = *B.getMRI();
1634 
1635   switch (MI.getOpcode()) {
1636   case TargetOpcode::G_ADDRSPACE_CAST:
1637     return legalizeAddrSpaceCast(MI, MRI, B);
1638   case TargetOpcode::G_FRINT:
1639     return legalizeFrint(MI, MRI, B);
1640   case TargetOpcode::G_FCEIL:
1641     return legalizeFceil(MI, MRI, B);
1642   case TargetOpcode::G_FREM:
1643     return legalizeFrem(MI, MRI, B);
1644   case TargetOpcode::G_INTRINSIC_TRUNC:
1645     return legalizeIntrinsicTrunc(MI, MRI, B);
1646   case TargetOpcode::G_SITOFP:
1647     return legalizeITOFP(MI, MRI, B, true);
1648   case TargetOpcode::G_UITOFP:
1649     return legalizeITOFP(MI, MRI, B, false);
1650   case TargetOpcode::G_FPTOSI:
1651     return legalizeFPTOI(MI, MRI, B, true);
1652   case TargetOpcode::G_FPTOUI:
1653     return legalizeFPTOI(MI, MRI, B, false);
1654   case TargetOpcode::G_FMINNUM:
1655   case TargetOpcode::G_FMAXNUM:
1656   case TargetOpcode::G_FMINNUM_IEEE:
1657   case TargetOpcode::G_FMAXNUM_IEEE:
1658     return legalizeMinNumMaxNum(Helper, MI);
1659   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1660     return legalizeExtractVectorElt(MI, MRI, B);
1661   case TargetOpcode::G_INSERT_VECTOR_ELT:
1662     return legalizeInsertVectorElt(MI, MRI, B);
1663   case TargetOpcode::G_SHUFFLE_VECTOR:
1664     return legalizeShuffleVector(MI, MRI, B);
1665   case TargetOpcode::G_FSIN:
1666   case TargetOpcode::G_FCOS:
1667     return legalizeSinCos(MI, MRI, B);
1668   case TargetOpcode::G_GLOBAL_VALUE:
1669     return legalizeGlobalValue(MI, MRI, B);
1670   case TargetOpcode::G_LOAD:
1671     return legalizeLoad(Helper, MI);
1672   case TargetOpcode::G_FMAD:
1673     return legalizeFMad(MI, MRI, B);
1674   case TargetOpcode::G_FDIV:
1675     return legalizeFDIV(MI, MRI, B);
1676   case TargetOpcode::G_UDIV:
1677   case TargetOpcode::G_UREM:
1678     return legalizeUDIV_UREM(MI, MRI, B);
1679   case TargetOpcode::G_SDIV:
1680   case TargetOpcode::G_SREM:
1681     return legalizeSDIV_SREM(MI, MRI, B);
1682   case TargetOpcode::G_ATOMIC_CMPXCHG:
1683     return legalizeAtomicCmpXChg(MI, MRI, B);
1684   case TargetOpcode::G_FLOG:
1685     return legalizeFlog(MI, B, numbers::ln2f);
1686   case TargetOpcode::G_FLOG10:
1687     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1688   case TargetOpcode::G_FEXP:
1689     return legalizeFExp(MI, B);
1690   case TargetOpcode::G_FPOW:
1691     return legalizeFPow(MI, B);
1692   case TargetOpcode::G_FFLOOR:
1693     return legalizeFFloor(MI, MRI, B);
1694   case TargetOpcode::G_BUILD_VECTOR:
1695     return legalizeBuildVector(MI, MRI, B);
1696   default:
1697     return false;
1698   }
1699 
1700   llvm_unreachable("expected switch to return");
1701 }
1702 
1703 Register AMDGPULegalizerInfo::getSegmentAperture(
1704   unsigned AS,
1705   MachineRegisterInfo &MRI,
1706   MachineIRBuilder &B) const {
1707   MachineFunction &MF = B.getMF();
1708   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1709   const LLT S32 = LLT::scalar(32);
1710 
1711   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1712 
1713   if (ST.hasApertureRegs()) {
1714     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1715     // getreg.
1716     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1717         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1718         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1719     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1720         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1721         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1722     unsigned Encoding =
1723         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1724         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1725         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1726 
1727     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1728 
1729     B.buildInstr(AMDGPU::S_GETREG_B32)
1730       .addDef(GetReg)
1731       .addImm(Encoding);
1732     MRI.setType(GetReg, S32);
1733 
1734     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1735     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1736   }
1737 
1738   Register QueuePtr = MRI.createGenericVirtualRegister(
1739     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1740 
1741   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
1742     return Register();
1743 
1744   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1745   // private_segment_aperture_base_hi.
1746   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1747 
1748   // TODO: can we be smarter about machine pointer info?
1749   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1750   MachineMemOperand *MMO = MF.getMachineMemOperand(
1751       PtrInfo,
1752       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1753           MachineMemOperand::MOInvariant,
1754       4, commonAlignment(Align(64), StructOffset));
1755 
1756   Register LoadAddr;
1757 
1758   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1759   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1760 }
1761 
1762 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1763   MachineInstr &MI, MachineRegisterInfo &MRI,
1764   MachineIRBuilder &B) const {
1765   MachineFunction &MF = B.getMF();
1766 
1767   const LLT S32 = LLT::scalar(32);
1768   Register Dst = MI.getOperand(0).getReg();
1769   Register Src = MI.getOperand(1).getReg();
1770 
1771   LLT DstTy = MRI.getType(Dst);
1772   LLT SrcTy = MRI.getType(Src);
1773   unsigned DestAS = DstTy.getAddressSpace();
1774   unsigned SrcAS = SrcTy.getAddressSpace();
1775 
1776   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1777   // vector element.
1778   assert(!DstTy.isVector());
1779 
1780   const AMDGPUTargetMachine &TM
1781     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1782 
1783   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
1784     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1785     return true;
1786   }
1787 
1788   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1789     // Truncate.
1790     B.buildExtract(Dst, Src, 0);
1791     MI.eraseFromParent();
1792     return true;
1793   }
1794 
1795   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1796     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1797     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1798 
1799     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1800     // another. Merge operands are required to be the same type, but creating an
1801     // extra ptrtoint would be kind of pointless.
1802     auto HighAddr = B.buildConstant(
1803       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1804     B.buildMerge(Dst, {Src, HighAddr});
1805     MI.eraseFromParent();
1806     return true;
1807   }
1808 
1809   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1810     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1811            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1812     unsigned NullVal = TM.getNullPointerValue(DestAS);
1813 
1814     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1815     auto FlatNull = B.buildConstant(SrcTy, 0);
1816 
1817     // Extract low 32-bits of the pointer.
1818     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1819 
1820     auto CmpRes =
1821         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1822     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1823 
1824     MI.eraseFromParent();
1825     return true;
1826   }
1827 
1828   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1829     return false;
1830 
1831   if (!ST.hasFlatAddressSpace())
1832     return false;
1833 
1834   auto SegmentNull =
1835       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1836   auto FlatNull =
1837       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1838 
1839   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1840   if (!ApertureReg.isValid())
1841     return false;
1842 
1843   auto CmpRes =
1844       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1845 
1846   // Coerce the type of the low half of the result so we can use merge_values.
1847   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1848 
1849   // TODO: Should we allow mismatched types but matching sizes in merges to
1850   // avoid the ptrtoint?
1851   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1852   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1853 
1854   MI.eraseFromParent();
1855   return true;
1856 }
1857 
1858 bool AMDGPULegalizerInfo::legalizeFrint(
1859   MachineInstr &MI, MachineRegisterInfo &MRI,
1860   MachineIRBuilder &B) const {
1861   Register Src = MI.getOperand(1).getReg();
1862   LLT Ty = MRI.getType(Src);
1863   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1864 
1865   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1866   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1867 
1868   auto C1 = B.buildFConstant(Ty, C1Val);
1869   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1870 
1871   // TODO: Should this propagate fast-math-flags?
1872   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1873   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1874 
1875   auto C2 = B.buildFConstant(Ty, C2Val);
1876   auto Fabs = B.buildFAbs(Ty, Src);
1877 
1878   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1879   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1880   MI.eraseFromParent();
1881   return true;
1882 }
1883 
1884 bool AMDGPULegalizerInfo::legalizeFceil(
1885   MachineInstr &MI, MachineRegisterInfo &MRI,
1886   MachineIRBuilder &B) const {
1887 
1888   const LLT S1 = LLT::scalar(1);
1889   const LLT S64 = LLT::scalar(64);
1890 
1891   Register Src = MI.getOperand(1).getReg();
1892   assert(MRI.getType(Src) == S64);
1893 
1894   // result = trunc(src)
1895   // if (src > 0.0 && src != result)
1896   //   result += 1.0
1897 
1898   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1899 
1900   const auto Zero = B.buildFConstant(S64, 0.0);
1901   const auto One = B.buildFConstant(S64, 1.0);
1902   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1903   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1904   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1905   auto Add = B.buildSelect(S64, And, One, Zero);
1906 
1907   // TODO: Should this propagate fast-math-flags?
1908   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1909   return true;
1910 }
1911 
1912 bool AMDGPULegalizerInfo::legalizeFrem(
1913   MachineInstr &MI, MachineRegisterInfo &MRI,
1914   MachineIRBuilder &B) const {
1915     Register DstReg = MI.getOperand(0).getReg();
1916     Register Src0Reg = MI.getOperand(1).getReg();
1917     Register Src1Reg = MI.getOperand(2).getReg();
1918     auto Flags = MI.getFlags();
1919     LLT Ty = MRI.getType(DstReg);
1920 
1921     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1922     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1923     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1924     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1925     MI.eraseFromParent();
1926     return true;
1927 }
1928 
1929 static MachineInstrBuilder extractF64Exponent(Register Hi,
1930                                               MachineIRBuilder &B) {
1931   const unsigned FractBits = 52;
1932   const unsigned ExpBits = 11;
1933   LLT S32 = LLT::scalar(32);
1934 
1935   auto Const0 = B.buildConstant(S32, FractBits - 32);
1936   auto Const1 = B.buildConstant(S32, ExpBits);
1937 
1938   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1939     .addUse(Hi)
1940     .addUse(Const0.getReg(0))
1941     .addUse(Const1.getReg(0));
1942 
1943   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1944 }
1945 
1946 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1947   MachineInstr &MI, MachineRegisterInfo &MRI,
1948   MachineIRBuilder &B) const {
1949   const LLT S1 = LLT::scalar(1);
1950   const LLT S32 = LLT::scalar(32);
1951   const LLT S64 = LLT::scalar(64);
1952 
1953   Register Src = MI.getOperand(1).getReg();
1954   assert(MRI.getType(Src) == S64);
1955 
1956   // TODO: Should this use extract since the low half is unused?
1957   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1958   Register Hi = Unmerge.getReg(1);
1959 
1960   // Extract the upper half, since this is where we will find the sign and
1961   // exponent.
1962   auto Exp = extractF64Exponent(Hi, B);
1963 
1964   const unsigned FractBits = 52;
1965 
1966   // Extract the sign bit.
1967   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1968   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1969 
1970   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1971 
1972   const auto Zero32 = B.buildConstant(S32, 0);
1973 
1974   // Extend back to 64-bits.
1975   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1976 
1977   auto Shr = B.buildAShr(S64, FractMask, Exp);
1978   auto Not = B.buildNot(S64, Shr);
1979   auto Tmp0 = B.buildAnd(S64, Src, Not);
1980   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1981 
1982   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1983   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1984 
1985   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1986   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1987   MI.eraseFromParent();
1988   return true;
1989 }
1990 
1991 bool AMDGPULegalizerInfo::legalizeITOFP(
1992   MachineInstr &MI, MachineRegisterInfo &MRI,
1993   MachineIRBuilder &B, bool Signed) const {
1994 
1995   Register Dst = MI.getOperand(0).getReg();
1996   Register Src = MI.getOperand(1).getReg();
1997 
1998   const LLT S64 = LLT::scalar(64);
1999   const LLT S32 = LLT::scalar(32);
2000 
2001   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2002 
2003   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2004 
2005   auto CvtHi = Signed ?
2006     B.buildSITOFP(S64, Unmerge.getReg(1)) :
2007     B.buildUITOFP(S64, Unmerge.getReg(1));
2008 
2009   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2010 
2011   auto ThirtyTwo = B.buildConstant(S32, 32);
2012   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
2013     .addUse(CvtHi.getReg(0))
2014     .addUse(ThirtyTwo.getReg(0));
2015 
2016   // TODO: Should this propagate fast-math-flags?
2017   B.buildFAdd(Dst, LdExp, CvtLo);
2018   MI.eraseFromParent();
2019   return true;
2020 }
2021 
2022 // TODO: Copied from DAG implementation. Verify logic and document how this
2023 // actually works.
2024 bool AMDGPULegalizerInfo::legalizeFPTOI(
2025   MachineInstr &MI, MachineRegisterInfo &MRI,
2026   MachineIRBuilder &B, bool Signed) const {
2027 
2028   Register Dst = MI.getOperand(0).getReg();
2029   Register Src = MI.getOperand(1).getReg();
2030 
2031   const LLT S64 = LLT::scalar(64);
2032   const LLT S32 = LLT::scalar(32);
2033 
2034   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
2035 
2036   unsigned Flags = MI.getFlags();
2037 
2038   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
2039   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
2040   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
2041 
2042   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
2043   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
2044   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
2045 
2046   auto Hi = Signed ?
2047     B.buildFPTOSI(S32, FloorMul) :
2048     B.buildFPTOUI(S32, FloorMul);
2049   auto Lo = B.buildFPTOUI(S32, Fma);
2050 
2051   B.buildMerge(Dst, { Lo, Hi });
2052   MI.eraseFromParent();
2053 
2054   return true;
2055 }
2056 
2057 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
2058                                                MachineInstr &MI) const {
2059   MachineFunction &MF = Helper.MIRBuilder.getMF();
2060   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2061 
2062   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2063                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2064 
2065   // With ieee_mode disabled, the instructions have the correct behavior
2066   // already for G_FMINNUM/G_FMAXNUM
2067   if (!MFI->getMode().IEEE)
2068     return !IsIEEEOp;
2069 
2070   if (IsIEEEOp)
2071     return true;
2072 
2073   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
2074 }
2075 
2076 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
2077   MachineInstr &MI, MachineRegisterInfo &MRI,
2078   MachineIRBuilder &B) const {
2079   // TODO: Should move some of this into LegalizerHelper.
2080 
2081   // TODO: Promote dynamic indexing of s16 to s32
2082 
2083   // FIXME: Artifact combiner probably should have replaced the truncated
2084   // constant before this, so we shouldn't need
2085   // getConstantVRegValWithLookThrough.
2086   Optional<ValueAndVReg> MaybeIdxVal =
2087       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2088   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2089     return true;
2090   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2091 
2092   Register Dst = MI.getOperand(0).getReg();
2093   Register Vec = MI.getOperand(1).getReg();
2094 
2095   LLT VecTy = MRI.getType(Vec);
2096   LLT EltTy = VecTy.getElementType();
2097   assert(EltTy == MRI.getType(Dst));
2098 
2099   if (IdxVal < VecTy.getNumElements())
2100     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
2101   else
2102     B.buildUndef(Dst);
2103 
2104   MI.eraseFromParent();
2105   return true;
2106 }
2107 
2108 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
2109   MachineInstr &MI, MachineRegisterInfo &MRI,
2110   MachineIRBuilder &B) const {
2111   // TODO: Should move some of this into LegalizerHelper.
2112 
2113   // TODO: Promote dynamic indexing of s16 to s32
2114 
2115   // FIXME: Artifact combiner probably should have replaced the truncated
2116   // constant before this, so we shouldn't need
2117   // getConstantVRegValWithLookThrough.
2118   Optional<ValueAndVReg> MaybeIdxVal =
2119       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2120   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
2121     return true;
2122 
2123   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
2124   Register Dst = MI.getOperand(0).getReg();
2125   Register Vec = MI.getOperand(1).getReg();
2126   Register Ins = MI.getOperand(2).getReg();
2127 
2128   LLT VecTy = MRI.getType(Vec);
2129   LLT EltTy = VecTy.getElementType();
2130   assert(EltTy == MRI.getType(Ins));
2131 
2132   if (IdxVal < VecTy.getNumElements())
2133     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
2134   else
2135     B.buildUndef(Dst);
2136 
2137   MI.eraseFromParent();
2138   return true;
2139 }
2140 
2141 bool AMDGPULegalizerInfo::legalizeShuffleVector(
2142   MachineInstr &MI, MachineRegisterInfo &MRI,
2143   MachineIRBuilder &B) const {
2144   const LLT V2S16 = LLT::vector(2, 16);
2145 
2146   Register Dst = MI.getOperand(0).getReg();
2147   Register Src0 = MI.getOperand(1).getReg();
2148   LLT DstTy = MRI.getType(Dst);
2149   LLT SrcTy = MRI.getType(Src0);
2150 
2151   if (SrcTy == V2S16 && DstTy == V2S16 &&
2152       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
2153     return true;
2154 
2155   MachineIRBuilder HelperBuilder(MI);
2156   GISelObserverWrapper DummyObserver;
2157   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
2158   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
2159 }
2160 
2161 bool AMDGPULegalizerInfo::legalizeSinCos(
2162   MachineInstr &MI, MachineRegisterInfo &MRI,
2163   MachineIRBuilder &B) const {
2164 
2165   Register DstReg = MI.getOperand(0).getReg();
2166   Register SrcReg = MI.getOperand(1).getReg();
2167   LLT Ty = MRI.getType(DstReg);
2168   unsigned Flags = MI.getFlags();
2169 
2170   Register TrigVal;
2171   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
2172   if (ST.hasTrigReducedRange()) {
2173     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2174     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
2175       .addUse(MulVal.getReg(0))
2176       .setMIFlags(Flags).getReg(0);
2177   } else
2178     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2179 
2180   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2181     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2182   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2183     .addUse(TrigVal)
2184     .setMIFlags(Flags);
2185   MI.eraseFromParent();
2186   return true;
2187 }
2188 
2189 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2190                                                   MachineIRBuilder &B,
2191                                                   const GlobalValue *GV,
2192                                                   int64_t Offset,
2193                                                   unsigned GAFlags) const {
2194   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2195   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2196   // to the following code sequence:
2197   //
2198   // For constant address space:
2199   //   s_getpc_b64 s[0:1]
2200   //   s_add_u32 s0, s0, $symbol
2201   //   s_addc_u32 s1, s1, 0
2202   //
2203   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2204   //   a fixup or relocation is emitted to replace $symbol with a literal
2205   //   constant, which is a pc-relative offset from the encoding of the $symbol
2206   //   operand to the global variable.
2207   //
2208   // For global address space:
2209   //   s_getpc_b64 s[0:1]
2210   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2211   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2212   //
2213   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2214   //   fixups or relocations are emitted to replace $symbol@*@lo and
2215   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2216   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2217   //   operand to the global variable.
2218   //
2219   // What we want here is an offset from the value returned by s_getpc
2220   // (which is the address of the s_add_u32 instruction) to the global
2221   // variable, but since the encoding of $symbol starts 4 bytes after the start
2222   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2223   // small. This requires us to add 4 to the global variable offset in order to
2224   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2225   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2226   // instruction.
2227 
2228   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2229 
2230   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2231     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2232 
2233   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2234     .addDef(PCReg);
2235 
2236   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2237   if (GAFlags == SIInstrInfo::MO_NONE)
2238     MIB.addImm(0);
2239   else
2240     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
2241 
2242   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2243 
2244   if (PtrTy.getSizeInBits() == 32)
2245     B.buildExtract(DstReg, PCReg, 0);
2246   return true;
2247  }
2248 
2249 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2250   MachineInstr &MI, MachineRegisterInfo &MRI,
2251   MachineIRBuilder &B) const {
2252   Register DstReg = MI.getOperand(0).getReg();
2253   LLT Ty = MRI.getType(DstReg);
2254   unsigned AS = Ty.getAddressSpace();
2255 
2256   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2257   MachineFunction &MF = B.getMF();
2258   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2259 
2260   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2261     if (!MFI->isModuleEntryFunction()) {
2262       const Function &Fn = MF.getFunction();
2263       DiagnosticInfoUnsupported BadLDSDecl(
2264         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2265         DS_Warning);
2266       Fn.getContext().diagnose(BadLDSDecl);
2267 
2268       // We currently don't have a way to correctly allocate LDS objects that
2269       // aren't directly associated with a kernel. We do force inlining of
2270       // functions that use local objects. However, if these dead functions are
2271       // not eliminated, we don't want a compile time error. Just emit a warning
2272       // and a trap, since there should be no callable path here.
2273       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2274       B.buildUndef(DstReg);
2275       MI.eraseFromParent();
2276       return true;
2277     }
2278 
2279     // TODO: We could emit code to handle the initialization somewhere.
2280     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2281       const SITargetLowering *TLI = ST.getTargetLowering();
2282       if (!TLI->shouldUseLDSConstAddress(GV)) {
2283         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2284         return true; // Leave in place;
2285       }
2286 
2287       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2288         Type *Ty = GV->getValueType();
2289         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2290         // zero-sized type in other languages to declare the dynamic shared
2291         // memory which size is not known at the compile time. They will be
2292         // allocated by the runtime and placed directly after the static
2293         // allocated ones. They all share the same offset.
2294         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2295           // Adjust alignment for that dynamic shared memory array.
2296           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2297           LLT S32 = LLT::scalar(32);
2298           auto Sz =
2299               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2300           B.buildIntToPtr(DstReg, Sz);
2301           MI.eraseFromParent();
2302           return true;
2303         }
2304       }
2305 
2306       B.buildConstant(
2307           DstReg,
2308           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2309       MI.eraseFromParent();
2310       return true;
2311     }
2312 
2313     const Function &Fn = MF.getFunction();
2314     DiagnosticInfoUnsupported BadInit(
2315       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2316     Fn.getContext().diagnose(BadInit);
2317     return true;
2318   }
2319 
2320   const SITargetLowering *TLI = ST.getTargetLowering();
2321 
2322   if (TLI->shouldEmitFixup(GV)) {
2323     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2324     MI.eraseFromParent();
2325     return true;
2326   }
2327 
2328   if (TLI->shouldEmitPCReloc(GV)) {
2329     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2330     MI.eraseFromParent();
2331     return true;
2332   }
2333 
2334   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2335   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2336 
2337   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2338       MachinePointerInfo::getGOT(MF),
2339       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2340           MachineMemOperand::MOInvariant,
2341       8 /*Size*/, Align(8));
2342 
2343   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2344 
2345   if (Ty.getSizeInBits() == 32) {
2346     // Truncate if this is a 32-bit constant adrdess.
2347     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2348     B.buildExtract(DstReg, Load, 0);
2349   } else
2350     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2351 
2352   MI.eraseFromParent();
2353   return true;
2354 }
2355 
2356 static LLT widenToNextPowerOf2(LLT Ty) {
2357   if (Ty.isVector())
2358     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2359   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2360 }
2361 
2362 bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2363                                        MachineInstr &MI) const {
2364   MachineIRBuilder &B = Helper.MIRBuilder;
2365   MachineRegisterInfo &MRI = *B.getMRI();
2366   GISelChangeObserver &Observer = Helper.Observer;
2367 
2368   Register PtrReg = MI.getOperand(1).getReg();
2369   LLT PtrTy = MRI.getType(PtrReg);
2370   unsigned AddrSpace = PtrTy.getAddressSpace();
2371 
2372   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
2373     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2374     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
2375     Observer.changingInstr(MI);
2376     MI.getOperand(1).setReg(Cast.getReg(0));
2377     Observer.changedInstr(MI);
2378     return true;
2379   }
2380 
2381   Register ValReg = MI.getOperand(0).getReg();
2382   LLT ValTy = MRI.getType(ValReg);
2383 
2384   MachineMemOperand *MMO = *MI.memoperands_begin();
2385   const unsigned ValSize = ValTy.getSizeInBits();
2386   const unsigned MemSize = 8 * MMO->getSize();
2387   const Align MemAlign = MMO->getAlign();
2388   const unsigned AlignInBits = 8 * MemAlign.value();
2389 
2390   // Widen non-power-of-2 loads to the alignment if needed
2391   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2392     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2393 
2394     // This was already the correct extending load result type, so just adjust
2395     // the memory type.
2396     if (WideMemSize == ValSize) {
2397       MachineFunction &MF = B.getMF();
2398 
2399       MachineMemOperand *WideMMO =
2400           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2401       Observer.changingInstr(MI);
2402       MI.setMemRefs(MF, {WideMMO});
2403       Observer.changedInstr(MI);
2404       return true;
2405     }
2406 
2407     // Don't bother handling edge case that should probably never be produced.
2408     if (ValSize > WideMemSize)
2409       return false;
2410 
2411     LLT WideTy = widenToNextPowerOf2(ValTy);
2412 
2413     Register WideLoad;
2414     if (!WideTy.isVector()) {
2415       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2416       B.buildTrunc(ValReg, WideLoad).getReg(0);
2417     } else {
2418       // Extract the subvector.
2419 
2420       if (isRegisterType(ValTy)) {
2421         // If this a case where G_EXTRACT is legal, use it.
2422         // (e.g. <3 x s32> -> <4 x s32>)
2423         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2424         B.buildExtract(ValReg, WideLoad, 0);
2425       } else {
2426         // For cases where the widened type isn't a nice register value, unmerge
2427         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2428         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2429         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2430         B.setInsertPt(B.getMBB(), MI.getIterator());
2431         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2432       }
2433     }
2434 
2435     MI.eraseFromParent();
2436     return true;
2437   }
2438 
2439   return false;
2440 }
2441 
2442 bool AMDGPULegalizerInfo::legalizeFMad(
2443   MachineInstr &MI, MachineRegisterInfo &MRI,
2444   MachineIRBuilder &B) const {
2445   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2446   assert(Ty.isScalar());
2447 
2448   MachineFunction &MF = B.getMF();
2449   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2450 
2451   // TODO: Always legal with future ftz flag.
2452   // FIXME: Do we need just output?
2453   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2454     return true;
2455   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2456     return true;
2457 
2458   MachineIRBuilder HelperBuilder(MI);
2459   GISelObserverWrapper DummyObserver;
2460   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2461   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2462 }
2463 
2464 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2465   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2466   Register DstReg = MI.getOperand(0).getReg();
2467   Register PtrReg = MI.getOperand(1).getReg();
2468   Register CmpVal = MI.getOperand(2).getReg();
2469   Register NewVal = MI.getOperand(3).getReg();
2470 
2471   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2472          "this should not have been custom lowered");
2473 
2474   LLT ValTy = MRI.getType(CmpVal);
2475   LLT VecTy = LLT::vector(2, ValTy);
2476 
2477   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2478 
2479   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2480     .addDef(DstReg)
2481     .addUse(PtrReg)
2482     .addUse(PackedVal)
2483     .setMemRefs(MI.memoperands());
2484 
2485   MI.eraseFromParent();
2486   return true;
2487 }
2488 
2489 bool AMDGPULegalizerInfo::legalizeFlog(
2490   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2491   Register Dst = MI.getOperand(0).getReg();
2492   Register Src = MI.getOperand(1).getReg();
2493   LLT Ty = B.getMRI()->getType(Dst);
2494   unsigned Flags = MI.getFlags();
2495 
2496   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2497   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2498 
2499   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2500   MI.eraseFromParent();
2501   return true;
2502 }
2503 
2504 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2505                                        MachineIRBuilder &B) const {
2506   Register Dst = MI.getOperand(0).getReg();
2507   Register Src = MI.getOperand(1).getReg();
2508   unsigned Flags = MI.getFlags();
2509   LLT Ty = B.getMRI()->getType(Dst);
2510 
2511   auto K = B.buildFConstant(Ty, numbers::log2e);
2512   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2513   B.buildFExp2(Dst, Mul, Flags);
2514   MI.eraseFromParent();
2515   return true;
2516 }
2517 
2518 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2519                                        MachineIRBuilder &B) const {
2520   Register Dst = MI.getOperand(0).getReg();
2521   Register Src0 = MI.getOperand(1).getReg();
2522   Register Src1 = MI.getOperand(2).getReg();
2523   unsigned Flags = MI.getFlags();
2524   LLT Ty = B.getMRI()->getType(Dst);
2525   const LLT S16 = LLT::scalar(16);
2526   const LLT S32 = LLT::scalar(32);
2527 
2528   if (Ty == S32) {
2529     auto Log = B.buildFLog2(S32, Src0, Flags);
2530     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2531       .addUse(Log.getReg(0))
2532       .addUse(Src1)
2533       .setMIFlags(Flags);
2534     B.buildFExp2(Dst, Mul, Flags);
2535   } else if (Ty == S16) {
2536     // There's no f16 fmul_legacy, so we need to convert for it.
2537     auto Log = B.buildFLog2(S16, Src0, Flags);
2538     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2539     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2540     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2541       .addUse(Ext0.getReg(0))
2542       .addUse(Ext1.getReg(0))
2543       .setMIFlags(Flags);
2544 
2545     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2546   } else
2547     return false;
2548 
2549   MI.eraseFromParent();
2550   return true;
2551 }
2552 
2553 // Find a source register, ignoring any possible source modifiers.
2554 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2555   Register ModSrc = OrigSrc;
2556   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2557     ModSrc = SrcFNeg->getOperand(1).getReg();
2558     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2559       ModSrc = SrcFAbs->getOperand(1).getReg();
2560   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2561     ModSrc = SrcFAbs->getOperand(1).getReg();
2562   return ModSrc;
2563 }
2564 
2565 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2566                                          MachineRegisterInfo &MRI,
2567                                          MachineIRBuilder &B) const {
2568 
2569   const LLT S1 = LLT::scalar(1);
2570   const LLT S64 = LLT::scalar(64);
2571   Register Dst = MI.getOperand(0).getReg();
2572   Register OrigSrc = MI.getOperand(1).getReg();
2573   unsigned Flags = MI.getFlags();
2574   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2575          "this should not have been custom lowered");
2576 
2577   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2578   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2579   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2580   // V_FRACT bug is:
2581   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2582   //
2583   // Convert floor(x) to (x - fract(x))
2584 
2585   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2586     .addUse(OrigSrc)
2587     .setMIFlags(Flags);
2588 
2589   // Give source modifier matching some assistance before obscuring a foldable
2590   // pattern.
2591 
2592   // TODO: We can avoid the neg on the fract? The input sign to fract
2593   // shouldn't matter?
2594   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2595 
2596   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2597 
2598   Register Min = MRI.createGenericVirtualRegister(S64);
2599 
2600   // We don't need to concern ourselves with the snan handling difference, so
2601   // use the one which will directly select.
2602   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2603   if (MFI->getMode().IEEE)
2604     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2605   else
2606     B.buildFMinNum(Min, Fract, Const, Flags);
2607 
2608   Register CorrectedFract = Min;
2609   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2610     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2611     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2612   }
2613 
2614   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2615   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2616 
2617   MI.eraseFromParent();
2618   return true;
2619 }
2620 
2621 // Turn an illegal packed v2s16 build vector into bit operations.
2622 // TODO: This should probably be a bitcast action in LegalizerHelper.
2623 bool AMDGPULegalizerInfo::legalizeBuildVector(
2624   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2625   Register Dst = MI.getOperand(0).getReg();
2626   const LLT S32 = LLT::scalar(32);
2627   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2628 
2629   Register Src0 = MI.getOperand(1).getReg();
2630   Register Src1 = MI.getOperand(2).getReg();
2631   assert(MRI.getType(Src0) == LLT::scalar(16));
2632 
2633   auto Merge = B.buildMerge(S32, {Src0, Src1});
2634   B.buildBitcast(Dst, Merge);
2635 
2636   MI.eraseFromParent();
2637   return true;
2638 }
2639 
2640 // Check that this is a G_XOR x, -1
2641 static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2642   if (MI.getOpcode() != TargetOpcode::G_XOR)
2643     return false;
2644   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2645   return ConstVal && *ConstVal == -1;
2646 }
2647 
2648 // Return the use branch instruction, otherwise null if the usage is invalid.
2649 static MachineInstr *
2650 verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2651                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
2652   Register CondDef = MI.getOperand(0).getReg();
2653   if (!MRI.hasOneNonDBGUse(CondDef))
2654     return nullptr;
2655 
2656   MachineBasicBlock *Parent = MI.getParent();
2657   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2658 
2659   if (isNot(MRI, *UseMI)) {
2660     Register NegatedCond = UseMI->getOperand(0).getReg();
2661     if (!MRI.hasOneNonDBGUse(NegatedCond))
2662       return nullptr;
2663 
2664     // We're deleting the def of this value, so we need to remove it.
2665     UseMI->eraseFromParent();
2666 
2667     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2668     Negated = true;
2669   }
2670 
2671   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2672     return nullptr;
2673 
2674   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2675   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
2676   if (Next == Parent->end()) {
2677     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2678     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2679       return nullptr;
2680     UncondBrTarget = &*NextMBB;
2681   } else {
2682     if (Next->getOpcode() != AMDGPU::G_BR)
2683       return nullptr;
2684     Br = &*Next;
2685     UncondBrTarget = Br->getOperand(0).getMBB();
2686   }
2687 
2688   return UseMI;
2689 }
2690 
2691 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2692                                          const ArgDescriptor *Arg,
2693                                          const TargetRegisterClass *ArgRC,
2694                                          LLT ArgTy) const {
2695   MCRegister SrcReg = Arg->getRegister();
2696   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
2697   assert(DstReg.isVirtual() && "Virtual register expected");
2698 
2699   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2700                                              ArgTy);
2701   if (Arg->isMasked()) {
2702     // TODO: Should we try to emit this once in the entry block?
2703     const LLT S32 = LLT::scalar(32);
2704     const unsigned Mask = Arg->getMask();
2705     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2706 
2707     Register AndMaskSrc = LiveIn;
2708 
2709     if (Shift != 0) {
2710       auto ShiftAmt = B.buildConstant(S32, Shift);
2711       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2712     }
2713 
2714     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2715   } else {
2716     B.buildCopy(DstReg, LiveIn);
2717   }
2718 
2719   return true;
2720 }
2721 
2722 bool AMDGPULegalizerInfo::loadInputValue(
2723     Register DstReg, MachineIRBuilder &B,
2724     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2725   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2726   const ArgDescriptor *Arg;
2727   const TargetRegisterClass *ArgRC;
2728   LLT ArgTy;
2729   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2730 
2731   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2732     return false; // TODO: Handle these
2733   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2734 }
2735 
2736 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2737     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2738     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2739   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
2740     return false;
2741 
2742   MI.eraseFromParent();
2743   return true;
2744 }
2745 
2746 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2747                                        MachineRegisterInfo &MRI,
2748                                        MachineIRBuilder &B) const {
2749   Register Dst = MI.getOperand(0).getReg();
2750   LLT DstTy = MRI.getType(Dst);
2751   LLT S16 = LLT::scalar(16);
2752   LLT S32 = LLT::scalar(32);
2753   LLT S64 = LLT::scalar(64);
2754 
2755   if (DstTy == S16)
2756     return legalizeFDIV16(MI, MRI, B);
2757   if (DstTy == S32)
2758     return legalizeFDIV32(MI, MRI, B);
2759   if (DstTy == S64)
2760     return legalizeFDIV64(MI, MRI, B);
2761 
2762   return false;
2763 }
2764 
2765 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2766                                                   Register DstReg,
2767                                                   Register X,
2768                                                   Register Y,
2769                                                   bool IsDiv) const {
2770   const LLT S1 = LLT::scalar(1);
2771   const LLT S32 = LLT::scalar(32);
2772 
2773   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2774   // algorithm used here.
2775 
2776   // Initial estimate of inv(y).
2777   auto FloatY = B.buildUITOFP(S32, Y);
2778   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2779   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2780   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2781   auto Z = B.buildFPTOUI(S32, ScaledY);
2782 
2783   // One round of UNR.
2784   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2785   auto NegYZ = B.buildMul(S32, NegY, Z);
2786   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2787 
2788   // Quotient/remainder estimate.
2789   auto Q = B.buildUMulH(S32, X, Z);
2790   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2791 
2792   // First quotient/remainder refinement.
2793   auto One = B.buildConstant(S32, 1);
2794   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2795   if (IsDiv)
2796     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2797   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2798 
2799   // Second quotient/remainder refinement.
2800   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2801   if (IsDiv)
2802     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2803   else
2804     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2805 }
2806 
2807 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2808                                               MachineRegisterInfo &MRI,
2809                                               MachineIRBuilder &B) const {
2810   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2811   Register DstReg = MI.getOperand(0).getReg();
2812   Register Num = MI.getOperand(1).getReg();
2813   Register Den = MI.getOperand(2).getReg();
2814   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2815   MI.eraseFromParent();
2816   return true;
2817 }
2818 
2819 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2820 //
2821 // Return lo, hi of result
2822 //
2823 // %cvt.lo = G_UITOFP Val.lo
2824 // %cvt.hi = G_UITOFP Val.hi
2825 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2826 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2827 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2828 // %mul2 = G_FMUL %mul1, 2**(-32)
2829 // %trunc = G_INTRINSIC_TRUNC %mul2
2830 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2831 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2832 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2833                                                        Register Val) {
2834   const LLT S32 = LLT::scalar(32);
2835   auto Unmerge = B.buildUnmerge(S32, Val);
2836 
2837   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2838   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2839 
2840   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2841                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2842 
2843   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2844   auto Mul1 =
2845       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2846 
2847   // 2**(-32)
2848   auto Mul2 =
2849       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2850   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2851 
2852   // -(2**32)
2853   auto Mad2 = B.buildFMAD(S32, Trunc,
2854                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2855 
2856   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2857   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2858 
2859   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2860 }
2861 
2862 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2863                                                   Register DstReg,
2864                                                   Register Numer,
2865                                                   Register Denom,
2866                                                   bool IsDiv) const {
2867   const LLT S32 = LLT::scalar(32);
2868   const LLT S64 = LLT::scalar(64);
2869   const LLT S1 = LLT::scalar(1);
2870   Register RcpLo, RcpHi;
2871 
2872   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2873 
2874   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2875 
2876   auto Zero64 = B.buildConstant(S64, 0);
2877   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2878 
2879   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2880   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2881 
2882   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2883   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2884   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2885 
2886   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2887   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2888   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2889   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2890 
2891   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2892   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2893   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2894   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2895   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2896 
2897   auto Zero32 = B.buildConstant(S32, 0);
2898   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2899   auto Add2_HiC =
2900       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2901   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2902   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2903 
2904   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2905   Register NumerLo = UnmergeNumer.getReg(0);
2906   Register NumerHi = UnmergeNumer.getReg(1);
2907 
2908   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2909   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2910   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2911   Register Mul3_Lo = UnmergeMul3.getReg(0);
2912   Register Mul3_Hi = UnmergeMul3.getReg(1);
2913   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2914   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2915   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2916   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2917 
2918   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2919   Register DenomLo = UnmergeDenom.getReg(0);
2920   Register DenomHi = UnmergeDenom.getReg(1);
2921 
2922   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2923   auto C1 = B.buildSExt(S32, CmpHi);
2924 
2925   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2926   auto C2 = B.buildSExt(S32, CmpLo);
2927 
2928   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2929   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2930 
2931   // TODO: Here and below portions of the code can be enclosed into if/endif.
2932   // Currently control flow is unconditional and we have 4 selects after
2933   // potential endif to substitute PHIs.
2934 
2935   // if C3 != 0 ...
2936   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2937   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2938   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2939   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2940 
2941   auto One64 = B.buildConstant(S64, 1);
2942   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2943 
2944   auto C4 =
2945       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2946   auto C5 =
2947       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2948   auto C6 = B.buildSelect(
2949       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2950 
2951   // if (C6 != 0)
2952   auto Add4 = B.buildAdd(S64, Add3, One64);
2953   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2954 
2955   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2956   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2957   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2958 
2959   // endif C6
2960   // endif C3
2961 
2962   if (IsDiv) {
2963     auto Sel1 = B.buildSelect(
2964         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2965     B.buildSelect(DstReg,
2966                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2967   } else {
2968     auto Sel2 = B.buildSelect(
2969         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2970     B.buildSelect(DstReg,
2971                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2972   }
2973 }
2974 
2975 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2976                                             MachineRegisterInfo &MRI,
2977                                             MachineIRBuilder &B) const {
2978   const LLT S64 = LLT::scalar(64);
2979   const LLT S32 = LLT::scalar(32);
2980   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2981   Register DstReg = MI.getOperand(0).getReg();
2982   Register Num = MI.getOperand(1).getReg();
2983   Register Den = MI.getOperand(2).getReg();
2984   LLT Ty = MRI.getType(DstReg);
2985 
2986   if (Ty == S32)
2987     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2988   else if (Ty == S64)
2989     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2990   else
2991     return false;
2992 
2993   MI.eraseFromParent();
2994   return true;
2995 
2996 }
2997 
2998 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2999                                             MachineRegisterInfo &MRI,
3000                                             MachineIRBuilder &B) const {
3001   const LLT S64 = LLT::scalar(64);
3002   const LLT S32 = LLT::scalar(32);
3003 
3004   Register DstReg = MI.getOperand(0).getReg();
3005   const LLT Ty = MRI.getType(DstReg);
3006   if (Ty != S32 && Ty != S64)
3007     return false;
3008 
3009   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
3010 
3011   Register LHS = MI.getOperand(1).getReg();
3012   Register RHS = MI.getOperand(2).getReg();
3013 
3014   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
3015   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
3016   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
3017 
3018   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
3019   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
3020 
3021   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
3022   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
3023 
3024   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
3025   if (Ty == S32)
3026     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
3027   else
3028     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
3029 
3030   Register Sign;
3031   if (IsDiv)
3032     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3033   else
3034     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3035 
3036   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
3037   B.buildSub(DstReg, UDivRem, Sign);
3038 
3039   MI.eraseFromParent();
3040   return true;
3041 }
3042 
3043 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
3044                                                  MachineRegisterInfo &MRI,
3045                                                  MachineIRBuilder &B) const {
3046   Register Res = MI.getOperand(0).getReg();
3047   Register LHS = MI.getOperand(1).getReg();
3048   Register RHS = MI.getOperand(2).getReg();
3049   uint16_t Flags = MI.getFlags();
3050   LLT ResTy = MRI.getType(Res);
3051 
3052   const MachineFunction &MF = B.getMF();
3053   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3054                             MI.getFlag(MachineInstr::FmAfn);
3055 
3056   if (!AllowInaccurateRcp)
3057     return false;
3058 
3059   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
3060     // 1 / x -> RCP(x)
3061     if (CLHS->isExactlyValue(1.0)) {
3062       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3063         .addUse(RHS)
3064         .setMIFlags(Flags);
3065 
3066       MI.eraseFromParent();
3067       return true;
3068     }
3069 
3070     // -1 / x -> RCP( FNEG(x) )
3071     if (CLHS->isExactlyValue(-1.0)) {
3072       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
3073       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
3074         .addUse(FNeg.getReg(0))
3075         .setMIFlags(Flags);
3076 
3077       MI.eraseFromParent();
3078       return true;
3079     }
3080   }
3081 
3082   // x / y -> x * (1.0 / y)
3083   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3084     .addUse(RHS)
3085     .setMIFlags(Flags);
3086   B.buildFMul(Res, LHS, RCP, Flags);
3087 
3088   MI.eraseFromParent();
3089   return true;
3090 }
3091 
3092 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3093                                                    MachineRegisterInfo &MRI,
3094                                                    MachineIRBuilder &B) const {
3095   Register Res = MI.getOperand(0).getReg();
3096   Register X = MI.getOperand(1).getReg();
3097   Register Y = MI.getOperand(2).getReg();
3098   uint16_t Flags = MI.getFlags();
3099   LLT ResTy = MRI.getType(Res);
3100 
3101   const MachineFunction &MF = B.getMF();
3102   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3103                             MI.getFlag(MachineInstr::FmAfn);
3104 
3105   if (!AllowInaccurateRcp)
3106     return false;
3107 
3108   auto NegY = B.buildFNeg(ResTy, Y);
3109   auto One = B.buildFConstant(ResTy, 1.0);
3110 
3111   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3112     .addUse(Y)
3113     .setMIFlags(Flags);
3114 
3115   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3116   R = B.buildFMA(ResTy, Tmp0, R, R);
3117 
3118   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3119   R = B.buildFMA(ResTy, Tmp1, R, R);
3120 
3121   auto Ret = B.buildFMul(ResTy, X, R);
3122   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3123 
3124   B.buildFMA(Res, Tmp2, R, Ret);
3125   MI.eraseFromParent();
3126   return true;
3127 }
3128 
3129 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3130                                          MachineRegisterInfo &MRI,
3131                                          MachineIRBuilder &B) const {
3132   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3133     return true;
3134 
3135   Register Res = MI.getOperand(0).getReg();
3136   Register LHS = MI.getOperand(1).getReg();
3137   Register RHS = MI.getOperand(2).getReg();
3138 
3139   uint16_t Flags = MI.getFlags();
3140 
3141   LLT S16 = LLT::scalar(16);
3142   LLT S32 = LLT::scalar(32);
3143 
3144   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3145   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3146 
3147   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3148     .addUse(RHSExt.getReg(0))
3149     .setMIFlags(Flags);
3150 
3151   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3152   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3153 
3154   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3155     .addUse(RDst.getReg(0))
3156     .addUse(RHS)
3157     .addUse(LHS)
3158     .setMIFlags(Flags);
3159 
3160   MI.eraseFromParent();
3161   return true;
3162 }
3163 
3164 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3165 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3166 static void toggleSPDenormMode(bool Enable,
3167                                MachineIRBuilder &B,
3168                                const GCNSubtarget &ST,
3169                                AMDGPU::SIModeRegisterDefaults Mode) {
3170   // Set SP denorm mode to this value.
3171   unsigned SPDenormMode =
3172     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3173 
3174   if (ST.hasDenormModeInst()) {
3175     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
3176     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3177 
3178     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3179     B.buildInstr(AMDGPU::S_DENORM_MODE)
3180       .addImm(NewDenormModeValue);
3181 
3182   } else {
3183     // Select FP32 bit field in mode register.
3184     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3185                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3186                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3187 
3188     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3189       .addImm(SPDenormMode)
3190       .addImm(SPDenormModeBitField);
3191   }
3192 }
3193 
3194 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3195                                          MachineRegisterInfo &MRI,
3196                                          MachineIRBuilder &B) const {
3197   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3198     return true;
3199 
3200   Register Res = MI.getOperand(0).getReg();
3201   Register LHS = MI.getOperand(1).getReg();
3202   Register RHS = MI.getOperand(2).getReg();
3203   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3204   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3205 
3206   uint16_t Flags = MI.getFlags();
3207 
3208   LLT S32 = LLT::scalar(32);
3209   LLT S1 = LLT::scalar(1);
3210 
3211   auto One = B.buildFConstant(S32, 1.0f);
3212 
3213   auto DenominatorScaled =
3214     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3215       .addUse(LHS)
3216       .addUse(RHS)
3217       .addImm(0)
3218       .setMIFlags(Flags);
3219   auto NumeratorScaled =
3220     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3221       .addUse(LHS)
3222       .addUse(RHS)
3223       .addImm(1)
3224       .setMIFlags(Flags);
3225 
3226   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3227     .addUse(DenominatorScaled.getReg(0))
3228     .setMIFlags(Flags);
3229   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3230 
3231   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3232   // aren't modeled as reading it.
3233   if (!Mode.allFP32Denormals())
3234     toggleSPDenormMode(true, B, ST, Mode);
3235 
3236   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3237   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3238   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3239   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3240   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3241   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3242 
3243   if (!Mode.allFP32Denormals())
3244     toggleSPDenormMode(false, B, ST, Mode);
3245 
3246   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3247     .addUse(Fma4.getReg(0))
3248     .addUse(Fma1.getReg(0))
3249     .addUse(Fma3.getReg(0))
3250     .addUse(NumeratorScaled.getReg(1))
3251     .setMIFlags(Flags);
3252 
3253   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3254     .addUse(Fmas.getReg(0))
3255     .addUse(RHS)
3256     .addUse(LHS)
3257     .setMIFlags(Flags);
3258 
3259   MI.eraseFromParent();
3260   return true;
3261 }
3262 
3263 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3264                                          MachineRegisterInfo &MRI,
3265                                          MachineIRBuilder &B) const {
3266   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3267     return true;
3268 
3269   Register Res = MI.getOperand(0).getReg();
3270   Register LHS = MI.getOperand(1).getReg();
3271   Register RHS = MI.getOperand(2).getReg();
3272 
3273   uint16_t Flags = MI.getFlags();
3274 
3275   LLT S64 = LLT::scalar(64);
3276   LLT S1 = LLT::scalar(1);
3277 
3278   auto One = B.buildFConstant(S64, 1.0);
3279 
3280   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3281     .addUse(LHS)
3282     .addUse(RHS)
3283     .addImm(0)
3284     .setMIFlags(Flags);
3285 
3286   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3287 
3288   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3289     .addUse(DivScale0.getReg(0))
3290     .setMIFlags(Flags);
3291 
3292   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3293   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3294   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3295 
3296   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3297     .addUse(LHS)
3298     .addUse(RHS)
3299     .addImm(1)
3300     .setMIFlags(Flags);
3301 
3302   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3303   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3304   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3305 
3306   Register Scale;
3307   if (!ST.hasUsableDivScaleConditionOutput()) {
3308     // Workaround a hardware bug on SI where the condition output from div_scale
3309     // is not usable.
3310 
3311     LLT S32 = LLT::scalar(32);
3312 
3313     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3314     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3315     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3316     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3317 
3318     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3319                               Scale1Unmerge.getReg(1));
3320     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3321                               Scale0Unmerge.getReg(1));
3322     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3323   } else {
3324     Scale = DivScale1.getReg(1);
3325   }
3326 
3327   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3328     .addUse(Fma4.getReg(0))
3329     .addUse(Fma3.getReg(0))
3330     .addUse(Mul.getReg(0))
3331     .addUse(Scale)
3332     .setMIFlags(Flags);
3333 
3334   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3335     .addUse(Fmas.getReg(0))
3336     .addUse(RHS)
3337     .addUse(LHS)
3338     .setMIFlags(Flags);
3339 
3340   MI.eraseFromParent();
3341   return true;
3342 }
3343 
3344 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3345                                                  MachineRegisterInfo &MRI,
3346                                                  MachineIRBuilder &B) const {
3347   Register Res = MI.getOperand(0).getReg();
3348   Register LHS = MI.getOperand(2).getReg();
3349   Register RHS = MI.getOperand(3).getReg();
3350   uint16_t Flags = MI.getFlags();
3351 
3352   LLT S32 = LLT::scalar(32);
3353   LLT S1 = LLT::scalar(1);
3354 
3355   auto Abs = B.buildFAbs(S32, RHS, Flags);
3356   const APFloat C0Val(1.0f);
3357 
3358   auto C0 = B.buildConstant(S32, 0x6f800000);
3359   auto C1 = B.buildConstant(S32, 0x2f800000);
3360   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3361 
3362   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3363   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3364 
3365   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3366 
3367   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3368     .addUse(Mul0.getReg(0))
3369     .setMIFlags(Flags);
3370 
3371   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3372 
3373   B.buildFMul(Res, Sel, Mul1, Flags);
3374 
3375   MI.eraseFromParent();
3376   return true;
3377 }
3378 
3379 // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3380 // FIXME: Why do we handle this one but not other removed instructions?
3381 //
3382 // Reciprocal square root.  The clamp prevents infinite results, clamping
3383 // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3384 // +-max_float.
3385 bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3386                                                     MachineRegisterInfo &MRI,
3387                                                     MachineIRBuilder &B) const {
3388   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3389     return true;
3390 
3391   Register Dst = MI.getOperand(0).getReg();
3392   Register Src = MI.getOperand(2).getReg();
3393   auto Flags = MI.getFlags();
3394 
3395   LLT Ty = MRI.getType(Dst);
3396 
3397   const fltSemantics *FltSemantics;
3398   if (Ty == LLT::scalar(32))
3399     FltSemantics = &APFloat::IEEEsingle();
3400   else if (Ty == LLT::scalar(64))
3401     FltSemantics = &APFloat::IEEEdouble();
3402   else
3403     return false;
3404 
3405   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3406     .addUse(Src)
3407     .setMIFlags(Flags);
3408 
3409   // We don't need to concern ourselves with the snan handling difference, since
3410   // the rsq quieted (or not) so use the one which will directly select.
3411   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3412   const bool UseIEEE = MFI->getMode().IEEE;
3413 
3414   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3415   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3416                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3417 
3418   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3419 
3420   if (UseIEEE)
3421     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3422   else
3423     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3424   MI.eraseFromParent();
3425   return true;
3426 }
3427 
3428 static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3429   switch (IID) {
3430   case Intrinsic::amdgcn_ds_fadd:
3431     return AMDGPU::G_ATOMICRMW_FADD;
3432   case Intrinsic::amdgcn_ds_fmin:
3433     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3434   case Intrinsic::amdgcn_ds_fmax:
3435     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3436   default:
3437     llvm_unreachable("not a DS FP intrinsic");
3438   }
3439 }
3440 
3441 bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3442                                                       MachineInstr &MI,
3443                                                       Intrinsic::ID IID) const {
3444   GISelChangeObserver &Observer = Helper.Observer;
3445   Observer.changingInstr(MI);
3446 
3447   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3448 
3449   // The remaining operands were used to set fields in the MemOperand on
3450   // construction.
3451   for (int I = 6; I > 3; --I)
3452     MI.RemoveOperand(I);
3453 
3454   MI.RemoveOperand(1); // Remove the intrinsic ID.
3455   Observer.changedInstr(MI);
3456   return true;
3457 }
3458 
3459 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3460                                             MachineRegisterInfo &MRI,
3461                                             MachineIRBuilder &B) const {
3462   uint64_t Offset =
3463     ST.getTargetLowering()->getImplicitParameterOffset(
3464       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3465   LLT DstTy = MRI.getType(DstReg);
3466   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3467 
3468   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3469   if (!loadInputValue(KernargPtrReg, B,
3470                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3471     return false;
3472 
3473   // FIXME: This should be nuw
3474   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3475   return true;
3476 }
3477 
3478 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3479                                                  MachineRegisterInfo &MRI,
3480                                                  MachineIRBuilder &B) const {
3481   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3482   if (!MFI->isEntryFunction()) {
3483     return legalizePreloadedArgIntrin(MI, MRI, B,
3484                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3485   }
3486 
3487   Register DstReg = MI.getOperand(0).getReg();
3488   if (!getImplicitArgPtr(DstReg, MRI, B))
3489     return false;
3490 
3491   MI.eraseFromParent();
3492   return true;
3493 }
3494 
3495 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3496                                               MachineRegisterInfo &MRI,
3497                                               MachineIRBuilder &B,
3498                                               unsigned AddrSpace) const {
3499   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3500   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3501   Register Hi32 = Unmerge.getReg(1);
3502 
3503   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3504   MI.eraseFromParent();
3505   return true;
3506 }
3507 
3508 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3509 // offset (the offset that is included in bounds checking and swizzling, to be
3510 // split between the instruction's voffset and immoffset fields) and soffset
3511 // (the offset that is excluded from bounds checking and swizzling, to go in
3512 // the instruction's soffset field).  This function takes the first kind of
3513 // offset and figures out how to split it between voffset and immoffset.
3514 std::tuple<Register, unsigned, unsigned>
3515 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3516                                         Register OrigOffset) const {
3517   const unsigned MaxImm = 4095;
3518   Register BaseReg;
3519   unsigned TotalConstOffset;
3520   const LLT S32 = LLT::scalar(32);
3521 
3522   std::tie(BaseReg, TotalConstOffset) =
3523       AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3524 
3525   unsigned ImmOffset = TotalConstOffset;
3526 
3527   // If the immediate value is too big for the immoffset field, put the value
3528   // and -4096 into the immoffset field so that the value that is copied/added
3529   // for the voffset field is a multiple of 4096, and it stands more chance
3530   // of being CSEd with the copy/add for another similar load/store.
3531   // However, do not do that rounding down to a multiple of 4096 if that is a
3532   // negative number, as it appears to be illegal to have a negative offset
3533   // in the vgpr, even if adding the immediate offset makes it positive.
3534   unsigned Overflow = ImmOffset & ~MaxImm;
3535   ImmOffset -= Overflow;
3536   if ((int32_t)Overflow < 0) {
3537     Overflow += ImmOffset;
3538     ImmOffset = 0;
3539   }
3540 
3541   if (Overflow != 0) {
3542     if (!BaseReg) {
3543       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3544     } else {
3545       auto OverflowVal = B.buildConstant(S32, Overflow);
3546       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3547     }
3548   }
3549 
3550   if (!BaseReg)
3551     BaseReg = B.buildConstant(S32, 0).getReg(0);
3552 
3553   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3554 }
3555 
3556 /// Handle register layout difference for f16 images for some subtargets.
3557 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3558                                              MachineRegisterInfo &MRI,
3559                                              Register Reg,
3560                                              bool ImageStore) const {
3561   const LLT S16 = LLT::scalar(16);
3562   const LLT S32 = LLT::scalar(32);
3563   LLT StoreVT = MRI.getType(Reg);
3564   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3565 
3566   if (ST.hasUnpackedD16VMem()) {
3567     auto Unmerge = B.buildUnmerge(S16, Reg);
3568 
3569     SmallVector<Register, 4> WideRegs;
3570     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3571       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3572 
3573     int NumElts = StoreVT.getNumElements();
3574 
3575     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3576   }
3577 
3578   if (ImageStore && ST.hasImageStoreD16Bug()) {
3579     if (StoreVT.getNumElements() == 2) {
3580       SmallVector<Register, 4> PackedRegs;
3581       Reg = B.buildBitcast(S32, Reg).getReg(0);
3582       PackedRegs.push_back(Reg);
3583       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3584       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3585     }
3586 
3587     if (StoreVT.getNumElements() == 3) {
3588       SmallVector<Register, 4> PackedRegs;
3589       auto Unmerge = B.buildUnmerge(S16, Reg);
3590       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3591         PackedRegs.push_back(Unmerge.getReg(I));
3592       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3593       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3594       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3595     }
3596 
3597     if (StoreVT.getNumElements() == 4) {
3598       SmallVector<Register, 4> PackedRegs;
3599       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3600       auto Unmerge = B.buildUnmerge(S32, Reg);
3601       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3602         PackedRegs.push_back(Unmerge.getReg(I));
3603       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3604       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3605     }
3606 
3607     llvm_unreachable("invalid data type");
3608   }
3609 
3610   return Reg;
3611 }
3612 
3613 Register AMDGPULegalizerInfo::fixStoreSourceType(
3614   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3615   MachineRegisterInfo *MRI = B.getMRI();
3616   LLT Ty = MRI->getType(VData);
3617 
3618   const LLT S16 = LLT::scalar(16);
3619 
3620   // Fixup illegal register types for i8 stores.
3621   if (Ty == LLT::scalar(8) || Ty == S16) {
3622     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3623     return AnyExt;
3624   }
3625 
3626   if (Ty.isVector()) {
3627     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3628       if (IsFormat)
3629         return handleD16VData(B, *MRI, VData);
3630     }
3631   }
3632 
3633   return VData;
3634 }
3635 
3636 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3637                                               MachineRegisterInfo &MRI,
3638                                               MachineIRBuilder &B,
3639                                               bool IsTyped,
3640                                               bool IsFormat) const {
3641   Register VData = MI.getOperand(1).getReg();
3642   LLT Ty = MRI.getType(VData);
3643   LLT EltTy = Ty.getScalarType();
3644   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3645   const LLT S32 = LLT::scalar(32);
3646 
3647   VData = fixStoreSourceType(B, VData, IsFormat);
3648   Register RSrc = MI.getOperand(2).getReg();
3649 
3650   MachineMemOperand *MMO = *MI.memoperands_begin();
3651   const int MemSize = MMO->getSize();
3652 
3653   unsigned ImmOffset;
3654   unsigned TotalOffset;
3655 
3656   // The typed intrinsics add an immediate after the registers.
3657   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3658 
3659   // The struct intrinsic variants add one additional operand over raw.
3660   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3661   Register VIndex;
3662   int OpOffset = 0;
3663   if (HasVIndex) {
3664     VIndex = MI.getOperand(3).getReg();
3665     OpOffset = 1;
3666   }
3667 
3668   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3669   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3670 
3671   unsigned Format = 0;
3672   if (IsTyped) {
3673     Format = MI.getOperand(5 + OpOffset).getImm();
3674     ++OpOffset;
3675   }
3676 
3677   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3678 
3679   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3680   if (TotalOffset != 0)
3681     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3682 
3683   unsigned Opc;
3684   if (IsTyped) {
3685     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3686                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3687   } else if (IsFormat) {
3688     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3689                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3690   } else {
3691     switch (MemSize) {
3692     case 1:
3693       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3694       break;
3695     case 2:
3696       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3697       break;
3698     default:
3699       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3700       break;
3701     }
3702   }
3703 
3704   if (!VIndex)
3705     VIndex = B.buildConstant(S32, 0).getReg(0);
3706 
3707   auto MIB = B.buildInstr(Opc)
3708     .addUse(VData)              // vdata
3709     .addUse(RSrc)               // rsrc
3710     .addUse(VIndex)             // vindex
3711     .addUse(VOffset)            // voffset
3712     .addUse(SOffset)            // soffset
3713     .addImm(ImmOffset);         // offset(imm)
3714 
3715   if (IsTyped)
3716     MIB.addImm(Format);
3717 
3718   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3719      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3720      .addMemOperand(MMO);
3721 
3722   MI.eraseFromParent();
3723   return true;
3724 }
3725 
3726 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3727                                              MachineRegisterInfo &MRI,
3728                                              MachineIRBuilder &B,
3729                                              bool IsFormat,
3730                                              bool IsTyped) const {
3731   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3732   MachineMemOperand *MMO = *MI.memoperands_begin();
3733   const int MemSize = MMO->getSize();
3734   const LLT S32 = LLT::scalar(32);
3735 
3736   Register Dst = MI.getOperand(0).getReg();
3737   Register RSrc = MI.getOperand(2).getReg();
3738 
3739   // The typed intrinsics add an immediate after the registers.
3740   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3741 
3742   // The struct intrinsic variants add one additional operand over raw.
3743   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3744   Register VIndex;
3745   int OpOffset = 0;
3746   if (HasVIndex) {
3747     VIndex = MI.getOperand(3).getReg();
3748     OpOffset = 1;
3749   }
3750 
3751   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3752   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3753 
3754   unsigned Format = 0;
3755   if (IsTyped) {
3756     Format = MI.getOperand(5 + OpOffset).getImm();
3757     ++OpOffset;
3758   }
3759 
3760   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3761   unsigned ImmOffset;
3762   unsigned TotalOffset;
3763 
3764   LLT Ty = MRI.getType(Dst);
3765   LLT EltTy = Ty.getScalarType();
3766   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3767   const bool Unpacked = ST.hasUnpackedD16VMem();
3768 
3769   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3770   if (TotalOffset != 0)
3771     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3772 
3773   unsigned Opc;
3774 
3775   if (IsTyped) {
3776     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3777                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3778   } else if (IsFormat) {
3779     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3780                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3781   } else {
3782     switch (MemSize) {
3783     case 1:
3784       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3785       break;
3786     case 2:
3787       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3788       break;
3789     default:
3790       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3791       break;
3792     }
3793   }
3794 
3795   Register LoadDstReg;
3796 
3797   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3798   LLT UnpackedTy = Ty.changeElementSize(32);
3799 
3800   if (IsExtLoad)
3801     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3802   else if (Unpacked && IsD16 && Ty.isVector())
3803     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3804   else
3805     LoadDstReg = Dst;
3806 
3807   if (!VIndex)
3808     VIndex = B.buildConstant(S32, 0).getReg(0);
3809 
3810   auto MIB = B.buildInstr(Opc)
3811     .addDef(LoadDstReg)         // vdata
3812     .addUse(RSrc)               // rsrc
3813     .addUse(VIndex)             // vindex
3814     .addUse(VOffset)            // voffset
3815     .addUse(SOffset)            // soffset
3816     .addImm(ImmOffset);         // offset(imm)
3817 
3818   if (IsTyped)
3819     MIB.addImm(Format);
3820 
3821   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3822      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3823      .addMemOperand(MMO);
3824 
3825   if (LoadDstReg != Dst) {
3826     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3827 
3828     // Widen result for extending loads was widened.
3829     if (IsExtLoad)
3830       B.buildTrunc(Dst, LoadDstReg);
3831     else {
3832       // Repack to original 16-bit vector result
3833       // FIXME: G_TRUNC should work, but legalization currently fails
3834       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3835       SmallVector<Register, 4> Repack;
3836       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3837         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3838       B.buildMerge(Dst, Repack);
3839     }
3840   }
3841 
3842   MI.eraseFromParent();
3843   return true;
3844 }
3845 
3846 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3847                                                MachineIRBuilder &B,
3848                                                bool IsInc) const {
3849   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3850                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3851   B.buildInstr(Opc)
3852     .addDef(MI.getOperand(0).getReg())
3853     .addUse(MI.getOperand(2).getReg())
3854     .addUse(MI.getOperand(3).getReg())
3855     .cloneMemRefs(MI);
3856   MI.eraseFromParent();
3857   return true;
3858 }
3859 
3860 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3861   switch (IntrID) {
3862   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3863   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3864     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3865   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3866   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3867     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3868   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3869   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3870     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3871   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3872   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3873     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3874   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3875   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3876     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3877   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3878   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3879     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3880   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3881   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3882     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3883   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3884   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3885     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3886   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3887   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3888     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3889   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3890   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3891     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3892   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3893   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3894     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3895   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3896   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3897     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3898   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3899   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3900     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3901   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3902   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3903     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
3904   default:
3905     llvm_unreachable("unhandled atomic opcode");
3906   }
3907 }
3908 
3909 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3910                                                MachineIRBuilder &B,
3911                                                Intrinsic::ID IID) const {
3912   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3913                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3914   const bool HasReturn = MI.getNumExplicitDefs() != 0;
3915 
3916   Register Dst;
3917 
3918   int OpOffset = 0;
3919   if (HasReturn) {
3920     // A few FP atomics do not support return values.
3921     Dst = MI.getOperand(0).getReg();
3922   } else {
3923     OpOffset = -1;
3924   }
3925 
3926   Register VData = MI.getOperand(2 + OpOffset).getReg();
3927   Register CmpVal;
3928 
3929   if (IsCmpSwap) {
3930     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3931     ++OpOffset;
3932   }
3933 
3934   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3935   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
3936 
3937   // The struct intrinsic variants add one additional operand over raw.
3938   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3939   Register VIndex;
3940   if (HasVIndex) {
3941     VIndex = MI.getOperand(4 + OpOffset).getReg();
3942     ++OpOffset;
3943   }
3944 
3945   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3946   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3947   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3948 
3949   MachineMemOperand *MMO = *MI.memoperands_begin();
3950 
3951   unsigned ImmOffset;
3952   unsigned TotalOffset;
3953   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3954   if (TotalOffset != 0)
3955     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3956 
3957   if (!VIndex)
3958     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3959 
3960   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3961 
3962   if (HasReturn)
3963     MIB.addDef(Dst);
3964 
3965   MIB.addUse(VData); // vdata
3966 
3967   if (IsCmpSwap)
3968     MIB.addReg(CmpVal);
3969 
3970   MIB.addUse(RSrc)               // rsrc
3971      .addUse(VIndex)             // vindex
3972      .addUse(VOffset)            // voffset
3973      .addUse(SOffset)            // soffset
3974      .addImm(ImmOffset)          // offset(imm)
3975      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3976      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3977      .addMemOperand(MMO);
3978 
3979   MI.eraseFromParent();
3980   return true;
3981 }
3982 
3983 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3984 /// vector with s16 typed elements.
3985 static void packImageA16AddressToDwords(
3986     MachineIRBuilder &B, MachineInstr &MI,
3987     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3988     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
3989   const LLT S16 = LLT::scalar(16);
3990   const LLT V2S16 = LLT::vector(2, 16);
3991 
3992   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3993     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
3994     if (!SrcOp.isReg())
3995       continue; // _L to _LZ may have eliminated this.
3996 
3997     Register AddrReg = SrcOp.getReg();
3998 
3999     if (I < Intr->GradientStart) {
4000       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
4001       PackedAddrs.push_back(AddrReg);
4002     } else {
4003       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
4004       // derivatives dx/dh and dx/dv are packed with undef.
4005       if (((I + 1) >= EndIdx) ||
4006           ((Intr->NumGradients / 2) % 2 == 1 &&
4007            (I == static_cast<unsigned>(Intr->GradientStart +
4008                                        (Intr->NumGradients / 2) - 1) ||
4009             I == static_cast<unsigned>(Intr->GradientStart +
4010                                        Intr->NumGradients - 1))) ||
4011           // Check for _L to _LZ optimization
4012           !MI.getOperand(ArgOffset + I + 1).isReg()) {
4013         PackedAddrs.push_back(
4014             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
4015                 .getReg(0));
4016       } else {
4017         PackedAddrs.push_back(
4018             B.buildBuildVector(
4019                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
4020                 .getReg(0));
4021         ++I;
4022       }
4023     }
4024   }
4025 }
4026 
4027 /// Convert from separate vaddr components to a single vector address register,
4028 /// and replace the remaining operands with $noreg.
4029 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
4030                                      int DimIdx, int NumVAddrs) {
4031   const LLT S32 = LLT::scalar(32);
4032 
4033   SmallVector<Register, 8> AddrRegs;
4034   for (int I = 0; I != NumVAddrs; ++I) {
4035     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4036     if (SrcOp.isReg()) {
4037       AddrRegs.push_back(SrcOp.getReg());
4038       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
4039     }
4040   }
4041 
4042   int NumAddrRegs = AddrRegs.size();
4043   if (NumAddrRegs != 1) {
4044     // Round up to 8 elements for v5-v7
4045     // FIXME: Missing intermediate sized register classes and instructions.
4046     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
4047       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
4048       auto Undef = B.buildUndef(S32);
4049       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
4050       NumAddrRegs = RoundedNumRegs;
4051     }
4052 
4053     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
4054     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
4055   }
4056 
4057   for (int I = 1; I != NumVAddrs; ++I) {
4058     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
4059     if (SrcOp.isReg())
4060       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
4061   }
4062 }
4063 
4064 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
4065 ///
4066 /// Depending on the subtarget, load/store with 16-bit element data need to be
4067 /// rewritten to use the low half of 32-bit registers, or directly use a packed
4068 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
4069 /// registers.
4070 ///
4071 /// We don't want to directly select image instructions just yet, but also want
4072 /// to exposes all register repacking to the legalizer/combiners. We also don't
4073 /// want a selected instrution entering RegBankSelect. In order to avoid
4074 /// defining a multitude of intermediate image instructions, directly hack on
4075 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
4076 /// now unnecessary arguments with $noreg.
4077 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4078     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4079     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
4080 
4081   const unsigned NumDefs = MI.getNumExplicitDefs();
4082   const unsigned ArgOffset = NumDefs + 1;
4083   bool IsTFE = NumDefs == 2;
4084   // We are only processing the operands of d16 image operations on subtargets
4085   // that use the unpacked register layout, or need to repack the TFE result.
4086 
4087   // TODO: Do we need to guard against already legalized intrinsics?
4088   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4089       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4090 
4091   MachineRegisterInfo *MRI = B.getMRI();
4092   const LLT S32 = LLT::scalar(32);
4093   const LLT S16 = LLT::scalar(16);
4094   const LLT V2S16 = LLT::vector(2, 16);
4095 
4096   unsigned DMask = 0;
4097 
4098   // Check for 16 bit addresses and pack if true.
4099   LLT GradTy =
4100       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4101   LLT AddrTy =
4102       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
4103   const bool IsG16 = GradTy == S16;
4104   const bool IsA16 = AddrTy == S16;
4105 
4106   int DMaskLanes = 0;
4107   if (!BaseOpcode->Atomic) {
4108     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
4109     if (BaseOpcode->Gather4) {
4110       DMaskLanes = 4;
4111     } else if (DMask != 0) {
4112       DMaskLanes = countPopulation(DMask);
4113     } else if (!IsTFE && !BaseOpcode->Store) {
4114       // If dmask is 0, this is a no-op load. This can be eliminated.
4115       B.buildUndef(MI.getOperand(0));
4116       MI.eraseFromParent();
4117       return true;
4118     }
4119   }
4120 
4121   Observer.changingInstr(MI);
4122   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
4123 
4124   unsigned NewOpcode = NumDefs == 0 ?
4125     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4126 
4127   // Track that we legalized this
4128   MI.setDesc(B.getTII().get(NewOpcode));
4129 
4130   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
4131   // dmask to be at least 1 otherwise the instruction will fail
4132   if (IsTFE && DMask == 0) {
4133     DMask = 0x1;
4134     DMaskLanes = 1;
4135     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
4136   }
4137 
4138   if (BaseOpcode->Atomic) {
4139     Register VData0 = MI.getOperand(2).getReg();
4140     LLT Ty = MRI->getType(VData0);
4141 
4142     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
4143     if (Ty.isVector())
4144       return false;
4145 
4146     if (BaseOpcode->AtomicX2) {
4147       Register VData1 = MI.getOperand(3).getReg();
4148       // The two values are packed in one register.
4149       LLT PackedTy = LLT::vector(2, Ty);
4150       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
4151       MI.getOperand(2).setReg(Concat.getReg(0));
4152       MI.getOperand(3).setReg(AMDGPU::NoRegister);
4153     }
4154   }
4155 
4156   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
4157 
4158   // Optimize _L to _LZ when _L is zero
4159   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4160           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
4161     const ConstantFP *ConstantLod;
4162 
4163     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4164                  m_GFCst(ConstantLod))) {
4165       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4166         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4167         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4168             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4169                                                       Intr->Dim);
4170 
4171         // The starting indexes should remain in the same place.
4172         --CorrectedNumVAddrs;
4173 
4174         MI.getOperand(MI.getNumExplicitDefs())
4175             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4176         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4177         Intr = NewImageDimIntr;
4178       }
4179     }
4180   }
4181 
4182   // Optimize _mip away, when 'lod' is zero
4183   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
4184     int64_t ConstantLod;
4185     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4186                  m_ICst(ConstantLod))) {
4187       if (ConstantLod == 0) {
4188         // TODO: Change intrinsic opcode and remove operand instead or replacing
4189         // it with 0, as the _L to _LZ handling is done above.
4190         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
4191         --CorrectedNumVAddrs;
4192       }
4193     }
4194   }
4195 
4196   // Rewrite the addressing register layout before doing anything else.
4197   if (IsA16 || IsG16) {
4198     if (IsA16) {
4199       // Target must support the feature and gradients need to be 16 bit too
4200       if (!ST.hasA16() || !IsG16)
4201         return false;
4202     } else if (!ST.hasG16())
4203       return false;
4204 
4205     if (Intr->NumVAddrs > 1) {
4206       SmallVector<Register, 4> PackedRegs;
4207       // Don't compress addresses for G16
4208       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4209       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4210                                   PackEndIdx);
4211 
4212       if (!IsA16) {
4213         // Add uncompressed address
4214         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4215           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
4216           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
4217           PackedRegs.push_back(AddrReg);
4218         }
4219       }
4220 
4221       // See also below in the non-a16 branch
4222       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
4223 
4224       if (!UseNSA && PackedRegs.size() > 1) {
4225         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
4226         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
4227         PackedRegs[0] = Concat.getReg(0);
4228         PackedRegs.resize(1);
4229       }
4230 
4231       const unsigned NumPacked = PackedRegs.size();
4232       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4233         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
4234         if (!SrcOp.isReg()) {
4235           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
4236           continue;
4237         }
4238 
4239         assert(SrcOp.getReg() != AMDGPU::NoRegister);
4240 
4241         if (I - Intr->VAddrStart < NumPacked)
4242           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
4243         else
4244           SrcOp.setReg(AMDGPU::NoRegister);
4245       }
4246     }
4247   } else {
4248     // If the register allocator cannot place the address registers contiguously
4249     // without introducing moves, then using the non-sequential address encoding
4250     // is always preferable, since it saves VALU instructions and is usually a
4251     // wash in terms of code size or even better.
4252     //
4253     // However, we currently have no way of hinting to the register allocator
4254     // that MIMG addresses should be placed contiguously when it is possible to
4255     // do so, so force non-NSA for the common 2-address case as a heuristic.
4256     //
4257     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4258     // allocation when possible.
4259     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
4260 
4261     if (!UseNSA && Intr->NumVAddrs > 1)
4262       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4263                                Intr->NumVAddrs);
4264   }
4265 
4266   int Flags = 0;
4267   if (IsA16)
4268     Flags |= 1;
4269   if (IsG16)
4270     Flags |= 2;
4271   MI.addOperand(MachineOperand::CreateImm(Flags));
4272 
4273   if (BaseOpcode->Store) { // No TFE for stores?
4274     // TODO: Handle dmask trim
4275     Register VData = MI.getOperand(1).getReg();
4276     LLT Ty = MRI->getType(VData);
4277     if (!Ty.isVector() || Ty.getElementType() != S16)
4278       return true;
4279 
4280     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
4281     if (RepackedReg != VData) {
4282       MI.getOperand(1).setReg(RepackedReg);
4283     }
4284 
4285     return true;
4286   }
4287 
4288   Register DstReg = MI.getOperand(0).getReg();
4289   LLT Ty = MRI->getType(DstReg);
4290   const LLT EltTy = Ty.getScalarType();
4291   const bool IsD16 = Ty.getScalarType() == S16;
4292   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
4293 
4294   // Confirm that the return type is large enough for the dmask specified
4295   if (NumElts < DMaskLanes)
4296     return false;
4297 
4298   if (NumElts > 4 || DMaskLanes > 4)
4299     return false;
4300 
4301   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4302   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
4303 
4304   // The raw dword aligned data component of the load. The only legal cases
4305   // where this matters should be when using the packed D16 format, for
4306   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
4307   LLT RoundedTy;
4308 
4309   // S32 vector to to cover all data, plus TFE result element.
4310   LLT TFETy;
4311 
4312   // Register type to use for each loaded component. Will be S32 or V2S16.
4313   LLT RegTy;
4314 
4315   if (IsD16 && ST.hasUnpackedD16VMem()) {
4316     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
4317     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
4318     RegTy = S32;
4319   } else {
4320     unsigned EltSize = EltTy.getSizeInBits();
4321     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
4322     unsigned RoundedSize = 32 * RoundedElts;
4323     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
4324     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
4325     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
4326   }
4327 
4328   // The return type does not need adjustment.
4329   // TODO: Should we change s16 case to s32 or <2 x s16>?
4330   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
4331     return true;
4332 
4333   Register Dst1Reg;
4334 
4335   // Insert after the instruction.
4336   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
4337 
4338   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
4339   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
4340   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
4341   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
4342 
4343   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
4344 
4345   MI.getOperand(0).setReg(NewResultReg);
4346 
4347   // In the IR, TFE is supposed to be used with a 2 element struct return
4348   // type. The intruction really returns these two values in one contiguous
4349   // register, with one additional dword beyond the loaded data. Rewrite the
4350   // return type to use a single register result.
4351 
4352   if (IsTFE) {
4353     Dst1Reg = MI.getOperand(1).getReg();
4354     if (MRI->getType(Dst1Reg) != S32)
4355       return false;
4356 
4357     // TODO: Make sure the TFE operand bit is set.
4358     MI.RemoveOperand(1);
4359 
4360     // Handle the easy case that requires no repack instructions.
4361     if (Ty == S32) {
4362       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
4363       return true;
4364     }
4365   }
4366 
4367   // Now figure out how to copy the new result register back into the old
4368   // result.
4369   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
4370 
4371   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
4372 
4373   if (ResultNumRegs == 1) {
4374     assert(!IsTFE);
4375     ResultRegs[0] = NewResultReg;
4376   } else {
4377     // We have to repack into a new vector of some kind.
4378     for (int I = 0; I != NumDataRegs; ++I)
4379       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
4380     B.buildUnmerge(ResultRegs, NewResultReg);
4381 
4382     // Drop the final TFE element to get the data part. The TFE result is
4383     // directly written to the right place already.
4384     if (IsTFE)
4385       ResultRegs.resize(NumDataRegs);
4386   }
4387 
4388   // For an s16 scalar result, we form an s32 result with a truncate regardless
4389   // of packed vs. unpacked.
4390   if (IsD16 && !Ty.isVector()) {
4391     B.buildTrunc(DstReg, ResultRegs[0]);
4392     return true;
4393   }
4394 
4395   // Avoid a build/concat_vector of 1 entry.
4396   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4397     B.buildBitcast(DstReg, ResultRegs[0]);
4398     return true;
4399   }
4400 
4401   assert(Ty.isVector());
4402 
4403   if (IsD16) {
4404     // For packed D16 results with TFE enabled, all the data components are
4405     // S32. Cast back to the expected type.
4406     //
4407     // TODO: We don't really need to use load s32 elements. We would only need one
4408     // cast for the TFE result if a multiple of v2s16 was used.
4409     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4410       for (Register &Reg : ResultRegs)
4411         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4412     } else if (ST.hasUnpackedD16VMem()) {
4413       for (Register &Reg : ResultRegs)
4414         Reg = B.buildTrunc(S16, Reg).getReg(0);
4415     }
4416   }
4417 
4418   auto padWithUndef = [&](LLT Ty, int NumElts) {
4419     if (NumElts == 0)
4420       return;
4421     Register Undef = B.buildUndef(Ty).getReg(0);
4422     for (int I = 0; I != NumElts; ++I)
4423       ResultRegs.push_back(Undef);
4424   };
4425 
4426   // Pad out any elements eliminated due to the dmask.
4427   LLT ResTy = MRI->getType(ResultRegs[0]);
4428   if (!ResTy.isVector()) {
4429     padWithUndef(ResTy, NumElts - ResultRegs.size());
4430     B.buildBuildVector(DstReg, ResultRegs);
4431     return true;
4432   }
4433 
4434   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4435   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4436 
4437   // Deal with the one annoying legal case.
4438   const LLT V3S16 = LLT::vector(3, 16);
4439   if (Ty == V3S16) {
4440     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4441     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4442     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4443     return true;
4444   }
4445 
4446   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4447   B.buildConcatVectors(DstReg, ResultRegs);
4448   return true;
4449 }
4450 
4451 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4452   LegalizerHelper &Helper, MachineInstr &MI) const {
4453   MachineIRBuilder &B = Helper.MIRBuilder;
4454   GISelChangeObserver &Observer = Helper.Observer;
4455 
4456   Register Dst = MI.getOperand(0).getReg();
4457   LLT Ty = B.getMRI()->getType(Dst);
4458   unsigned Size = Ty.getSizeInBits();
4459   MachineFunction &MF = B.getMF();
4460 
4461   Observer.changingInstr(MI);
4462 
4463   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4464     Ty = getBitcastRegisterType(Ty);
4465     Helper.bitcastDst(MI, Ty, 0);
4466     Dst = MI.getOperand(0).getReg();
4467     B.setInsertPt(B.getMBB(), MI);
4468   }
4469 
4470   // FIXME: We don't really need this intermediate instruction. The intrinsic
4471   // should be fixed to have a memory operand. Since it's readnone, we're not
4472   // allowed to add one.
4473   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4474   MI.RemoveOperand(1); // Remove intrinsic ID
4475 
4476   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4477   // TODO: Should this use datalayout alignment?
4478   const unsigned MemSize = (Size + 7) / 8;
4479   const Align MemAlign(4);
4480   MachineMemOperand *MMO = MF.getMachineMemOperand(
4481       MachinePointerInfo(),
4482       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4483           MachineMemOperand::MOInvariant,
4484       MemSize, MemAlign);
4485   MI.addMemOperand(MF, MMO);
4486 
4487   // There are no 96-bit result scalar loads, but widening to 128-bit should
4488   // always be legal. We may need to restore this to a 96-bit result if it turns
4489   // out this needs to be converted to a vector load during RegBankSelect.
4490   if (!isPowerOf2_32(Size)) {
4491     if (Ty.isVector())
4492       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4493     else
4494       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4495   }
4496 
4497   Observer.changedInstr(MI);
4498   return true;
4499 }
4500 
4501 // TODO: Move to selection
4502 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4503                                                 MachineRegisterInfo &MRI,
4504                                                 MachineIRBuilder &B) const {
4505   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4506   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4507       !ST.isTrapHandlerEnabled()) {
4508     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4509   } else {
4510     // Pass queue pointer to trap handler as input, and insert trap instruction
4511     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4512     MachineRegisterInfo &MRI = *B.getMRI();
4513 
4514     Register LiveIn =
4515       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4516     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
4517       return false;
4518 
4519     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4520     B.buildCopy(SGPR01, LiveIn);
4521     B.buildInstr(AMDGPU::S_TRAP)
4522         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4523         .addReg(SGPR01, RegState::Implicit);
4524   }
4525 
4526   MI.eraseFromParent();
4527   return true;
4528 }
4529 
4530 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4531     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4532   // Is non-HSA path or trap-handler disabled? then, report a warning
4533   // accordingly
4534   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4535       !ST.isTrapHandlerEnabled()) {
4536     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4537                                      "debugtrap handler not supported",
4538                                      MI.getDebugLoc(), DS_Warning);
4539     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4540     Ctx.diagnose(NoTrap);
4541   } else {
4542     // Insert debug-trap instruction
4543     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4544   }
4545 
4546   MI.eraseFromParent();
4547   return true;
4548 }
4549 
4550 bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4551                                                MachineIRBuilder &B) const {
4552   MachineRegisterInfo &MRI = *B.getMRI();
4553   const LLT S16 = LLT::scalar(16);
4554   const LLT S32 = LLT::scalar(32);
4555 
4556   Register DstReg = MI.getOperand(0).getReg();
4557   Register NodePtr = MI.getOperand(2).getReg();
4558   Register RayExtent = MI.getOperand(3).getReg();
4559   Register RayOrigin = MI.getOperand(4).getReg();
4560   Register RayDir = MI.getOperand(5).getReg();
4561   Register RayInvDir = MI.getOperand(6).getReg();
4562   Register TDescr = MI.getOperand(7).getReg();
4563 
4564   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4565   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4566   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4567                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4568                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4569                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4570 
4571   SmallVector<Register, 12> Ops;
4572   if (Is64) {
4573     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4574     Ops.push_back(Unmerge.getReg(0));
4575     Ops.push_back(Unmerge.getReg(1));
4576   } else {
4577     Ops.push_back(NodePtr);
4578   }
4579   Ops.push_back(RayExtent);
4580 
4581   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4582     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4583     Ops.push_back(Unmerge.getReg(0));
4584     Ops.push_back(Unmerge.getReg(1));
4585     Ops.push_back(Unmerge.getReg(2));
4586   };
4587 
4588   packLanes(RayOrigin);
4589   if (IsA16) {
4590     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4591     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4592     Register R1 = MRI.createGenericVirtualRegister(S32);
4593     Register R2 = MRI.createGenericVirtualRegister(S32);
4594     Register R3 = MRI.createGenericVirtualRegister(S32);
4595     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4596     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4597     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4598     Ops.push_back(R1);
4599     Ops.push_back(R2);
4600     Ops.push_back(R3);
4601   } else {
4602     packLanes(RayDir);
4603     packLanes(RayInvDir);
4604   }
4605 
4606   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4607     .addDef(DstReg)
4608     .addImm(Opcode);
4609 
4610   for (Register R : Ops) {
4611     MIB.addUse(R);
4612   }
4613 
4614   MIB.addUse(TDescr)
4615      .addImm(IsA16 ? 1 : 0)
4616      .cloneMemRefs(MI);
4617 
4618   MI.eraseFromParent();
4619   return true;
4620 }
4621 
4622 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4623                                             MachineInstr &MI) const {
4624   MachineIRBuilder &B = Helper.MIRBuilder;
4625   MachineRegisterInfo &MRI = *B.getMRI();
4626 
4627   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4628   auto IntrID = MI.getIntrinsicID();
4629   switch (IntrID) {
4630   case Intrinsic::amdgcn_if:
4631   case Intrinsic::amdgcn_else: {
4632     MachineInstr *Br = nullptr;
4633     MachineBasicBlock *UncondBrTarget = nullptr;
4634     bool Negated = false;
4635     if (MachineInstr *BrCond =
4636             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4637       const SIRegisterInfo *TRI
4638         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4639 
4640       Register Def = MI.getOperand(1).getReg();
4641       Register Use = MI.getOperand(3).getReg();
4642 
4643       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4644 
4645       if (Negated)
4646         std::swap(CondBrTarget, UncondBrTarget);
4647 
4648       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4649       if (IntrID == Intrinsic::amdgcn_if) {
4650         B.buildInstr(AMDGPU::SI_IF)
4651           .addDef(Def)
4652           .addUse(Use)
4653           .addMBB(UncondBrTarget);
4654       } else {
4655         B.buildInstr(AMDGPU::SI_ELSE)
4656             .addDef(Def)
4657             .addUse(Use)
4658             .addMBB(UncondBrTarget);
4659       }
4660 
4661       if (Br) {
4662         Br->getOperand(0).setMBB(CondBrTarget);
4663       } else {
4664         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4665         // since we're swapping branch targets it needs to be reinserted.
4666         // FIXME: IRTranslator should probably not do this
4667         B.buildBr(*CondBrTarget);
4668       }
4669 
4670       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4671       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4672       MI.eraseFromParent();
4673       BrCond->eraseFromParent();
4674       return true;
4675     }
4676 
4677     return false;
4678   }
4679   case Intrinsic::amdgcn_loop: {
4680     MachineInstr *Br = nullptr;
4681     MachineBasicBlock *UncondBrTarget = nullptr;
4682     bool Negated = false;
4683     if (MachineInstr *BrCond =
4684             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
4685       const SIRegisterInfo *TRI
4686         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4687 
4688       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4689       Register Reg = MI.getOperand(2).getReg();
4690 
4691       if (Negated)
4692         std::swap(CondBrTarget, UncondBrTarget);
4693 
4694       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4695       B.buildInstr(AMDGPU::SI_LOOP)
4696         .addUse(Reg)
4697         .addMBB(UncondBrTarget);
4698 
4699       if (Br)
4700         Br->getOperand(0).setMBB(CondBrTarget);
4701       else
4702         B.buildBr(*CondBrTarget);
4703 
4704       MI.eraseFromParent();
4705       BrCond->eraseFromParent();
4706       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4707       return true;
4708     }
4709 
4710     return false;
4711   }
4712   case Intrinsic::amdgcn_kernarg_segment_ptr:
4713     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4714       // This only makes sense to call in a kernel, so just lower to null.
4715       B.buildConstant(MI.getOperand(0).getReg(), 0);
4716       MI.eraseFromParent();
4717       return true;
4718     }
4719 
4720     return legalizePreloadedArgIntrin(
4721       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4722   case Intrinsic::amdgcn_implicitarg_ptr:
4723     return legalizeImplicitArgPtr(MI, MRI, B);
4724   case Intrinsic::amdgcn_workitem_id_x:
4725     return legalizePreloadedArgIntrin(MI, MRI, B,
4726                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4727   case Intrinsic::amdgcn_workitem_id_y:
4728     return legalizePreloadedArgIntrin(MI, MRI, B,
4729                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4730   case Intrinsic::amdgcn_workitem_id_z:
4731     return legalizePreloadedArgIntrin(MI, MRI, B,
4732                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4733   case Intrinsic::amdgcn_workgroup_id_x:
4734     return legalizePreloadedArgIntrin(MI, MRI, B,
4735                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4736   case Intrinsic::amdgcn_workgroup_id_y:
4737     return legalizePreloadedArgIntrin(MI, MRI, B,
4738                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4739   case Intrinsic::amdgcn_workgroup_id_z:
4740     return legalizePreloadedArgIntrin(MI, MRI, B,
4741                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4742   case Intrinsic::amdgcn_dispatch_ptr:
4743     return legalizePreloadedArgIntrin(MI, MRI, B,
4744                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4745   case Intrinsic::amdgcn_queue_ptr:
4746     return legalizePreloadedArgIntrin(MI, MRI, B,
4747                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4748   case Intrinsic::amdgcn_implicit_buffer_ptr:
4749     return legalizePreloadedArgIntrin(
4750       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4751   case Intrinsic::amdgcn_dispatch_id:
4752     return legalizePreloadedArgIntrin(MI, MRI, B,
4753                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4754   case Intrinsic::amdgcn_fdiv_fast:
4755     return legalizeFDIVFastIntrin(MI, MRI, B);
4756   case Intrinsic::amdgcn_is_shared:
4757     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4758   case Intrinsic::amdgcn_is_private:
4759     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4760   case Intrinsic::amdgcn_wavefrontsize: {
4761     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4762     MI.eraseFromParent();
4763     return true;
4764   }
4765   case Intrinsic::amdgcn_s_buffer_load:
4766     return legalizeSBufferLoad(Helper, MI);
4767   case Intrinsic::amdgcn_raw_buffer_store:
4768   case Intrinsic::amdgcn_struct_buffer_store:
4769     return legalizeBufferStore(MI, MRI, B, false, false);
4770   case Intrinsic::amdgcn_raw_buffer_store_format:
4771   case Intrinsic::amdgcn_struct_buffer_store_format:
4772     return legalizeBufferStore(MI, MRI, B, false, true);
4773   case Intrinsic::amdgcn_raw_tbuffer_store:
4774   case Intrinsic::amdgcn_struct_tbuffer_store:
4775     return legalizeBufferStore(MI, MRI, B, true, true);
4776   case Intrinsic::amdgcn_raw_buffer_load:
4777   case Intrinsic::amdgcn_struct_buffer_load:
4778     return legalizeBufferLoad(MI, MRI, B, false, false);
4779   case Intrinsic::amdgcn_raw_buffer_load_format:
4780   case Intrinsic::amdgcn_struct_buffer_load_format:
4781     return legalizeBufferLoad(MI, MRI, B, true, false);
4782   case Intrinsic::amdgcn_raw_tbuffer_load:
4783   case Intrinsic::amdgcn_struct_tbuffer_load:
4784     return legalizeBufferLoad(MI, MRI, B, true, true);
4785   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4786   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4787   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4788   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4789   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4790   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4791   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4792   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4793   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4794   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4795   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4796   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4797   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4798   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4799   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4800   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4801   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4802   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4803   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4804   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4805   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4806   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4807   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4808   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4809   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4810   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4811   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4812   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4813     return legalizeBufferAtomic(MI, B, IntrID);
4814   case Intrinsic::amdgcn_atomic_inc:
4815     return legalizeAtomicIncDec(MI, B, true);
4816   case Intrinsic::amdgcn_atomic_dec:
4817     return legalizeAtomicIncDec(MI, B, false);
4818   case Intrinsic::trap:
4819     return legalizeTrapIntrinsic(MI, MRI, B);
4820   case Intrinsic::debugtrap:
4821     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4822   case Intrinsic::amdgcn_rsq_clamp:
4823     return legalizeRsqClampIntrinsic(MI, MRI, B);
4824   case Intrinsic::amdgcn_ds_fadd:
4825   case Intrinsic::amdgcn_ds_fmin:
4826   case Intrinsic::amdgcn_ds_fmax:
4827     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4828   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4829     return legalizeBVHIntrinsic(MI, B);
4830   default: {
4831     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4832             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4833       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4834     return true;
4835   }
4836   }
4837 
4838   return true;
4839 }
4840