xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 9dba64be9536c28e4800e06512b7f29b43ade345)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
16 // from the Visual C++ cmath / math.h headers:
17 // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
18 #define _USE_MATH_DEFINES
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPULegalizerInfo.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
26 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27 #include "llvm/CodeGen/TargetOpcodes.h"
28 #include "llvm/CodeGen/ValueTypes.h"
29 #include "llvm/IR/DerivedTypes.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/Type.h"
32 #include "llvm/Support/Debug.h"
33 
34 #define DEBUG_TYPE "amdgpu-legalinfo"
35 
36 using namespace llvm;
37 using namespace LegalizeActions;
38 using namespace LegalizeMutations;
39 using namespace LegalityPredicates;
40 
41 
42 static LegalityPredicate isMultiple32(unsigned TypeIdx,
43                                       unsigned MaxSize = 1024) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     const LLT EltTy = Ty.getScalarType();
47     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
48   };
49 }
50 
51 static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
52   return [=](const LegalityQuery &Query) {
53     return Query.Types[TypeIdx].getSizeInBits() == Size;
54   };
55 }
56 
57 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
58   return [=](const LegalityQuery &Query) {
59     const LLT Ty = Query.Types[TypeIdx];
60     return Ty.isVector() &&
61            Ty.getNumElements() % 2 != 0 &&
62            Ty.getElementType().getSizeInBits() < 32 &&
63            Ty.getSizeInBits() % 32 != 0;
64   };
65 }
66 
67 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
68   return [=](const LegalityQuery &Query) {
69     const LLT Ty = Query.Types[TypeIdx];
70     const LLT EltTy = Ty.getScalarType();
71     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
72   };
73 }
74 
75 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
76   return [=](const LegalityQuery &Query) {
77     const LLT Ty = Query.Types[TypeIdx];
78     const LLT EltTy = Ty.getElementType();
79     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
80   };
81 }
82 
83 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
84   return [=](const LegalityQuery &Query) {
85     const LLT Ty = Query.Types[TypeIdx];
86     const LLT EltTy = Ty.getElementType();
87     unsigned Size = Ty.getSizeInBits();
88     unsigned Pieces = (Size + 63) / 64;
89     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
90     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
91   };
92 }
93 
94 // Increase the number of vector elements to reach the next multiple of 32-bit
95 // type.
96 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
97   return [=](const LegalityQuery &Query) {
98     const LLT Ty = Query.Types[TypeIdx];
99 
100     const LLT EltTy = Ty.getElementType();
101     const int Size = Ty.getSizeInBits();
102     const int EltSize = EltTy.getSizeInBits();
103     const int NextMul32 = (Size + 31) / 32;
104 
105     assert(EltSize < 32);
106 
107     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
108     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
109   };
110 }
111 
112 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
113   return [=](const LegalityQuery &Query) {
114     const LLT QueryTy = Query.Types[TypeIdx];
115     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
116   };
117 }
118 
119 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
120   return [=](const LegalityQuery &Query) {
121     const LLT QueryTy = Query.Types[TypeIdx];
122     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
123   };
124 }
125 
126 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
127   return [=](const LegalityQuery &Query) {
128     const LLT QueryTy = Query.Types[TypeIdx];
129     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
130   };
131 }
132 
133 // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
134 // v2s16.
135 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
136   return [=](const LegalityQuery &Query) {
137     const LLT Ty = Query.Types[TypeIdx];
138     if (Ty.isVector()) {
139       const int EltSize = Ty.getElementType().getSizeInBits();
140       return EltSize == 32 || EltSize == 64 ||
141             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
142              EltSize == 128 || EltSize == 256;
143     }
144 
145     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
146   };
147 }
148 
149 static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
150   return [=](const LegalityQuery &Query) {
151     return Query.Types[TypeIdx].getElementType() == Type;
152   };
153 }
154 
155 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
156   return [=](const LegalityQuery &Query) {
157     const LLT Ty = Query.Types[TypeIdx];
158     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
159            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
160   };
161 }
162 
163 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
164                                          const GCNTargetMachine &TM)
165   :  ST(ST_) {
166   using namespace TargetOpcode;
167 
168   auto GetAddrSpacePtr = [&TM](unsigned AS) {
169     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
170   };
171 
172   const LLT S1 = LLT::scalar(1);
173   const LLT S8 = LLT::scalar(8);
174   const LLT S16 = LLT::scalar(16);
175   const LLT S32 = LLT::scalar(32);
176   const LLT S64 = LLT::scalar(64);
177   const LLT S96 = LLT::scalar(96);
178   const LLT S128 = LLT::scalar(128);
179   const LLT S256 = LLT::scalar(256);
180   const LLT S1024 = LLT::scalar(1024);
181 
182   const LLT V2S16 = LLT::vector(2, 16);
183   const LLT V4S16 = LLT::vector(4, 16);
184 
185   const LLT V2S32 = LLT::vector(2, 32);
186   const LLT V3S32 = LLT::vector(3, 32);
187   const LLT V4S32 = LLT::vector(4, 32);
188   const LLT V5S32 = LLT::vector(5, 32);
189   const LLT V6S32 = LLT::vector(6, 32);
190   const LLT V7S32 = LLT::vector(7, 32);
191   const LLT V8S32 = LLT::vector(8, 32);
192   const LLT V9S32 = LLT::vector(9, 32);
193   const LLT V10S32 = LLT::vector(10, 32);
194   const LLT V11S32 = LLT::vector(11, 32);
195   const LLT V12S32 = LLT::vector(12, 32);
196   const LLT V13S32 = LLT::vector(13, 32);
197   const LLT V14S32 = LLT::vector(14, 32);
198   const LLT V15S32 = LLT::vector(15, 32);
199   const LLT V16S32 = LLT::vector(16, 32);
200   const LLT V32S32 = LLT::vector(32, 32);
201 
202   const LLT V2S64 = LLT::vector(2, 64);
203   const LLT V3S64 = LLT::vector(3, 64);
204   const LLT V4S64 = LLT::vector(4, 64);
205   const LLT V5S64 = LLT::vector(5, 64);
206   const LLT V6S64 = LLT::vector(6, 64);
207   const LLT V7S64 = LLT::vector(7, 64);
208   const LLT V8S64 = LLT::vector(8, 64);
209   const LLT V16S64 = LLT::vector(16, 64);
210 
211   std::initializer_list<LLT> AllS32Vectors =
212     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
213      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
214   std::initializer_list<LLT> AllS64Vectors =
215     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
216 
217   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
218   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
219   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
220   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
221   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
222   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
223   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
224 
225   const LLT CodePtr = FlatPtr;
226 
227   const std::initializer_list<LLT> AddrSpaces64 = {
228     GlobalPtr, ConstantPtr, FlatPtr
229   };
230 
231   const std::initializer_list<LLT> AddrSpaces32 = {
232     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
233   };
234 
235   const std::initializer_list<LLT> FPTypesBase = {
236     S32, S64
237   };
238 
239   const std::initializer_list<LLT> FPTypes16 = {
240     S32, S64, S16
241   };
242 
243   const std::initializer_list<LLT> FPTypesPK16 = {
244     S32, S64, S16, V2S16
245   };
246 
247   setAction({G_BRCOND, S1}, Legal);
248 
249   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
250   // elements for v3s16
251   getActionDefinitionsBuilder(G_PHI)
252     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
253     .legalFor(AllS32Vectors)
254     .legalFor(AllS64Vectors)
255     .legalFor(AddrSpaces64)
256     .legalFor(AddrSpaces32)
257     .clampScalar(0, S32, S256)
258     .widenScalarToNextPow2(0, 32)
259     .clampMaxNumElements(0, S32, 16)
260     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
261     .legalIf(isPointer(0));
262 
263   if (ST.has16BitInsts()) {
264     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
265       .legalFor({S32, S16})
266       .clampScalar(0, S16, S32)
267       .scalarize(0);
268   } else {
269     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
270       .legalFor({S32})
271       .clampScalar(0, S32, S32)
272       .scalarize(0);
273   }
274 
275   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
276     .legalFor({S32})
277     .clampScalar(0, S32, S32)
278     .scalarize(0);
279 
280   // Report legal for any types we can handle anywhere. For the cases only legal
281   // on the SALU, RegBankSelect will be able to re-legalize.
282   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
283     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
284     .clampScalar(0, S32, S64)
285     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
286     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
287     .widenScalarToNextPow2(0)
288     .scalarize(0);
289 
290   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
291                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
292     .legalFor({{S32, S1}})
293     .clampScalar(0, S32, S32)
294     .scalarize(0); // TODO: Implement.
295 
296   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
297     .lower();
298 
299   getActionDefinitionsBuilder(G_BITCAST)
300     // Don't worry about the size constraint.
301     .legalIf(all(isRegisterType(0), isRegisterType(1)))
302     // FIXME: Testing hack
303     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
304 
305   getActionDefinitionsBuilder(G_FCONSTANT)
306     .legalFor({S32, S64, S16})
307     .clampScalar(0, S16, S64);
308 
309   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
310     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
311                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
312     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
313     .clampScalarOrElt(0, S32, S1024)
314     .legalIf(isMultiple32(0))
315     .widenScalarToNextPow2(0, 32)
316     .clampMaxNumElements(0, S32, 16);
317 
318 
319   // FIXME: i1 operands to intrinsics should always be legal, but other i1
320   // values may not be legal.  We need to figure out how to distinguish
321   // between these two scenarios.
322   getActionDefinitionsBuilder(G_CONSTANT)
323     .legalFor({S1, S32, S64, S16, GlobalPtr,
324                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
325     .clampScalar(0, S32, S64)
326     .widenScalarToNextPow2(0)
327     .legalIf(isPointer(0));
328 
329   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
330   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
331     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
332 
333 
334   auto &FPOpActions = getActionDefinitionsBuilder(
335     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
336     .legalFor({S32, S64});
337   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
338     .customFor({S32, S64});
339   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
340     .customFor({S32, S64});
341 
342   if (ST.has16BitInsts()) {
343     if (ST.hasVOP3PInsts())
344       FPOpActions.legalFor({S16, V2S16});
345     else
346       FPOpActions.legalFor({S16});
347 
348     TrigActions.customFor({S16});
349     FDIVActions.customFor({S16});
350   }
351 
352   auto &MinNumMaxNum = getActionDefinitionsBuilder({
353       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
354 
355   if (ST.hasVOP3PInsts()) {
356     MinNumMaxNum.customFor(FPTypesPK16)
357       .clampMaxNumElements(0, S16, 2)
358       .clampScalar(0, S16, S64)
359       .scalarize(0);
360   } else if (ST.has16BitInsts()) {
361     MinNumMaxNum.customFor(FPTypes16)
362       .clampScalar(0, S16, S64)
363       .scalarize(0);
364   } else {
365     MinNumMaxNum.customFor(FPTypesBase)
366       .clampScalar(0, S32, S64)
367       .scalarize(0);
368   }
369 
370   if (ST.hasVOP3PInsts())
371     FPOpActions.clampMaxNumElements(0, S16, 2);
372 
373   FPOpActions
374     .scalarize(0)
375     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
376 
377   TrigActions
378     .scalarize(0)
379     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
380 
381   FDIVActions
382     .scalarize(0)
383     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
384 
385   getActionDefinitionsBuilder({G_FNEG, G_FABS})
386     .legalFor(FPTypesPK16)
387     .clampMaxNumElements(0, S16, 2)
388     .scalarize(0)
389     .clampScalar(0, S16, S64);
390 
391   // TODO: Implement
392   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
393 
394   if (ST.has16BitInsts()) {
395     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
396       .legalFor({S32, S64, S16})
397       .scalarize(0)
398       .clampScalar(0, S16, S64);
399   } else {
400     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
401       .legalFor({S32, S64})
402       .scalarize(0)
403       .clampScalar(0, S32, S64);
404   }
405 
406   getActionDefinitionsBuilder(G_FPTRUNC)
407     .legalFor({{S32, S64}, {S16, S32}})
408     .scalarize(0);
409 
410   getActionDefinitionsBuilder(G_FPEXT)
411     .legalFor({{S64, S32}, {S32, S16}})
412     .lowerFor({{S64, S16}}) // FIXME: Implement
413     .scalarize(0);
414 
415   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
416   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
417 
418   getActionDefinitionsBuilder(G_FSUB)
419       // Use actual fsub instruction
420       .legalFor({S32})
421       // Must use fadd + fneg
422       .lowerFor({S64, S16, V2S16})
423       .scalarize(0)
424       .clampScalar(0, S32, S64);
425 
426   // Whether this is legal depends on the floating point mode for the function.
427   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
428   if (ST.hasMadF16())
429     FMad.customFor({S32, S16});
430   else
431     FMad.customFor({S32});
432   FMad.scalarize(0)
433       .lower();
434 
435   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
436     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
437                {S32, S1}, {S64, S1}, {S16, S1},
438                {S96, S32},
439                // FIXME: Hack
440                {S64, LLT::scalar(33)},
441                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
442     .scalarize(0);
443 
444   // TODO: Split s1->s64 during regbankselect for VALU.
445   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
446     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
447     .lowerFor({{S32, S64}})
448     .customFor({{S64, S64}});
449   if (ST.has16BitInsts())
450     IToFP.legalFor({{S16, S16}});
451   IToFP.clampScalar(1, S32, S64)
452        .scalarize(0);
453 
454   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
455     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
456   if (ST.has16BitInsts())
457     FPToI.legalFor({{S16, S16}});
458   else
459     FPToI.minScalar(1, S32);
460 
461   FPToI.minScalar(0, S32)
462        .scalarize(0);
463 
464   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
465     .legalFor({S32, S64})
466     .scalarize(0);
467 
468   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
469     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
470       .legalFor({S32, S64})
471       .clampScalar(0, S32, S64)
472       .scalarize(0);
473   } else {
474     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
475       .legalFor({S32})
476       .customFor({S64})
477       .clampScalar(0, S32, S64)
478       .scalarize(0);
479   }
480 
481   getActionDefinitionsBuilder(G_GEP)
482     .legalForCartesianProduct(AddrSpaces64, {S64})
483     .legalForCartesianProduct(AddrSpaces32, {S32})
484     .scalarize(0);
485 
486   getActionDefinitionsBuilder(G_PTR_MASK)
487     .scalarize(0)
488     .alwaysLegal();
489 
490   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
491 
492   auto &CmpBuilder =
493     getActionDefinitionsBuilder(G_ICMP)
494     .legalForCartesianProduct(
495       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
496     .legalFor({{S1, S32}, {S1, S64}});
497   if (ST.has16BitInsts()) {
498     CmpBuilder.legalFor({{S1, S16}});
499   }
500 
501   CmpBuilder
502     .widenScalarToNextPow2(1)
503     .clampScalar(1, S32, S64)
504     .scalarize(0)
505     .legalIf(all(typeIs(0, S1), isPointer(1)));
506 
507   getActionDefinitionsBuilder(G_FCMP)
508     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
509     .widenScalarToNextPow2(1)
510     .clampScalar(1, S32, S64)
511     .scalarize(0);
512 
513   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
514   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
515                                G_FLOG, G_FLOG2, G_FLOG10})
516     .legalFor({S32})
517     .scalarize(0);
518 
519   // The 64-bit versions produce 32-bit results, but only on the SALU.
520   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
521                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
522                                G_CTPOP})
523     .legalFor({{S32, S32}, {S32, S64}})
524     .clampScalar(0, S32, S32)
525     .clampScalar(1, S32, S64)
526     .scalarize(0)
527     .widenScalarToNextPow2(0, 32)
528     .widenScalarToNextPow2(1, 32);
529 
530   // TODO: Expand for > s32
531   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
532     .legalFor({S32})
533     .clampScalar(0, S32, S32)
534     .scalarize(0);
535 
536   if (ST.has16BitInsts()) {
537     if (ST.hasVOP3PInsts()) {
538       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
539         .legalFor({S32, S16, V2S16})
540         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
541         .clampMaxNumElements(0, S16, 2)
542         .clampScalar(0, S16, S32)
543         .widenScalarToNextPow2(0)
544         .scalarize(0);
545     } else {
546       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
547         .legalFor({S32, S16})
548         .widenScalarToNextPow2(0)
549         .clampScalar(0, S16, S32)
550         .scalarize(0);
551     }
552   } else {
553     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
554       .legalFor({S32})
555       .clampScalar(0, S32, S32)
556       .widenScalarToNextPow2(0)
557       .scalarize(0);
558   }
559 
560   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
561     return [=](const LegalityQuery &Query) {
562       return Query.Types[TypeIdx0].getSizeInBits() <
563              Query.Types[TypeIdx1].getSizeInBits();
564     };
565   };
566 
567   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
568     return [=](const LegalityQuery &Query) {
569       return Query.Types[TypeIdx0].getSizeInBits() >
570              Query.Types[TypeIdx1].getSizeInBits();
571     };
572   };
573 
574   getActionDefinitionsBuilder(G_INTTOPTR)
575     // List the common cases
576     .legalForCartesianProduct(AddrSpaces64, {S64})
577     .legalForCartesianProduct(AddrSpaces32, {S32})
578     .scalarize(0)
579     // Accept any address space as long as the size matches
580     .legalIf(sameSize(0, 1))
581     .widenScalarIf(smallerThan(1, 0),
582       [](const LegalityQuery &Query) {
583         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
584       })
585     .narrowScalarIf(greaterThan(1, 0),
586       [](const LegalityQuery &Query) {
587         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
588       });
589 
590   getActionDefinitionsBuilder(G_PTRTOINT)
591     // List the common cases
592     .legalForCartesianProduct(AddrSpaces64, {S64})
593     .legalForCartesianProduct(AddrSpaces32, {S32})
594     .scalarize(0)
595     // Accept any address space as long as the size matches
596     .legalIf(sameSize(0, 1))
597     .widenScalarIf(smallerThan(0, 1),
598       [](const LegalityQuery &Query) {
599         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
600       })
601     .narrowScalarIf(
602       greaterThan(0, 1),
603       [](const LegalityQuery &Query) {
604         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
605       });
606 
607   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
608     .scalarize(0)
609     .custom();
610 
611   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
612   // handle some operations by just promoting the register during
613   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
614   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
615     switch (AS) {
616     // FIXME: Private element size.
617     case AMDGPUAS::PRIVATE_ADDRESS:
618       return 32;
619     // FIXME: Check subtarget
620     case AMDGPUAS::LOCAL_ADDRESS:
621       return ST.useDS128() ? 128 : 64;
622 
623     // Treat constant and global as identical. SMRD loads are sometimes usable
624     // for global loads (ideally constant address space should be eliminated)
625     // depending on the context. Legality cannot be context dependent, but
626     // RegBankSelect can split the load as necessary depending on the pointer
627     // register bank/uniformity and if the memory is invariant or not written in
628     // a kernel.
629     case AMDGPUAS::CONSTANT_ADDRESS:
630     case AMDGPUAS::GLOBAL_ADDRESS:
631       return 512;
632     default:
633       return 128;
634     }
635   };
636 
637   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
638     const LLT DstTy = Query.Types[0];
639 
640     // Split vector extloads.
641     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
642     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
643       return true;
644 
645     const LLT PtrTy = Query.Types[1];
646     unsigned AS = PtrTy.getAddressSpace();
647     if (MemSize > maxSizeForAddrSpace(AS))
648       return true;
649 
650     // Catch weird sized loads that don't evenly divide into the access sizes
651     // TODO: May be able to widen depending on alignment etc.
652     unsigned NumRegs = MemSize / 32;
653     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
654       return true;
655 
656     unsigned Align = Query.MMODescrs[0].AlignInBits;
657     if (Align < MemSize) {
658       const SITargetLowering *TLI = ST.getTargetLowering();
659       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
660     }
661 
662     return false;
663   };
664 
665   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
666   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
667   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
668 
669   // TODO: Refine based on subtargets which support unaligned access or 128-bit
670   // LDS
671   // TODO: Unsupported flat for SI.
672 
673   for (unsigned Op : {G_LOAD, G_STORE}) {
674     const bool IsStore = Op == G_STORE;
675 
676     auto &Actions = getActionDefinitionsBuilder(Op);
677     // Whitelist the common cases.
678     // TODO: Pointer loads
679     // TODO: Wide constant loads
680     // TODO: Only CI+ has 3x loads
681     // TODO: Loads to s16 on gfx9
682     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
683                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
684                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
685                                       {S96, GlobalPtr, 96, GlobalAlign32},
686                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
687                                       {S128, GlobalPtr, 128, GlobalAlign32},
688                                       {S64, GlobalPtr, 64, GlobalAlign32},
689                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
690                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
691                                       {S32, GlobalPtr, 8, GlobalAlign8},
692                                       {S32, GlobalPtr, 16, GlobalAlign16},
693 
694                                       {S32, LocalPtr, 32, 32},
695                                       {S64, LocalPtr, 64, 32},
696                                       {V2S32, LocalPtr, 64, 32},
697                                       {S32, LocalPtr, 8, 8},
698                                       {S32, LocalPtr, 16, 16},
699                                       {V2S16, LocalPtr, 32, 32},
700 
701                                       {S32, PrivatePtr, 32, 32},
702                                       {S32, PrivatePtr, 8, 8},
703                                       {S32, PrivatePtr, 16, 16},
704                                       {V2S16, PrivatePtr, 32, 32},
705 
706                                       {S32, FlatPtr, 32, GlobalAlign32},
707                                       {S32, FlatPtr, 16, GlobalAlign16},
708                                       {S32, FlatPtr, 8, GlobalAlign8},
709                                       {V2S16, FlatPtr, 32, GlobalAlign32},
710 
711                                       {S32, ConstantPtr, 32, GlobalAlign32},
712                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
713                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
714                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
715                                       {S64, ConstantPtr, 64, GlobalAlign32},
716                                       {S128, ConstantPtr, 128, GlobalAlign32},
717                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
718     Actions
719         .customIf(typeIs(1, Constant32Ptr))
720         .narrowScalarIf(
721             [=](const LegalityQuery &Query) -> bool {
722               return !Query.Types[0].isVector() && needToSplitLoad(Query);
723             },
724             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
725               const LLT DstTy = Query.Types[0];
726               const LLT PtrTy = Query.Types[1];
727 
728               const unsigned DstSize = DstTy.getSizeInBits();
729               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
730 
731               // Split extloads.
732               if (DstSize > MemSize)
733                 return std::make_pair(0, LLT::scalar(MemSize));
734 
735               if (DstSize > 32 && (DstSize % 32 != 0)) {
736                 // FIXME: Need a way to specify non-extload of larger size if
737                 // suitably aligned.
738                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
739               }
740 
741               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
742               if (MemSize > MaxSize)
743                 return std::make_pair(0, LLT::scalar(MaxSize));
744 
745               unsigned Align = Query.MMODescrs[0].AlignInBits;
746               return std::make_pair(0, LLT::scalar(Align));
747             })
748         .fewerElementsIf(
749             [=](const LegalityQuery &Query) -> bool {
750               return Query.Types[0].isVector() && needToSplitLoad(Query);
751             },
752             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
753               const LLT DstTy = Query.Types[0];
754               const LLT PtrTy = Query.Types[1];
755 
756               LLT EltTy = DstTy.getElementType();
757               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
758 
759               // Split if it's too large for the address space.
760               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
761                 unsigned NumElts = DstTy.getNumElements();
762                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
763 
764                 // FIXME: Refine when odd breakdowns handled
765                 // The scalars will need to be re-legalized.
766                 if (NumPieces == 1 || NumPieces >= NumElts ||
767                     NumElts % NumPieces != 0)
768                   return std::make_pair(0, EltTy);
769 
770                 return std::make_pair(0,
771                                       LLT::vector(NumElts / NumPieces, EltTy));
772               }
773 
774               // Need to split because of alignment.
775               unsigned Align = Query.MMODescrs[0].AlignInBits;
776               unsigned EltSize = EltTy.getSizeInBits();
777               if (EltSize > Align &&
778                   (EltSize / Align < DstTy.getNumElements())) {
779                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
780               }
781 
782               // May need relegalization for the scalars.
783               return std::make_pair(0, EltTy);
784             })
785         .minScalar(0, S32);
786 
787     if (IsStore)
788       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
789 
790     // TODO: Need a bitcast lower option?
791     Actions
792         .legalIf([=](const LegalityQuery &Query) {
793           const LLT Ty0 = Query.Types[0];
794           unsigned Size = Ty0.getSizeInBits();
795           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
796           unsigned Align = Query.MMODescrs[0].AlignInBits;
797 
798           // No extending vector loads.
799           if (Size > MemSize && Ty0.isVector())
800             return false;
801 
802           // FIXME: Widening store from alignment not valid.
803           if (MemSize < Size)
804             MemSize = std::max(MemSize, Align);
805 
806           switch (MemSize) {
807           case 8:
808           case 16:
809             return Size == 32;
810           case 32:
811           case 64:
812           case 128:
813             return true;
814           case 96:
815             return ST.hasDwordx3LoadStores();
816           case 256:
817           case 512:
818             return true;
819           default:
820             return false;
821           }
822         })
823         .widenScalarToNextPow2(0)
824         // TODO: v3s32->v4s32 with alignment
825         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
826   }
827 
828   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
829                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
830                                                   {S32, GlobalPtr, 16, 2 * 8},
831                                                   {S32, LocalPtr, 8, 8},
832                                                   {S32, LocalPtr, 16, 16},
833                                                   {S32, PrivatePtr, 8, 8},
834                                                   {S32, PrivatePtr, 16, 16},
835                                                   {S32, ConstantPtr, 8, 8},
836                                                   {S32, ConstantPtr, 16, 2 * 8}});
837   if (ST.hasFlatAddressSpace()) {
838     ExtLoads.legalForTypesWithMemDesc(
839         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
840   }
841 
842   ExtLoads.clampScalar(0, S32, S32)
843           .widenScalarToNextPow2(0)
844           .unsupportedIfMemSizeNotPow2()
845           .lower();
846 
847   auto &Atomics = getActionDefinitionsBuilder(
848     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
849      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
850      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
851      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
852     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
853                {S64, GlobalPtr}, {S64, LocalPtr}});
854   if (ST.hasFlatAddressSpace()) {
855     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
856   }
857 
858   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
859     .legalFor({{S32, LocalPtr}});
860 
861   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
862     .lower();
863 
864   // TODO: Pointer types, any 32-bit or 64-bit vector
865   getActionDefinitionsBuilder(G_SELECT)
866     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
867           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
868           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
869     .clampScalar(0, S16, S64)
870     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
871     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
872     .scalarize(1)
873     .clampMaxNumElements(0, S32, 2)
874     .clampMaxNumElements(0, LocalPtr, 2)
875     .clampMaxNumElements(0, PrivatePtr, 2)
876     .scalarize(0)
877     .widenScalarToNextPow2(0)
878     .legalIf(all(isPointer(0), typeIs(1, S1)));
879 
880   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
881   // be more flexible with the shift amount type.
882   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
883     .legalFor({{S32, S32}, {S64, S32}});
884   if (ST.has16BitInsts()) {
885     if (ST.hasVOP3PInsts()) {
886       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
887             .clampMaxNumElements(0, S16, 2);
888     } else
889       Shifts.legalFor({{S16, S32}, {S16, S16}});
890 
891     Shifts.clampScalar(1, S16, S32);
892     Shifts.clampScalar(0, S16, S64);
893     Shifts.widenScalarToNextPow2(0, 16);
894   } else {
895     // Make sure we legalize the shift amount type first, as the general
896     // expansion for the shifted type will produce much worse code if it hasn't
897     // been truncated already.
898     Shifts.clampScalar(1, S32, S32);
899     Shifts.clampScalar(0, S32, S64);
900     Shifts.widenScalarToNextPow2(0, 32);
901   }
902   Shifts.scalarize(0);
903 
904   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
905     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
906     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
907     unsigned IdxTypeIdx = 2;
908 
909     getActionDefinitionsBuilder(Op)
910       .customIf([=](const LegalityQuery &Query) {
911           const LLT EltTy = Query.Types[EltTypeIdx];
912           const LLT VecTy = Query.Types[VecTypeIdx];
913           const LLT IdxTy = Query.Types[IdxTypeIdx];
914           return (EltTy.getSizeInBits() == 16 ||
915                   EltTy.getSizeInBits() % 32 == 0) &&
916                  VecTy.getSizeInBits() % 32 == 0 &&
917                  VecTy.getSizeInBits() <= 1024 &&
918                  IdxTy.getSizeInBits() == 32;
919         })
920       .clampScalar(EltTypeIdx, S32, S64)
921       .clampScalar(VecTypeIdx, S32, S64)
922       .clampScalar(IdxTypeIdx, S32, S32);
923   }
924 
925   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
926     .unsupportedIf([=](const LegalityQuery &Query) {
927         const LLT &EltTy = Query.Types[1].getElementType();
928         return Query.Types[0] != EltTy;
929       });
930 
931   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
932     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
933     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
934 
935     // FIXME: Doesn't handle extract of illegal sizes.
936     getActionDefinitionsBuilder(Op)
937       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
938       // FIXME: Multiples of 16 should not be legal.
939       .legalIf([=](const LegalityQuery &Query) {
940           const LLT BigTy = Query.Types[BigTyIdx];
941           const LLT LitTy = Query.Types[LitTyIdx];
942           return (BigTy.getSizeInBits() % 32 == 0) &&
943                  (LitTy.getSizeInBits() % 16 == 0);
944         })
945       .widenScalarIf(
946         [=](const LegalityQuery &Query) {
947           const LLT BigTy = Query.Types[BigTyIdx];
948           return (BigTy.getScalarSizeInBits() < 16);
949         },
950         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
951       .widenScalarIf(
952         [=](const LegalityQuery &Query) {
953           const LLT LitTy = Query.Types[LitTyIdx];
954           return (LitTy.getScalarSizeInBits() < 16);
955         },
956         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
957       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
958       .widenScalarToNextPow2(BigTyIdx, 32);
959 
960   }
961 
962   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
963     .legalForCartesianProduct(AllS32Vectors, {S32})
964     .legalForCartesianProduct(AllS64Vectors, {S64})
965     .clampNumElements(0, V16S32, V32S32)
966     .clampNumElements(0, V2S64, V16S64)
967     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
968 
969   if (ST.hasScalarPackInsts())
970     BuildVector.legalFor({V2S16, S32});
971 
972   BuildVector
973     .minScalarSameAs(1, 0)
974     .legalIf(isRegisterType(0))
975     .minScalarOrElt(0, S32);
976 
977   if (ST.hasScalarPackInsts()) {
978     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
979       .legalFor({V2S16, S32})
980       .lower();
981   } else {
982     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
983       .lower();
984   }
985 
986   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
987     .legalIf(isRegisterType(0));
988 
989   // TODO: Don't fully scalarize v2s16 pieces
990   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
991 
992   // Merge/Unmerge
993   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
994     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
995     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
996 
997     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
998       const LLT &Ty = Query.Types[TypeIdx];
999       if (Ty.isVector()) {
1000         const LLT &EltTy = Ty.getElementType();
1001         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
1002           return true;
1003         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1004           return true;
1005       }
1006       return false;
1007     };
1008 
1009     auto &Builder = getActionDefinitionsBuilder(Op)
1010       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1011       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1012       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1013       // valid.
1014       .clampScalar(LitTyIdx, S16, S256)
1015       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1016       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1017       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1018                            elementTypeIs(1, S16)),
1019                        changeTo(1, V2S16))
1020       // Break up vectors with weird elements into scalars
1021       .fewerElementsIf(
1022         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
1023         scalarize(0))
1024       .fewerElementsIf(
1025         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
1026         scalarize(1))
1027       .clampScalar(BigTyIdx, S32, S1024)
1028       .lowerFor({{S16, V2S16}});
1029 
1030     if (Op == G_MERGE_VALUES) {
1031       Builder.widenScalarIf(
1032         // TODO: Use 16-bit shifts if legal for 8-bit values?
1033         [=](const LegalityQuery &Query) {
1034           const LLT Ty = Query.Types[LitTyIdx];
1035           return Ty.getSizeInBits() < 32;
1036         },
1037         changeTo(LitTyIdx, S32));
1038     }
1039 
1040     Builder.widenScalarIf(
1041       [=](const LegalityQuery &Query) {
1042         const LLT Ty = Query.Types[BigTyIdx];
1043         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1044           Ty.getSizeInBits() % 16 != 0;
1045       },
1046       [=](const LegalityQuery &Query) {
1047         // Pick the next power of 2, or a multiple of 64 over 128.
1048         // Whichever is smaller.
1049         const LLT &Ty = Query.Types[BigTyIdx];
1050         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1051         if (NewSizeInBits >= 256) {
1052           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1053           if (RoundedTo < NewSizeInBits)
1054             NewSizeInBits = RoundedTo;
1055         }
1056         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1057       })
1058       .legalIf([=](const LegalityQuery &Query) {
1059           const LLT &BigTy = Query.Types[BigTyIdx];
1060           const LLT &LitTy = Query.Types[LitTyIdx];
1061 
1062           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1063             return false;
1064           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1065             return false;
1066 
1067           return BigTy.getSizeInBits() % 16 == 0 &&
1068                  LitTy.getSizeInBits() % 16 == 0 &&
1069                  BigTy.getSizeInBits() <= 1024;
1070         })
1071       // Any vectors left are the wrong size. Scalarize them.
1072       .scalarize(0)
1073       .scalarize(1);
1074   }
1075 
1076   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
1077 
1078   computeTables();
1079   verify(*ST.getInstrInfo());
1080 }
1081 
1082 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
1083                                          MachineRegisterInfo &MRI,
1084                                          MachineIRBuilder &B,
1085                                          GISelChangeObserver &Observer) const {
1086   switch (MI.getOpcode()) {
1087   case TargetOpcode::G_ADDRSPACE_CAST:
1088     return legalizeAddrSpaceCast(MI, MRI, B);
1089   case TargetOpcode::G_FRINT:
1090     return legalizeFrint(MI, MRI, B);
1091   case TargetOpcode::G_FCEIL:
1092     return legalizeFceil(MI, MRI, B);
1093   case TargetOpcode::G_INTRINSIC_TRUNC:
1094     return legalizeIntrinsicTrunc(MI, MRI, B);
1095   case TargetOpcode::G_SITOFP:
1096     return legalizeITOFP(MI, MRI, B, true);
1097   case TargetOpcode::G_UITOFP:
1098     return legalizeITOFP(MI, MRI, B, false);
1099   case TargetOpcode::G_FMINNUM:
1100   case TargetOpcode::G_FMAXNUM:
1101   case TargetOpcode::G_FMINNUM_IEEE:
1102   case TargetOpcode::G_FMAXNUM_IEEE:
1103     return legalizeMinNumMaxNum(MI, MRI, B);
1104   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1105     return legalizeExtractVectorElt(MI, MRI, B);
1106   case TargetOpcode::G_INSERT_VECTOR_ELT:
1107     return legalizeInsertVectorElt(MI, MRI, B);
1108   case TargetOpcode::G_FSIN:
1109   case TargetOpcode::G_FCOS:
1110     return legalizeSinCos(MI, MRI, B);
1111   case TargetOpcode::G_GLOBAL_VALUE:
1112     return legalizeGlobalValue(MI, MRI, B);
1113   case TargetOpcode::G_LOAD:
1114     return legalizeLoad(MI, MRI, B, Observer);
1115   case TargetOpcode::G_FMAD:
1116     return legalizeFMad(MI, MRI, B);
1117   case TargetOpcode::G_FDIV:
1118     return legalizeFDIV(MI, MRI, B);
1119   default:
1120     return false;
1121   }
1122 
1123   llvm_unreachable("expected switch to return");
1124 }
1125 
1126 Register AMDGPULegalizerInfo::getSegmentAperture(
1127   unsigned AS,
1128   MachineRegisterInfo &MRI,
1129   MachineIRBuilder &B) const {
1130   MachineFunction &MF = B.getMF();
1131   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1132   const LLT S32 = LLT::scalar(32);
1133 
1134   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1135 
1136   if (ST.hasApertureRegs()) {
1137     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1138     // getreg.
1139     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1140         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1141         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1142     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1143         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1144         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1145     unsigned Encoding =
1146         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1147         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1148         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1149 
1150     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
1151     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1152 
1153     B.buildInstr(AMDGPU::S_GETREG_B32)
1154       .addDef(GetReg)
1155       .addImm(Encoding);
1156     MRI.setType(GetReg, S32);
1157 
1158     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1159     B.buildInstr(TargetOpcode::G_SHL)
1160       .addDef(ApertureReg)
1161       .addUse(GetReg)
1162       .addUse(ShiftAmt.getReg(0));
1163 
1164     return ApertureReg;
1165   }
1166 
1167   Register QueuePtr = MRI.createGenericVirtualRegister(
1168     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1169 
1170   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1171   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1172     return Register();
1173 
1174   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1175   // private_segment_aperture_base_hi.
1176   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1177 
1178   // FIXME: Don't use undef
1179   Value *V = UndefValue::get(PointerType::get(
1180                                Type::getInt8Ty(MF.getFunction().getContext()),
1181                                AMDGPUAS::CONSTANT_ADDRESS));
1182 
1183   MachinePointerInfo PtrInfo(V, StructOffset);
1184   MachineMemOperand *MMO = MF.getMachineMemOperand(
1185     PtrInfo,
1186     MachineMemOperand::MOLoad |
1187     MachineMemOperand::MODereferenceable |
1188     MachineMemOperand::MOInvariant,
1189     4,
1190     MinAlign(64, StructOffset));
1191 
1192   Register LoadResult = MRI.createGenericVirtualRegister(S32);
1193   Register LoadAddr;
1194 
1195   B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1196   B.buildLoad(LoadResult, LoadAddr, *MMO);
1197   return LoadResult;
1198 }
1199 
1200 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1201   MachineInstr &MI, MachineRegisterInfo &MRI,
1202   MachineIRBuilder &B) const {
1203   MachineFunction &MF = B.getMF();
1204 
1205   B.setInstr(MI);
1206 
1207   const LLT S32 = LLT::scalar(32);
1208   Register Dst = MI.getOperand(0).getReg();
1209   Register Src = MI.getOperand(1).getReg();
1210 
1211   LLT DstTy = MRI.getType(Dst);
1212   LLT SrcTy = MRI.getType(Src);
1213   unsigned DestAS = DstTy.getAddressSpace();
1214   unsigned SrcAS = SrcTy.getAddressSpace();
1215 
1216   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1217   // vector element.
1218   assert(!DstTy.isVector());
1219 
1220   const AMDGPUTargetMachine &TM
1221     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1222 
1223   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1224   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1225     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1226     return true;
1227   }
1228 
1229   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1230     // Truncate.
1231     B.buildExtract(Dst, Src, 0);
1232     MI.eraseFromParent();
1233     return true;
1234   }
1235 
1236   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1237     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1238     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1239 
1240     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1241     // another. Merge operands are required to be the same type, but creating an
1242     // extra ptrtoint would be kind of pointless.
1243     auto HighAddr = B.buildConstant(
1244       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1245     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
1246     MI.eraseFromParent();
1247     return true;
1248   }
1249 
1250   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1251     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1252            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1253     unsigned NullVal = TM.getNullPointerValue(DestAS);
1254 
1255     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1256     auto FlatNull = B.buildConstant(SrcTy, 0);
1257 
1258     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
1259 
1260     // Extract low 32-bits of the pointer.
1261     B.buildExtract(PtrLo32, Src, 0);
1262 
1263     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1264     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
1265     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1266 
1267     MI.eraseFromParent();
1268     return true;
1269   }
1270 
1271   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1272     return false;
1273 
1274   if (!ST.hasFlatAddressSpace())
1275     return false;
1276 
1277   auto SegmentNull =
1278       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1279   auto FlatNull =
1280       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1281 
1282   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1283   if (!ApertureReg.isValid())
1284     return false;
1285 
1286   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
1287   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
1288 
1289   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
1290 
1291   // Coerce the type of the low half of the result so we can use merge_values.
1292   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
1293   B.buildInstr(TargetOpcode::G_PTRTOINT)
1294     .addDef(SrcAsInt)
1295     .addUse(Src);
1296 
1297   // TODO: Should we allow mismatched types but matching sizes in merges to
1298   // avoid the ptrtoint?
1299   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
1300   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
1301 
1302   MI.eraseFromParent();
1303   return true;
1304 }
1305 
1306 bool AMDGPULegalizerInfo::legalizeFrint(
1307   MachineInstr &MI, MachineRegisterInfo &MRI,
1308   MachineIRBuilder &B) const {
1309   B.setInstr(MI);
1310 
1311   Register Src = MI.getOperand(1).getReg();
1312   LLT Ty = MRI.getType(Src);
1313   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1314 
1315   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1316   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1317 
1318   auto C1 = B.buildFConstant(Ty, C1Val);
1319   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1320 
1321   // TODO: Should this propagate fast-math-flags?
1322   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1323   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1324 
1325   auto C2 = B.buildFConstant(Ty, C2Val);
1326   auto Fabs = B.buildFAbs(Ty, Src);
1327 
1328   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1329   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1330   return true;
1331 }
1332 
1333 bool AMDGPULegalizerInfo::legalizeFceil(
1334   MachineInstr &MI, MachineRegisterInfo &MRI,
1335   MachineIRBuilder &B) const {
1336   B.setInstr(MI);
1337 
1338   const LLT S1 = LLT::scalar(1);
1339   const LLT S64 = LLT::scalar(64);
1340 
1341   Register Src = MI.getOperand(1).getReg();
1342   assert(MRI.getType(Src) == S64);
1343 
1344   // result = trunc(src)
1345   // if (src > 0.0 && src != result)
1346   //   result += 1.0
1347 
1348   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1349 
1350   const auto Zero = B.buildFConstant(S64, 0.0);
1351   const auto One = B.buildFConstant(S64, 1.0);
1352   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1353   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1354   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1355   auto Add = B.buildSelect(S64, And, One, Zero);
1356 
1357   // TODO: Should this propagate fast-math-flags?
1358   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1359   return true;
1360 }
1361 
1362 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1363                                               MachineIRBuilder &B) {
1364   const unsigned FractBits = 52;
1365   const unsigned ExpBits = 11;
1366   LLT S32 = LLT::scalar(32);
1367 
1368   auto Const0 = B.buildConstant(S32, FractBits - 32);
1369   auto Const1 = B.buildConstant(S32, ExpBits);
1370 
1371   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1372     .addUse(Const0.getReg(0))
1373     .addUse(Const1.getReg(0));
1374 
1375   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1376 }
1377 
1378 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1379   MachineInstr &MI, MachineRegisterInfo &MRI,
1380   MachineIRBuilder &B) const {
1381   B.setInstr(MI);
1382 
1383   const LLT S1 = LLT::scalar(1);
1384   const LLT S32 = LLT::scalar(32);
1385   const LLT S64 = LLT::scalar(64);
1386 
1387   Register Src = MI.getOperand(1).getReg();
1388   assert(MRI.getType(Src) == S64);
1389 
1390   // TODO: Should this use extract since the low half is unused?
1391   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1392   Register Hi = Unmerge.getReg(1);
1393 
1394   // Extract the upper half, since this is where we will find the sign and
1395   // exponent.
1396   auto Exp = extractF64Exponent(Hi, B);
1397 
1398   const unsigned FractBits = 52;
1399 
1400   // Extract the sign bit.
1401   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1402   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1403 
1404   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1405 
1406   const auto Zero32 = B.buildConstant(S32, 0);
1407 
1408   // Extend back to 64-bits.
1409   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1410 
1411   auto Shr = B.buildAShr(S64, FractMask, Exp);
1412   auto Not = B.buildNot(S64, Shr);
1413   auto Tmp0 = B.buildAnd(S64, Src, Not);
1414   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1415 
1416   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1417   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1418 
1419   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1420   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1421   return true;
1422 }
1423 
1424 bool AMDGPULegalizerInfo::legalizeITOFP(
1425   MachineInstr &MI, MachineRegisterInfo &MRI,
1426   MachineIRBuilder &B, bool Signed) const {
1427   B.setInstr(MI);
1428 
1429   Register Dst = MI.getOperand(0).getReg();
1430   Register Src = MI.getOperand(1).getReg();
1431 
1432   const LLT S64 = LLT::scalar(64);
1433   const LLT S32 = LLT::scalar(32);
1434 
1435   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1436 
1437   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1438 
1439   auto CvtHi = Signed ?
1440     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1441     B.buildUITOFP(S64, Unmerge.getReg(1));
1442 
1443   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1444 
1445   auto ThirtyTwo = B.buildConstant(S32, 32);
1446   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1447     .addUse(CvtHi.getReg(0))
1448     .addUse(ThirtyTwo.getReg(0));
1449 
1450   // TODO: Should this propagate fast-math-flags?
1451   B.buildFAdd(Dst, LdExp, CvtLo);
1452   MI.eraseFromParent();
1453   return true;
1454 }
1455 
1456 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1457   MachineInstr &MI, MachineRegisterInfo &MRI,
1458   MachineIRBuilder &B) const {
1459   MachineFunction &MF = B.getMF();
1460   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1461 
1462   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1463                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1464 
1465   // With ieee_mode disabled, the instructions have the correct behavior
1466   // already for G_FMINNUM/G_FMAXNUM
1467   if (!MFI->getMode().IEEE)
1468     return !IsIEEEOp;
1469 
1470   if (IsIEEEOp)
1471     return true;
1472 
1473   MachineIRBuilder HelperBuilder(MI);
1474   GISelObserverWrapper DummyObserver;
1475   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1476   HelperBuilder.setInstr(MI);
1477   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1478 }
1479 
1480 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1481   MachineInstr &MI, MachineRegisterInfo &MRI,
1482   MachineIRBuilder &B) const {
1483   // TODO: Should move some of this into LegalizerHelper.
1484 
1485   // TODO: Promote dynamic indexing of s16 to s32
1486   // TODO: Dynamic s64 indexing is only legal for SGPR.
1487   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1488   if (!IdxVal) // Dynamic case will be selected to register indexing.
1489     return true;
1490 
1491   Register Dst = MI.getOperand(0).getReg();
1492   Register Vec = MI.getOperand(1).getReg();
1493 
1494   LLT VecTy = MRI.getType(Vec);
1495   LLT EltTy = VecTy.getElementType();
1496   assert(EltTy == MRI.getType(Dst));
1497 
1498   B.setInstr(MI);
1499 
1500   if (IdxVal.getValue() < VecTy.getNumElements())
1501     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1502   else
1503     B.buildUndef(Dst);
1504 
1505   MI.eraseFromParent();
1506   return true;
1507 }
1508 
1509 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1510   MachineInstr &MI, MachineRegisterInfo &MRI,
1511   MachineIRBuilder &B) const {
1512   // TODO: Should move some of this into LegalizerHelper.
1513 
1514   // TODO: Promote dynamic indexing of s16 to s32
1515   // TODO: Dynamic s64 indexing is only legal for SGPR.
1516   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1517   if (!IdxVal) // Dynamic case will be selected to register indexing.
1518     return true;
1519 
1520   Register Dst = MI.getOperand(0).getReg();
1521   Register Vec = MI.getOperand(1).getReg();
1522   Register Ins = MI.getOperand(2).getReg();
1523 
1524   LLT VecTy = MRI.getType(Vec);
1525   LLT EltTy = VecTy.getElementType();
1526   assert(EltTy == MRI.getType(Ins));
1527 
1528   B.setInstr(MI);
1529 
1530   if (IdxVal.getValue() < VecTy.getNumElements())
1531     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1532   else
1533     B.buildUndef(Dst);
1534 
1535   MI.eraseFromParent();
1536   return true;
1537 }
1538 
1539 bool AMDGPULegalizerInfo::legalizeSinCos(
1540   MachineInstr &MI, MachineRegisterInfo &MRI,
1541   MachineIRBuilder &B) const {
1542   B.setInstr(MI);
1543 
1544   Register DstReg = MI.getOperand(0).getReg();
1545   Register SrcReg = MI.getOperand(1).getReg();
1546   LLT Ty = MRI.getType(DstReg);
1547   unsigned Flags = MI.getFlags();
1548 
1549   Register TrigVal;
1550   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
1551   if (ST.hasTrigReducedRange()) {
1552     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1553     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1554       .addUse(MulVal.getReg(0))
1555       .setMIFlags(Flags).getReg(0);
1556   } else
1557     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
1558 
1559   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
1560     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
1561   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
1562     .addUse(TrigVal)
1563     .setMIFlags(Flags);
1564   MI.eraseFromParent();
1565   return true;
1566 }
1567 
1568 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
1569   Register DstReg, LLT PtrTy,
1570   MachineIRBuilder &B, const GlobalValue *GV,
1571   unsigned Offset, unsigned GAFlags) const {
1572   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
1573   // to the following code sequence:
1574   //
1575   // For constant address space:
1576   //   s_getpc_b64 s[0:1]
1577   //   s_add_u32 s0, s0, $symbol
1578   //   s_addc_u32 s1, s1, 0
1579   //
1580   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1581   //   a fixup or relocation is emitted to replace $symbol with a literal
1582   //   constant, which is a pc-relative offset from the encoding of the $symbol
1583   //   operand to the global variable.
1584   //
1585   // For global address space:
1586   //   s_getpc_b64 s[0:1]
1587   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
1588   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
1589   //
1590   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
1591   //   fixups or relocations are emitted to replace $symbol@*@lo and
1592   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
1593   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
1594   //   operand to the global variable.
1595   //
1596   // What we want here is an offset from the value returned by s_getpc
1597   // (which is the address of the s_add_u32 instruction) to the global
1598   // variable, but since the encoding of $symbol starts 4 bytes after the start
1599   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
1600   // small. This requires us to add 4 to the global variable offset in order to
1601   // compute the correct address.
1602 
1603   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1604 
1605   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
1606     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
1607 
1608   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
1609     .addDef(PCReg);
1610 
1611   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
1612   if (GAFlags == SIInstrInfo::MO_NONE)
1613     MIB.addImm(0);
1614   else
1615     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
1616 
1617   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
1618 
1619   if (PtrTy.getSizeInBits() == 32)
1620     B.buildExtract(DstReg, PCReg, 0);
1621   return true;
1622  }
1623 
1624 bool AMDGPULegalizerInfo::legalizeGlobalValue(
1625   MachineInstr &MI, MachineRegisterInfo &MRI,
1626   MachineIRBuilder &B) const {
1627   Register DstReg = MI.getOperand(0).getReg();
1628   LLT Ty = MRI.getType(DstReg);
1629   unsigned AS = Ty.getAddressSpace();
1630 
1631   const GlobalValue *GV = MI.getOperand(1).getGlobal();
1632   MachineFunction &MF = B.getMF();
1633   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1634   B.setInstr(MI);
1635 
1636   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1637     if (!MFI->isEntryFunction()) {
1638       const Function &Fn = MF.getFunction();
1639       DiagnosticInfoUnsupported BadLDSDecl(
1640         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
1641       Fn.getContext().diagnose(BadLDSDecl);
1642     }
1643 
1644     // TODO: We could emit code to handle the initialization somewhere.
1645     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
1646       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
1647       MI.eraseFromParent();
1648       return true;
1649     }
1650 
1651     const Function &Fn = MF.getFunction();
1652     DiagnosticInfoUnsupported BadInit(
1653       Fn, "unsupported initializer for address space", MI.getDebugLoc());
1654     Fn.getContext().diagnose(BadInit);
1655     return true;
1656   }
1657 
1658   const SITargetLowering *TLI = ST.getTargetLowering();
1659 
1660   if (TLI->shouldEmitFixup(GV)) {
1661     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
1662     MI.eraseFromParent();
1663     return true;
1664   }
1665 
1666   if (TLI->shouldEmitPCReloc(GV)) {
1667     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
1668     MI.eraseFromParent();
1669     return true;
1670   }
1671 
1672   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1673   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
1674 
1675   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
1676     MachinePointerInfo::getGOT(MF),
1677     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1678     MachineMemOperand::MOInvariant,
1679     8 /*Size*/, 8 /*Align*/);
1680 
1681   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
1682 
1683   if (Ty.getSizeInBits() == 32) {
1684     // Truncate if this is a 32-bit constant adrdess.
1685     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
1686     B.buildExtract(DstReg, Load, 0);
1687   } else
1688     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
1689 
1690   MI.eraseFromParent();
1691   return true;
1692 }
1693 
1694 bool AMDGPULegalizerInfo::legalizeLoad(
1695   MachineInstr &MI, MachineRegisterInfo &MRI,
1696   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
1697   B.setInstr(MI);
1698   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
1699   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
1700   Observer.changingInstr(MI);
1701   MI.getOperand(1).setReg(Cast.getReg(0));
1702   Observer.changedInstr(MI);
1703   return true;
1704 }
1705 
1706 bool AMDGPULegalizerInfo::legalizeFMad(
1707   MachineInstr &MI, MachineRegisterInfo &MRI,
1708   MachineIRBuilder &B) const {
1709   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1710   assert(Ty.isScalar());
1711 
1712   // TODO: Always legal with future ftz flag.
1713   if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
1714     return true;
1715   if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
1716     return true;
1717 
1718   MachineFunction &MF = B.getMF();
1719 
1720   MachineIRBuilder HelperBuilder(MI);
1721   GISelObserverWrapper DummyObserver;
1722   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1723   HelperBuilder.setMBB(*MI.getParent());
1724   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
1725 }
1726 
1727 // Return the use branch instruction, otherwise null if the usage is invalid.
1728 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1729                                        MachineRegisterInfo &MRI) {
1730   Register CondDef = MI.getOperand(0).getReg();
1731   if (!MRI.hasOneNonDBGUse(CondDef))
1732     return nullptr;
1733 
1734   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1735   return UseMI.getParent() == MI.getParent() &&
1736     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1737 }
1738 
1739 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1740                                                 Register Reg, LLT Ty) const {
1741   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1742   if (LiveIn)
1743     return LiveIn;
1744 
1745   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1746   MRI.addLiveIn(Reg, NewReg);
1747   return NewReg;
1748 }
1749 
1750 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1751                                          const ArgDescriptor *Arg) const {
1752   if (!Arg->isRegister() || !Arg->getRegister().isValid())
1753     return false; // TODO: Handle these
1754 
1755   assert(Arg->getRegister().isPhysical());
1756 
1757   MachineRegisterInfo &MRI = *B.getMRI();
1758 
1759   LLT Ty = MRI.getType(DstReg);
1760   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1761 
1762   if (Arg->isMasked()) {
1763     // TODO: Should we try to emit this once in the entry block?
1764     const LLT S32 = LLT::scalar(32);
1765     const unsigned Mask = Arg->getMask();
1766     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1767 
1768     Register AndMaskSrc = LiveIn;
1769 
1770     if (Shift != 0) {
1771       auto ShiftAmt = B.buildConstant(S32, Shift);
1772       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
1773     }
1774 
1775     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
1776   } else
1777     B.buildCopy(DstReg, LiveIn);
1778 
1779   // Insert the argument copy if it doens't already exist.
1780   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1781   if (!MRI.getVRegDef(LiveIn)) {
1782     // FIXME: Should have scoped insert pt
1783     MachineBasicBlock &OrigInsBB = B.getMBB();
1784     auto OrigInsPt = B.getInsertPt();
1785 
1786     MachineBasicBlock &EntryMBB = B.getMF().front();
1787     EntryMBB.addLiveIn(Arg->getRegister());
1788     B.setInsertPt(EntryMBB, EntryMBB.begin());
1789     B.buildCopy(LiveIn, Arg->getRegister());
1790 
1791     B.setInsertPt(OrigInsBB, OrigInsPt);
1792   }
1793 
1794   return true;
1795 }
1796 
1797 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1798   MachineInstr &MI,
1799   MachineRegisterInfo &MRI,
1800   MachineIRBuilder &B,
1801   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1802   B.setInstr(MI);
1803 
1804   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1805 
1806   const ArgDescriptor *Arg;
1807   const TargetRegisterClass *RC;
1808   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1809   if (!Arg) {
1810     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1811     return false;
1812   }
1813 
1814   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1815     MI.eraseFromParent();
1816     return true;
1817   }
1818 
1819   return false;
1820 }
1821 
1822 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
1823                                        MachineRegisterInfo &MRI,
1824                                        MachineIRBuilder &B) const {
1825   B.setInstr(MI);
1826 
1827   if (legalizeFastUnsafeFDIV(MI, MRI, B))
1828     return true;
1829 
1830   return false;
1831 }
1832 
1833 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
1834                                                  MachineRegisterInfo &MRI,
1835                                                  MachineIRBuilder &B) const {
1836   Register Res = MI.getOperand(0).getReg();
1837   Register LHS = MI.getOperand(1).getReg();
1838   Register RHS = MI.getOperand(2).getReg();
1839 
1840   uint16_t Flags = MI.getFlags();
1841 
1842   LLT ResTy = MRI.getType(Res);
1843   LLT S32 = LLT::scalar(32);
1844   LLT S64 = LLT::scalar(64);
1845 
1846   const MachineFunction &MF = B.getMF();
1847   bool Unsafe =
1848     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
1849 
1850   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
1851     return false;
1852 
1853   if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
1854     return false;
1855 
1856   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
1857     // 1 / x -> RCP(x)
1858     if (CLHS->isExactlyValue(1.0)) {
1859       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1860         .addUse(RHS)
1861         .setMIFlags(Flags);
1862 
1863       MI.eraseFromParent();
1864       return true;
1865     }
1866 
1867     // -1 / x -> RCP( FNEG(x) )
1868     if (CLHS->isExactlyValue(-1.0)) {
1869       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
1870       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
1871         .addUse(FNeg.getReg(0))
1872         .setMIFlags(Flags);
1873 
1874       MI.eraseFromParent();
1875       return true;
1876     }
1877   }
1878 
1879   // x / y -> x * (1.0 / y)
1880   if (Unsafe) {
1881     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
1882       .addUse(RHS)
1883       .setMIFlags(Flags);
1884     B.buildFMul(Res, LHS, RCP, Flags);
1885 
1886     MI.eraseFromParent();
1887     return true;
1888   }
1889 
1890   return false;
1891 }
1892 
1893 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
1894                                                  MachineRegisterInfo &MRI,
1895                                                  MachineIRBuilder &B) const {
1896   B.setInstr(MI);
1897   Register Res = MI.getOperand(0).getReg();
1898   Register LHS = MI.getOperand(2).getReg();
1899   Register RHS = MI.getOperand(3).getReg();
1900   uint16_t Flags = MI.getFlags();
1901 
1902   LLT S32 = LLT::scalar(32);
1903   LLT S1 = LLT::scalar(1);
1904 
1905   auto Abs = B.buildFAbs(S32, RHS, Flags);
1906   const APFloat C0Val(1.0f);
1907 
1908   auto C0 = B.buildConstant(S32, 0x6f800000);
1909   auto C1 = B.buildConstant(S32, 0x2f800000);
1910   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
1911 
1912   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
1913   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
1914 
1915   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
1916 
1917   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
1918     .addUse(Mul0.getReg(0))
1919     .setMIFlags(Flags);
1920 
1921   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
1922 
1923   B.buildFMul(Res, Sel, Mul1, Flags);
1924 
1925   MI.eraseFromParent();
1926   return true;
1927 }
1928 
1929 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1930                                                  MachineRegisterInfo &MRI,
1931                                                  MachineIRBuilder &B) const {
1932   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1933   if (!MFI->isEntryFunction()) {
1934     return legalizePreloadedArgIntrin(MI, MRI, B,
1935                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1936   }
1937 
1938   B.setInstr(MI);
1939 
1940   uint64_t Offset =
1941     ST.getTargetLowering()->getImplicitParameterOffset(
1942       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1943   Register DstReg = MI.getOperand(0).getReg();
1944   LLT DstTy = MRI.getType(DstReg);
1945   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1946 
1947   const ArgDescriptor *Arg;
1948   const TargetRegisterClass *RC;
1949   std::tie(Arg, RC)
1950     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1951   if (!Arg)
1952     return false;
1953 
1954   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1955   if (!loadInputValue(KernargPtrReg, B, Arg))
1956     return false;
1957 
1958   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1959   MI.eraseFromParent();
1960   return true;
1961 }
1962 
1963 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
1964                                               MachineRegisterInfo &MRI,
1965                                               MachineIRBuilder &B,
1966                                               unsigned AddrSpace) const {
1967   B.setInstr(MI);
1968   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
1969   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
1970   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
1971   MI.eraseFromParent();
1972   return true;
1973 }
1974 
1975 /// Handle register layout difference for f16 images for some subtargets.
1976 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
1977                                              MachineRegisterInfo &MRI,
1978                                              Register Reg) const {
1979   if (!ST.hasUnpackedD16VMem())
1980     return Reg;
1981 
1982   const LLT S16 = LLT::scalar(16);
1983   const LLT S32 = LLT::scalar(32);
1984   LLT StoreVT = MRI.getType(Reg);
1985   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
1986 
1987   auto Unmerge = B.buildUnmerge(S16, Reg);
1988 
1989   SmallVector<Register, 4> WideRegs;
1990   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1991     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
1992 
1993   int NumElts = StoreVT.getNumElements();
1994 
1995   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1996 }
1997 
1998 bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
1999                                                  MachineRegisterInfo &MRI,
2000                                                  MachineIRBuilder &B,
2001                                                  bool IsFormat) const {
2002   // TODO: Reject f16 format on targets where unsupported.
2003   Register VData = MI.getOperand(1).getReg();
2004   LLT Ty = MRI.getType(VData);
2005 
2006   B.setInstr(MI);
2007 
2008   const LLT S32 = LLT::scalar(32);
2009   const LLT S16 = LLT::scalar(16);
2010 
2011   // Fixup illegal register types for i8 stores.
2012   if (Ty == LLT::scalar(8) || Ty == S16) {
2013     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
2014     MI.getOperand(1).setReg(AnyExt);
2015     return true;
2016   }
2017 
2018   if (Ty.isVector()) {
2019     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
2020       if (IsFormat)
2021         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
2022       return true;
2023     }
2024 
2025     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
2026   }
2027 
2028   return Ty == S32;
2029 }
2030 
2031 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
2032                                             MachineRegisterInfo &MRI,
2033                                             MachineIRBuilder &B) const {
2034   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2035   switch (MI.getIntrinsicID()) {
2036   case Intrinsic::amdgcn_if: {
2037     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2038       const SIRegisterInfo *TRI
2039         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2040 
2041       B.setInstr(*BrCond);
2042       Register Def = MI.getOperand(1).getReg();
2043       Register Use = MI.getOperand(3).getReg();
2044       B.buildInstr(AMDGPU::SI_IF)
2045         .addDef(Def)
2046         .addUse(Use)
2047         .addMBB(BrCond->getOperand(1).getMBB());
2048 
2049       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
2050       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
2051       MI.eraseFromParent();
2052       BrCond->eraseFromParent();
2053       return true;
2054     }
2055 
2056     return false;
2057   }
2058   case Intrinsic::amdgcn_loop: {
2059     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
2060       const SIRegisterInfo *TRI
2061         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
2062 
2063       B.setInstr(*BrCond);
2064       Register Reg = MI.getOperand(2).getReg();
2065       B.buildInstr(AMDGPU::SI_LOOP)
2066         .addUse(Reg)
2067         .addMBB(BrCond->getOperand(1).getMBB());
2068       MI.eraseFromParent();
2069       BrCond->eraseFromParent();
2070       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
2071       return true;
2072     }
2073 
2074     return false;
2075   }
2076   case Intrinsic::amdgcn_kernarg_segment_ptr:
2077     return legalizePreloadedArgIntrin(
2078       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2079   case Intrinsic::amdgcn_implicitarg_ptr:
2080     return legalizeImplicitArgPtr(MI, MRI, B);
2081   case Intrinsic::amdgcn_workitem_id_x:
2082     return legalizePreloadedArgIntrin(MI, MRI, B,
2083                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2084   case Intrinsic::amdgcn_workitem_id_y:
2085     return legalizePreloadedArgIntrin(MI, MRI, B,
2086                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2087   case Intrinsic::amdgcn_workitem_id_z:
2088     return legalizePreloadedArgIntrin(MI, MRI, B,
2089                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2090   case Intrinsic::amdgcn_workgroup_id_x:
2091     return legalizePreloadedArgIntrin(MI, MRI, B,
2092                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
2093   case Intrinsic::amdgcn_workgroup_id_y:
2094     return legalizePreloadedArgIntrin(MI, MRI, B,
2095                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
2096   case Intrinsic::amdgcn_workgroup_id_z:
2097     return legalizePreloadedArgIntrin(MI, MRI, B,
2098                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
2099   case Intrinsic::amdgcn_dispatch_ptr:
2100     return legalizePreloadedArgIntrin(MI, MRI, B,
2101                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
2102   case Intrinsic::amdgcn_queue_ptr:
2103     return legalizePreloadedArgIntrin(MI, MRI, B,
2104                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
2105   case Intrinsic::amdgcn_implicit_buffer_ptr:
2106     return legalizePreloadedArgIntrin(
2107       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
2108   case Intrinsic::amdgcn_dispatch_id:
2109     return legalizePreloadedArgIntrin(MI, MRI, B,
2110                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
2111   case Intrinsic::amdgcn_fdiv_fast:
2112     return legalizeFDIVFastIntrin(MI, MRI, B);
2113   case Intrinsic::amdgcn_is_shared:
2114     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
2115   case Intrinsic::amdgcn_is_private:
2116     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
2117   case Intrinsic::amdgcn_wavefrontsize: {
2118     B.setInstr(MI);
2119     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
2120     MI.eraseFromParent();
2121     return true;
2122   }
2123   case Intrinsic::amdgcn_raw_buffer_store:
2124     return legalizeRawBufferStore(MI, MRI, B, false);
2125   case Intrinsic::amdgcn_raw_buffer_store_format:
2126     return legalizeRawBufferStore(MI, MRI, B, true);
2127   default:
2128     return true;
2129   }
2130 
2131   return true;
2132 }
2133