xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 47ce20aef1e636e601ef26a4bc7e05c64a000640)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
20 #include "llvm/CodeGen/TargetOpcodes.h"
21 #include "llvm/CodeGen/ValueTypes.h"
22 #include "llvm/IR/DerivedTypes.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/Support/Debug.h"
25 
26 #define DEBUG_TYPE "amdgpu-legalinfo"
27 
28 using namespace llvm;
29 using namespace LegalizeActions;
30 using namespace LegalizeMutations;
31 using namespace LegalityPredicates;
32 
33 
34 static LegalityPredicate isMultiple32(unsigned TypeIdx,
35                                       unsigned MaxSize = 512) {
36   return [=](const LegalityQuery &Query) {
37     const LLT Ty = Query.Types[TypeIdx];
38     const LLT EltTy = Ty.getScalarType();
39     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
40   };
41 }
42 
43 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
44   return [=](const LegalityQuery &Query) {
45     const LLT Ty = Query.Types[TypeIdx];
46     return Ty.isVector() &&
47            Ty.getNumElements() % 2 != 0 &&
48            Ty.getElementType().getSizeInBits() < 32;
49   };
50 }
51 
52 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
53   return [=](const LegalityQuery &Query) {
54     const LLT Ty = Query.Types[TypeIdx];
55     const LLT EltTy = Ty.getElementType();
56     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
57   };
58 }
59 
60 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
61   return [=](const LegalityQuery &Query) {
62     const LLT Ty = Query.Types[TypeIdx];
63     const LLT EltTy = Ty.getElementType();
64     unsigned Size = Ty.getSizeInBits();
65     unsigned Pieces = (Size + 63) / 64;
66     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
67     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
68   };
69 }
70 
71 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
72   return [=](const LegalityQuery &Query) {
73     const LLT QueryTy = Query.Types[TypeIdx];
74     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
75   };
76 }
77 
78 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
79   return [=](const LegalityQuery &Query) {
80     const LLT QueryTy = Query.Types[TypeIdx];
81     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
82   };
83 }
84 
85 // Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
86 // v2s16.
87 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
88   return [=](const LegalityQuery &Query) {
89     const LLT Ty = Query.Types[TypeIdx];
90     if (Ty.isVector()) {
91       const int EltSize = Ty.getElementType().getSizeInBits();
92       return EltSize == 32 || EltSize == 64 ||
93             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
94              EltSize == 128 || EltSize == 256;
95     }
96 
97     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
98   };
99 }
100 
101 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
102                                          const GCNTargetMachine &TM)
103   :  ST(ST_) {
104   using namespace TargetOpcode;
105 
106   auto GetAddrSpacePtr = [&TM](unsigned AS) {
107     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
108   };
109 
110   const LLT S1 = LLT::scalar(1);
111   const LLT S8 = LLT::scalar(8);
112   const LLT S16 = LLT::scalar(16);
113   const LLT S32 = LLT::scalar(32);
114   const LLT S64 = LLT::scalar(64);
115   const LLT S128 = LLT::scalar(128);
116   const LLT S256 = LLT::scalar(256);
117   const LLT S512 = LLT::scalar(512);
118 
119   const LLT V2S16 = LLT::vector(2, 16);
120   const LLT V4S16 = LLT::vector(4, 16);
121 
122   const LLT V2S32 = LLT::vector(2, 32);
123   const LLT V3S32 = LLT::vector(3, 32);
124   const LLT V4S32 = LLT::vector(4, 32);
125   const LLT V5S32 = LLT::vector(5, 32);
126   const LLT V6S32 = LLT::vector(6, 32);
127   const LLT V7S32 = LLT::vector(7, 32);
128   const LLT V8S32 = LLT::vector(8, 32);
129   const LLT V9S32 = LLT::vector(9, 32);
130   const LLT V10S32 = LLT::vector(10, 32);
131   const LLT V11S32 = LLT::vector(11, 32);
132   const LLT V12S32 = LLT::vector(12, 32);
133   const LLT V13S32 = LLT::vector(13, 32);
134   const LLT V14S32 = LLT::vector(14, 32);
135   const LLT V15S32 = LLT::vector(15, 32);
136   const LLT V16S32 = LLT::vector(16, 32);
137 
138   const LLT V2S64 = LLT::vector(2, 64);
139   const LLT V3S64 = LLT::vector(3, 64);
140   const LLT V4S64 = LLT::vector(4, 64);
141   const LLT V5S64 = LLT::vector(5, 64);
142   const LLT V6S64 = LLT::vector(6, 64);
143   const LLT V7S64 = LLT::vector(7, 64);
144   const LLT V8S64 = LLT::vector(8, 64);
145 
146   std::initializer_list<LLT> AllS32Vectors =
147     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
148      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
149   std::initializer_list<LLT> AllS64Vectors =
150     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
151 
152   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
153   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
154   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
155   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
156   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
157 
158   const LLT CodePtr = FlatPtr;
159 
160   const std::initializer_list<LLT> AddrSpaces64 = {
161     GlobalPtr, ConstantPtr, FlatPtr
162   };
163 
164   const std::initializer_list<LLT> AddrSpaces32 = {
165     LocalPtr, PrivatePtr
166   };
167 
168   const std::initializer_list<LLT> FPTypesBase = {
169     S32, S64
170   };
171 
172   const std::initializer_list<LLT> FPTypes16 = {
173     S32, S64, S16
174   };
175 
176   const std::initializer_list<LLT> FPTypesPK16 = {
177     S32, S64, S16, V2S16
178   };
179 
180   setAction({G_BRCOND, S1}, Legal);
181 
182   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
183   // elements for v3s16
184   getActionDefinitionsBuilder(G_PHI)
185     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
186     .legalFor(AllS32Vectors)
187     .legalFor(AllS64Vectors)
188     .legalFor(AddrSpaces64)
189     .legalFor(AddrSpaces32)
190     .clampScalar(0, S32, S256)
191     .widenScalarToNextPow2(0, 32)
192     .clampMaxNumElements(0, S32, 16)
193     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
194     .legalIf(isPointer(0));
195 
196   if (ST.has16BitInsts()) {
197     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
198       .legalFor({S32, S16})
199       .clampScalar(0, S16, S32)
200       .scalarize(0);
201   } else {
202     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
203       .legalFor({S32})
204       .clampScalar(0, S32, S32)
205       .scalarize(0);
206   }
207 
208   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
209     .legalFor({S32})
210     .clampScalar(0, S32, S32)
211     .scalarize(0);
212 
213   // Report legal for any types we can handle anywhere. For the cases only legal
214   // on the SALU, RegBankSelect will be able to re-legalize.
215   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
216     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
217     .clampScalar(0, S32, S64)
218     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
219     .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
220     .widenScalarToNextPow2(0)
221     .scalarize(0);
222 
223   getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
224                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
225     .legalFor({{S32, S1}})
226     .clampScalar(0, S32, S32);
227 
228   getActionDefinitionsBuilder(G_BITCAST)
229     .legalForCartesianProduct({S32, V2S16})
230     .legalForCartesianProduct({S64, V2S32, V4S16})
231     .legalForCartesianProduct({V2S64, V4S32})
232     // Don't worry about the size constraint.
233     .legalIf(all(isPointer(0), isPointer(1)));
234 
235   if (ST.has16BitInsts()) {
236     getActionDefinitionsBuilder(G_FCONSTANT)
237       .legalFor({S32, S64, S16})
238       .clampScalar(0, S16, S64);
239   } else {
240     getActionDefinitionsBuilder(G_FCONSTANT)
241       .legalFor({S32, S64})
242       .clampScalar(0, S32, S64);
243   }
244 
245   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
246     .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
247                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
248     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
249     .clampScalarOrElt(0, S32, S512)
250     .legalIf(isMultiple32(0))
251     .widenScalarToNextPow2(0, 32)
252     .clampMaxNumElements(0, S32, 16);
253 
254 
255   // FIXME: i1 operands to intrinsics should always be legal, but other i1
256   // values may not be legal.  We need to figure out how to distinguish
257   // between these two scenarios.
258   getActionDefinitionsBuilder(G_CONSTANT)
259     .legalFor({S1, S32, S64, GlobalPtr,
260                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
261     .clampScalar(0, S32, S64)
262     .widenScalarToNextPow2(0)
263     .legalIf(isPointer(0));
264 
265   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
266 
267   auto &FPOpActions = getActionDefinitionsBuilder(
268     { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
269     .legalFor({S32, S64});
270 
271   if (ST.has16BitInsts()) {
272     if (ST.hasVOP3PInsts())
273       FPOpActions.legalFor({S16, V2S16});
274     else
275       FPOpActions.legalFor({S16});
276   }
277 
278   auto &MinNumMaxNum = getActionDefinitionsBuilder({
279       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
280 
281   if (ST.hasVOP3PInsts()) {
282     MinNumMaxNum.customFor(FPTypesPK16)
283       .clampMaxNumElements(0, S16, 2)
284       .clampScalar(0, S16, S64)
285       .scalarize(0);
286   } else if (ST.has16BitInsts()) {
287     MinNumMaxNum.customFor(FPTypes16)
288       .clampScalar(0, S16, S64)
289       .scalarize(0);
290   } else {
291     MinNumMaxNum.customFor(FPTypesBase)
292       .clampScalar(0, S32, S64)
293       .scalarize(0);
294   }
295 
296   // TODO: Implement
297   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
298 
299   if (ST.hasVOP3PInsts())
300     FPOpActions.clampMaxNumElements(0, S16, 2);
301   FPOpActions
302     .scalarize(0)
303     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
304 
305   if (ST.has16BitInsts()) {
306     getActionDefinitionsBuilder(G_FSQRT)
307       .legalFor({S32, S64, S16})
308       .scalarize(0)
309       .clampScalar(0, S16, S64);
310   } else {
311     getActionDefinitionsBuilder(G_FSQRT)
312       .legalFor({S32, S64})
313       .scalarize(0)
314       .clampScalar(0, S32, S64);
315   }
316 
317   getActionDefinitionsBuilder(G_FPTRUNC)
318     .legalFor({{S32, S64}, {S16, S32}})
319     .scalarize(0);
320 
321   getActionDefinitionsBuilder(G_FPEXT)
322     .legalFor({{S64, S32}, {S32, S16}})
323     .lowerFor({{S64, S16}}) // FIXME: Implement
324     .scalarize(0);
325 
326   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
327   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
328 
329   getActionDefinitionsBuilder(G_FSUB)
330       // Use actual fsub instruction
331       .legalFor({S32})
332       // Must use fadd + fneg
333       .lowerFor({S64, S16, V2S16})
334       .scalarize(0)
335       .clampScalar(0, S32, S64);
336 
337   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
338     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
339                {S32, S1}, {S64, S1}, {S16, S1},
340                // FIXME: Hack
341                {S64, LLT::scalar(33)},
342                {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
343     .scalarize(0);
344 
345   getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
346     .legalFor({{S32, S32}, {S64, S32}})
347     .lowerFor({{S32, S64}})
348     .customFor({{S64, S64}})
349     .scalarize(0);
350 
351   getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
352     .legalFor({{S32, S32}, {S32, S64}})
353     .scalarize(0);
354 
355   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
356     .legalFor({S32, S64})
357     .scalarize(0);
358 
359   if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
360     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
361       .legalFor({S32, S64})
362       .clampScalar(0, S32, S64)
363       .scalarize(0);
364   } else {
365     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
366       .legalFor({S32})
367       .customFor({S64})
368       .clampScalar(0, S32, S64)
369       .scalarize(0);
370   }
371 
372   getActionDefinitionsBuilder(G_GEP)
373     .legalForCartesianProduct(AddrSpaces64, {S64})
374     .legalForCartesianProduct(AddrSpaces32, {S32})
375     .scalarize(0);
376 
377   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
378 
379   auto &CmpBuilder =
380     getActionDefinitionsBuilder(G_ICMP)
381     .legalForCartesianProduct(
382       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
383     .legalFor({{S1, S32}, {S1, S64}});
384   if (ST.has16BitInsts()) {
385     CmpBuilder.legalFor({{S1, S16}});
386   }
387 
388   CmpBuilder
389     .widenScalarToNextPow2(1)
390     .clampScalar(1, S32, S64)
391     .scalarize(0)
392     .legalIf(all(typeIs(0, S1), isPointer(1)));
393 
394   getActionDefinitionsBuilder(G_FCMP)
395     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
396     .widenScalarToNextPow2(1)
397     .clampScalar(1, S32, S64)
398     .scalarize(0);
399 
400   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
401   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
402                                G_FLOG, G_FLOG2, G_FLOG10})
403     .legalFor({S32})
404     .scalarize(0);
405 
406   // The 64-bit versions produce 32-bit results, but only on the SALU.
407   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
408                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
409                                G_CTPOP})
410     .legalFor({{S32, S32}, {S32, S64}})
411     .clampScalar(0, S32, S32)
412     .clampScalar(1, S32, S64)
413     .scalarize(0)
414     .widenScalarToNextPow2(0, 32)
415     .widenScalarToNextPow2(1, 32);
416 
417   // TODO: Expand for > s32
418   getActionDefinitionsBuilder(G_BSWAP)
419     .legalFor({S32})
420     .clampScalar(0, S32, S32)
421     .scalarize(0);
422 
423   if (ST.has16BitInsts()) {
424     if (ST.hasVOP3PInsts()) {
425       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
426         .legalFor({S32, S16, V2S16})
427         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
428         .clampMaxNumElements(0, S16, 2)
429         .clampScalar(0, S16, S32)
430         .widenScalarToNextPow2(0)
431         .scalarize(0);
432     } else {
433       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
434         .legalFor({S32, S16})
435         .widenScalarToNextPow2(0)
436         .clampScalar(0, S16, S32)
437         .scalarize(0);
438     }
439   } else {
440     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
441       .legalFor({S32})
442       .clampScalar(0, S32, S32)
443       .widenScalarToNextPow2(0)
444       .scalarize(0);
445   }
446 
447   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
448     return [=](const LegalityQuery &Query) {
449       return Query.Types[TypeIdx0].getSizeInBits() <
450              Query.Types[TypeIdx1].getSizeInBits();
451     };
452   };
453 
454   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
455     return [=](const LegalityQuery &Query) {
456       return Query.Types[TypeIdx0].getSizeInBits() >
457              Query.Types[TypeIdx1].getSizeInBits();
458     };
459   };
460 
461   getActionDefinitionsBuilder(G_INTTOPTR)
462     // List the common cases
463     .legalForCartesianProduct(AddrSpaces64, {S64})
464     .legalForCartesianProduct(AddrSpaces32, {S32})
465     .scalarize(0)
466     // Accept any address space as long as the size matches
467     .legalIf(sameSize(0, 1))
468     .widenScalarIf(smallerThan(1, 0),
469       [](const LegalityQuery &Query) {
470         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
471       })
472     .narrowScalarIf(greaterThan(1, 0),
473       [](const LegalityQuery &Query) {
474         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
475       });
476 
477   getActionDefinitionsBuilder(G_PTRTOINT)
478     // List the common cases
479     .legalForCartesianProduct(AddrSpaces64, {S64})
480     .legalForCartesianProduct(AddrSpaces32, {S32})
481     .scalarize(0)
482     // Accept any address space as long as the size matches
483     .legalIf(sameSize(0, 1))
484     .widenScalarIf(smallerThan(0, 1),
485       [](const LegalityQuery &Query) {
486         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
487       })
488     .narrowScalarIf(
489       greaterThan(0, 1),
490       [](const LegalityQuery &Query) {
491         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
492       });
493 
494   if (ST.hasFlatAddressSpace()) {
495     getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
496       .scalarize(0)
497       .custom();
498   }
499 
500   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
501   // handle some operations by just promoting the register during
502   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
503   getActionDefinitionsBuilder({G_LOAD, G_STORE})
504     .narrowScalarIf([](const LegalityQuery &Query) {
505         unsigned Size = Query.Types[0].getSizeInBits();
506         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
507         return (Size > 32 && MemSize < Size);
508       },
509       [](const LegalityQuery &Query) {
510         return std::make_pair(0, LLT::scalar(32));
511       })
512     .fewerElementsIf([=](const LegalityQuery &Query) {
513         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
514         return (MemSize == 96) &&
515                Query.Types[0].isVector() &&
516                !ST.hasDwordx3LoadStores();
517       },
518       [=](const LegalityQuery &Query) {
519         return std::make_pair(0, V2S32);
520       })
521     .legalIf([=](const LegalityQuery &Query) {
522         const LLT &Ty0 = Query.Types[0];
523 
524         unsigned Size = Ty0.getSizeInBits();
525         unsigned MemSize = Query.MMODescrs[0].SizeInBits;
526         if (Size < 32 || (Size > 32 && MemSize < Size))
527           return false;
528 
529         if (Ty0.isVector() && Size != MemSize)
530           return false;
531 
532         // TODO: Decompose private loads into 4-byte components.
533         // TODO: Illegal flat loads on SI
534         switch (MemSize) {
535         case 8:
536         case 16:
537           return Size == 32;
538         case 32:
539         case 64:
540         case 128:
541           return true;
542 
543         case 96:
544           return ST.hasDwordx3LoadStores();
545 
546         case 256:
547         case 512:
548           // TODO: Possibly support loads of i256 and i512 .  This will require
549           // adding i256 and i512 types to MVT in order for to be able to use
550           // TableGen.
551           // TODO: Add support for other vector types, this will require
552           //       defining more value mappings for the new types.
553           return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
554                                     Ty0.getScalarType().getSizeInBits() == 64);
555 
556         default:
557           return false;
558         }
559       })
560     .clampScalar(0, S32, S64);
561 
562 
563   // FIXME: Handle alignment requirements.
564   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
565     .legalForTypesWithMemDesc({
566         {S32, GlobalPtr, 8, 8},
567         {S32, GlobalPtr, 16, 8},
568         {S32, LocalPtr, 8, 8},
569         {S32, LocalPtr, 16, 8},
570         {S32, PrivatePtr, 8, 8},
571         {S32, PrivatePtr, 16, 8}});
572   if (ST.hasFlatAddressSpace()) {
573     ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
574                                        {S32, FlatPtr, 16, 8}});
575   }
576 
577   ExtLoads.clampScalar(0, S32, S32)
578           .widenScalarToNextPow2(0)
579           .unsupportedIfMemSizeNotPow2()
580           .lower();
581 
582   auto &Atomics = getActionDefinitionsBuilder(
583     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
584      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
585      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
586      G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG})
587     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
588                {S64, GlobalPtr}, {S64, LocalPtr}});
589   if (ST.hasFlatAddressSpace()) {
590     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
591   }
592 
593   // TODO: Pointer types, any 32-bit or 64-bit vector
594   getActionDefinitionsBuilder(G_SELECT)
595     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
596           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
597           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
598     .clampScalar(0, S16, S64)
599     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
600     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
601     .scalarize(1)
602     .clampMaxNumElements(0, S32, 2)
603     .clampMaxNumElements(0, LocalPtr, 2)
604     .clampMaxNumElements(0, PrivatePtr, 2)
605     .scalarize(0)
606     .widenScalarToNextPow2(0)
607     .legalIf(all(isPointer(0), typeIs(1, S1)));
608 
609   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
610   // be more flexible with the shift amount type.
611   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
612     .legalFor({{S32, S32}, {S64, S32}});
613   if (ST.has16BitInsts()) {
614     if (ST.hasVOP3PInsts()) {
615       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
616             .clampMaxNumElements(0, S16, 2);
617     } else
618       Shifts.legalFor({{S16, S32}, {S16, S16}});
619 
620     Shifts.clampScalar(1, S16, S32);
621     Shifts.clampScalar(0, S16, S64);
622     Shifts.widenScalarToNextPow2(0, 16);
623   } else {
624     // Make sure we legalize the shift amount type first, as the general
625     // expansion for the shifted type will produce much worse code if it hasn't
626     // been truncated already.
627     Shifts.clampScalar(1, S32, S32);
628     Shifts.clampScalar(0, S32, S64);
629     Shifts.widenScalarToNextPow2(0, 32);
630   }
631   Shifts.scalarize(0);
632 
633   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
634     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
635     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
636     unsigned IdxTypeIdx = 2;
637 
638     getActionDefinitionsBuilder(Op)
639       .customIf([=](const LegalityQuery &Query) {
640           const LLT EltTy = Query.Types[EltTypeIdx];
641           const LLT VecTy = Query.Types[VecTypeIdx];
642           const LLT IdxTy = Query.Types[IdxTypeIdx];
643           return (EltTy.getSizeInBits() == 16 ||
644                   EltTy.getSizeInBits() % 32 == 0) &&
645                  VecTy.getSizeInBits() % 32 == 0 &&
646                  VecTy.getSizeInBits() <= 512 &&
647                  IdxTy.getSizeInBits() == 32;
648         })
649       .clampScalar(EltTypeIdx, S32, S64)
650       .clampScalar(VecTypeIdx, S32, S64)
651       .clampScalar(IdxTypeIdx, S32, S32);
652   }
653 
654   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
655     .unsupportedIf([=](const LegalityQuery &Query) {
656         const LLT &EltTy = Query.Types[1].getElementType();
657         return Query.Types[0] != EltTy;
658       });
659 
660   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
661     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
662     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
663 
664     // FIXME: Doesn't handle extract of illegal sizes.
665     getActionDefinitionsBuilder(Op)
666       .legalIf([=](const LegalityQuery &Query) {
667           const LLT BigTy = Query.Types[BigTyIdx];
668           const LLT LitTy = Query.Types[LitTyIdx];
669           return (BigTy.getSizeInBits() % 32 == 0) &&
670                  (LitTy.getSizeInBits() % 16 == 0);
671         })
672       .widenScalarIf(
673         [=](const LegalityQuery &Query) {
674           const LLT BigTy = Query.Types[BigTyIdx];
675           return (BigTy.getScalarSizeInBits() < 16);
676         },
677         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
678       .widenScalarIf(
679         [=](const LegalityQuery &Query) {
680           const LLT LitTy = Query.Types[LitTyIdx];
681           return (LitTy.getScalarSizeInBits() < 16);
682         },
683         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
684       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
685       .widenScalarToNextPow2(BigTyIdx, 32);
686 
687   }
688 
689   getActionDefinitionsBuilder(G_BUILD_VECTOR)
690       .legalForCartesianProduct(AllS32Vectors, {S32})
691       .legalForCartesianProduct(AllS64Vectors, {S64})
692       .clampNumElements(0, V16S32, V16S32)
693       .clampNumElements(0, V2S64, V8S64)
694       .minScalarSameAs(1, 0)
695       .legalIf(isRegisterType(0))
696       .minScalarOrElt(0, S32);
697 
698   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
699     .legalIf(isRegisterType(0));
700 
701   // Merge/Unmerge
702   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
703     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
704     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
705 
706     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
707       const LLT &Ty = Query.Types[TypeIdx];
708       if (Ty.isVector()) {
709         const LLT &EltTy = Ty.getElementType();
710         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
711           return true;
712         if (!isPowerOf2_32(EltTy.getSizeInBits()))
713           return true;
714       }
715       return false;
716     };
717 
718     getActionDefinitionsBuilder(Op)
719       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
720       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
721       // worth considering the multiples of 64 since 2*192 and 2*384 are not
722       // valid.
723       .clampScalar(LitTyIdx, S16, S256)
724       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
725 
726       // Break up vectors with weird elements into scalars
727       .fewerElementsIf(
728         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
729         scalarize(0))
730       .fewerElementsIf(
731         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
732         scalarize(1))
733       .clampScalar(BigTyIdx, S32, S512)
734       .widenScalarIf(
735         [=](const LegalityQuery &Query) {
736           const LLT &Ty = Query.Types[BigTyIdx];
737           return !isPowerOf2_32(Ty.getSizeInBits()) &&
738                  Ty.getSizeInBits() % 16 != 0;
739         },
740         [=](const LegalityQuery &Query) {
741           // Pick the next power of 2, or a multiple of 64 over 128.
742           // Whichever is smaller.
743           const LLT &Ty = Query.Types[BigTyIdx];
744           unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
745           if (NewSizeInBits >= 256) {
746             unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
747             if (RoundedTo < NewSizeInBits)
748               NewSizeInBits = RoundedTo;
749           }
750           return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
751         })
752       .legalIf([=](const LegalityQuery &Query) {
753           const LLT &BigTy = Query.Types[BigTyIdx];
754           const LLT &LitTy = Query.Types[LitTyIdx];
755 
756           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
757             return false;
758           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
759             return false;
760 
761           return BigTy.getSizeInBits() % 16 == 0 &&
762                  LitTy.getSizeInBits() % 16 == 0 &&
763                  BigTy.getSizeInBits() <= 512;
764         })
765       // Any vectors left are the wrong size. Scalarize them.
766       .scalarize(0)
767       .scalarize(1);
768   }
769 
770   computeTables();
771   verify(*ST.getInstrInfo());
772 }
773 
774 bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
775                                          MachineRegisterInfo &MRI,
776                                          MachineIRBuilder &MIRBuilder,
777                                          GISelChangeObserver &Observer) const {
778   switch (MI.getOpcode()) {
779   case TargetOpcode::G_ADDRSPACE_CAST:
780     return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
781   case TargetOpcode::G_FRINT:
782     return legalizeFrint(MI, MRI, MIRBuilder);
783   case TargetOpcode::G_FCEIL:
784     return legalizeFceil(MI, MRI, MIRBuilder);
785   case TargetOpcode::G_INTRINSIC_TRUNC:
786     return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
787   case TargetOpcode::G_SITOFP:
788     return legalizeITOFP(MI, MRI, MIRBuilder, true);
789   case TargetOpcode::G_UITOFP:
790     return legalizeITOFP(MI, MRI, MIRBuilder, false);
791   case TargetOpcode::G_FMINNUM:
792   case TargetOpcode::G_FMAXNUM:
793   case TargetOpcode::G_FMINNUM_IEEE:
794   case TargetOpcode::G_FMAXNUM_IEEE:
795     return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
796   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
797     return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
798   case TargetOpcode::G_INSERT_VECTOR_ELT:
799     return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
800   default:
801     return false;
802   }
803 
804   llvm_unreachable("expected switch to return");
805 }
806 
807 Register AMDGPULegalizerInfo::getSegmentAperture(
808   unsigned AS,
809   MachineRegisterInfo &MRI,
810   MachineIRBuilder &MIRBuilder) const {
811   MachineFunction &MF = MIRBuilder.getMF();
812   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
813   const LLT S32 = LLT::scalar(32);
814 
815   if (ST.hasApertureRegs()) {
816     // FIXME: Use inline constants (src_{shared, private}_base) instead of
817     // getreg.
818     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
819         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
820         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
821     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
822         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
823         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
824     unsigned Encoding =
825         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
826         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
827         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
828 
829     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
830     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
831 
832     MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
833       .addDef(GetReg)
834       .addImm(Encoding);
835     MRI.setType(GetReg, S32);
836 
837     auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
838     MIRBuilder.buildInstr(TargetOpcode::G_SHL)
839       .addDef(ApertureReg)
840       .addUse(GetReg)
841       .addUse(ShiftAmt.getReg(0));
842 
843     return ApertureReg;
844   }
845 
846   Register QueuePtr = MRI.createGenericVirtualRegister(
847     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
848 
849   // FIXME: Placeholder until we can track the input registers.
850   MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
851 
852   // Offset into amd_queue_t for group_segment_aperture_base_hi /
853   // private_segment_aperture_base_hi.
854   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
855 
856   // FIXME: Don't use undef
857   Value *V = UndefValue::get(PointerType::get(
858                                Type::getInt8Ty(MF.getFunction().getContext()),
859                                AMDGPUAS::CONSTANT_ADDRESS));
860 
861   MachinePointerInfo PtrInfo(V, StructOffset);
862   MachineMemOperand *MMO = MF.getMachineMemOperand(
863     PtrInfo,
864     MachineMemOperand::MOLoad |
865     MachineMemOperand::MODereferenceable |
866     MachineMemOperand::MOInvariant,
867     4,
868     MinAlign(64, StructOffset));
869 
870   Register LoadResult = MRI.createGenericVirtualRegister(S32);
871   Register LoadAddr;
872 
873   MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
874   MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
875   return LoadResult;
876 }
877 
878 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
879   MachineInstr &MI, MachineRegisterInfo &MRI,
880   MachineIRBuilder &MIRBuilder) const {
881   MachineFunction &MF = MIRBuilder.getMF();
882 
883   MIRBuilder.setInstr(MI);
884 
885   Register Dst = MI.getOperand(0).getReg();
886   Register Src = MI.getOperand(1).getReg();
887 
888   LLT DstTy = MRI.getType(Dst);
889   LLT SrcTy = MRI.getType(Src);
890   unsigned DestAS = DstTy.getAddressSpace();
891   unsigned SrcAS = SrcTy.getAddressSpace();
892 
893   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
894   // vector element.
895   assert(!DstTy.isVector());
896 
897   const AMDGPUTargetMachine &TM
898     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
899 
900   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
901   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
902     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
903     return true;
904   }
905 
906   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
907     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
908            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
909     unsigned NullVal = TM.getNullPointerValue(DestAS);
910 
911     auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
912     auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
913 
914     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
915 
916     // Extract low 32-bits of the pointer.
917     MIRBuilder.buildExtract(PtrLo32, Src, 0);
918 
919     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
920     MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
921     MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
922 
923     MI.eraseFromParent();
924     return true;
925   }
926 
927   assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
928          SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
929 
930   auto SegmentNull =
931       MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
932   auto FlatNull =
933       MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
934 
935   Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
936 
937   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
938   MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
939 
940   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
941 
942   // Coerce the type of the low half of the result so we can use merge_values.
943   Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
944   MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
945     .addDef(SrcAsInt)
946     .addUse(Src);
947 
948   // TODO: Should we allow mismatched types but matching sizes in merges to
949   // avoid the ptrtoint?
950   MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
951   MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
952 
953   MI.eraseFromParent();
954   return true;
955 }
956 
957 bool AMDGPULegalizerInfo::legalizeFrint(
958   MachineInstr &MI, MachineRegisterInfo &MRI,
959   MachineIRBuilder &MIRBuilder) const {
960   MIRBuilder.setInstr(MI);
961 
962   Register Src = MI.getOperand(1).getReg();
963   LLT Ty = MRI.getType(Src);
964   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
965 
966   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
967   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
968 
969   auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
970   auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
971 
972   // TODO: Should this propagate fast-math-flags?
973   auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
974   auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
975 
976   auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
977   auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
978 
979   auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
980   MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
981   return true;
982 }
983 
984 bool AMDGPULegalizerInfo::legalizeFceil(
985   MachineInstr &MI, MachineRegisterInfo &MRI,
986   MachineIRBuilder &B) const {
987   B.setInstr(MI);
988 
989   const LLT S1 = LLT::scalar(1);
990   const LLT S64 = LLT::scalar(64);
991 
992   Register Src = MI.getOperand(1).getReg();
993   assert(MRI.getType(Src) == S64);
994 
995   // result = trunc(src)
996   // if (src > 0.0 && src != result)
997   //   result += 1.0
998 
999   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
1000 
1001   const auto Zero = B.buildFConstant(S64, 0.0);
1002   const auto One = B.buildFConstant(S64, 1.0);
1003   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1004   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1005   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1006   auto Add = B.buildSelect(S64, And, One, Zero);
1007 
1008   // TODO: Should this propagate fast-math-flags?
1009   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1010   return true;
1011 }
1012 
1013 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1014                                               MachineIRBuilder &B) {
1015   const unsigned FractBits = 52;
1016   const unsigned ExpBits = 11;
1017   LLT S32 = LLT::scalar(32);
1018 
1019   auto Const0 = B.buildConstant(S32, FractBits - 32);
1020   auto Const1 = B.buildConstant(S32, ExpBits);
1021 
1022   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1023     .addUse(Const0.getReg(0))
1024     .addUse(Const1.getReg(0));
1025 
1026   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1027 }
1028 
1029 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1030   MachineInstr &MI, MachineRegisterInfo &MRI,
1031   MachineIRBuilder &B) const {
1032   B.setInstr(MI);
1033 
1034   const LLT S1 = LLT::scalar(1);
1035   const LLT S32 = LLT::scalar(32);
1036   const LLT S64 = LLT::scalar(64);
1037 
1038   Register Src = MI.getOperand(1).getReg();
1039   assert(MRI.getType(Src) == S64);
1040 
1041   // TODO: Should this use extract since the low half is unused?
1042   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1043   Register Hi = Unmerge.getReg(1);
1044 
1045   // Extract the upper half, since this is where we will find the sign and
1046   // exponent.
1047   auto Exp = extractF64Exponent(Hi, B);
1048 
1049   const unsigned FractBits = 52;
1050 
1051   // Extract the sign bit.
1052   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1053   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1054 
1055   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1056 
1057   const auto Zero32 = B.buildConstant(S32, 0);
1058 
1059   // Extend back to 64-bits.
1060   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
1061 
1062   auto Shr = B.buildAShr(S64, FractMask, Exp);
1063   auto Not = B.buildNot(S64, Shr);
1064   auto Tmp0 = B.buildAnd(S64, Src, Not);
1065   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1066 
1067   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1068   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1069 
1070   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1071   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1072   return true;
1073 }
1074 
1075 bool AMDGPULegalizerInfo::legalizeITOFP(
1076   MachineInstr &MI, MachineRegisterInfo &MRI,
1077   MachineIRBuilder &B, bool Signed) const {
1078   B.setInstr(MI);
1079 
1080   Register Dst = MI.getOperand(0).getReg();
1081   Register Src = MI.getOperand(1).getReg();
1082 
1083   const LLT S64 = LLT::scalar(64);
1084   const LLT S32 = LLT::scalar(32);
1085 
1086   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1087 
1088   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1089 
1090   auto CvtHi = Signed ?
1091     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1092     B.buildUITOFP(S64, Unmerge.getReg(1));
1093 
1094   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1095 
1096   auto ThirtyTwo = B.buildConstant(S32, 32);
1097   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1098     .addUse(CvtHi.getReg(0))
1099     .addUse(ThirtyTwo.getReg(0));
1100 
1101   // TODO: Should this propagate fast-math-flags?
1102   B.buildFAdd(Dst, LdExp, CvtLo);
1103   MI.eraseFromParent();
1104   return true;
1105 }
1106 
1107 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
1108   MachineInstr &MI, MachineRegisterInfo &MRI,
1109   MachineIRBuilder &B) const {
1110   MachineFunction &MF = B.getMF();
1111   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1112 
1113   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1114                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1115 
1116   // With ieee_mode disabled, the instructions have the correct behavior
1117   // already for G_FMINNUM/G_FMAXNUM
1118   if (!MFI->getMode().IEEE)
1119     return !IsIEEEOp;
1120 
1121   if (IsIEEEOp)
1122     return true;
1123 
1124   MachineIRBuilder HelperBuilder(MI);
1125   GISelObserverWrapper DummyObserver;
1126   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
1127   HelperBuilder.setMBB(*MI.getParent());
1128   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1129 }
1130 
1131 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1132   MachineInstr &MI, MachineRegisterInfo &MRI,
1133   MachineIRBuilder &B) const {
1134   // TODO: Should move some of this into LegalizerHelper.
1135 
1136   // TODO: Promote dynamic indexing of s16 to s32
1137   // TODO: Dynamic s64 indexing is only legal for SGPR.
1138   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
1139   if (!IdxVal) // Dynamic case will be selected to register indexing.
1140     return true;
1141 
1142   Register Dst = MI.getOperand(0).getReg();
1143   Register Vec = MI.getOperand(1).getReg();
1144 
1145   LLT VecTy = MRI.getType(Vec);
1146   LLT EltTy = VecTy.getElementType();
1147   assert(EltTy == MRI.getType(Dst));
1148 
1149   B.setInstr(MI);
1150 
1151   if (IdxVal.getValue() < VecTy.getNumElements())
1152     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
1153   else
1154     B.buildUndef(Dst);
1155 
1156   MI.eraseFromParent();
1157   return true;
1158 }
1159 
1160 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1161   MachineInstr &MI, MachineRegisterInfo &MRI,
1162   MachineIRBuilder &B) const {
1163   // TODO: Should move some of this into LegalizerHelper.
1164 
1165   // TODO: Promote dynamic indexing of s16 to s32
1166   // TODO: Dynamic s64 indexing is only legal for SGPR.
1167   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
1168   if (!IdxVal) // Dynamic case will be selected to register indexing.
1169     return true;
1170 
1171   Register Dst = MI.getOperand(0).getReg();
1172   Register Vec = MI.getOperand(1).getReg();
1173   Register Ins = MI.getOperand(2).getReg();
1174 
1175   LLT VecTy = MRI.getType(Vec);
1176   LLT EltTy = VecTy.getElementType();
1177   assert(EltTy == MRI.getType(Ins));
1178 
1179   B.setInstr(MI);
1180 
1181   if (IdxVal.getValue() < VecTy.getNumElements())
1182     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
1183   else
1184     B.buildUndef(Dst);
1185 
1186   MI.eraseFromParent();
1187   return true;
1188 }
1189 
1190 // Return the use branch instruction, otherwise null if the usage is invalid.
1191 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1192                                        MachineRegisterInfo &MRI) {
1193   Register CondDef = MI.getOperand(0).getReg();
1194   if (!MRI.hasOneNonDBGUse(CondDef))
1195     return nullptr;
1196 
1197   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1198   return UseMI.getParent() == MI.getParent() &&
1199     UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr;
1200 }
1201 
1202 Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
1203                                                 Register Reg, LLT Ty) const {
1204   Register LiveIn = MRI.getLiveInVirtReg(Reg);
1205   if (LiveIn)
1206     return LiveIn;
1207 
1208   Register NewReg = MRI.createGenericVirtualRegister(Ty);
1209   MRI.addLiveIn(Reg, NewReg);
1210   return NewReg;
1211 }
1212 
1213 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
1214                                          const ArgDescriptor *Arg) const {
1215   if (!Arg->isRegister())
1216     return false; // TODO: Handle these
1217 
1218   assert(Arg->getRegister() != 0);
1219   assert(Arg->getRegister().isPhysical());
1220 
1221   MachineRegisterInfo &MRI = *B.getMRI();
1222 
1223   LLT Ty = MRI.getType(DstReg);
1224   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
1225 
1226   if (Arg->isMasked()) {
1227     // TODO: Should we try to emit this once in the entry block?
1228     const LLT S32 = LLT::scalar(32);
1229     const unsigned Mask = Arg->getMask();
1230     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
1231 
1232     auto ShiftAmt = B.buildConstant(S32, Shift);
1233     auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
1234     B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
1235   } else
1236     B.buildCopy(DstReg, LiveIn);
1237 
1238   // Insert the argument copy if it doens't already exist.
1239   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
1240   if (!MRI.getVRegDef(LiveIn)) {
1241     MachineBasicBlock &EntryMBB = B.getMF().front();
1242     EntryMBB.addLiveIn(Arg->getRegister());
1243     B.setInsertPt(EntryMBB, EntryMBB.begin());
1244     B.buildCopy(LiveIn, Arg->getRegister());
1245   }
1246 
1247   return true;
1248 }
1249 
1250 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
1251   MachineInstr &MI,
1252   MachineRegisterInfo &MRI,
1253   MachineIRBuilder &B,
1254   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
1255   B.setInstr(MI);
1256 
1257   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1258 
1259   const ArgDescriptor *Arg;
1260   const TargetRegisterClass *RC;
1261   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
1262   if (!Arg) {
1263     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
1264     return false;
1265   }
1266 
1267   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
1268     MI.eraseFromParent();
1269     return true;
1270   }
1271 
1272   return false;
1273 }
1274 
1275 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
1276                                                  MachineRegisterInfo &MRI,
1277                                                  MachineIRBuilder &B) const {
1278   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
1279   if (!MFI->isEntryFunction()) {
1280     return legalizePreloadedArgIntrin(MI, MRI, B,
1281                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1282   }
1283 
1284   B.setInstr(MI);
1285 
1286   uint64_t Offset =
1287     ST.getTargetLowering()->getImplicitParameterOffset(
1288       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
1289   Register DstReg = MI.getOperand(0).getReg();
1290   LLT DstTy = MRI.getType(DstReg);
1291   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
1292 
1293   const ArgDescriptor *Arg;
1294   const TargetRegisterClass *RC;
1295   std::tie(Arg, RC)
1296     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1297   if (!Arg)
1298     return false;
1299 
1300   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
1301   if (!loadInputValue(KernargPtrReg, B, Arg))
1302     return false;
1303 
1304   B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
1305   MI.eraseFromParent();
1306   return true;
1307 }
1308 
1309 bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
1310                                             MachineRegisterInfo &MRI,
1311                                             MachineIRBuilder &B) const {
1312   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
1313   switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
1314   case Intrinsic::amdgcn_if: {
1315     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1316       const SIRegisterInfo *TRI
1317         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1318 
1319       B.setInstr(*BrCond);
1320       Register Def = MI.getOperand(1).getReg();
1321       Register Use = MI.getOperand(3).getReg();
1322       B.buildInstr(AMDGPU::SI_IF)
1323         .addDef(Def)
1324         .addUse(Use)
1325         .addMBB(BrCond->getOperand(1).getMBB());
1326 
1327       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
1328       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
1329       MI.eraseFromParent();
1330       BrCond->eraseFromParent();
1331       return true;
1332     }
1333 
1334     return false;
1335   }
1336   case Intrinsic::amdgcn_loop: {
1337     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
1338       const SIRegisterInfo *TRI
1339         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
1340 
1341       B.setInstr(*BrCond);
1342       Register Reg = MI.getOperand(2).getReg();
1343       B.buildInstr(AMDGPU::SI_LOOP)
1344         .addUse(Reg)
1345         .addMBB(BrCond->getOperand(1).getMBB());
1346       MI.eraseFromParent();
1347       BrCond->eraseFromParent();
1348       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
1349       return true;
1350     }
1351 
1352     return false;
1353   }
1354   case Intrinsic::amdgcn_kernarg_segment_ptr:
1355     return legalizePreloadedArgIntrin(
1356       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1357   case Intrinsic::amdgcn_implicitarg_ptr:
1358     return legalizeImplicitArgPtr(MI, MRI, B);
1359   case Intrinsic::amdgcn_workitem_id_x:
1360     return legalizePreloadedArgIntrin(MI, MRI, B,
1361                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
1362   case Intrinsic::amdgcn_workitem_id_y:
1363     return legalizePreloadedArgIntrin(MI, MRI, B,
1364                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
1365   case Intrinsic::amdgcn_workitem_id_z:
1366     return legalizePreloadedArgIntrin(MI, MRI, B,
1367                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
1368   case Intrinsic::amdgcn_workgroup_id_x:
1369     return legalizePreloadedArgIntrin(MI, MRI, B,
1370                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1371   case Intrinsic::amdgcn_workgroup_id_y:
1372     return legalizePreloadedArgIntrin(MI, MRI, B,
1373                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1374   case Intrinsic::amdgcn_workgroup_id_z:
1375     return legalizePreloadedArgIntrin(MI, MRI, B,
1376                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1377   case Intrinsic::amdgcn_dispatch_ptr:
1378     return legalizePreloadedArgIntrin(MI, MRI, B,
1379                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
1380   case Intrinsic::amdgcn_queue_ptr:
1381     return legalizePreloadedArgIntrin(MI, MRI, B,
1382                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
1383   case Intrinsic::amdgcn_implicit_buffer_ptr:
1384     return legalizePreloadedArgIntrin(
1385       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
1386   case Intrinsic::amdgcn_dispatch_id:
1387     return legalizePreloadedArgIntrin(MI, MRI, B,
1388                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
1389   default:
1390     return true;
1391   }
1392 
1393   return true;
1394 }
1395