xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision dc318a4ffabcbfa23bb56a33403aad36e6de30af)
1 //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the Machinelegalizer class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPULegalizerInfo.h"
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUGlobalISelUtils.h"
18 #include "AMDGPUTargetMachine.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/TargetOpcodes.h"
25 #include "llvm/CodeGen/ValueTypes.h"
26 #include "llvm/IR/DerivedTypes.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Type.h"
29 #include "llvm/Support/Debug.h"
30 
31 #define DEBUG_TYPE "amdgpu-legalinfo"
32 
33 using namespace llvm;
34 using namespace LegalizeActions;
35 using namespace LegalizeMutations;
36 using namespace LegalityPredicates;
37 using namespace MIPatternMatch;
38 
39 // Hack until load/store selection patterns support any tuple of legal types.
40 static cl::opt<bool> EnableNewLegality(
41   "amdgpu-global-isel-new-legality",
42   cl::desc("Use GlobalISel desired legality, rather than try to use"
43            "rules compatible with selection patterns"),
44   cl::init(false),
45   cl::ReallyHidden);
46 
47 static constexpr unsigned MaxRegisterSize = 1024;
48 
49 // Round the number of elements to the next power of two elements
50 static LLT getPow2VectorType(LLT Ty) {
51   unsigned NElts = Ty.getNumElements();
52   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53   return Ty.changeNumElements(Pow2NElts);
54 }
55 
56 // Round the number of bits to the next power of two bits
57 static LLT getPow2ScalarType(LLT Ty) {
58   unsigned Bits = Ty.getSizeInBits();
59   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60   return LLT::scalar(Pow2Bits);
61 }
62 
63 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
64   return [=](const LegalityQuery &Query) {
65     const LLT Ty = Query.Types[TypeIdx];
66     return Ty.isVector() &&
67            Ty.getNumElements() % 2 != 0 &&
68            Ty.getElementType().getSizeInBits() < 32 &&
69            Ty.getSizeInBits() % 32 != 0;
70   };
71 }
72 
73 static LegalityPredicate isWideVec16(unsigned TypeIdx) {
74   return [=](const LegalityQuery &Query) {
75     const LLT Ty = Query.Types[TypeIdx];
76     const LLT EltTy = Ty.getScalarType();
77     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
78   };
79 }
80 
81 static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
82   return [=](const LegalityQuery &Query) {
83     const LLT Ty = Query.Types[TypeIdx];
84     const LLT EltTy = Ty.getElementType();
85     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
86   };
87 }
88 
89 static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
90   return [=](const LegalityQuery &Query) {
91     const LLT Ty = Query.Types[TypeIdx];
92     const LLT EltTy = Ty.getElementType();
93     unsigned Size = Ty.getSizeInBits();
94     unsigned Pieces = (Size + 63) / 64;
95     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
96     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
97   };
98 }
99 
100 // Increase the number of vector elements to reach the next multiple of 32-bit
101 // type.
102 static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
103   return [=](const LegalityQuery &Query) {
104     const LLT Ty = Query.Types[TypeIdx];
105 
106     const LLT EltTy = Ty.getElementType();
107     const int Size = Ty.getSizeInBits();
108     const int EltSize = EltTy.getSizeInBits();
109     const int NextMul32 = (Size + 31) / 32;
110 
111     assert(EltSize < 32);
112 
113     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
114     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
115   };
116 }
117 
118 static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119   return [=](const LegalityQuery &Query) {
120     const LLT Ty = Query.Types[TypeIdx];
121     unsigned Size = Ty.getSizeInBits();
122 
123     LLT CoercedTy;
124     if (Size <= 32) {
125       // <2 x s8> -> s16
126       // <4 x s8> -> s32
127       CoercedTy = LLT::scalar(Size);
128     } else
129       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130 
131     return std::make_pair(TypeIdx, CoercedTy);
132   };
133 }
134 
135 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
136   return [=](const LegalityQuery &Query) {
137     const LLT QueryTy = Query.Types[TypeIdx];
138     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
139   };
140 }
141 
142 static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
143   return [=](const LegalityQuery &Query) {
144     const LLT QueryTy = Query.Types[TypeIdx];
145     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
146   };
147 }
148 
149 static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
150   return [=](const LegalityQuery &Query) {
151     const LLT QueryTy = Query.Types[TypeIdx];
152     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
153   };
154 }
155 
156 static bool isRegisterSize(unsigned Size) {
157   return Size % 32 == 0 && Size <= MaxRegisterSize;
158 }
159 
160 static bool isRegisterVectorElementType(LLT EltTy) {
161   const int EltSize = EltTy.getSizeInBits();
162   return EltSize == 16 || EltSize % 32 == 0;
163 }
164 
165 static bool isRegisterVectorType(LLT Ty) {
166   const int EltSize = Ty.getElementType().getSizeInBits();
167   return EltSize == 32 || EltSize == 64 ||
168          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
169          EltSize == 128 || EltSize == 256;
170 }
171 
172 static bool isRegisterType(LLT Ty) {
173   if (!isRegisterSize(Ty.getSizeInBits()))
174     return false;
175 
176   if (Ty.isVector())
177     return isRegisterVectorType(Ty);
178 
179   return true;
180 }
181 
182 // Any combination of 32 or 64-bit elements up the maximum register size, and
183 // multiples of v2s16.
184 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185   return [=](const LegalityQuery &Query) {
186     return isRegisterType(Query.Types[TypeIdx]);
187   };
188 }
189 
190 static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
191   return [=](const LegalityQuery &Query) {
192     const LLT QueryTy = Query.Types[TypeIdx];
193     if (!QueryTy.isVector())
194       return false;
195     const LLT EltTy = QueryTy.getElementType();
196     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
197   };
198 }
199 
200 static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
201   return [=](const LegalityQuery &Query) {
202     const LLT Ty = Query.Types[TypeIdx];
203     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
204            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
205   };
206 }
207 
208 // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209 // handle some operations by just promoting the register during
210 // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211 static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212                                     bool IsLoad) {
213   switch (AS) {
214   case AMDGPUAS::PRIVATE_ADDRESS:
215     // FIXME: Private element size.
216     return 32;
217   case AMDGPUAS::LOCAL_ADDRESS:
218     return ST.useDS128() ? 128 : 64;
219   case AMDGPUAS::GLOBAL_ADDRESS:
220   case AMDGPUAS::CONSTANT_ADDRESS:
221   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222     // Treat constant and global as identical. SMRD loads are sometimes usable for
223     // global loads (ideally constant address space should be eliminated)
224     // depending on the context. Legality cannot be context dependent, but
225     // RegBankSelect can split the load as necessary depending on the pointer
226     // register bank/uniformity and if the memory is invariant or not written in a
227     // kernel.
228     return IsLoad ? 512 : 128;
229   default:
230     // Flat addresses may contextually need to be split to 32-bit parts if they
231     // may alias scratch depending on the subtarget.
232     return 128;
233   }
234 }
235 
236 static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237                                  const LegalityQuery &Query,
238                                  unsigned Opcode) {
239   const LLT Ty = Query.Types[0];
240 
241   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243 
244   unsigned RegSize = Ty.getSizeInBits();
245   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246   unsigned Align = Query.MMODescrs[0].AlignInBits;
247   unsigned AS = Query.Types[1].getAddressSpace();
248 
249   // All of these need to be custom lowered to cast the pointer operand.
250   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251     return false;
252 
253   // TODO: We should be able to widen loads if the alignment is high enough, but
254   // we also need to modify the memory access size.
255 #if 0
256   // Accept widening loads based on alignment.
257   if (IsLoad && MemSize < Size)
258     MemSize = std::max(MemSize, Align);
259 #endif
260 
261   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262   if (MemSize != RegSize && RegSize != 32)
263     return false;
264 
265   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266     return false;
267 
268   switch (MemSize) {
269   case 8:
270   case 16:
271   case 32:
272   case 64:
273   case 128:
274     break;
275   case 96:
276     if (!ST.hasDwordx3LoadStores())
277       return false;
278     break;
279   case 256:
280   case 512:
281     // These may contextually need to be broken down.
282     break;
283   default:
284     return false;
285   }
286 
287   assert(RegSize >= MemSize);
288 
289   if (Align < MemSize) {
290     const SITargetLowering *TLI = ST.getTargetLowering();
291     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292       return false;
293   }
294 
295   return true;
296 }
297 
298 // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299 // workaround this. Eventually it should ignore the type for loads and only care
300 // about the size. Return true in cases where we will workaround this for now by
301 // bitcasting.
302 static bool loadStoreBitcastWorkaround(const LLT Ty) {
303   if (EnableNewLegality)
304     return false;
305 
306   const unsigned Size = Ty.getSizeInBits();
307   if (Size <= 64)
308     return false;
309   if (!Ty.isVector())
310     return true;
311   unsigned EltSize = Ty.getElementType().getSizeInBits();
312   return EltSize != 32 && EltSize != 64;
313 }
314 
315 static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316                              unsigned Opcode) {
317   const LLT Ty = Query.Types[0];
318   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319          !loadStoreBitcastWorkaround(Ty);
320 }
321 
322 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
323                                          const GCNTargetMachine &TM)
324   :  ST(ST_) {
325   using namespace TargetOpcode;
326 
327   auto GetAddrSpacePtr = [&TM](unsigned AS) {
328     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
329   };
330 
331   const LLT S1 = LLT::scalar(1);
332   const LLT S16 = LLT::scalar(16);
333   const LLT S32 = LLT::scalar(32);
334   const LLT S64 = LLT::scalar(64);
335   const LLT S128 = LLT::scalar(128);
336   const LLT S256 = LLT::scalar(256);
337   const LLT S512 = LLT::scalar(512);
338   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
339 
340   const LLT V2S16 = LLT::vector(2, 16);
341   const LLT V4S16 = LLT::vector(4, 16);
342 
343   const LLT V2S32 = LLT::vector(2, 32);
344   const LLT V3S32 = LLT::vector(3, 32);
345   const LLT V4S32 = LLT::vector(4, 32);
346   const LLT V5S32 = LLT::vector(5, 32);
347   const LLT V6S32 = LLT::vector(6, 32);
348   const LLT V7S32 = LLT::vector(7, 32);
349   const LLT V8S32 = LLT::vector(8, 32);
350   const LLT V9S32 = LLT::vector(9, 32);
351   const LLT V10S32 = LLT::vector(10, 32);
352   const LLT V11S32 = LLT::vector(11, 32);
353   const LLT V12S32 = LLT::vector(12, 32);
354   const LLT V13S32 = LLT::vector(13, 32);
355   const LLT V14S32 = LLT::vector(14, 32);
356   const LLT V15S32 = LLT::vector(15, 32);
357   const LLT V16S32 = LLT::vector(16, 32);
358   const LLT V32S32 = LLT::vector(32, 32);
359 
360   const LLT V2S64 = LLT::vector(2, 64);
361   const LLT V3S64 = LLT::vector(3, 64);
362   const LLT V4S64 = LLT::vector(4, 64);
363   const LLT V5S64 = LLT::vector(5, 64);
364   const LLT V6S64 = LLT::vector(6, 64);
365   const LLT V7S64 = LLT::vector(7, 64);
366   const LLT V8S64 = LLT::vector(8, 64);
367   const LLT V16S64 = LLT::vector(16, 64);
368 
369   std::initializer_list<LLT> AllS32Vectors =
370     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
371      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
372   std::initializer_list<LLT> AllS64Vectors =
373     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
374 
375   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
376   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
377   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
378   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
379   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
380   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
381   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
382 
383   const LLT CodePtr = FlatPtr;
384 
385   const std::initializer_list<LLT> AddrSpaces64 = {
386     GlobalPtr, ConstantPtr, FlatPtr
387   };
388 
389   const std::initializer_list<LLT> AddrSpaces32 = {
390     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
391   };
392 
393   const std::initializer_list<LLT> FPTypesBase = {
394     S32, S64
395   };
396 
397   const std::initializer_list<LLT> FPTypes16 = {
398     S32, S64, S16
399   };
400 
401   const std::initializer_list<LLT> FPTypesPK16 = {
402     S32, S64, S16, V2S16
403   };
404 
405   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406 
407   setAction({G_BRCOND, S1}, Legal); // VCC branches
408   setAction({G_BRCOND, S32}, Legal); // SCC branches
409 
410   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
411   // elements for v3s16
412   getActionDefinitionsBuilder(G_PHI)
413     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
414     .legalFor(AllS32Vectors)
415     .legalFor(AllS64Vectors)
416     .legalFor(AddrSpaces64)
417     .legalFor(AddrSpaces32)
418     .clampScalar(0, S32, S256)
419     .widenScalarToNextPow2(0, 32)
420     .clampMaxNumElements(0, S32, 16)
421     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
422     .legalIf(isPointer(0));
423 
424   if (ST.hasVOP3PInsts()) {
425     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426       .legalFor({S32, S16, V2S16})
427       .clampScalar(0, S16, S32)
428       .clampMaxNumElements(0, S16, 2)
429       .scalarize(0)
430       .widenScalarToNextPow2(0, 32);
431   } else if (ST.has16BitInsts()) {
432     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
433       .legalFor({S32, S16})
434       .clampScalar(0, S16, S32)
435       .scalarize(0)
436       .widenScalarToNextPow2(0, 32);
437   } else {
438     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
439       .legalFor({S32})
440       .clampScalar(0, S32, S32)
441       .scalarize(0);
442   }
443 
444   // FIXME: Not really legal. Placeholder for custom lowering.
445   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446     .customFor({S32, S64})
447     .clampScalar(0, S32, S64)
448     .widenScalarToNextPow2(0, 32)
449     .scalarize(0);
450 
451   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
452     .legalFor({S32})
453     .clampScalar(0, S32, S32)
454     .scalarize(0);
455 
456   // Report legal for any types we can handle anywhere. For the cases only legal
457   // on the SALU, RegBankSelect will be able to re-legalize.
458   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
459     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
460     .clampScalar(0, S32, S64)
461     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
462     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
463     .widenScalarToNextPow2(0)
464     .scalarize(0);
465 
466   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
467                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468     .legalFor({{S32, S1}, {S32, S32}})
469     .minScalar(0, S32)
470     // TODO: .scalarize(0)
471     .lower();
472 
473   getActionDefinitionsBuilder(G_BITCAST)
474     // Don't worry about the size constraint.
475     .legalIf(all(isRegisterType(0), isRegisterType(1)))
476     .lower();
477 
478 
479   getActionDefinitionsBuilder(G_CONSTANT)
480     .legalFor({S1, S32, S64, S16, GlobalPtr,
481                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
482     .clampScalar(0, S32, S64)
483     .widenScalarToNextPow2(0)
484     .legalIf(isPointer(0));
485 
486   getActionDefinitionsBuilder(G_FCONSTANT)
487     .legalFor({S32, S64, S16})
488     .clampScalar(0, S16, S64);
489 
490   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491       .legalIf(isRegisterType(0))
492       // s1 and s16 are special cases because they have legal operations on
493       // them, but don't really occupy registers in the normal way.
494       .legalFor({S1, S16})
495       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496       .clampScalarOrElt(0, S32, MaxScalar)
497       .widenScalarToNextPow2(0, 32)
498       .clampMaxNumElements(0, S32, 16);
499 
500   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501 
502   // If the amount is divergent, we have to do a wave reduction to get the
503   // maximum value, so this is expanded during RegBankSelect.
504   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505     .legalFor({{PrivatePtr, S32}});
506 
507   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508     .unsupportedFor({PrivatePtr})
509     .custom();
510   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
511 
512   auto &FPOpActions = getActionDefinitionsBuilder(
513     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
514     .legalFor({S32, S64});
515   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
516     .customFor({S32, S64});
517   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
518     .customFor({S32, S64});
519 
520   if (ST.has16BitInsts()) {
521     if (ST.hasVOP3PInsts())
522       FPOpActions.legalFor({S16, V2S16});
523     else
524       FPOpActions.legalFor({S16});
525 
526     TrigActions.customFor({S16});
527     FDIVActions.customFor({S16});
528   }
529 
530   auto &MinNumMaxNum = getActionDefinitionsBuilder({
531       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
532 
533   if (ST.hasVOP3PInsts()) {
534     MinNumMaxNum.customFor(FPTypesPK16)
535       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
536       .clampMaxNumElements(0, S16, 2)
537       .clampScalar(0, S16, S64)
538       .scalarize(0);
539   } else if (ST.has16BitInsts()) {
540     MinNumMaxNum.customFor(FPTypes16)
541       .clampScalar(0, S16, S64)
542       .scalarize(0);
543   } else {
544     MinNumMaxNum.customFor(FPTypesBase)
545       .clampScalar(0, S32, S64)
546       .scalarize(0);
547   }
548 
549   if (ST.hasVOP3PInsts())
550     FPOpActions.clampMaxNumElements(0, S16, 2);
551 
552   FPOpActions
553     .scalarize(0)
554     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
555 
556   TrigActions
557     .scalarize(0)
558     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
559 
560   FDIVActions
561     .scalarize(0)
562     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
563 
564   getActionDefinitionsBuilder({G_FNEG, G_FABS})
565     .legalFor(FPTypesPK16)
566     .clampMaxNumElements(0, S16, 2)
567     .scalarize(0)
568     .clampScalar(0, S16, S64);
569 
570   if (ST.has16BitInsts()) {
571     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
572       .legalFor({S32, S64, S16})
573       .scalarize(0)
574       .clampScalar(0, S16, S64);
575   } else {
576     getActionDefinitionsBuilder(G_FSQRT)
577       .legalFor({S32, S64})
578       .scalarize(0)
579       .clampScalar(0, S32, S64);
580 
581     if (ST.hasFractBug()) {
582       getActionDefinitionsBuilder(G_FFLOOR)
583         .customFor({S64})
584         .legalFor({S32, S64})
585         .scalarize(0)
586         .clampScalar(0, S32, S64);
587     } else {
588       getActionDefinitionsBuilder(G_FFLOOR)
589         .legalFor({S32, S64})
590         .scalarize(0)
591         .clampScalar(0, S32, S64);
592     }
593   }
594 
595   getActionDefinitionsBuilder(G_FPTRUNC)
596     .legalFor({{S32, S64}, {S16, S32}})
597     .scalarize(0)
598     .lower();
599 
600   getActionDefinitionsBuilder(G_FPEXT)
601     .legalFor({{S64, S32}, {S32, S16}})
602     .lowerFor({{S64, S16}}) // FIXME: Implement
603     .scalarize(0);
604 
605   getActionDefinitionsBuilder(G_FSUB)
606       // Use actual fsub instruction
607       .legalFor({S32})
608       // Must use fadd + fneg
609       .lowerFor({S64, S16, V2S16})
610       .scalarize(0)
611       .clampScalar(0, S32, S64);
612 
613   // Whether this is legal depends on the floating point mode for the function.
614   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
616     FMad.customFor({S32, S16});
617   else if (ST.hasMadMacF32Insts())
618     FMad.customFor({S32});
619   else if (ST.hasMadF16())
620     FMad.customFor({S16});
621   FMad.scalarize(0)
622       .lower();
623 
624   // TODO: Do we need to clamp maximum bitwidth?
625   getActionDefinitionsBuilder(G_TRUNC)
626     .legalIf(isScalar(0))
627     .legalFor({{V2S16, V2S32}})
628     .clampMaxNumElements(0, S16, 2)
629     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
630     // situations (like an invalid implicit use), we don't want to infinite loop
631     // in the legalizer.
632     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
633     .alwaysLegal();
634 
635   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
636     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
637                {S32, S1}, {S64, S1}, {S16, S1}})
638     .scalarize(0)
639     .clampScalar(0, S32, S64)
640     .widenScalarToNextPow2(1, 32);
641 
642   // TODO: Split s1->s64 during regbankselect for VALU.
643   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
644     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
645     .lowerFor({{S32, S64}})
646     .lowerIf(typeIs(1, S1))
647     .customFor({{S64, S64}});
648   if (ST.has16BitInsts())
649     IToFP.legalFor({{S16, S16}});
650   IToFP.clampScalar(1, S32, S64)
651        .scalarize(0)
652        .widenScalarToNextPow2(1);
653 
654   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656     .customFor({{S64, S64}});
657   if (ST.has16BitInsts())
658     FPToI.legalFor({{S16, S16}});
659   else
660     FPToI.minScalar(1, S32);
661 
662   FPToI.minScalar(0, S32)
663        .scalarize(0)
664        .lower();
665 
666   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
667     .scalarize(0)
668     .lower();
669 
670   if (ST.has16BitInsts()) {
671     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
672       .legalFor({S16, S32, S64})
673       .clampScalar(0, S16, S64)
674       .scalarize(0);
675   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
676     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
677       .legalFor({S32, S64})
678       .clampScalar(0, S32, S64)
679       .scalarize(0);
680   } else {
681     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
682       .legalFor({S32})
683       .customFor({S64})
684       .clampScalar(0, S32, S64)
685       .scalarize(0);
686   }
687 
688   // FIXME: Clamp offset operand.
689   getActionDefinitionsBuilder(G_PTR_ADD)
690     .legalIf(isPointer(0))
691     .scalarize(0);
692 
693   getActionDefinitionsBuilder(G_PTRMASK)
694     .legalIf(typeInSet(1, {S64, S32}))
695     .minScalar(1, S32)
696     .maxScalarIf(sizeIs(0, 32), 1, S32)
697     .maxScalarIf(sizeIs(0, 64), 1, S64)
698     .scalarize(0);
699 
700   auto &CmpBuilder =
701     getActionDefinitionsBuilder(G_ICMP)
702     // The compare output type differs based on the register bank of the output,
703     // so make both s1 and s32 legal.
704     //
705     // Scalar compares producing output in scc will be promoted to s32, as that
706     // is the allocatable register type that will be needed for the copy from
707     // scc. This will be promoted during RegBankSelect, and we assume something
708     // before that won't try to use s32 result types.
709     //
710     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
711     // bank.
712     .legalForCartesianProduct(
713       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
714     .legalForCartesianProduct(
715       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
716   if (ST.has16BitInsts()) {
717     CmpBuilder.legalFor({{S1, S16}});
718   }
719 
720   CmpBuilder
721     .widenScalarToNextPow2(1)
722     .clampScalar(1, S32, S64)
723     .scalarize(0)
724     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
725 
726   getActionDefinitionsBuilder(G_FCMP)
727     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
728     .widenScalarToNextPow2(1)
729     .clampScalar(1, S32, S64)
730     .scalarize(0);
731 
732   // FIXME: fpow has a selection pattern that should move to custom lowering.
733   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
734   if (ST.has16BitInsts())
735     Exp2Ops.legalFor({S32, S16});
736   else
737     Exp2Ops.legalFor({S32});
738   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
739   Exp2Ops.scalarize(0);
740 
741   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
742   if (ST.has16BitInsts())
743     ExpOps.customFor({{S32}, {S16}});
744   else
745     ExpOps.customFor({S32});
746   ExpOps.clampScalar(0, MinScalarFPTy, S32)
747         .scalarize(0);
748 
749   // The 64-bit versions produce 32-bit results, but only on the SALU.
750   getActionDefinitionsBuilder(G_CTPOP)
751     .legalFor({{S32, S32}, {S32, S64}})
752     .clampScalar(0, S32, S32)
753     .clampScalar(1, S32, S64)
754     .scalarize(0)
755     .widenScalarToNextPow2(0, 32)
756     .widenScalarToNextPow2(1, 32);
757 
758   // The hardware instructions return a different result on 0 than the generic
759   // instructions expect. The hardware produces -1, but these produce the
760   // bitwidth.
761   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
762     .scalarize(0)
763     .clampScalar(0, S32, S32)
764     .clampScalar(1, S32, S64)
765     .widenScalarToNextPow2(0, 32)
766     .widenScalarToNextPow2(1, 32)
767     .lower();
768 
769   // The 64-bit versions produce 32-bit results, but only on the SALU.
770   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
771     .legalFor({{S32, S32}, {S32, S64}})
772     .clampScalar(0, S32, S32)
773     .clampScalar(1, S32, S64)
774     .scalarize(0)
775     .widenScalarToNextPow2(0, 32)
776     .widenScalarToNextPow2(1, 32);
777 
778   getActionDefinitionsBuilder(G_BITREVERSE)
779     .legalFor({S32})
780     .clampScalar(0, S32, S32)
781     .scalarize(0);
782 
783   if (ST.has16BitInsts()) {
784     getActionDefinitionsBuilder(G_BSWAP)
785       .legalFor({S16, S32, V2S16})
786       .clampMaxNumElements(0, S16, 2)
787       // FIXME: Fixing non-power-of-2 before clamp is workaround for
788       // narrowScalar limitation.
789       .widenScalarToNextPow2(0)
790       .clampScalar(0, S16, S32)
791       .scalarize(0);
792 
793     if (ST.hasVOP3PInsts()) {
794       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
795         .legalFor({S32, S16, V2S16})
796         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
797         .clampMaxNumElements(0, S16, 2)
798         .minScalar(0, S16)
799         .widenScalarToNextPow2(0)
800         .scalarize(0)
801         .lower();
802     } else {
803       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
804         .legalFor({S32, S16})
805         .widenScalarToNextPow2(0)
806         .minScalar(0, S16)
807         .scalarize(0)
808         .lower();
809     }
810   } else {
811     // TODO: Should have same legality without v_perm_b32
812     getActionDefinitionsBuilder(G_BSWAP)
813       .legalFor({S32})
814       .lowerIf(scalarNarrowerThan(0, 32))
815       // FIXME: Fixing non-power-of-2 before clamp is workaround for
816       // narrowScalar limitation.
817       .widenScalarToNextPow2(0)
818       .maxScalar(0, S32)
819       .scalarize(0)
820       .lower();
821 
822     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
823       .legalFor({S32})
824       .minScalar(0, S32)
825       .widenScalarToNextPow2(0)
826       .scalarize(0)
827       .lower();
828   }
829 
830   getActionDefinitionsBuilder(G_INTTOPTR)
831     // List the common cases
832     .legalForCartesianProduct(AddrSpaces64, {S64})
833     .legalForCartesianProduct(AddrSpaces32, {S32})
834     .scalarize(0)
835     // Accept any address space as long as the size matches
836     .legalIf(sameSize(0, 1))
837     .widenScalarIf(smallerThan(1, 0),
838       [](const LegalityQuery &Query) {
839         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
840       })
841     .narrowScalarIf(largerThan(1, 0),
842       [](const LegalityQuery &Query) {
843         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
844       });
845 
846   getActionDefinitionsBuilder(G_PTRTOINT)
847     // List the common cases
848     .legalForCartesianProduct(AddrSpaces64, {S64})
849     .legalForCartesianProduct(AddrSpaces32, {S32})
850     .scalarize(0)
851     // Accept any address space as long as the size matches
852     .legalIf(sameSize(0, 1))
853     .widenScalarIf(smallerThan(0, 1),
854       [](const LegalityQuery &Query) {
855         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
856       })
857     .narrowScalarIf(
858       largerThan(0, 1),
859       [](const LegalityQuery &Query) {
860         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
861       });
862 
863   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
864     .scalarize(0)
865     .custom();
866 
867   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
868                                     bool IsLoad) -> bool {
869     const LLT DstTy = Query.Types[0];
870 
871     // Split vector extloads.
872     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
873     unsigned Align = Query.MMODescrs[0].AlignInBits;
874 
875     if (MemSize < DstTy.getSizeInBits())
876       MemSize = std::max(MemSize, Align);
877 
878     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
879       return true;
880 
881     const LLT PtrTy = Query.Types[1];
882     unsigned AS = PtrTy.getAddressSpace();
883     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
884       return true;
885 
886     // Catch weird sized loads that don't evenly divide into the access sizes
887     // TODO: May be able to widen depending on alignment etc.
888     unsigned NumRegs = (MemSize + 31) / 32;
889     if (NumRegs == 3) {
890       if (!ST.hasDwordx3LoadStores())
891         return true;
892     } else {
893       // If the alignment allows, these should have been widened.
894       if (!isPowerOf2_32(NumRegs))
895         return true;
896     }
897 
898     if (Align < MemSize) {
899       const SITargetLowering *TLI = ST.getTargetLowering();
900       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
901     }
902 
903     return false;
904   };
905 
906   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
907                                          unsigned Opc) -> bool {
908     unsigned Size = Query.Types[0].getSizeInBits();
909     if (isPowerOf2_32(Size))
910       return false;
911 
912     if (Size == 96 && ST.hasDwordx3LoadStores())
913       return false;
914 
915     unsigned AddrSpace = Query.Types[1].getAddressSpace();
916     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
917       return false;
918 
919     unsigned Align = Query.MMODescrs[0].AlignInBits;
920     unsigned RoundedSize = NextPowerOf2(Size);
921     return (Align >= RoundedSize);
922   };
923 
924   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
925   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
926   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
927 
928   // TODO: Refine based on subtargets which support unaligned access or 128-bit
929   // LDS
930   // TODO: Unsupported flat for SI.
931 
932   for (unsigned Op : {G_LOAD, G_STORE}) {
933     const bool IsStore = Op == G_STORE;
934 
935     auto &Actions = getActionDefinitionsBuilder(Op);
936     // Explicitly list some common cases.
937     // TODO: Does this help compile time at all?
938     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
939                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
940                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
941                                       {S64, GlobalPtr, 64, GlobalAlign32},
942                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
943                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
944                                       {S32, GlobalPtr, 8, GlobalAlign8},
945                                       {S32, GlobalPtr, 16, GlobalAlign16},
946 
947                                       {S32, LocalPtr, 32, 32},
948                                       {S64, LocalPtr, 64, 32},
949                                       {V2S32, LocalPtr, 64, 32},
950                                       {S32, LocalPtr, 8, 8},
951                                       {S32, LocalPtr, 16, 16},
952                                       {V2S16, LocalPtr, 32, 32},
953 
954                                       {S32, PrivatePtr, 32, 32},
955                                       {S32, PrivatePtr, 8, 8},
956                                       {S32, PrivatePtr, 16, 16},
957                                       {V2S16, PrivatePtr, 32, 32},
958 
959                                       {S32, ConstantPtr, 32, GlobalAlign32},
960                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
961                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
962                                       {S64, ConstantPtr, 64, GlobalAlign32},
963                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
964     Actions.legalIf(
965       [=](const LegalityQuery &Query) -> bool {
966         return isLoadStoreLegal(ST, Query, Op);
967       });
968 
969     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
970     // 64-bits.
971     //
972     // TODO: Should generalize bitcast action into coerce, which will also cover
973     // inserting addrspacecasts.
974     Actions.customIf(typeIs(1, Constant32Ptr));
975 
976     // Turn any illegal element vectors into something easier to deal
977     // with. These will ultimately produce 32-bit scalar shifts to extract the
978     // parts anyway.
979     //
980     // For odd 16-bit element vectors, prefer to split those into pieces with
981     // 16-bit vector parts.
982     Actions.bitcastIf(
983       [=](const LegalityQuery &Query) -> bool {
984         const LLT Ty = Query.Types[0];
985         const unsigned Size = Ty.getSizeInBits();
986 
987         if (Size != Query.MMODescrs[0].SizeInBits)
988           return Size <= 32 && Ty.isVector();
989 
990         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
991           return true;
992         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
993                !isRegisterVectorElementType(Ty.getElementType());
994       }, bitcastToRegisterType(0));
995 
996     Actions
997         .customIf(typeIs(1, Constant32Ptr))
998         // Widen suitably aligned loads by loading extra elements.
999         .moreElementsIf([=](const LegalityQuery &Query) {
1000             const LLT Ty = Query.Types[0];
1001             return Op == G_LOAD && Ty.isVector() &&
1002                    shouldWidenLoadResult(Query, Op);
1003           }, moreElementsToNextPow2(0))
1004         .widenScalarIf([=](const LegalityQuery &Query) {
1005             const LLT Ty = Query.Types[0];
1006             return Op == G_LOAD && !Ty.isVector() &&
1007                    shouldWidenLoadResult(Query, Op);
1008           }, widenScalarOrEltToNextPow2(0))
1009         .narrowScalarIf(
1010             [=](const LegalityQuery &Query) -> bool {
1011               return !Query.Types[0].isVector() &&
1012                      needToSplitMemOp(Query, Op == G_LOAD);
1013             },
1014             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1015               const LLT DstTy = Query.Types[0];
1016               const LLT PtrTy = Query.Types[1];
1017 
1018               const unsigned DstSize = DstTy.getSizeInBits();
1019               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1020 
1021               // Split extloads.
1022               if (DstSize > MemSize)
1023                 return std::make_pair(0, LLT::scalar(MemSize));
1024 
1025               if (!isPowerOf2_32(DstSize)) {
1026                 // We're probably decomposing an odd sized store. Try to split
1027                 // to the widest type. TODO: Account for alignment. As-is it
1028                 // should be OK, since the new parts will be further legalized.
1029                 unsigned FloorSize = PowerOf2Floor(DstSize);
1030                 return std::make_pair(0, LLT::scalar(FloorSize));
1031               }
1032 
1033               if (DstSize > 32 && (DstSize % 32 != 0)) {
1034                 // FIXME: Need a way to specify non-extload of larger size if
1035                 // suitably aligned.
1036                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
1037               }
1038 
1039               unsigned MaxSize = maxSizeForAddrSpace(ST,
1040                                                      PtrTy.getAddressSpace(),
1041                                                      Op == G_LOAD);
1042               if (MemSize > MaxSize)
1043                 return std::make_pair(0, LLT::scalar(MaxSize));
1044 
1045               unsigned Align = Query.MMODescrs[0].AlignInBits;
1046               return std::make_pair(0, LLT::scalar(Align));
1047             })
1048         .fewerElementsIf(
1049             [=](const LegalityQuery &Query) -> bool {
1050               return Query.Types[0].isVector() &&
1051                      needToSplitMemOp(Query, Op == G_LOAD);
1052             },
1053             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1054               const LLT DstTy = Query.Types[0];
1055               const LLT PtrTy = Query.Types[1];
1056 
1057               LLT EltTy = DstTy.getElementType();
1058               unsigned MaxSize = maxSizeForAddrSpace(ST,
1059                                                      PtrTy.getAddressSpace(),
1060                                                      Op == G_LOAD);
1061 
1062               // FIXME: Handle widened to power of 2 results better. This ends
1063               // up scalarizing.
1064               // FIXME: 3 element stores scalarized on SI
1065 
1066               // Split if it's too large for the address space.
1067               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
1068                 unsigned NumElts = DstTy.getNumElements();
1069                 unsigned EltSize = EltTy.getSizeInBits();
1070 
1071                 if (MaxSize % EltSize == 0) {
1072                   return std::make_pair(
1073                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1074                 }
1075 
1076                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
1077 
1078                 // FIXME: Refine when odd breakdowns handled
1079                 // The scalars will need to be re-legalized.
1080                 if (NumPieces == 1 || NumPieces >= NumElts ||
1081                     NumElts % NumPieces != 0)
1082                   return std::make_pair(0, EltTy);
1083 
1084                 return std::make_pair(0,
1085                                       LLT::vector(NumElts / NumPieces, EltTy));
1086               }
1087 
1088               // FIXME: We could probably handle weird extending loads better.
1089               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1090               if (DstTy.getSizeInBits() > MemSize)
1091                 return std::make_pair(0, EltTy);
1092 
1093               unsigned EltSize = EltTy.getSizeInBits();
1094               unsigned DstSize = DstTy.getSizeInBits();
1095               if (!isPowerOf2_32(DstSize)) {
1096                 // We're probably decomposing an odd sized store. Try to split
1097                 // to the widest type. TODO: Account for alignment. As-is it
1098                 // should be OK, since the new parts will be further legalized.
1099                 unsigned FloorSize = PowerOf2Floor(DstSize);
1100                 return std::make_pair(
1101                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1102               }
1103 
1104               // Need to split because of alignment.
1105               unsigned Align = Query.MMODescrs[0].AlignInBits;
1106               if (EltSize > Align &&
1107                   (EltSize / Align < DstTy.getNumElements())) {
1108                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
1109               }
1110 
1111               // May need relegalization for the scalars.
1112               return std::make_pair(0, EltTy);
1113             })
1114         .minScalar(0, S32);
1115 
1116     if (IsStore)
1117       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
1118 
1119     // TODO: Need a bitcast lower option?
1120     Actions
1121         .widenScalarToNextPow2(0)
1122         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
1123   }
1124 
1125   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1126                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
1127                                                   {S32, GlobalPtr, 16, 2 * 8},
1128                                                   {S32, LocalPtr, 8, 8},
1129                                                   {S32, LocalPtr, 16, 16},
1130                                                   {S32, PrivatePtr, 8, 8},
1131                                                   {S32, PrivatePtr, 16, 16},
1132                                                   {S32, ConstantPtr, 8, 8},
1133                                                   {S32, ConstantPtr, 16, 2 * 8}});
1134   if (ST.hasFlatAddressSpace()) {
1135     ExtLoads.legalForTypesWithMemDesc(
1136         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
1137   }
1138 
1139   ExtLoads.clampScalar(0, S32, S32)
1140           .widenScalarToNextPow2(0)
1141           .unsupportedIfMemSizeNotPow2()
1142           .lower();
1143 
1144   auto &Atomics = getActionDefinitionsBuilder(
1145     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1146      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1147      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1148      G_ATOMICRMW_UMIN})
1149     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1150                {S64, GlobalPtr}, {S64, LocalPtr}});
1151   if (ST.hasFlatAddressSpace()) {
1152     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1153   }
1154 
1155   if (ST.hasLDSFPAtomics()) {
1156     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1157       .legalFor({{S32, LocalPtr}});
1158   }
1159 
1160   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1161   // demarshalling
1162   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1163     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1164                 {S32, FlatPtr}, {S64, FlatPtr}})
1165     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1166                {S32, RegionPtr}, {S64, RegionPtr}});
1167   // TODO: Pointer types, any 32-bit or 64-bit vector
1168 
1169   // Condition should be s32 for scalar, s1 for vector.
1170   getActionDefinitionsBuilder(G_SELECT)
1171     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
1172           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1173           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
1174     .clampScalar(0, S16, S64)
1175     .scalarize(1)
1176     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1177     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1178     .clampMaxNumElements(0, S32, 2)
1179     .clampMaxNumElements(0, LocalPtr, 2)
1180     .clampMaxNumElements(0, PrivatePtr, 2)
1181     .scalarize(0)
1182     .widenScalarToNextPow2(0)
1183     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1184 
1185   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1186   // be more flexible with the shift amount type.
1187   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1188     .legalFor({{S32, S32}, {S64, S32}});
1189   if (ST.has16BitInsts()) {
1190     if (ST.hasVOP3PInsts()) {
1191       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1192             .clampMaxNumElements(0, S16, 2);
1193     } else
1194       Shifts.legalFor({{S16, S16}});
1195 
1196     // TODO: Support 16-bit shift amounts for all types
1197     Shifts.widenScalarIf(
1198       [=](const LegalityQuery &Query) {
1199         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1200         // 32-bit amount.
1201         const LLT ValTy = Query.Types[0];
1202         const LLT AmountTy = Query.Types[1];
1203         return ValTy.getSizeInBits() <= 16 &&
1204                AmountTy.getSizeInBits() < 16;
1205       }, changeTo(1, S16));
1206     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1207     Shifts.clampScalar(1, S32, S32);
1208     Shifts.clampScalar(0, S16, S64);
1209     Shifts.widenScalarToNextPow2(0, 16);
1210   } else {
1211     // Make sure we legalize the shift amount type first, as the general
1212     // expansion for the shifted type will produce much worse code if it hasn't
1213     // been truncated already.
1214     Shifts.clampScalar(1, S32, S32);
1215     Shifts.clampScalar(0, S32, S64);
1216     Shifts.widenScalarToNextPow2(0, 32);
1217   }
1218   Shifts.scalarize(0);
1219 
1220   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1221     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1222     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1223     unsigned IdxTypeIdx = 2;
1224 
1225     getActionDefinitionsBuilder(Op)
1226       .customIf([=](const LegalityQuery &Query) {
1227           const LLT EltTy = Query.Types[EltTypeIdx];
1228           const LLT VecTy = Query.Types[VecTypeIdx];
1229           const LLT IdxTy = Query.Types[IdxTypeIdx];
1230           return (EltTy.getSizeInBits() == 16 ||
1231                   EltTy.getSizeInBits() % 32 == 0) &&
1232                  VecTy.getSizeInBits() % 32 == 0 &&
1233                  VecTy.getSizeInBits() <= MaxRegisterSize &&
1234                  IdxTy.getSizeInBits() == 32;
1235         })
1236       .clampScalar(EltTypeIdx, S32, S64)
1237       .clampScalar(VecTypeIdx, S32, S64)
1238       .clampScalar(IdxTypeIdx, S32, S32);
1239   }
1240 
1241   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1242     .unsupportedIf([=](const LegalityQuery &Query) {
1243         const LLT &EltTy = Query.Types[1].getElementType();
1244         return Query.Types[0] != EltTy;
1245       });
1246 
1247   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1248     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1249     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1250 
1251     // FIXME: Doesn't handle extract of illegal sizes.
1252     getActionDefinitionsBuilder(Op)
1253       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
1254       // FIXME: Multiples of 16 should not be legal.
1255       .legalIf([=](const LegalityQuery &Query) {
1256           const LLT BigTy = Query.Types[BigTyIdx];
1257           const LLT LitTy = Query.Types[LitTyIdx];
1258           return (BigTy.getSizeInBits() % 32 == 0) &&
1259                  (LitTy.getSizeInBits() % 16 == 0);
1260         })
1261       .widenScalarIf(
1262         [=](const LegalityQuery &Query) {
1263           const LLT BigTy = Query.Types[BigTyIdx];
1264           return (BigTy.getScalarSizeInBits() < 16);
1265         },
1266         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
1267       .widenScalarIf(
1268         [=](const LegalityQuery &Query) {
1269           const LLT LitTy = Query.Types[LitTyIdx];
1270           return (LitTy.getScalarSizeInBits() < 16);
1271         },
1272         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
1273       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1274       .widenScalarToNextPow2(BigTyIdx, 32);
1275 
1276   }
1277 
1278   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1279     .legalForCartesianProduct(AllS32Vectors, {S32})
1280     .legalForCartesianProduct(AllS64Vectors, {S64})
1281     .clampNumElements(0, V16S32, V32S32)
1282     .clampNumElements(0, V2S64, V16S64)
1283     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
1284 
1285   if (ST.hasScalarPackInsts()) {
1286     BuildVector
1287       // FIXME: Should probably widen s1 vectors straight to s32
1288       .minScalarOrElt(0, S16)
1289       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1290       .minScalar(1, S32);
1291 
1292     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1293       .legalFor({V2S16, S32})
1294       .lower();
1295     BuildVector.minScalarOrElt(0, S32);
1296   } else {
1297     BuildVector.customFor({V2S16, S16});
1298     BuildVector.minScalarOrElt(0, S32);
1299 
1300     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1301       .customFor({V2S16, S32})
1302       .lower();
1303   }
1304 
1305   BuildVector.legalIf(isRegisterType(0));
1306 
1307   // FIXME: Clamp maximum size
1308   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1309     .legalIf(isRegisterType(0));
1310 
1311   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1312   // pre-legalize.
1313   if (ST.hasVOP3PInsts()) {
1314     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1315       .customFor({V2S16, V2S16})
1316       .lower();
1317   } else
1318     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1319 
1320   // Merge/Unmerge
1321   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1322     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
1323     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
1324 
1325     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1326       const LLT Ty = Query.Types[TypeIdx];
1327       if (Ty.isVector()) {
1328         const LLT &EltTy = Ty.getElementType();
1329         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
1330           return true;
1331         if (!isPowerOf2_32(EltTy.getSizeInBits()))
1332           return true;
1333       }
1334       return false;
1335     };
1336 
1337     auto &Builder = getActionDefinitionsBuilder(Op)
1338       .lowerFor({{S16, V2S16}})
1339       .lowerIf([=](const LegalityQuery &Query) {
1340           const LLT BigTy = Query.Types[BigTyIdx];
1341           return BigTy.getSizeInBits() == 32;
1342         })
1343       // Try to widen to s16 first for small types.
1344       // TODO: Only do this on targets with legal s16 shifts
1345       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
1346       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
1347       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1348       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
1349                            elementTypeIs(1, S16)),
1350                        changeTo(1, V2S16))
1351       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1352       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1353       // valid.
1354       .clampScalar(LitTyIdx, S32, S512)
1355       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
1356       // Break up vectors with weird elements into scalars
1357       .fewerElementsIf(
1358         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
1359         scalarize(0))
1360       .fewerElementsIf(
1361         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
1362         scalarize(1))
1363       .clampScalar(BigTyIdx, S32, MaxScalar);
1364 
1365     if (Op == G_MERGE_VALUES) {
1366       Builder.widenScalarIf(
1367         // TODO: Use 16-bit shifts if legal for 8-bit values?
1368         [=](const LegalityQuery &Query) {
1369           const LLT Ty = Query.Types[LitTyIdx];
1370           return Ty.getSizeInBits() < 32;
1371         },
1372         changeTo(LitTyIdx, S32));
1373     }
1374 
1375     Builder.widenScalarIf(
1376       [=](const LegalityQuery &Query) {
1377         const LLT Ty = Query.Types[BigTyIdx];
1378         return !isPowerOf2_32(Ty.getSizeInBits()) &&
1379           Ty.getSizeInBits() % 16 != 0;
1380       },
1381       [=](const LegalityQuery &Query) {
1382         // Pick the next power of 2, or a multiple of 64 over 128.
1383         // Whichever is smaller.
1384         const LLT &Ty = Query.Types[BigTyIdx];
1385         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
1386         if (NewSizeInBits >= 256) {
1387           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
1388           if (RoundedTo < NewSizeInBits)
1389             NewSizeInBits = RoundedTo;
1390         }
1391         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
1392       })
1393       .legalIf([=](const LegalityQuery &Query) {
1394           const LLT &BigTy = Query.Types[BigTyIdx];
1395           const LLT &LitTy = Query.Types[LitTyIdx];
1396 
1397           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
1398             return false;
1399           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
1400             return false;
1401 
1402           return BigTy.getSizeInBits() % 16 == 0 &&
1403                  LitTy.getSizeInBits() % 16 == 0 &&
1404                  BigTy.getSizeInBits() <= MaxRegisterSize;
1405         })
1406       // Any vectors left are the wrong size. Scalarize them.
1407       .scalarize(0)
1408       .scalarize(1);
1409   }
1410 
1411   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1412   // RegBankSelect.
1413   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1414     .legalFor({{S32}, {S64}});
1415 
1416   if (ST.hasVOP3PInsts()) {
1417     SextInReg.lowerFor({{V2S16}})
1418       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1419       // get more vector shift opportunities, since we'll get those when
1420       // expanded.
1421       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1422   } else if (ST.has16BitInsts()) {
1423     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1424   } else {
1425     // Prefer to promote to s32 before lowering if we don't have 16-bit
1426     // shifts. This avoid a lot of intermediate truncate and extend operations.
1427     SextInReg.lowerFor({{S32}, {S64}});
1428   }
1429 
1430   // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
1431   // available, and is selectively legal for s16, s32, v2s16.
1432   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
1433     .scalarize(0)
1434     .clampScalar(0, S16, S32);
1435 
1436   SextInReg
1437     .scalarize(0)
1438     .clampScalar(0, S32, S64)
1439     .lower();
1440 
1441   getActionDefinitionsBuilder(G_FSHR)
1442     .legalFor({{S32, S32}})
1443     .scalarize(0)
1444     .lower();
1445 
1446   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1447     .legalFor({S64});
1448 
1449   getActionDefinitionsBuilder({
1450       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1451       G_FCOPYSIGN,
1452 
1453       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1454       G_READ_REGISTER,
1455       G_WRITE_REGISTER,
1456 
1457       G_SADDO, G_SSUBO,
1458 
1459        // TODO: Implement
1460       G_FMINIMUM, G_FMAXIMUM,
1461       G_FSHL
1462     }).lower();
1463 
1464   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1465         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1466         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1467     .unsupported();
1468 
1469   computeTables();
1470   verify(*ST.getInstrInfo());
1471 }
1472 
1473 bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1474                                          MachineInstr &MI) const {
1475   MachineIRBuilder &B = Helper.MIRBuilder;
1476   MachineRegisterInfo &MRI = *B.getMRI();
1477   GISelChangeObserver &Observer = Helper.Observer;
1478 
1479   switch (MI.getOpcode()) {
1480   case TargetOpcode::G_ADDRSPACE_CAST:
1481     return legalizeAddrSpaceCast(MI, MRI, B);
1482   case TargetOpcode::G_FRINT:
1483     return legalizeFrint(MI, MRI, B);
1484   case TargetOpcode::G_FCEIL:
1485     return legalizeFceil(MI, MRI, B);
1486   case TargetOpcode::G_INTRINSIC_TRUNC:
1487     return legalizeIntrinsicTrunc(MI, MRI, B);
1488   case TargetOpcode::G_SITOFP:
1489     return legalizeITOFP(MI, MRI, B, true);
1490   case TargetOpcode::G_UITOFP:
1491     return legalizeITOFP(MI, MRI, B, false);
1492   case TargetOpcode::G_FPTOSI:
1493     return legalizeFPTOI(MI, MRI, B, true);
1494   case TargetOpcode::G_FPTOUI:
1495     return legalizeFPTOI(MI, MRI, B, false);
1496   case TargetOpcode::G_FMINNUM:
1497   case TargetOpcode::G_FMAXNUM:
1498   case TargetOpcode::G_FMINNUM_IEEE:
1499   case TargetOpcode::G_FMAXNUM_IEEE:
1500     return legalizeMinNumMaxNum(Helper, MI);
1501   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1502     return legalizeExtractVectorElt(MI, MRI, B);
1503   case TargetOpcode::G_INSERT_VECTOR_ELT:
1504     return legalizeInsertVectorElt(MI, MRI, B);
1505   case TargetOpcode::G_SHUFFLE_VECTOR:
1506     return legalizeShuffleVector(MI, MRI, B);
1507   case TargetOpcode::G_FSIN:
1508   case TargetOpcode::G_FCOS:
1509     return legalizeSinCos(MI, MRI, B);
1510   case TargetOpcode::G_GLOBAL_VALUE:
1511     return legalizeGlobalValue(MI, MRI, B);
1512   case TargetOpcode::G_LOAD:
1513     return legalizeLoad(MI, MRI, B, Observer);
1514   case TargetOpcode::G_FMAD:
1515     return legalizeFMad(MI, MRI, B);
1516   case TargetOpcode::G_FDIV:
1517     return legalizeFDIV(MI, MRI, B);
1518   case TargetOpcode::G_UDIV:
1519   case TargetOpcode::G_UREM:
1520     return legalizeUDIV_UREM(MI, MRI, B);
1521   case TargetOpcode::G_SDIV:
1522   case TargetOpcode::G_SREM:
1523     return legalizeSDIV_SREM(MI, MRI, B);
1524   case TargetOpcode::G_ATOMIC_CMPXCHG:
1525     return legalizeAtomicCmpXChg(MI, MRI, B);
1526   case TargetOpcode::G_FLOG:
1527     return legalizeFlog(MI, B, numbers::ln2f);
1528   case TargetOpcode::G_FLOG10:
1529     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1530   case TargetOpcode::G_FEXP:
1531     return legalizeFExp(MI, B);
1532   case TargetOpcode::G_FPOW:
1533     return legalizeFPow(MI, B);
1534   case TargetOpcode::G_FFLOOR:
1535     return legalizeFFloor(MI, MRI, B);
1536   case TargetOpcode::G_BUILD_VECTOR:
1537     return legalizeBuildVector(MI, MRI, B);
1538   default:
1539     return false;
1540   }
1541 
1542   llvm_unreachable("expected switch to return");
1543 }
1544 
1545 Register AMDGPULegalizerInfo::getSegmentAperture(
1546   unsigned AS,
1547   MachineRegisterInfo &MRI,
1548   MachineIRBuilder &B) const {
1549   MachineFunction &MF = B.getMF();
1550   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1551   const LLT S32 = LLT::scalar(32);
1552 
1553   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
1554 
1555   if (ST.hasApertureRegs()) {
1556     // FIXME: Use inline constants (src_{shared, private}_base) instead of
1557     // getreg.
1558     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
1559         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
1560         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
1561     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
1562         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
1563         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
1564     unsigned Encoding =
1565         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
1566         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
1567         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
1568 
1569     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1570 
1571     B.buildInstr(AMDGPU::S_GETREG_B32)
1572       .addDef(GetReg)
1573       .addImm(Encoding);
1574     MRI.setType(GetReg, S32);
1575 
1576     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1577     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
1578   }
1579 
1580   Register QueuePtr = MRI.createGenericVirtualRegister(
1581     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
1582 
1583   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1584   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
1585     return Register();
1586 
1587   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1588   // private_segment_aperture_base_hi.
1589   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1590 
1591   // TODO: can we be smarter about machine pointer info?
1592   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1593   MachineMemOperand *MMO = MF.getMachineMemOperand(
1594       PtrInfo,
1595       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
1596           MachineMemOperand::MOInvariant,
1597       4, commonAlignment(Align(64), StructOffset));
1598 
1599   Register LoadAddr;
1600 
1601   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1602   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
1603 }
1604 
1605 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
1606   MachineInstr &MI, MachineRegisterInfo &MRI,
1607   MachineIRBuilder &B) const {
1608   MachineFunction &MF = B.getMF();
1609 
1610   const LLT S32 = LLT::scalar(32);
1611   Register Dst = MI.getOperand(0).getReg();
1612   Register Src = MI.getOperand(1).getReg();
1613 
1614   LLT DstTy = MRI.getType(Dst);
1615   LLT SrcTy = MRI.getType(Src);
1616   unsigned DestAS = DstTy.getAddressSpace();
1617   unsigned SrcAS = SrcTy.getAddressSpace();
1618 
1619   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
1620   // vector element.
1621   assert(!DstTy.isVector());
1622 
1623   const AMDGPUTargetMachine &TM
1624     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
1625 
1626   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1627   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
1628     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
1629     return true;
1630   }
1631 
1632   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1633     // Truncate.
1634     B.buildExtract(Dst, Src, 0);
1635     MI.eraseFromParent();
1636     return true;
1637   }
1638 
1639   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
1640     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1641     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
1642 
1643     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
1644     // another. Merge operands are required to be the same type, but creating an
1645     // extra ptrtoint would be kind of pointless.
1646     auto HighAddr = B.buildConstant(
1647       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1648     B.buildMerge(Dst, {Src, HighAddr});
1649     MI.eraseFromParent();
1650     return true;
1651   }
1652 
1653   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
1654     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
1655            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1656     unsigned NullVal = TM.getNullPointerValue(DestAS);
1657 
1658     auto SegmentNull = B.buildConstant(DstTy, NullVal);
1659     auto FlatNull = B.buildConstant(SrcTy, 0);
1660 
1661     // Extract low 32-bits of the pointer.
1662     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
1663 
1664     auto CmpRes =
1665         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
1666     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
1667 
1668     MI.eraseFromParent();
1669     return true;
1670   }
1671 
1672   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
1673     return false;
1674 
1675   if (!ST.hasFlatAddressSpace())
1676     return false;
1677 
1678   auto SegmentNull =
1679       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1680   auto FlatNull =
1681       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1682 
1683   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
1684   if (!ApertureReg.isValid())
1685     return false;
1686 
1687   auto CmpRes =
1688       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1689 
1690   // Coerce the type of the low half of the result so we can use merge_values.
1691   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
1692 
1693   // TODO: Should we allow mismatched types but matching sizes in merges to
1694   // avoid the ptrtoint?
1695   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1696   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
1697 
1698   MI.eraseFromParent();
1699   return true;
1700 }
1701 
1702 bool AMDGPULegalizerInfo::legalizeFrint(
1703   MachineInstr &MI, MachineRegisterInfo &MRI,
1704   MachineIRBuilder &B) const {
1705   Register Src = MI.getOperand(1).getReg();
1706   LLT Ty = MRI.getType(Src);
1707   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
1708 
1709   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1710   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1711 
1712   auto C1 = B.buildFConstant(Ty, C1Val);
1713   auto CopySign = B.buildFCopysign(Ty, C1, Src);
1714 
1715   // TODO: Should this propagate fast-math-flags?
1716   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
1717   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
1718 
1719   auto C2 = B.buildFConstant(Ty, C2Val);
1720   auto Fabs = B.buildFAbs(Ty, Src);
1721 
1722   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
1723   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1724   return true;
1725 }
1726 
1727 bool AMDGPULegalizerInfo::legalizeFceil(
1728   MachineInstr &MI, MachineRegisterInfo &MRI,
1729   MachineIRBuilder &B) const {
1730 
1731   const LLT S1 = LLT::scalar(1);
1732   const LLT S64 = LLT::scalar(64);
1733 
1734   Register Src = MI.getOperand(1).getReg();
1735   assert(MRI.getType(Src) == S64);
1736 
1737   // result = trunc(src)
1738   // if (src > 0.0 && src != result)
1739   //   result += 1.0
1740 
1741   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
1742 
1743   const auto Zero = B.buildFConstant(S64, 0.0);
1744   const auto One = B.buildFConstant(S64, 1.0);
1745   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
1746   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
1747   auto And = B.buildAnd(S1, Lt0, NeTrunc);
1748   auto Add = B.buildSelect(S64, And, One, Zero);
1749 
1750   // TODO: Should this propagate fast-math-flags?
1751   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
1752   return true;
1753 }
1754 
1755 static MachineInstrBuilder extractF64Exponent(unsigned Hi,
1756                                               MachineIRBuilder &B) {
1757   const unsigned FractBits = 52;
1758   const unsigned ExpBits = 11;
1759   LLT S32 = LLT::scalar(32);
1760 
1761   auto Const0 = B.buildConstant(S32, FractBits - 32);
1762   auto Const1 = B.buildConstant(S32, ExpBits);
1763 
1764   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1765     .addUse(Const0.getReg(0))
1766     .addUse(Const1.getReg(0));
1767 
1768   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
1769 }
1770 
1771 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
1772   MachineInstr &MI, MachineRegisterInfo &MRI,
1773   MachineIRBuilder &B) const {
1774   const LLT S1 = LLT::scalar(1);
1775   const LLT S32 = LLT::scalar(32);
1776   const LLT S64 = LLT::scalar(64);
1777 
1778   Register Src = MI.getOperand(1).getReg();
1779   assert(MRI.getType(Src) == S64);
1780 
1781   // TODO: Should this use extract since the low half is unused?
1782   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1783   Register Hi = Unmerge.getReg(1);
1784 
1785   // Extract the upper half, since this is where we will find the sign and
1786   // exponent.
1787   auto Exp = extractF64Exponent(Hi, B);
1788 
1789   const unsigned FractBits = 52;
1790 
1791   // Extract the sign bit.
1792   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
1793   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
1794 
1795   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
1796 
1797   const auto Zero32 = B.buildConstant(S32, 0);
1798 
1799   // Extend back to 64-bits.
1800   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
1801 
1802   auto Shr = B.buildAShr(S64, FractMask, Exp);
1803   auto Not = B.buildNot(S64, Shr);
1804   auto Tmp0 = B.buildAnd(S64, Src, Not);
1805   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
1806 
1807   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
1808   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
1809 
1810   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
1811   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1812   return true;
1813 }
1814 
1815 bool AMDGPULegalizerInfo::legalizeITOFP(
1816   MachineInstr &MI, MachineRegisterInfo &MRI,
1817   MachineIRBuilder &B, bool Signed) const {
1818 
1819   Register Dst = MI.getOperand(0).getReg();
1820   Register Src = MI.getOperand(1).getReg();
1821 
1822   const LLT S64 = LLT::scalar(64);
1823   const LLT S32 = LLT::scalar(32);
1824 
1825   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1826 
1827   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
1828 
1829   auto CvtHi = Signed ?
1830     B.buildSITOFP(S64, Unmerge.getReg(1)) :
1831     B.buildUITOFP(S64, Unmerge.getReg(1));
1832 
1833   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
1834 
1835   auto ThirtyTwo = B.buildConstant(S32, 32);
1836   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
1837     .addUse(CvtHi.getReg(0))
1838     .addUse(ThirtyTwo.getReg(0));
1839 
1840   // TODO: Should this propagate fast-math-flags?
1841   B.buildFAdd(Dst, LdExp, CvtLo);
1842   MI.eraseFromParent();
1843   return true;
1844 }
1845 
1846 // TODO: Copied from DAG implementation. Verify logic and document how this
1847 // actually works.
1848 bool AMDGPULegalizerInfo::legalizeFPTOI(
1849   MachineInstr &MI, MachineRegisterInfo &MRI,
1850   MachineIRBuilder &B, bool Signed) const {
1851 
1852   Register Dst = MI.getOperand(0).getReg();
1853   Register Src = MI.getOperand(1).getReg();
1854 
1855   const LLT S64 = LLT::scalar(64);
1856   const LLT S32 = LLT::scalar(32);
1857 
1858   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1859 
1860   unsigned Flags = MI.getFlags();
1861 
1862   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1863   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1864   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1865 
1866   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1867   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1868   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1869 
1870   auto Hi = Signed ?
1871     B.buildFPTOSI(S32, FloorMul) :
1872     B.buildFPTOUI(S32, FloorMul);
1873   auto Lo = B.buildFPTOUI(S32, Fma);
1874 
1875   B.buildMerge(Dst, { Lo, Hi });
1876   MI.eraseFromParent();
1877 
1878   return true;
1879 }
1880 
1881 bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1882                                                MachineInstr &MI) const {
1883   MachineFunction &MF = Helper.MIRBuilder.getMF();
1884   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1885 
1886   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
1887                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
1888 
1889   // With ieee_mode disabled, the instructions have the correct behavior
1890   // already for G_FMINNUM/G_FMAXNUM
1891   if (!MFI->getMode().IEEE)
1892     return !IsIEEEOp;
1893 
1894   if (IsIEEEOp)
1895     return true;
1896 
1897   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
1898 }
1899 
1900 bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
1901   MachineInstr &MI, MachineRegisterInfo &MRI,
1902   MachineIRBuilder &B) const {
1903   // TODO: Should move some of this into LegalizerHelper.
1904 
1905   // TODO: Promote dynamic indexing of s16 to s32
1906 
1907   // FIXME: Artifact combiner probably should have replaced the truncated
1908   // constant before this, so we shouldn't need
1909   // getConstantVRegValWithLookThrough.
1910   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1911     MI.getOperand(2).getReg(), MRI);
1912   if (!IdxVal) // Dynamic case will be selected to register indexing.
1913     return true;
1914 
1915   Register Dst = MI.getOperand(0).getReg();
1916   Register Vec = MI.getOperand(1).getReg();
1917 
1918   LLT VecTy = MRI.getType(Vec);
1919   LLT EltTy = VecTy.getElementType();
1920   assert(EltTy == MRI.getType(Dst));
1921 
1922   if (IdxVal->Value < VecTy.getNumElements())
1923     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
1924   else
1925     B.buildUndef(Dst);
1926 
1927   MI.eraseFromParent();
1928   return true;
1929 }
1930 
1931 bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
1932   MachineInstr &MI, MachineRegisterInfo &MRI,
1933   MachineIRBuilder &B) const {
1934   // TODO: Should move some of this into LegalizerHelper.
1935 
1936   // TODO: Promote dynamic indexing of s16 to s32
1937 
1938   // FIXME: Artifact combiner probably should have replaced the truncated
1939   // constant before this, so we shouldn't need
1940   // getConstantVRegValWithLookThrough.
1941   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1942     MI.getOperand(3).getReg(), MRI);
1943   if (!IdxVal) // Dynamic case will be selected to register indexing.
1944     return true;
1945 
1946   Register Dst = MI.getOperand(0).getReg();
1947   Register Vec = MI.getOperand(1).getReg();
1948   Register Ins = MI.getOperand(2).getReg();
1949 
1950   LLT VecTy = MRI.getType(Vec);
1951   LLT EltTy = VecTy.getElementType();
1952   assert(EltTy == MRI.getType(Ins));
1953 
1954   if (IdxVal->Value < VecTy.getNumElements())
1955     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
1956   else
1957     B.buildUndef(Dst);
1958 
1959   MI.eraseFromParent();
1960   return true;
1961 }
1962 
1963 bool AMDGPULegalizerInfo::legalizeShuffleVector(
1964   MachineInstr &MI, MachineRegisterInfo &MRI,
1965   MachineIRBuilder &B) const {
1966   const LLT V2S16 = LLT::vector(2, 16);
1967 
1968   Register Dst = MI.getOperand(0).getReg();
1969   Register Src0 = MI.getOperand(1).getReg();
1970   LLT DstTy = MRI.getType(Dst);
1971   LLT SrcTy = MRI.getType(Src0);
1972 
1973   if (SrcTy == V2S16 && DstTy == V2S16 &&
1974       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1975     return true;
1976 
1977   MachineIRBuilder HelperBuilder(MI);
1978   GISelObserverWrapper DummyObserver;
1979   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1980   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1981 }
1982 
1983 bool AMDGPULegalizerInfo::legalizeSinCos(
1984   MachineInstr &MI, MachineRegisterInfo &MRI,
1985   MachineIRBuilder &B) const {
1986 
1987   Register DstReg = MI.getOperand(0).getReg();
1988   Register SrcReg = MI.getOperand(1).getReg();
1989   LLT Ty = MRI.getType(DstReg);
1990   unsigned Flags = MI.getFlags();
1991 
1992   Register TrigVal;
1993   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
1994   if (ST.hasTrigReducedRange()) {
1995     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
1996     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
1997       .addUse(MulVal.getReg(0))
1998       .setMIFlags(Flags).getReg(0);
1999   } else
2000     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2001 
2002   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
2003     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2004   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
2005     .addUse(TrigVal)
2006     .setMIFlags(Flags);
2007   MI.eraseFromParent();
2008   return true;
2009 }
2010 
2011 bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2012                                                   MachineIRBuilder &B,
2013                                                   const GlobalValue *GV,
2014                                                   int64_t Offset,
2015                                                   unsigned GAFlags) const {
2016   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
2017   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
2018   // to the following code sequence:
2019   //
2020   // For constant address space:
2021   //   s_getpc_b64 s[0:1]
2022   //   s_add_u32 s0, s0, $symbol
2023   //   s_addc_u32 s1, s1, 0
2024   //
2025   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2026   //   a fixup or relocation is emitted to replace $symbol with a literal
2027   //   constant, which is a pc-relative offset from the encoding of the $symbol
2028   //   operand to the global variable.
2029   //
2030   // For global address space:
2031   //   s_getpc_b64 s[0:1]
2032   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
2033   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
2034   //
2035   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
2036   //   fixups or relocations are emitted to replace $symbol@*@lo and
2037   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
2038   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
2039   //   operand to the global variable.
2040   //
2041   // What we want here is an offset from the value returned by s_getpc
2042   // (which is the address of the s_add_u32 instruction) to the global
2043   // variable, but since the encoding of $symbol starts 4 bytes after the start
2044   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
2045   // small. This requires us to add 4 to the global variable offset in order to
2046   // compute the correct address.
2047 
2048   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2049 
2050   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
2051     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2052 
2053   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
2054     .addDef(PCReg);
2055 
2056   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
2057   if (GAFlags == SIInstrInfo::MO_NONE)
2058     MIB.addImm(0);
2059   else
2060     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
2061 
2062   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2063 
2064   if (PtrTy.getSizeInBits() == 32)
2065     B.buildExtract(DstReg, PCReg, 0);
2066   return true;
2067  }
2068 
2069 bool AMDGPULegalizerInfo::legalizeGlobalValue(
2070   MachineInstr &MI, MachineRegisterInfo &MRI,
2071   MachineIRBuilder &B) const {
2072   Register DstReg = MI.getOperand(0).getReg();
2073   LLT Ty = MRI.getType(DstReg);
2074   unsigned AS = Ty.getAddressSpace();
2075 
2076   const GlobalValue *GV = MI.getOperand(1).getGlobal();
2077   MachineFunction &MF = B.getMF();
2078   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2079 
2080   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2081     if (!MFI->isEntryFunction()) {
2082       const Function &Fn = MF.getFunction();
2083       DiagnosticInfoUnsupported BadLDSDecl(
2084         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2085         DS_Warning);
2086       Fn.getContext().diagnose(BadLDSDecl);
2087 
2088       // We currently don't have a way to correctly allocate LDS objects that
2089       // aren't directly associated with a kernel. We do force inlining of
2090       // functions that use local objects. However, if these dead functions are
2091       // not eliminated, we don't want a compile time error. Just emit a warning
2092       // and a trap, since there should be no callable path here.
2093       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2094       B.buildUndef(DstReg);
2095       MI.eraseFromParent();
2096       return true;
2097     }
2098 
2099     // TODO: We could emit code to handle the initialization somewhere.
2100     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2101       const SITargetLowering *TLI = ST.getTargetLowering();
2102       if (!TLI->shouldUseLDSConstAddress(GV)) {
2103         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2104         return true; // Leave in place;
2105       }
2106 
2107       B.buildConstant(
2108           DstReg,
2109           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
2110       MI.eraseFromParent();
2111       return true;
2112     }
2113 
2114     const Function &Fn = MF.getFunction();
2115     DiagnosticInfoUnsupported BadInit(
2116       Fn, "unsupported initializer for address space", MI.getDebugLoc());
2117     Fn.getContext().diagnose(BadInit);
2118     return true;
2119   }
2120 
2121   const SITargetLowering *TLI = ST.getTargetLowering();
2122 
2123   if (TLI->shouldEmitFixup(GV)) {
2124     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
2125     MI.eraseFromParent();
2126     return true;
2127   }
2128 
2129   if (TLI->shouldEmitPCReloc(GV)) {
2130     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
2131     MI.eraseFromParent();
2132     return true;
2133   }
2134 
2135   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2136   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
2137 
2138   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
2139       MachinePointerInfo::getGOT(MF),
2140       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
2141           MachineMemOperand::MOInvariant,
2142       8 /*Size*/, Align(8));
2143 
2144   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
2145 
2146   if (Ty.getSizeInBits() == 32) {
2147     // Truncate if this is a 32-bit constant adrdess.
2148     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
2149     B.buildExtract(DstReg, Load, 0);
2150   } else
2151     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
2152 
2153   MI.eraseFromParent();
2154   return true;
2155 }
2156 
2157 bool AMDGPULegalizerInfo::legalizeLoad(
2158   MachineInstr &MI, MachineRegisterInfo &MRI,
2159   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
2160   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2161   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
2162   Observer.changingInstr(MI);
2163   MI.getOperand(1).setReg(Cast.getReg(0));
2164   Observer.changedInstr(MI);
2165   return true;
2166 }
2167 
2168 bool AMDGPULegalizerInfo::legalizeFMad(
2169   MachineInstr &MI, MachineRegisterInfo &MRI,
2170   MachineIRBuilder &B) const {
2171   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2172   assert(Ty.isScalar());
2173 
2174   MachineFunction &MF = B.getMF();
2175   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2176 
2177   // TODO: Always legal with future ftz flag.
2178   // FIXME: Do we need just output?
2179   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
2180     return true;
2181   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
2182     return true;
2183 
2184   MachineIRBuilder HelperBuilder(MI);
2185   GISelObserverWrapper DummyObserver;
2186   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
2187   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
2188 }
2189 
2190 bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2191   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2192   Register DstReg = MI.getOperand(0).getReg();
2193   Register PtrReg = MI.getOperand(1).getReg();
2194   Register CmpVal = MI.getOperand(2).getReg();
2195   Register NewVal = MI.getOperand(3).getReg();
2196 
2197   assert(SITargetLowering::isFlatGlobalAddrSpace(
2198            MRI.getType(PtrReg).getAddressSpace()) &&
2199          "this should not have been custom lowered");
2200 
2201   LLT ValTy = MRI.getType(CmpVal);
2202   LLT VecTy = LLT::vector(2, ValTy);
2203 
2204   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2205 
2206   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2207     .addDef(DstReg)
2208     .addUse(PtrReg)
2209     .addUse(PackedVal)
2210     .setMemRefs(MI.memoperands());
2211 
2212   MI.eraseFromParent();
2213   return true;
2214 }
2215 
2216 bool AMDGPULegalizerInfo::legalizeFlog(
2217   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2218   Register Dst = MI.getOperand(0).getReg();
2219   Register Src = MI.getOperand(1).getReg();
2220   LLT Ty = B.getMRI()->getType(Dst);
2221   unsigned Flags = MI.getFlags();
2222 
2223   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2224   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2225 
2226   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2227   MI.eraseFromParent();
2228   return true;
2229 }
2230 
2231 bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2232                                        MachineIRBuilder &B) const {
2233   Register Dst = MI.getOperand(0).getReg();
2234   Register Src = MI.getOperand(1).getReg();
2235   unsigned Flags = MI.getFlags();
2236   LLT Ty = B.getMRI()->getType(Dst);
2237 
2238   auto K = B.buildFConstant(Ty, numbers::log2e);
2239   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2240   B.buildFExp2(Dst, Mul, Flags);
2241   MI.eraseFromParent();
2242   return true;
2243 }
2244 
2245 bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2246                                        MachineIRBuilder &B) const {
2247   Register Dst = MI.getOperand(0).getReg();
2248   Register Src0 = MI.getOperand(1).getReg();
2249   Register Src1 = MI.getOperand(2).getReg();
2250   unsigned Flags = MI.getFlags();
2251   LLT Ty = B.getMRI()->getType(Dst);
2252   const LLT S16 = LLT::scalar(16);
2253   const LLT S32 = LLT::scalar(32);
2254 
2255   if (Ty == S32) {
2256     auto Log = B.buildFLog2(S32, Src0, Flags);
2257     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2258       .addUse(Log.getReg(0))
2259       .addUse(Src1)
2260       .setMIFlags(Flags);
2261     B.buildFExp2(Dst, Mul, Flags);
2262   } else if (Ty == S16) {
2263     // There's no f16 fmul_legacy, so we need to convert for it.
2264     auto Log = B.buildFLog2(S16, Src0, Flags);
2265     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2266     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2267     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2268       .addUse(Ext0.getReg(0))
2269       .addUse(Ext1.getReg(0))
2270       .setMIFlags(Flags);
2271 
2272     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2273   } else
2274     return false;
2275 
2276   MI.eraseFromParent();
2277   return true;
2278 }
2279 
2280 // Find a source register, ignoring any possible source modifiers.
2281 static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2282   Register ModSrc = OrigSrc;
2283   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2284     ModSrc = SrcFNeg->getOperand(1).getReg();
2285     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2286       ModSrc = SrcFAbs->getOperand(1).getReg();
2287   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2288     ModSrc = SrcFAbs->getOperand(1).getReg();
2289   return ModSrc;
2290 }
2291 
2292 bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2293                                          MachineRegisterInfo &MRI,
2294                                          MachineIRBuilder &B) const {
2295 
2296   const LLT S1 = LLT::scalar(1);
2297   const LLT S64 = LLT::scalar(64);
2298   Register Dst = MI.getOperand(0).getReg();
2299   Register OrigSrc = MI.getOperand(1).getReg();
2300   unsigned Flags = MI.getFlags();
2301   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2302          "this should not have been custom lowered");
2303 
2304   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2305   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2306   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2307   // V_FRACT bug is:
2308   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2309   //
2310   // Convert floor(x) to (x - fract(x))
2311 
2312   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2313     .addUse(OrigSrc)
2314     .setMIFlags(Flags);
2315 
2316   // Give source modifier matching some assistance before obscuring a foldable
2317   // pattern.
2318 
2319   // TODO: We can avoid the neg on the fract? The input sign to fract
2320   // shouldn't matter?
2321   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2322 
2323   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2324 
2325   Register Min = MRI.createGenericVirtualRegister(S64);
2326 
2327   // We don't need to concern ourselves with the snan handling difference, so
2328   // use the one which will directly select.
2329   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2330   if (MFI->getMode().IEEE)
2331     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2332   else
2333     B.buildFMinNum(Min, Fract, Const, Flags);
2334 
2335   Register CorrectedFract = Min;
2336   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2337     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2338     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2339   }
2340 
2341   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2342   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2343 
2344   MI.eraseFromParent();
2345   return true;
2346 }
2347 
2348 // Turn an illegal packed v2s16 build vector into bit operations.
2349 // TODO: This should probably be a bitcast action in LegalizerHelper.
2350 bool AMDGPULegalizerInfo::legalizeBuildVector(
2351   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2352   Register Dst = MI.getOperand(0).getReg();
2353   const LLT S32 = LLT::scalar(32);
2354   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2355 
2356   Register Src0 = MI.getOperand(1).getReg();
2357   Register Src1 = MI.getOperand(2).getReg();
2358   assert(MRI.getType(Src0) == LLT::scalar(16));
2359 
2360   auto Merge = B.buildMerge(S32, {Src0, Src1});
2361   B.buildBitcast(Dst, Merge);
2362 
2363   MI.eraseFromParent();
2364   return true;
2365 }
2366 
2367 // Return the use branch instruction, otherwise null if the usage is invalid.
2368 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2369                                        MachineRegisterInfo &MRI,
2370                                        MachineInstr *&Br,
2371                                        MachineBasicBlock *&UncondBrTarget) {
2372   Register CondDef = MI.getOperand(0).getReg();
2373   if (!MRI.hasOneNonDBGUse(CondDef))
2374     return nullptr;
2375 
2376   MachineBasicBlock *Parent = MI.getParent();
2377   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2378   if (UseMI.getParent() != Parent ||
2379       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2380     return nullptr;
2381 
2382   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2383   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2384   if (Next == Parent->end()) {
2385     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2386     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2387       return nullptr;
2388     UncondBrTarget = &*NextMBB;
2389   } else {
2390     if (Next->getOpcode() != AMDGPU::G_BR)
2391       return nullptr;
2392     Br = &*Next;
2393     UncondBrTarget = Br->getOperand(0).getMBB();
2394   }
2395 
2396   return &UseMI;
2397 }
2398 
2399 Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2400                                                MachineRegisterInfo &MRI,
2401                                                Register LiveIn,
2402                                                Register PhyReg) const {
2403   assert(PhyReg.isPhysical() && "Physical register expected");
2404 
2405   // Insert the live-in copy, if required, by defining destination virtual
2406   // register.
2407   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2408   if (!MRI.getVRegDef(LiveIn)) {
2409     // FIXME: Should have scoped insert pt
2410     MachineBasicBlock &OrigInsBB = B.getMBB();
2411     auto OrigInsPt = B.getInsertPt();
2412 
2413     MachineBasicBlock &EntryMBB = B.getMF().front();
2414     EntryMBB.addLiveIn(PhyReg);
2415     B.setInsertPt(EntryMBB, EntryMBB.begin());
2416     B.buildCopy(LiveIn, PhyReg);
2417 
2418     B.setInsertPt(OrigInsBB, OrigInsPt);
2419   }
2420 
2421   return LiveIn;
2422 }
2423 
2424 Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2425                                                 MachineRegisterInfo &MRI,
2426                                                 Register PhyReg, LLT Ty,
2427                                                 bool InsertLiveInCopy) const {
2428   assert(PhyReg.isPhysical() && "Physical register expected");
2429 
2430   // Get or create virtual live-in regester
2431   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2432   if (!LiveIn) {
2433     LiveIn = MRI.createGenericVirtualRegister(Ty);
2434     MRI.addLiveIn(PhyReg, LiveIn);
2435   }
2436 
2437   // When the actual true copy required is from virtual register to physical
2438   // register (to be inserted later), live-in copy insertion from physical
2439   // to register virtual register is not required
2440   if (!InsertLiveInCopy)
2441     return LiveIn;
2442 
2443   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2444 }
2445 
2446 const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2447     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2448   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2449   const ArgDescriptor *Arg;
2450   const TargetRegisterClass *RC;
2451   LLT ArgTy;
2452   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2453   if (!Arg) {
2454     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2455     return nullptr;
2456   }
2457   return Arg;
2458 }
2459 
2460 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2461                                          const ArgDescriptor *Arg) const {
2462   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2463     return false; // TODO: Handle these
2464 
2465   Register SrcReg = Arg->getRegister();
2466   assert(SrcReg.isPhysical() && "Physical register expected");
2467   assert(DstReg.isVirtual() && "Virtual register expected");
2468 
2469   MachineRegisterInfo &MRI = *B.getMRI();
2470 
2471   LLT Ty = MRI.getType(DstReg);
2472   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
2473 
2474   if (Arg->isMasked()) {
2475     // TODO: Should we try to emit this once in the entry block?
2476     const LLT S32 = LLT::scalar(32);
2477     const unsigned Mask = Arg->getMask();
2478     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
2479 
2480     Register AndMaskSrc = LiveIn;
2481 
2482     if (Shift != 0) {
2483       auto ShiftAmt = B.buildConstant(S32, Shift);
2484       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
2485     }
2486 
2487     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2488   } else {
2489     B.buildCopy(DstReg, LiveIn);
2490   }
2491 
2492   return true;
2493 }
2494 
2495 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2496     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
2497     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2498 
2499   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2500   if (!Arg)
2501     return false;
2502 
2503   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2504     return false;
2505 
2506   MI.eraseFromParent();
2507   return true;
2508 }
2509 
2510 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
2511                                        MachineRegisterInfo &MRI,
2512                                        MachineIRBuilder &B) const {
2513   Register Dst = MI.getOperand(0).getReg();
2514   LLT DstTy = MRI.getType(Dst);
2515   LLT S16 = LLT::scalar(16);
2516   LLT S32 = LLT::scalar(32);
2517   LLT S64 = LLT::scalar(64);
2518 
2519   if (legalizeFastUnsafeFDIV(MI, MRI, B))
2520     return true;
2521 
2522   if (DstTy == S16)
2523     return legalizeFDIV16(MI, MRI, B);
2524   if (DstTy == S32)
2525     return legalizeFDIV32(MI, MRI, B);
2526   if (DstTy == S64)
2527     return legalizeFDIV64(MI, MRI, B);
2528 
2529   return false;
2530 }
2531 
2532 void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2533                                                   Register DstReg,
2534                                                   Register X,
2535                                                   Register Y,
2536                                                   bool IsDiv) const {
2537   const LLT S1 = LLT::scalar(1);
2538   const LLT S32 = LLT::scalar(32);
2539 
2540   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2541   // algorithm used here.
2542 
2543   // Initial estimate of inv(y).
2544   auto FloatY = B.buildUITOFP(S32, Y);
2545   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2546   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2547   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2548   auto Z = B.buildFPTOUI(S32, ScaledY);
2549 
2550   // One round of UNR.
2551   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2552   auto NegYZ = B.buildMul(S32, NegY, Z);
2553   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2554 
2555   // Quotient/remainder estimate.
2556   auto Q = B.buildUMulH(S32, X, Z);
2557   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2558 
2559   // First quotient/remainder refinement.
2560   auto One = B.buildConstant(S32, 1);
2561   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2562   if (IsDiv)
2563     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2564   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2565 
2566   // Second quotient/remainder refinement.
2567   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2568   if (IsDiv)
2569     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2570   else
2571     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2572 }
2573 
2574 bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2575                                               MachineRegisterInfo &MRI,
2576                                               MachineIRBuilder &B) const {
2577   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2578   Register DstReg = MI.getOperand(0).getReg();
2579   Register Num = MI.getOperand(1).getReg();
2580   Register Den = MI.getOperand(2).getReg();
2581   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2582   MI.eraseFromParent();
2583   return true;
2584 }
2585 
2586 // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2587 //
2588 // Return lo, hi of result
2589 //
2590 // %cvt.lo = G_UITOFP Val.lo
2591 // %cvt.hi = G_UITOFP Val.hi
2592 // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2593 // %rcp = G_AMDGPU_RCP_IFLAG %mad
2594 // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2595 // %mul2 = G_FMUL %mul1, 2**(-32)
2596 // %trunc = G_INTRINSIC_TRUNC %mul2
2597 // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2598 // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2599 static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2600                                                        Register Val) {
2601   const LLT S32 = LLT::scalar(32);
2602   auto Unmerge = B.buildUnmerge(S32, Val);
2603 
2604   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2605   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2606 
2607   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2608                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2609 
2610   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2611   auto Mul1 =
2612       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2613 
2614   // 2**(-32)
2615   auto Mul2 =
2616       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2617   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2618 
2619   // -(2**32)
2620   auto Mad2 = B.buildFMAD(S32, Trunc,
2621                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2622 
2623   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2624   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2625 
2626   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2627 }
2628 
2629 void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2630                                                   Register DstReg,
2631                                                   Register Numer,
2632                                                   Register Denom,
2633                                                   bool IsDiv) const {
2634   const LLT S32 = LLT::scalar(32);
2635   const LLT S64 = LLT::scalar(64);
2636   const LLT S1 = LLT::scalar(1);
2637   Register RcpLo, RcpHi;
2638 
2639   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2640 
2641   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2642 
2643   auto Zero64 = B.buildConstant(S64, 0);
2644   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2645 
2646   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2647   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2648 
2649   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2650   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2651   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2652 
2653   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2654   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2655   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2656   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2657 
2658   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2659   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2660   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2661   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2662   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2663 
2664   auto Zero32 = B.buildConstant(S32, 0);
2665   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2666   auto Add2_HiC =
2667       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2668   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2669   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2670 
2671   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2672   Register NumerLo = UnmergeNumer.getReg(0);
2673   Register NumerHi = UnmergeNumer.getReg(1);
2674 
2675   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2676   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2677   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2678   Register Mul3_Lo = UnmergeMul3.getReg(0);
2679   Register Mul3_Hi = UnmergeMul3.getReg(1);
2680   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2681   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2682   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2683   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2684 
2685   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2686   Register DenomLo = UnmergeDenom.getReg(0);
2687   Register DenomHi = UnmergeDenom.getReg(1);
2688 
2689   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2690   auto C1 = B.buildSExt(S32, CmpHi);
2691 
2692   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2693   auto C2 = B.buildSExt(S32, CmpLo);
2694 
2695   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2696   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2697 
2698   // TODO: Here and below portions of the code can be enclosed into if/endif.
2699   // Currently control flow is unconditional and we have 4 selects after
2700   // potential endif to substitute PHIs.
2701 
2702   // if C3 != 0 ...
2703   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2704   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2705   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2706   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2707 
2708   auto One64 = B.buildConstant(S64, 1);
2709   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2710 
2711   auto C4 =
2712       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2713   auto C5 =
2714       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2715   auto C6 = B.buildSelect(
2716       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2717 
2718   // if (C6 != 0)
2719   auto Add4 = B.buildAdd(S64, Add3, One64);
2720   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2721 
2722   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2723   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2724   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2725 
2726   // endif C6
2727   // endif C3
2728 
2729   if (IsDiv) {
2730     auto Sel1 = B.buildSelect(
2731         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2732     B.buildSelect(DstReg,
2733                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2734   } else {
2735     auto Sel2 = B.buildSelect(
2736         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2737     B.buildSelect(DstReg,
2738                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2739   }
2740 }
2741 
2742 bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2743                                             MachineRegisterInfo &MRI,
2744                                             MachineIRBuilder &B) const {
2745   const LLT S64 = LLT::scalar(64);
2746   const LLT S32 = LLT::scalar(32);
2747   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2748   Register DstReg = MI.getOperand(0).getReg();
2749   Register Num = MI.getOperand(1).getReg();
2750   Register Den = MI.getOperand(2).getReg();
2751   LLT Ty = MRI.getType(DstReg);
2752 
2753   if (Ty == S32)
2754     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2755   else if (Ty == S64)
2756     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2757   else
2758     return false;
2759 
2760   MI.eraseFromParent();
2761   return true;
2762 
2763 }
2764 
2765 bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2766                                             MachineRegisterInfo &MRI,
2767                                             MachineIRBuilder &B) const {
2768   const LLT S64 = LLT::scalar(64);
2769   const LLT S32 = LLT::scalar(32);
2770 
2771   Register DstReg = MI.getOperand(0).getReg();
2772   const LLT Ty = MRI.getType(DstReg);
2773   if (Ty != S32 && Ty != S64)
2774     return false;
2775 
2776   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2777 
2778   Register LHS = MI.getOperand(1).getReg();
2779   Register RHS = MI.getOperand(2).getReg();
2780 
2781   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2782   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2783   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2784 
2785   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2786   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2787 
2788   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2789   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2790 
2791   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2792   if (Ty == S32)
2793     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2794   else
2795     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2796 
2797   Register Sign;
2798   if (IsDiv)
2799     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2800   else
2801     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2802 
2803   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2804   B.buildSub(DstReg, UDivRem, Sign);
2805 
2806   MI.eraseFromParent();
2807   return true;
2808 }
2809 
2810 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
2811                                                  MachineRegisterInfo &MRI,
2812                                                  MachineIRBuilder &B) const {
2813   Register Res = MI.getOperand(0).getReg();
2814   Register LHS = MI.getOperand(1).getReg();
2815   Register RHS = MI.getOperand(2).getReg();
2816 
2817   uint16_t Flags = MI.getFlags();
2818 
2819   LLT ResTy = MRI.getType(Res);
2820   LLT S32 = LLT::scalar(32);
2821   LLT S64 = LLT::scalar(64);
2822 
2823   const MachineFunction &MF = B.getMF();
2824   bool Unsafe =
2825     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
2826 
2827   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
2828     return false;
2829 
2830   if (!Unsafe && ResTy == S32 &&
2831       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
2832     return false;
2833 
2834   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
2835     // 1 / x -> RCP(x)
2836     if (CLHS->isExactlyValue(1.0)) {
2837       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2838         .addUse(RHS)
2839         .setMIFlags(Flags);
2840 
2841       MI.eraseFromParent();
2842       return true;
2843     }
2844 
2845     // -1 / x -> RCP( FNEG(x) )
2846     if (CLHS->isExactlyValue(-1.0)) {
2847       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
2848       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
2849         .addUse(FNeg.getReg(0))
2850         .setMIFlags(Flags);
2851 
2852       MI.eraseFromParent();
2853       return true;
2854     }
2855   }
2856 
2857   // x / y -> x * (1.0 / y)
2858   if (Unsafe) {
2859     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
2860       .addUse(RHS)
2861       .setMIFlags(Flags);
2862     B.buildFMul(Res, LHS, RCP, Flags);
2863 
2864     MI.eraseFromParent();
2865     return true;
2866   }
2867 
2868   return false;
2869 }
2870 
2871 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2872                                          MachineRegisterInfo &MRI,
2873                                          MachineIRBuilder &B) const {
2874   Register Res = MI.getOperand(0).getReg();
2875   Register LHS = MI.getOperand(1).getReg();
2876   Register RHS = MI.getOperand(2).getReg();
2877 
2878   uint16_t Flags = MI.getFlags();
2879 
2880   LLT S16 = LLT::scalar(16);
2881   LLT S32 = LLT::scalar(32);
2882 
2883   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2884   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2885 
2886   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2887     .addUse(RHSExt.getReg(0))
2888     .setMIFlags(Flags);
2889 
2890   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2891   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2892 
2893   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2894     .addUse(RDst.getReg(0))
2895     .addUse(RHS)
2896     .addUse(LHS)
2897     .setMIFlags(Flags);
2898 
2899   MI.eraseFromParent();
2900   return true;
2901 }
2902 
2903 // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2904 // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2905 static void toggleSPDenormMode(bool Enable,
2906                                MachineIRBuilder &B,
2907                                const GCNSubtarget &ST,
2908                                AMDGPU::SIModeRegisterDefaults Mode) {
2909   // Set SP denorm mode to this value.
2910   unsigned SPDenormMode =
2911     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2912 
2913   if (ST.hasDenormModeInst()) {
2914     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2915     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2916 
2917     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2918     B.buildInstr(AMDGPU::S_DENORM_MODE)
2919       .addImm(NewDenormModeValue);
2920 
2921   } else {
2922     // Select FP32 bit field in mode register.
2923     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2924                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2925                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2926 
2927     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2928       .addImm(SPDenormMode)
2929       .addImm(SPDenormModeBitField);
2930   }
2931 }
2932 
2933 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2934                                          MachineRegisterInfo &MRI,
2935                                          MachineIRBuilder &B) const {
2936   Register Res = MI.getOperand(0).getReg();
2937   Register LHS = MI.getOperand(1).getReg();
2938   Register RHS = MI.getOperand(2).getReg();
2939   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2940   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2941 
2942   uint16_t Flags = MI.getFlags();
2943 
2944   LLT S32 = LLT::scalar(32);
2945   LLT S1 = LLT::scalar(1);
2946 
2947   auto One = B.buildFConstant(S32, 1.0f);
2948 
2949   auto DenominatorScaled =
2950     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2951       .addUse(LHS)
2952       .addUse(RHS)
2953       .addImm(0)
2954       .setMIFlags(Flags);
2955   auto NumeratorScaled =
2956     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2957       .addUse(LHS)
2958       .addUse(RHS)
2959       .addImm(1)
2960       .setMIFlags(Flags);
2961 
2962   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2963     .addUse(DenominatorScaled.getReg(0))
2964     .setMIFlags(Flags);
2965   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2966 
2967   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2968   // aren't modeled as reading it.
2969   if (!Mode.allFP32Denormals())
2970     toggleSPDenormMode(true, B, ST, Mode);
2971 
2972   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2973   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2974   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2975   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2976   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2977   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2978 
2979   if (!Mode.allFP32Denormals())
2980     toggleSPDenormMode(false, B, ST, Mode);
2981 
2982   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2983     .addUse(Fma4.getReg(0))
2984     .addUse(Fma1.getReg(0))
2985     .addUse(Fma3.getReg(0))
2986     .addUse(NumeratorScaled.getReg(1))
2987     .setMIFlags(Flags);
2988 
2989   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2990     .addUse(Fmas.getReg(0))
2991     .addUse(RHS)
2992     .addUse(LHS)
2993     .setMIFlags(Flags);
2994 
2995   MI.eraseFromParent();
2996   return true;
2997 }
2998 
2999 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3000                                          MachineRegisterInfo &MRI,
3001                                          MachineIRBuilder &B) const {
3002   Register Res = MI.getOperand(0).getReg();
3003   Register LHS = MI.getOperand(1).getReg();
3004   Register RHS = MI.getOperand(2).getReg();
3005 
3006   uint16_t Flags = MI.getFlags();
3007 
3008   LLT S64 = LLT::scalar(64);
3009   LLT S1 = LLT::scalar(1);
3010 
3011   auto One = B.buildFConstant(S64, 1.0);
3012 
3013   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3014     .addUse(LHS)
3015     .addUse(RHS)
3016     .addImm(0)
3017     .setMIFlags(Flags);
3018 
3019   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3020 
3021   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3022     .addUse(DivScale0.getReg(0))
3023     .setMIFlags(Flags);
3024 
3025   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3026   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3027   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3028 
3029   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3030     .addUse(LHS)
3031     .addUse(RHS)
3032     .addImm(1)
3033     .setMIFlags(Flags);
3034 
3035   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3036   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3037   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3038 
3039   Register Scale;
3040   if (!ST.hasUsableDivScaleConditionOutput()) {
3041     // Workaround a hardware bug on SI where the condition output from div_scale
3042     // is not usable.
3043 
3044     LLT S32 = LLT::scalar(32);
3045 
3046     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3047     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3048     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3049     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3050 
3051     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3052                               Scale1Unmerge.getReg(1));
3053     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3054                               Scale0Unmerge.getReg(1));
3055     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3056   } else {
3057     Scale = DivScale1.getReg(1);
3058   }
3059 
3060   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3061     .addUse(Fma4.getReg(0))
3062     .addUse(Fma3.getReg(0))
3063     .addUse(Mul.getReg(0))
3064     .addUse(Scale)
3065     .setMIFlags(Flags);
3066 
3067   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3068     .addUse(Fmas.getReg(0))
3069     .addUse(RHS)
3070     .addUse(LHS)
3071     .setMIFlags(Flags);
3072 
3073   MI.eraseFromParent();
3074   return true;
3075 }
3076 
3077 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
3078                                                  MachineRegisterInfo &MRI,
3079                                                  MachineIRBuilder &B) const {
3080   Register Res = MI.getOperand(0).getReg();
3081   Register LHS = MI.getOperand(2).getReg();
3082   Register RHS = MI.getOperand(3).getReg();
3083   uint16_t Flags = MI.getFlags();
3084 
3085   LLT S32 = LLT::scalar(32);
3086   LLT S1 = LLT::scalar(1);
3087 
3088   auto Abs = B.buildFAbs(S32, RHS, Flags);
3089   const APFloat C0Val(1.0f);
3090 
3091   auto C0 = B.buildConstant(S32, 0x6f800000);
3092   auto C1 = B.buildConstant(S32, 0x2f800000);
3093   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
3094 
3095   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
3096   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
3097 
3098   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
3099 
3100   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3101     .addUse(Mul0.getReg(0))
3102     .setMIFlags(Flags);
3103 
3104   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
3105 
3106   B.buildFMul(Res, Sel, Mul1, Flags);
3107 
3108   MI.eraseFromParent();
3109   return true;
3110 }
3111 
3112 bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
3113                                                  MachineRegisterInfo &MRI,
3114                                                  MachineIRBuilder &B) const {
3115   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3116   if (!MFI->isEntryFunction()) {
3117     return legalizePreloadedArgIntrin(MI, MRI, B,
3118                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3119   }
3120 
3121   uint64_t Offset =
3122     ST.getTargetLowering()->getImplicitParameterOffset(
3123       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3124   Register DstReg = MI.getOperand(0).getReg();
3125   LLT DstTy = MRI.getType(DstReg);
3126   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3127 
3128   const ArgDescriptor *Arg;
3129   const TargetRegisterClass *RC;
3130   LLT ArgTy;
3131   std::tie(Arg, RC, ArgTy) =
3132       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3133   if (!Arg)
3134     return false;
3135 
3136   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3137   if (!loadInputValue(KernargPtrReg, B, Arg))
3138     return false;
3139 
3140   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3141   MI.eraseFromParent();
3142   return true;
3143 }
3144 
3145 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
3146                                               MachineRegisterInfo &MRI,
3147                                               MachineIRBuilder &B,
3148                                               unsigned AddrSpace) const {
3149   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3150   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
3151   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
3152   MI.eraseFromParent();
3153   return true;
3154 }
3155 
3156 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3157 // offset (the offset that is included in bounds checking and swizzling, to be
3158 // split between the instruction's voffset and immoffset fields) and soffset
3159 // (the offset that is excluded from bounds checking and swizzling, to go in
3160 // the instruction's soffset field).  This function takes the first kind of
3161 // offset and figures out how to split it between voffset and immoffset.
3162 std::tuple<Register, unsigned, unsigned>
3163 AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3164                                         Register OrigOffset) const {
3165   const unsigned MaxImm = 4095;
3166   Register BaseReg;
3167   unsigned TotalConstOffset;
3168   MachineInstr *OffsetDef;
3169   const LLT S32 = LLT::scalar(32);
3170 
3171   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3172     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3173 
3174   unsigned ImmOffset = TotalConstOffset;
3175 
3176   // If the immediate value is too big for the immoffset field, put the value
3177   // and -4096 into the immoffset field so that the value that is copied/added
3178   // for the voffset field is a multiple of 4096, and it stands more chance
3179   // of being CSEd with the copy/add for another similar load/store.
3180   // However, do not do that rounding down to a multiple of 4096 if that is a
3181   // negative number, as it appears to be illegal to have a negative offset
3182   // in the vgpr, even if adding the immediate offset makes it positive.
3183   unsigned Overflow = ImmOffset & ~MaxImm;
3184   ImmOffset -= Overflow;
3185   if ((int32_t)Overflow < 0) {
3186     Overflow += ImmOffset;
3187     ImmOffset = 0;
3188   }
3189 
3190   if (Overflow != 0) {
3191     if (!BaseReg) {
3192       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3193     } else {
3194       auto OverflowVal = B.buildConstant(S32, Overflow);
3195       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3196     }
3197   }
3198 
3199   if (!BaseReg)
3200     BaseReg = B.buildConstant(S32, 0).getReg(0);
3201 
3202   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3203 }
3204 
3205 /// Handle register layout difference for f16 images for some subtargets.
3206 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
3207                                              MachineRegisterInfo &MRI,
3208                                              Register Reg) const {
3209   if (!ST.hasUnpackedD16VMem())
3210     return Reg;
3211 
3212   const LLT S16 = LLT::scalar(16);
3213   const LLT S32 = LLT::scalar(32);
3214   LLT StoreVT = MRI.getType(Reg);
3215   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
3216 
3217   auto Unmerge = B.buildUnmerge(S16, Reg);
3218 
3219   SmallVector<Register, 4> WideRegs;
3220   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3221     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
3222 
3223   int NumElts = StoreVT.getNumElements();
3224 
3225   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
3226 }
3227 
3228 Register AMDGPULegalizerInfo::fixStoreSourceType(
3229   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3230   MachineRegisterInfo *MRI = B.getMRI();
3231   LLT Ty = MRI->getType(VData);
3232 
3233   const LLT S16 = LLT::scalar(16);
3234 
3235   // Fixup illegal register types for i8 stores.
3236   if (Ty == LLT::scalar(8) || Ty == S16) {
3237     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3238     return AnyExt;
3239   }
3240 
3241   if (Ty.isVector()) {
3242     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
3243       if (IsFormat)
3244         return handleD16VData(B, *MRI, VData);
3245     }
3246   }
3247 
3248   return VData;
3249 }
3250 
3251 bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3252                                               MachineRegisterInfo &MRI,
3253                                               MachineIRBuilder &B,
3254                                               bool IsTyped,
3255                                               bool IsFormat) const {
3256   Register VData = MI.getOperand(1).getReg();
3257   LLT Ty = MRI.getType(VData);
3258   LLT EltTy = Ty.getScalarType();
3259   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3260   const LLT S32 = LLT::scalar(32);
3261 
3262   VData = fixStoreSourceType(B, VData, IsFormat);
3263   Register RSrc = MI.getOperand(2).getReg();
3264 
3265   MachineMemOperand *MMO = *MI.memoperands_begin();
3266   const int MemSize = MMO->getSize();
3267 
3268   unsigned ImmOffset;
3269   unsigned TotalOffset;
3270 
3271   // The typed intrinsics add an immediate after the registers.
3272   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3273 
3274   // The struct intrinsic variants add one additional operand over raw.
3275   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3276   Register VIndex;
3277   int OpOffset = 0;
3278   if (HasVIndex) {
3279     VIndex = MI.getOperand(3).getReg();
3280     OpOffset = 1;
3281   }
3282 
3283   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3284   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3285 
3286   unsigned Format = 0;
3287   if (IsTyped) {
3288     Format = MI.getOperand(5 + OpOffset).getImm();
3289     ++OpOffset;
3290   }
3291 
3292   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3293 
3294   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3295   if (TotalOffset != 0)
3296     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3297 
3298   unsigned Opc;
3299   if (IsTyped) {
3300     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3301                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3302   } else if (IsFormat) {
3303     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3304                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3305   } else {
3306     switch (MemSize) {
3307     case 1:
3308       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3309       break;
3310     case 2:
3311       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3312       break;
3313     default:
3314       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3315       break;
3316     }
3317   }
3318 
3319   if (!VIndex)
3320     VIndex = B.buildConstant(S32, 0).getReg(0);
3321 
3322   auto MIB = B.buildInstr(Opc)
3323     .addUse(VData)              // vdata
3324     .addUse(RSrc)               // rsrc
3325     .addUse(VIndex)             // vindex
3326     .addUse(VOffset)            // voffset
3327     .addUse(SOffset)            // soffset
3328     .addImm(ImmOffset);         // offset(imm)
3329 
3330   if (IsTyped)
3331     MIB.addImm(Format);
3332 
3333   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3334      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3335      .addMemOperand(MMO);
3336 
3337   MI.eraseFromParent();
3338   return true;
3339 }
3340 
3341 bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3342                                              MachineRegisterInfo &MRI,
3343                                              MachineIRBuilder &B,
3344                                              bool IsFormat,
3345                                              bool IsTyped) const {
3346   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3347   MachineMemOperand *MMO = *MI.memoperands_begin();
3348   const int MemSize = MMO->getSize();
3349   const LLT S32 = LLT::scalar(32);
3350 
3351   Register Dst = MI.getOperand(0).getReg();
3352   Register RSrc = MI.getOperand(2).getReg();
3353 
3354   // The typed intrinsics add an immediate after the registers.
3355   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3356 
3357   // The struct intrinsic variants add one additional operand over raw.
3358   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3359   Register VIndex;
3360   int OpOffset = 0;
3361   if (HasVIndex) {
3362     VIndex = MI.getOperand(3).getReg();
3363     OpOffset = 1;
3364   }
3365 
3366   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3367   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3368 
3369   unsigned Format = 0;
3370   if (IsTyped) {
3371     Format = MI.getOperand(5 + OpOffset).getImm();
3372     ++OpOffset;
3373   }
3374 
3375   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3376   unsigned ImmOffset;
3377   unsigned TotalOffset;
3378 
3379   LLT Ty = MRI.getType(Dst);
3380   LLT EltTy = Ty.getScalarType();
3381   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3382   const bool Unpacked = ST.hasUnpackedD16VMem();
3383 
3384   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3385   if (TotalOffset != 0)
3386     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3387 
3388   unsigned Opc;
3389 
3390   if (IsTyped) {
3391     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3392                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3393   } else if (IsFormat) {
3394     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3395                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3396   } else {
3397     switch (MemSize) {
3398     case 1:
3399       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3400       break;
3401     case 2:
3402       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3403       break;
3404     default:
3405       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3406       break;
3407     }
3408   }
3409 
3410   Register LoadDstReg;
3411 
3412   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3413   LLT UnpackedTy = Ty.changeElementSize(32);
3414 
3415   if (IsExtLoad)
3416     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3417   else if (Unpacked && IsD16 && Ty.isVector())
3418     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3419   else
3420     LoadDstReg = Dst;
3421 
3422   if (!VIndex)
3423     VIndex = B.buildConstant(S32, 0).getReg(0);
3424 
3425   auto MIB = B.buildInstr(Opc)
3426     .addDef(LoadDstReg)         // vdata
3427     .addUse(RSrc)               // rsrc
3428     .addUse(VIndex)             // vindex
3429     .addUse(VOffset)            // voffset
3430     .addUse(SOffset)            // soffset
3431     .addImm(ImmOffset);         // offset(imm)
3432 
3433   if (IsTyped)
3434     MIB.addImm(Format);
3435 
3436   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3437      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3438      .addMemOperand(MMO);
3439 
3440   if (LoadDstReg != Dst) {
3441     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3442 
3443     // Widen result for extending loads was widened.
3444     if (IsExtLoad)
3445       B.buildTrunc(Dst, LoadDstReg);
3446     else {
3447       // Repack to original 16-bit vector result
3448       // FIXME: G_TRUNC should work, but legalization currently fails
3449       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3450       SmallVector<Register, 4> Repack;
3451       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3452         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3453       B.buildMerge(Dst, Repack);
3454     }
3455   }
3456 
3457   MI.eraseFromParent();
3458   return true;
3459 }
3460 
3461 bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3462                                                MachineIRBuilder &B,
3463                                                bool IsInc) const {
3464   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3465                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3466   B.buildInstr(Opc)
3467     .addDef(MI.getOperand(0).getReg())
3468     .addUse(MI.getOperand(2).getReg())
3469     .addUse(MI.getOperand(3).getReg())
3470     .cloneMemRefs(MI);
3471   MI.eraseFromParent();
3472   return true;
3473 }
3474 
3475 static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3476   switch (IntrID) {
3477   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3478   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3479     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3480   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3481   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3482     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3483   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3484   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3485     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3486   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3487   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3488     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3489   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3490   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3491     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3492   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3493   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3494     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3495   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3496   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3497     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3498   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3499   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3500     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3501   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3502   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3503     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3504   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3505   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3506     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3507   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3508   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3509     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3510   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3511   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3512     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3513   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3514   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3515     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3516   default:
3517     llvm_unreachable("unhandled atomic opcode");
3518   }
3519 }
3520 
3521 bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3522                                                MachineIRBuilder &B,
3523                                                Intrinsic::ID IID) const {
3524   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3525                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3526 
3527   Register Dst = MI.getOperand(0).getReg();
3528   Register VData = MI.getOperand(2).getReg();
3529 
3530   Register CmpVal;
3531   int OpOffset = 0;
3532 
3533   if (IsCmpSwap) {
3534     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3535     ++OpOffset;
3536   }
3537 
3538   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3539   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3540 
3541   // The struct intrinsic variants add one additional operand over raw.
3542   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3543   Register VIndex;
3544   if (HasVIndex) {
3545     VIndex = MI.getOperand(4 + OpOffset).getReg();
3546     ++OpOffset;
3547   }
3548 
3549   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3550   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3551   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3552 
3553   MachineMemOperand *MMO = *MI.memoperands_begin();
3554 
3555   unsigned ImmOffset;
3556   unsigned TotalOffset;
3557   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3558   if (TotalOffset != 0)
3559     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3560 
3561   if (!VIndex)
3562     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3563 
3564   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3565     .addDef(Dst)
3566     .addUse(VData); // vdata
3567 
3568   if (IsCmpSwap)
3569     MIB.addReg(CmpVal);
3570 
3571   MIB.addUse(RSrc)               // rsrc
3572      .addUse(VIndex)             // vindex
3573      .addUse(VOffset)            // voffset
3574      .addUse(SOffset)            // soffset
3575      .addImm(ImmOffset)          // offset(imm)
3576      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3577      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3578      .addMemOperand(MMO);
3579 
3580   MI.eraseFromParent();
3581   return true;
3582 }
3583 
3584 /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3585 /// vector with s16 typed elements.
3586 static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3587                                         SmallVectorImpl<Register> &PackedAddrs,
3588                                         int AddrIdx, int DimIdx, int EndIdx,
3589                                         int NumGradients) {
3590   const LLT S16 = LLT::scalar(16);
3591   const LLT V2S16 = LLT::vector(2, 16);
3592 
3593   for (int I = AddrIdx; I < EndIdx; ++I) {
3594     MachineOperand &SrcOp = MI.getOperand(I);
3595     if (!SrcOp.isReg())
3596       continue; // _L to _LZ may have eliminated this.
3597 
3598     Register AddrReg = SrcOp.getReg();
3599 
3600     if (I < DimIdx) {
3601       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3602       PackedAddrs.push_back(AddrReg);
3603     } else {
3604       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3605       // derivatives dx/dh and dx/dv are packed with undef.
3606       if (((I + 1) >= EndIdx) ||
3607           ((NumGradients / 2) % 2 == 1 &&
3608            (I == DimIdx + (NumGradients / 2) - 1 ||
3609             I == DimIdx + NumGradients - 1)) ||
3610           // Check for _L to _LZ optimization
3611           !MI.getOperand(I + 1).isReg()) {
3612         PackedAddrs.push_back(
3613             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3614                 .getReg(0));
3615       } else {
3616         PackedAddrs.push_back(
3617             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3618                 .getReg(0));
3619         ++I;
3620       }
3621     }
3622   }
3623 }
3624 
3625 /// Convert from separate vaddr components to a single vector address register,
3626 /// and replace the remaining operands with $noreg.
3627 static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3628                                      int DimIdx, int NumVAddrs) {
3629   const LLT S32 = LLT::scalar(32);
3630 
3631   SmallVector<Register, 8> AddrRegs;
3632   for (int I = 0; I != NumVAddrs; ++I) {
3633     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3634     if (SrcOp.isReg()) {
3635       AddrRegs.push_back(SrcOp.getReg());
3636       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3637     }
3638   }
3639 
3640   int NumAddrRegs = AddrRegs.size();
3641   if (NumAddrRegs != 1) {
3642     // Round up to 8 elements for v5-v7
3643     // FIXME: Missing intermediate sized register classes and instructions.
3644     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3645       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3646       auto Undef = B.buildUndef(S32);
3647       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3648       NumAddrRegs = RoundedNumRegs;
3649     }
3650 
3651     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3652     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3653   }
3654 
3655   for (int I = 1; I != NumVAddrs; ++I) {
3656     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3657     if (SrcOp.isReg())
3658       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3659   }
3660 }
3661 
3662 /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3663 ///
3664 /// Depending on the subtarget, load/store with 16-bit element data need to be
3665 /// rewritten to use the low half of 32-bit registers, or directly use a packed
3666 /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3667 /// registers.
3668 ///
3669 /// We don't want to directly select image instructions just yet, but also want
3670 /// to exposes all register repacking to the legalizer/combiners. We also don't
3671 /// want a selected instrution entering RegBankSelect. In order to avoid
3672 /// defining a multitude of intermediate image instructions, directly hack on
3673 /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3674 /// now unnecessary arguments with $noreg.
3675 bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3676     MachineInstr &MI, MachineIRBuilder &B,
3677     GISelChangeObserver &Observer,
3678     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3679 
3680   const int NumDefs = MI.getNumExplicitDefs();
3681   bool IsTFE = NumDefs == 2;
3682   // We are only processing the operands of d16 image operations on subtargets
3683   // that use the unpacked register layout, or need to repack the TFE result.
3684 
3685   // TODO: Do we need to guard against already legalized intrinsics?
3686   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3687     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3688 
3689   MachineRegisterInfo *MRI = B.getMRI();
3690   const LLT S32 = LLT::scalar(32);
3691   const LLT S16 = LLT::scalar(16);
3692   const LLT V2S16 = LLT::vector(2, 16);
3693 
3694   // Index of first address argument
3695   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3696 
3697   int NumVAddrs, NumGradients;
3698   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3699   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3700     getDMaskIdx(BaseOpcode, NumDefs);
3701   unsigned DMask = 0;
3702 
3703   // Check for 16 bit addresses and pack if true.
3704   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3705   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3706   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3707   const bool IsG16 = GradTy == S16;
3708   const bool IsA16 = AddrTy == S16;
3709 
3710   int DMaskLanes = 0;
3711   if (!BaseOpcode->Atomic) {
3712     DMask = MI.getOperand(DMaskIdx).getImm();
3713     if (BaseOpcode->Gather4) {
3714       DMaskLanes = 4;
3715     } else if (DMask != 0) {
3716       DMaskLanes = countPopulation(DMask);
3717     } else if (!IsTFE && !BaseOpcode->Store) {
3718       // If dmask is 0, this is a no-op load. This can be eliminated.
3719       B.buildUndef(MI.getOperand(0));
3720       MI.eraseFromParent();
3721       return true;
3722     }
3723   }
3724 
3725   Observer.changingInstr(MI);
3726   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3727 
3728   unsigned NewOpcode = NumDefs == 0 ?
3729     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3730 
3731   // Track that we legalized this
3732   MI.setDesc(B.getTII().get(NewOpcode));
3733 
3734   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3735   // dmask to be at least 1 otherwise the instruction will fail
3736   if (IsTFE && DMask == 0) {
3737     DMask = 0x1;
3738     DMaskLanes = 1;
3739     MI.getOperand(DMaskIdx).setImm(DMask);
3740   }
3741 
3742   if (BaseOpcode->Atomic) {
3743     Register VData0 = MI.getOperand(2).getReg();
3744     LLT Ty = MRI->getType(VData0);
3745 
3746     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3747     if (Ty.isVector())
3748       return false;
3749 
3750     if (BaseOpcode->AtomicX2) {
3751       Register VData1 = MI.getOperand(3).getReg();
3752       // The two values are packed in one register.
3753       LLT PackedTy = LLT::vector(2, Ty);
3754       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3755       MI.getOperand(2).setReg(Concat.getReg(0));
3756       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3757     }
3758   }
3759 
3760   int CorrectedNumVAddrs = NumVAddrs;
3761 
3762   // Optimize _L to _LZ when _L is zero
3763   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3764         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3765     const ConstantFP *ConstantLod;
3766     const int LodIdx = AddrIdx + NumVAddrs - 1;
3767 
3768     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3769       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3770         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3771         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3772           LZMappingInfo->LZ, ImageDimIntr->Dim);
3773 
3774         // The starting indexes should remain in the same place.
3775         --NumVAddrs;
3776         --CorrectedNumVAddrs;
3777 
3778         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3779           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3780         MI.RemoveOperand(LodIdx);
3781       }
3782     }
3783   }
3784 
3785   // Optimize _mip away, when 'lod' is zero
3786   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3787     int64_t ConstantLod;
3788     const int LodIdx = AddrIdx + NumVAddrs - 1;
3789 
3790     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3791       if (ConstantLod == 0) {
3792         // TODO: Change intrinsic opcode and remove operand instead or replacing
3793         // it with 0, as the _L to _LZ handling is done above.
3794         MI.getOperand(LodIdx).ChangeToImmediate(0);
3795         --CorrectedNumVAddrs;
3796       }
3797     }
3798   }
3799 
3800   // Rewrite the addressing register layout before doing anything else.
3801   if (IsA16 || IsG16) {
3802     if (IsA16) {
3803       // Target must support the feature and gradients need to be 16 bit too
3804       if (!ST.hasA16() || !IsG16)
3805         return false;
3806     } else if (!ST.hasG16())
3807       return false;
3808 
3809     if (NumVAddrs > 1) {
3810       SmallVector<Register, 4> PackedRegs;
3811       // Don't compress addresses for G16
3812       const int PackEndIdx =
3813           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3814       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3815                                   PackEndIdx, NumGradients);
3816 
3817       if (!IsA16) {
3818         // Add uncompressed address
3819         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3820           int AddrReg = MI.getOperand(I).getReg();
3821           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3822           PackedRegs.push_back(AddrReg);
3823         }
3824       }
3825 
3826       // See also below in the non-a16 branch
3827       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3828 
3829       if (!UseNSA && PackedRegs.size() > 1) {
3830         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3831         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3832         PackedRegs[0] = Concat.getReg(0);
3833         PackedRegs.resize(1);
3834       }
3835 
3836       const int NumPacked = PackedRegs.size();
3837       for (int I = 0; I != NumVAddrs; ++I) {
3838         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3839         if (!SrcOp.isReg()) {
3840           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3841           continue;
3842         }
3843 
3844         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3845 
3846         if (I < NumPacked)
3847           SrcOp.setReg(PackedRegs[I]);
3848         else
3849           SrcOp.setReg(AMDGPU::NoRegister);
3850       }
3851     }
3852   } else {
3853     // If the register allocator cannot place the address registers contiguously
3854     // without introducing moves, then using the non-sequential address encoding
3855     // is always preferable, since it saves VALU instructions and is usually a
3856     // wash in terms of code size or even better.
3857     //
3858     // However, we currently have no way of hinting to the register allocator
3859     // that MIMG addresses should be placed contiguously when it is possible to
3860     // do so, so force non-NSA for the common 2-address case as a heuristic.
3861     //
3862     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3863     // allocation when possible.
3864     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3865 
3866     if (!UseNSA && NumVAddrs > 1)
3867       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3868   }
3869 
3870   int Flags = 0;
3871   if (IsA16)
3872     Flags |= 1;
3873   if (IsG16)
3874     Flags |= 2;
3875   MI.addOperand(MachineOperand::CreateImm(Flags));
3876 
3877   if (BaseOpcode->Store) { // No TFE for stores?
3878     // TODO: Handle dmask trim
3879     Register VData = MI.getOperand(1).getReg();
3880     LLT Ty = MRI->getType(VData);
3881     if (!Ty.isVector() || Ty.getElementType() != S16)
3882       return true;
3883 
3884     Register RepackedReg = handleD16VData(B, *MRI, VData);
3885     if (RepackedReg != VData) {
3886       MI.getOperand(1).setReg(RepackedReg);
3887     }
3888 
3889     return true;
3890   }
3891 
3892   Register DstReg = MI.getOperand(0).getReg();
3893   LLT Ty = MRI->getType(DstReg);
3894   const LLT EltTy = Ty.getScalarType();
3895   const bool IsD16 = Ty.getScalarType() == S16;
3896   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3897 
3898   // Confirm that the return type is large enough for the dmask specified
3899   if (NumElts < DMaskLanes)
3900     return false;
3901 
3902   if (NumElts > 4 || DMaskLanes > 4)
3903     return false;
3904 
3905   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3906   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3907 
3908   // The raw dword aligned data component of the load. The only legal cases
3909   // where this matters should be when using the packed D16 format, for
3910   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3911   LLT RoundedTy;
3912 
3913   // S32 vector to to cover all data, plus TFE result element.
3914   LLT TFETy;
3915 
3916   // Register type to use for each loaded component. Will be S32 or V2S16.
3917   LLT RegTy;
3918 
3919   if (IsD16 && ST.hasUnpackedD16VMem()) {
3920     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3921     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3922     RegTy = S32;
3923   } else {
3924     unsigned EltSize = EltTy.getSizeInBits();
3925     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3926     unsigned RoundedSize = 32 * RoundedElts;
3927     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3928     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3929     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3930   }
3931 
3932   // The return type does not need adjustment.
3933   // TODO: Should we change s16 case to s32 or <2 x s16>?
3934   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3935     return true;
3936 
3937   Register Dst1Reg;
3938 
3939   // Insert after the instruction.
3940   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3941 
3942   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3943   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3944   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3945   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3946 
3947   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3948 
3949   MI.getOperand(0).setReg(NewResultReg);
3950 
3951   // In the IR, TFE is supposed to be used with a 2 element struct return
3952   // type. The intruction really returns these two values in one contiguous
3953   // register, with one additional dword beyond the loaded data. Rewrite the
3954   // return type to use a single register result.
3955 
3956   if (IsTFE) {
3957     Dst1Reg = MI.getOperand(1).getReg();
3958     if (MRI->getType(Dst1Reg) != S32)
3959       return false;
3960 
3961     // TODO: Make sure the TFE operand bit is set.
3962     MI.RemoveOperand(1);
3963 
3964     // Handle the easy case that requires no repack instructions.
3965     if (Ty == S32) {
3966       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3967       return true;
3968     }
3969   }
3970 
3971   // Now figure out how to copy the new result register back into the old
3972   // result.
3973   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3974 
3975   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3976 
3977   if (ResultNumRegs == 1) {
3978     assert(!IsTFE);
3979     ResultRegs[0] = NewResultReg;
3980   } else {
3981     // We have to repack into a new vector of some kind.
3982     for (int I = 0; I != NumDataRegs; ++I)
3983       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3984     B.buildUnmerge(ResultRegs, NewResultReg);
3985 
3986     // Drop the final TFE element to get the data part. The TFE result is
3987     // directly written to the right place already.
3988     if (IsTFE)
3989       ResultRegs.resize(NumDataRegs);
3990   }
3991 
3992   // For an s16 scalar result, we form an s32 result with a truncate regardless
3993   // of packed vs. unpacked.
3994   if (IsD16 && !Ty.isVector()) {
3995     B.buildTrunc(DstReg, ResultRegs[0]);
3996     return true;
3997   }
3998 
3999   // Avoid a build/concat_vector of 1 entry.
4000   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4001     B.buildBitcast(DstReg, ResultRegs[0]);
4002     return true;
4003   }
4004 
4005   assert(Ty.isVector());
4006 
4007   if (IsD16) {
4008     // For packed D16 results with TFE enabled, all the data components are
4009     // S32. Cast back to the expected type.
4010     //
4011     // TODO: We don't really need to use load s32 elements. We would only need one
4012     // cast for the TFE result if a multiple of v2s16 was used.
4013     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4014       for (Register &Reg : ResultRegs)
4015         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4016     } else if (ST.hasUnpackedD16VMem()) {
4017       for (Register &Reg : ResultRegs)
4018         Reg = B.buildTrunc(S16, Reg).getReg(0);
4019     }
4020   }
4021 
4022   auto padWithUndef = [&](LLT Ty, int NumElts) {
4023     if (NumElts == 0)
4024       return;
4025     Register Undef = B.buildUndef(Ty).getReg(0);
4026     for (int I = 0; I != NumElts; ++I)
4027       ResultRegs.push_back(Undef);
4028   };
4029 
4030   // Pad out any elements eliminated due to the dmask.
4031   LLT ResTy = MRI->getType(ResultRegs[0]);
4032   if (!ResTy.isVector()) {
4033     padWithUndef(ResTy, NumElts - ResultRegs.size());
4034     B.buildBuildVector(DstReg, ResultRegs);
4035     return true;
4036   }
4037 
4038   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4039   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4040 
4041   // Deal with the one annoying legal case.
4042   const LLT V3S16 = LLT::vector(3, 16);
4043   if (Ty == V3S16) {
4044     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4045     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4046     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4047     return true;
4048   }
4049 
4050   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4051   B.buildConcatVectors(DstReg, ResultRegs);
4052   return true;
4053 }
4054 
4055 bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4056   MachineInstr &MI, MachineIRBuilder &B,
4057   GISelChangeObserver &Observer) const {
4058   Register Dst = MI.getOperand(0).getReg();
4059   LLT Ty = B.getMRI()->getType(Dst);
4060   unsigned Size = Ty.getSizeInBits();
4061   MachineFunction &MF = B.getMF();
4062 
4063   Observer.changingInstr(MI);
4064 
4065   // FIXME: We don't really need this intermediate instruction. The intrinsic
4066   // should be fixed to have a memory operand. Since it's readnone, we're not
4067   // allowed to add one.
4068   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4069   MI.RemoveOperand(1); // Remove intrinsic ID
4070 
4071   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4072   // TODO: Should this use datalayout alignment?
4073   const unsigned MemSize = (Size + 7) / 8;
4074   const Align MemAlign(4);
4075   MachineMemOperand *MMO = MF.getMachineMemOperand(
4076       MachinePointerInfo(),
4077       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4078           MachineMemOperand::MOInvariant,
4079       MemSize, MemAlign);
4080   MI.addMemOperand(MF, MMO);
4081 
4082   // There are no 96-bit result scalar loads, but widening to 128-bit should
4083   // always be legal. We may need to restore this to a 96-bit result if it turns
4084   // out this needs to be converted to a vector load during RegBankSelect.
4085   if (!isPowerOf2_32(Size)) {
4086     LegalizerHelper Helper(MF, *this, Observer, B);
4087 
4088     if (Ty.isVector())
4089       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4090     else
4091       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4092   }
4093 
4094   Observer.changedInstr(MI);
4095   return true;
4096 }
4097 
4098 bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
4099                                                 MachineRegisterInfo &MRI,
4100                                                 MachineIRBuilder &B) const {
4101   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4102   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4103       !ST.isTrapHandlerEnabled()) {
4104     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4105   } else {
4106     // Pass queue pointer to trap handler as input, and insert trap instruction
4107     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4108     const ArgDescriptor *Arg =
4109         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4110     if (!Arg)
4111       return false;
4112     MachineRegisterInfo &MRI = *B.getMRI();
4113     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4114     Register LiveIn = getLiveInRegister(
4115         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4116         /*InsertLiveInCopy=*/false);
4117     if (!loadInputValue(LiveIn, B, Arg))
4118       return false;
4119     B.buildCopy(SGPR01, LiveIn);
4120     B.buildInstr(AMDGPU::S_TRAP)
4121         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4122         .addReg(SGPR01, RegState::Implicit);
4123   }
4124 
4125   MI.eraseFromParent();
4126   return true;
4127 }
4128 
4129 bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4130     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4131   // Is non-HSA path or trap-handler disabled? then, report a warning
4132   // accordingly
4133   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4134       !ST.isTrapHandlerEnabled()) {
4135     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4136                                      "debugtrap handler not supported",
4137                                      MI.getDebugLoc(), DS_Warning);
4138     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4139     Ctx.diagnose(NoTrap);
4140   } else {
4141     // Insert debug-trap instruction
4142     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4143   }
4144 
4145   MI.eraseFromParent();
4146   return true;
4147 }
4148 
4149 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4150                                             MachineInstr &MI) const {
4151   MachineIRBuilder &B = Helper.MIRBuilder;
4152   MachineRegisterInfo &MRI = *B.getMRI();
4153 
4154   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4155   auto IntrID = MI.getIntrinsicID();
4156   switch (IntrID) {
4157   case Intrinsic::amdgcn_if:
4158   case Intrinsic::amdgcn_else: {
4159     MachineInstr *Br = nullptr;
4160     MachineBasicBlock *UncondBrTarget = nullptr;
4161     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4162       const SIRegisterInfo *TRI
4163         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4164 
4165       Register Def = MI.getOperand(1).getReg();
4166       Register Use = MI.getOperand(3).getReg();
4167 
4168       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4169       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4170       if (IntrID == Intrinsic::amdgcn_if) {
4171         B.buildInstr(AMDGPU::SI_IF)
4172           .addDef(Def)
4173           .addUse(Use)
4174           .addMBB(UncondBrTarget);
4175       } else {
4176         B.buildInstr(AMDGPU::SI_ELSE)
4177           .addDef(Def)
4178           .addUse(Use)
4179           .addMBB(UncondBrTarget)
4180           .addImm(0);
4181       }
4182 
4183       if (Br) {
4184         Br->getOperand(0).setMBB(CondBrTarget);
4185       } else {
4186         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4187         // since we're swapping branch targets it needs to be reinserted.
4188         // FIXME: IRTranslator should probably not do this
4189         B.buildBr(*CondBrTarget);
4190       }
4191 
4192       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
4193       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
4194       MI.eraseFromParent();
4195       BrCond->eraseFromParent();
4196       return true;
4197     }
4198 
4199     return false;
4200   }
4201   case Intrinsic::amdgcn_loop: {
4202     MachineInstr *Br = nullptr;
4203     MachineBasicBlock *UncondBrTarget = nullptr;
4204     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
4205       const SIRegisterInfo *TRI
4206         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
4207 
4208       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4209       Register Reg = MI.getOperand(2).getReg();
4210 
4211       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4212       B.buildInstr(AMDGPU::SI_LOOP)
4213         .addUse(Reg)
4214         .addMBB(UncondBrTarget);
4215 
4216       if (Br)
4217         Br->getOperand(0).setMBB(CondBrTarget);
4218       else
4219         B.buildBr(*CondBrTarget);
4220 
4221       MI.eraseFromParent();
4222       BrCond->eraseFromParent();
4223       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
4224       return true;
4225     }
4226 
4227     return false;
4228   }
4229   case Intrinsic::amdgcn_kernarg_segment_ptr:
4230     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4231       // This only makes sense to call in a kernel, so just lower to null.
4232       B.buildConstant(MI.getOperand(0).getReg(), 0);
4233       MI.eraseFromParent();
4234       return true;
4235     }
4236 
4237     return legalizePreloadedArgIntrin(
4238       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
4239   case Intrinsic::amdgcn_implicitarg_ptr:
4240     return legalizeImplicitArgPtr(MI, MRI, B);
4241   case Intrinsic::amdgcn_workitem_id_x:
4242     return legalizePreloadedArgIntrin(MI, MRI, B,
4243                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
4244   case Intrinsic::amdgcn_workitem_id_y:
4245     return legalizePreloadedArgIntrin(MI, MRI, B,
4246                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
4247   case Intrinsic::amdgcn_workitem_id_z:
4248     return legalizePreloadedArgIntrin(MI, MRI, B,
4249                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
4250   case Intrinsic::amdgcn_workgroup_id_x:
4251     return legalizePreloadedArgIntrin(MI, MRI, B,
4252                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
4253   case Intrinsic::amdgcn_workgroup_id_y:
4254     return legalizePreloadedArgIntrin(MI, MRI, B,
4255                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
4256   case Intrinsic::amdgcn_workgroup_id_z:
4257     return legalizePreloadedArgIntrin(MI, MRI, B,
4258                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
4259   case Intrinsic::amdgcn_dispatch_ptr:
4260     return legalizePreloadedArgIntrin(MI, MRI, B,
4261                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
4262   case Intrinsic::amdgcn_queue_ptr:
4263     return legalizePreloadedArgIntrin(MI, MRI, B,
4264                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
4265   case Intrinsic::amdgcn_implicit_buffer_ptr:
4266     return legalizePreloadedArgIntrin(
4267       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
4268   case Intrinsic::amdgcn_dispatch_id:
4269     return legalizePreloadedArgIntrin(MI, MRI, B,
4270                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
4271   case Intrinsic::amdgcn_fdiv_fast:
4272     return legalizeFDIVFastIntrin(MI, MRI, B);
4273   case Intrinsic::amdgcn_is_shared:
4274     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
4275   case Intrinsic::amdgcn_is_private:
4276     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
4277   case Intrinsic::amdgcn_wavefrontsize: {
4278     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
4279     MI.eraseFromParent();
4280     return true;
4281   }
4282   case Intrinsic::amdgcn_s_buffer_load:
4283     return legalizeSBufferLoad(MI, B, Helper.Observer);
4284   case Intrinsic::amdgcn_raw_buffer_store:
4285   case Intrinsic::amdgcn_struct_buffer_store:
4286     return legalizeBufferStore(MI, MRI, B, false, false);
4287   case Intrinsic::amdgcn_raw_buffer_store_format:
4288   case Intrinsic::amdgcn_struct_buffer_store_format:
4289     return legalizeBufferStore(MI, MRI, B, false, true);
4290   case Intrinsic::amdgcn_raw_tbuffer_store:
4291   case Intrinsic::amdgcn_struct_tbuffer_store:
4292     return legalizeBufferStore(MI, MRI, B, true, true);
4293   case Intrinsic::amdgcn_raw_buffer_load:
4294   case Intrinsic::amdgcn_struct_buffer_load:
4295     return legalizeBufferLoad(MI, MRI, B, false, false);
4296   case Intrinsic::amdgcn_raw_buffer_load_format:
4297   case Intrinsic::amdgcn_struct_buffer_load_format:
4298     return legalizeBufferLoad(MI, MRI, B, true, false);
4299   case Intrinsic::amdgcn_raw_tbuffer_load:
4300   case Intrinsic::amdgcn_struct_tbuffer_load:
4301     return legalizeBufferLoad(MI, MRI, B, true, true);
4302   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4303   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4304   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4305   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4306   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4307   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4308   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4309   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4310   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4311   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4312   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4313   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4314   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4315   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4316   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4317   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4318   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4319   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4320   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4321   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4322   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4323   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4324   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4325   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4326   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4327   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4328     return legalizeBufferAtomic(MI, B, IntrID);
4329   case Intrinsic::amdgcn_atomic_inc:
4330     return legalizeAtomicIncDec(MI, B, true);
4331   case Intrinsic::amdgcn_atomic_dec:
4332     return legalizeAtomicIncDec(MI, B, false);
4333   case Intrinsic::trap:
4334     return legalizeTrapIntrinsic(MI, MRI, B);
4335   case Intrinsic::debugtrap:
4336     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4337   default: {
4338     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4339             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4340       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
4341     return true;
4342   }
4343   }
4344 
4345   return true;
4346 }
4347