xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 5ffd83dbcc34f10e07f6d3e968ae6365869615f4)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
14*5ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
17*5ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
180b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
190b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
20*5ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
210b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
220b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
23*5ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
240b57cec5SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h"
250b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h"
260b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
280b57cec5SDimitry Andric #include "llvm/IR/Type.h"
290b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
300b57cec5SDimitry Andric 
310b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
320b57cec5SDimitry Andric 
330b57cec5SDimitry Andric using namespace llvm;
340b57cec5SDimitry Andric using namespace LegalizeActions;
350b57cec5SDimitry Andric using namespace LegalizeMutations;
360b57cec5SDimitry Andric using namespace LegalityPredicates;
37*5ffd83dbSDimitry Andric using namespace MIPatternMatch;
380b57cec5SDimitry Andric 
39*5ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
40*5ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
41*5ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
42*5ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
43*5ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
44*5ffd83dbSDimitry Andric   cl::init(false),
45*5ffd83dbSDimitry Andric   cl::ReallyHidden);
460b57cec5SDimitry Andric 
47*5ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
48*5ffd83dbSDimitry Andric 
49*5ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
50*5ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
51*5ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
52*5ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53*5ffd83dbSDimitry Andric   return Ty.changeNumElements(Pow2NElts);
540b57cec5SDimitry Andric }
550b57cec5SDimitry Andric 
56*5ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
57*5ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
58*5ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
59*5ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
60*5ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
618bcb0991SDimitry Andric }
628bcb0991SDimitry Andric 
630b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
640b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
650b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
660b57cec5SDimitry Andric     return Ty.isVector() &&
670b57cec5SDimitry Andric            Ty.getNumElements() % 2 != 0 &&
688bcb0991SDimitry Andric            Ty.getElementType().getSizeInBits() < 32 &&
698bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
708bcb0991SDimitry Andric   };
718bcb0991SDimitry Andric }
728bcb0991SDimitry Andric 
738bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
748bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
758bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
768bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
778bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
780b57cec5SDimitry Andric   };
790b57cec5SDimitry Andric }
800b57cec5SDimitry Andric 
810b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
820b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
830b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
840b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
850b57cec5SDimitry Andric     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
860b57cec5SDimitry Andric   };
870b57cec5SDimitry Andric }
880b57cec5SDimitry Andric 
890b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
900b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
910b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
920b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
930b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
940b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
950b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
960b57cec5SDimitry Andric     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
970b57cec5SDimitry Andric   };
980b57cec5SDimitry Andric }
990b57cec5SDimitry Andric 
1008bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1018bcb0991SDimitry Andric // type.
1028bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1038bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1048bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1058bcb0991SDimitry Andric 
1068bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1078bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1088bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1098bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1108bcb0991SDimitry Andric 
1118bcb0991SDimitry Andric     assert(EltSize < 32);
1128bcb0991SDimitry Andric 
1138bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
1148bcb0991SDimitry Andric     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
1158bcb0991SDimitry Andric   };
1168bcb0991SDimitry Andric }
1178bcb0991SDimitry Andric 
118*5ffd83dbSDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
119*5ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
120*5ffd83dbSDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
121*5ffd83dbSDimitry Andric     unsigned Size = Ty.getSizeInBits();
122*5ffd83dbSDimitry Andric 
123*5ffd83dbSDimitry Andric     LLT CoercedTy;
124*5ffd83dbSDimitry Andric     if (Size <= 32) {
125*5ffd83dbSDimitry Andric       // <2 x s8> -> s16
126*5ffd83dbSDimitry Andric       // <4 x s8> -> s32
127*5ffd83dbSDimitry Andric       CoercedTy = LLT::scalar(Size);
128*5ffd83dbSDimitry Andric     } else
129*5ffd83dbSDimitry Andric       CoercedTy = LLT::scalarOrVector(Size / 32, 32);
130*5ffd83dbSDimitry Andric 
131*5ffd83dbSDimitry Andric     return std::make_pair(TypeIdx, CoercedTy);
132*5ffd83dbSDimitry Andric   };
133*5ffd83dbSDimitry Andric }
134*5ffd83dbSDimitry Andric 
1358bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
1368bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1378bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1388bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
1398bcb0991SDimitry Andric   };
1408bcb0991SDimitry Andric }
1418bcb0991SDimitry Andric 
1420b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
1430b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1440b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1450b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
1460b57cec5SDimitry Andric   };
1470b57cec5SDimitry Andric }
1480b57cec5SDimitry Andric 
1490b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
1500b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1510b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1520b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
1530b57cec5SDimitry Andric   };
1540b57cec5SDimitry Andric }
1550b57cec5SDimitry Andric 
156*5ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
157*5ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
158*5ffd83dbSDimitry Andric }
159*5ffd83dbSDimitry Andric 
160*5ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
161*5ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
162*5ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
163*5ffd83dbSDimitry Andric }
164*5ffd83dbSDimitry Andric 
165*5ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
1660b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
1670b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
1680b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
1690b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
1700b57cec5SDimitry Andric }
1710b57cec5SDimitry Andric 
172*5ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
173*5ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
174*5ffd83dbSDimitry Andric     return false;
175*5ffd83dbSDimitry Andric 
176*5ffd83dbSDimitry Andric   if (Ty.isVector())
177*5ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
178*5ffd83dbSDimitry Andric 
179*5ffd83dbSDimitry Andric   return true;
180*5ffd83dbSDimitry Andric }
181*5ffd83dbSDimitry Andric 
182*5ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
183*5ffd83dbSDimitry Andric // multiples of v2s16.
184*5ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
185*5ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
186*5ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
1878bcb0991SDimitry Andric   };
1888bcb0991SDimitry Andric }
1898bcb0991SDimitry Andric 
190*5ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
1918bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
192*5ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
193*5ffd83dbSDimitry Andric     if (!QueryTy.isVector())
194*5ffd83dbSDimitry Andric       return false;
195*5ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
196*5ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
1978bcb0991SDimitry Andric   };
1988bcb0991SDimitry Andric }
1998bcb0991SDimitry Andric 
2008bcb0991SDimitry Andric static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
2018bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2028bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2038bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
2048bcb0991SDimitry Andric            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
2050b57cec5SDimitry Andric   };
2060b57cec5SDimitry Andric }
2070b57cec5SDimitry Andric 
208*5ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
209*5ffd83dbSDimitry Andric // handle some operations by just promoting the register during
210*5ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
211*5ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
212*5ffd83dbSDimitry Andric                                     bool IsLoad) {
213*5ffd83dbSDimitry Andric   switch (AS) {
214*5ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
215*5ffd83dbSDimitry Andric     // FIXME: Private element size.
216*5ffd83dbSDimitry Andric     return 32;
217*5ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
218*5ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
219*5ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
220*5ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
221*5ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
222*5ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
223*5ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
224*5ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
225*5ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
226*5ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
227*5ffd83dbSDimitry Andric     // kernel.
228*5ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
229*5ffd83dbSDimitry Andric   default:
230*5ffd83dbSDimitry Andric     // Flat addresses may contextually need to be split to 32-bit parts if they
231*5ffd83dbSDimitry Andric     // may alias scratch depending on the subtarget.
232*5ffd83dbSDimitry Andric     return 128;
233*5ffd83dbSDimitry Andric   }
234*5ffd83dbSDimitry Andric }
235*5ffd83dbSDimitry Andric 
236*5ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
237*5ffd83dbSDimitry Andric                                  const LegalityQuery &Query,
238*5ffd83dbSDimitry Andric                                  unsigned Opcode) {
239*5ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
240*5ffd83dbSDimitry Andric 
241*5ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
242*5ffd83dbSDimitry Andric   const bool IsLoad = Opcode != AMDGPU::G_STORE;
243*5ffd83dbSDimitry Andric 
244*5ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
245*5ffd83dbSDimitry Andric   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
246*5ffd83dbSDimitry Andric   unsigned Align = Query.MMODescrs[0].AlignInBits;
247*5ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
248*5ffd83dbSDimitry Andric 
249*5ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
250*5ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
251*5ffd83dbSDimitry Andric     return false;
252*5ffd83dbSDimitry Andric 
253*5ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
254*5ffd83dbSDimitry Andric   // we also need to modify the memory access size.
255*5ffd83dbSDimitry Andric #if 0
256*5ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
257*5ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
258*5ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
259*5ffd83dbSDimitry Andric #endif
260*5ffd83dbSDimitry Andric 
261*5ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
262*5ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
263*5ffd83dbSDimitry Andric     return false;
264*5ffd83dbSDimitry Andric 
265*5ffd83dbSDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
266*5ffd83dbSDimitry Andric     return false;
267*5ffd83dbSDimitry Andric 
268*5ffd83dbSDimitry Andric   switch (MemSize) {
269*5ffd83dbSDimitry Andric   case 8:
270*5ffd83dbSDimitry Andric   case 16:
271*5ffd83dbSDimitry Andric   case 32:
272*5ffd83dbSDimitry Andric   case 64:
273*5ffd83dbSDimitry Andric   case 128:
274*5ffd83dbSDimitry Andric     break;
275*5ffd83dbSDimitry Andric   case 96:
276*5ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
277*5ffd83dbSDimitry Andric       return false;
278*5ffd83dbSDimitry Andric     break;
279*5ffd83dbSDimitry Andric   case 256:
280*5ffd83dbSDimitry Andric   case 512:
281*5ffd83dbSDimitry Andric     // These may contextually need to be broken down.
282*5ffd83dbSDimitry Andric     break;
283*5ffd83dbSDimitry Andric   default:
284*5ffd83dbSDimitry Andric     return false;
285*5ffd83dbSDimitry Andric   }
286*5ffd83dbSDimitry Andric 
287*5ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
288*5ffd83dbSDimitry Andric 
289*5ffd83dbSDimitry Andric   if (Align < MemSize) {
290*5ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
291*5ffd83dbSDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
292*5ffd83dbSDimitry Andric       return false;
293*5ffd83dbSDimitry Andric   }
294*5ffd83dbSDimitry Andric 
295*5ffd83dbSDimitry Andric   return true;
296*5ffd83dbSDimitry Andric }
297*5ffd83dbSDimitry Andric 
298*5ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
299*5ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
300*5ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
301*5ffd83dbSDimitry Andric // bitcasting.
302*5ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
303*5ffd83dbSDimitry Andric   if (EnableNewLegality)
304*5ffd83dbSDimitry Andric     return false;
305*5ffd83dbSDimitry Andric 
306*5ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
307*5ffd83dbSDimitry Andric   if (Size <= 64)
308*5ffd83dbSDimitry Andric     return false;
309*5ffd83dbSDimitry Andric   if (!Ty.isVector())
310*5ffd83dbSDimitry Andric     return true;
311*5ffd83dbSDimitry Andric   unsigned EltSize = Ty.getElementType().getSizeInBits();
312*5ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
313*5ffd83dbSDimitry Andric }
314*5ffd83dbSDimitry Andric 
315*5ffd83dbSDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
316*5ffd83dbSDimitry Andric                              unsigned Opcode) {
317*5ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
318*5ffd83dbSDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
319*5ffd83dbSDimitry Andric          !loadStoreBitcastWorkaround(Ty);
320*5ffd83dbSDimitry Andric }
321*5ffd83dbSDimitry Andric 
3220b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
3230b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
3240b57cec5SDimitry Andric   :  ST(ST_) {
3250b57cec5SDimitry Andric   using namespace TargetOpcode;
3260b57cec5SDimitry Andric 
3270b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
3280b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
3290b57cec5SDimitry Andric   };
3300b57cec5SDimitry Andric 
3310b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
3320b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
3330b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
3340b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
3350b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
3360b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
337*5ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
338*5ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
3390b57cec5SDimitry Andric 
3400b57cec5SDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
3410b57cec5SDimitry Andric   const LLT V4S16 = LLT::vector(4, 16);
3420b57cec5SDimitry Andric 
3430b57cec5SDimitry Andric   const LLT V2S32 = LLT::vector(2, 32);
3440b57cec5SDimitry Andric   const LLT V3S32 = LLT::vector(3, 32);
3450b57cec5SDimitry Andric   const LLT V4S32 = LLT::vector(4, 32);
3460b57cec5SDimitry Andric   const LLT V5S32 = LLT::vector(5, 32);
3470b57cec5SDimitry Andric   const LLT V6S32 = LLT::vector(6, 32);
3480b57cec5SDimitry Andric   const LLT V7S32 = LLT::vector(7, 32);
3490b57cec5SDimitry Andric   const LLT V8S32 = LLT::vector(8, 32);
3500b57cec5SDimitry Andric   const LLT V9S32 = LLT::vector(9, 32);
3510b57cec5SDimitry Andric   const LLT V10S32 = LLT::vector(10, 32);
3520b57cec5SDimitry Andric   const LLT V11S32 = LLT::vector(11, 32);
3530b57cec5SDimitry Andric   const LLT V12S32 = LLT::vector(12, 32);
3540b57cec5SDimitry Andric   const LLT V13S32 = LLT::vector(13, 32);
3550b57cec5SDimitry Andric   const LLT V14S32 = LLT::vector(14, 32);
3560b57cec5SDimitry Andric   const LLT V15S32 = LLT::vector(15, 32);
3570b57cec5SDimitry Andric   const LLT V16S32 = LLT::vector(16, 32);
3588bcb0991SDimitry Andric   const LLT V32S32 = LLT::vector(32, 32);
3590b57cec5SDimitry Andric 
3600b57cec5SDimitry Andric   const LLT V2S64 = LLT::vector(2, 64);
3610b57cec5SDimitry Andric   const LLT V3S64 = LLT::vector(3, 64);
3620b57cec5SDimitry Andric   const LLT V4S64 = LLT::vector(4, 64);
3630b57cec5SDimitry Andric   const LLT V5S64 = LLT::vector(5, 64);
3640b57cec5SDimitry Andric   const LLT V6S64 = LLT::vector(6, 64);
3650b57cec5SDimitry Andric   const LLT V7S64 = LLT::vector(7, 64);
3660b57cec5SDimitry Andric   const LLT V8S64 = LLT::vector(8, 64);
3678bcb0991SDimitry Andric   const LLT V16S64 = LLT::vector(16, 64);
3680b57cec5SDimitry Andric 
3690b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
3700b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
3718bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
3720b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
3738bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
3740b57cec5SDimitry Andric 
3750b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
3760b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
3778bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
3780b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
3798bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
3800b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
3810b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
3820b57cec5SDimitry Andric 
3830b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
3840b57cec5SDimitry Andric 
3850b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
3860b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
3870b57cec5SDimitry Andric   };
3880b57cec5SDimitry Andric 
3890b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
3908bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
3910b57cec5SDimitry Andric   };
3920b57cec5SDimitry Andric 
3930b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
3940b57cec5SDimitry Andric     S32, S64
3950b57cec5SDimitry Andric   };
3960b57cec5SDimitry Andric 
3970b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
3980b57cec5SDimitry Andric     S32, S64, S16
3990b57cec5SDimitry Andric   };
4000b57cec5SDimitry Andric 
4010b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
4020b57cec5SDimitry Andric     S32, S64, S16, V2S16
4030b57cec5SDimitry Andric   };
4040b57cec5SDimitry Andric 
405*5ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
406*5ffd83dbSDimitry Andric 
407480093f4SDimitry Andric   setAction({G_BRCOND, S1}, Legal); // VCC branches
408480093f4SDimitry Andric   setAction({G_BRCOND, S32}, Legal); // SCC branches
4090b57cec5SDimitry Andric 
4100b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
4110b57cec5SDimitry Andric   // elements for v3s16
4120b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
4130b57cec5SDimitry Andric     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
4140b57cec5SDimitry Andric     .legalFor(AllS32Vectors)
4150b57cec5SDimitry Andric     .legalFor(AllS64Vectors)
4160b57cec5SDimitry Andric     .legalFor(AddrSpaces64)
4170b57cec5SDimitry Andric     .legalFor(AddrSpaces32)
4180b57cec5SDimitry Andric     .clampScalar(0, S32, S256)
4190b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
4200b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16)
4210b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
4220b57cec5SDimitry Andric     .legalIf(isPointer(0));
4230b57cec5SDimitry Andric 
424*5ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
425*5ffd83dbSDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
426*5ffd83dbSDimitry Andric       .legalFor({S32, S16, V2S16})
427*5ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
428*5ffd83dbSDimitry Andric       .clampMaxNumElements(0, S16, 2)
429*5ffd83dbSDimitry Andric       .scalarize(0)
430*5ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32);
431*5ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
4320b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
4330b57cec5SDimitry Andric       .legalFor({S32, S16})
4340b57cec5SDimitry Andric       .clampScalar(0, S16, S32)
435*5ffd83dbSDimitry Andric       .scalarize(0)
436*5ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32);
4370b57cec5SDimitry Andric   } else {
4380b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
4390b57cec5SDimitry Andric       .legalFor({S32})
4400b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
4410b57cec5SDimitry Andric       .scalarize(0);
4420b57cec5SDimitry Andric   }
4430b57cec5SDimitry Andric 
444480093f4SDimitry Andric   // FIXME: Not really legal. Placeholder for custom lowering.
445480093f4SDimitry Andric   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
446*5ffd83dbSDimitry Andric     .customFor({S32, S64})
447480093f4SDimitry Andric     .clampScalar(0, S32, S64)
448480093f4SDimitry Andric     .widenScalarToNextPow2(0, 32)
449480093f4SDimitry Andric     .scalarize(0);
450480093f4SDimitry Andric 
4510b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
4520b57cec5SDimitry Andric     .legalFor({S32})
4530b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
4540b57cec5SDimitry Andric     .scalarize(0);
4550b57cec5SDimitry Andric 
4560b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
4570b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
4580b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
4590b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
4600b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
4610b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
4628bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
4630b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
4640b57cec5SDimitry Andric     .scalarize(0);
4650b57cec5SDimitry Andric 
4668bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
4670b57cec5SDimitry Andric                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
468480093f4SDimitry Andric     .legalFor({{S32, S1}, {S32, S32}})
469*5ffd83dbSDimitry Andric     .minScalar(0, S32)
470*5ffd83dbSDimitry Andric     // TODO: .scalarize(0)
4718bcb0991SDimitry Andric     .lower();
4720b57cec5SDimitry Andric 
4730b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
4740b57cec5SDimitry Andric     // Don't worry about the size constraint.
4758bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
476*5ffd83dbSDimitry Andric     .lower();
4770b57cec5SDimitry Andric 
4780b57cec5SDimitry Andric 
4790b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
4808bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
4810b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
4820b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
4830b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
4840b57cec5SDimitry Andric     .legalIf(isPointer(0));
4850b57cec5SDimitry Andric 
486*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
487*5ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
488*5ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
4898bcb0991SDimitry Andric 
490*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
491*5ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
492*5ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
493*5ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
494*5ffd83dbSDimitry Andric       .legalFor({S1, S16})
495*5ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
496*5ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
497*5ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
498*5ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
499*5ffd83dbSDimitry Andric 
500*5ffd83dbSDimitry Andric   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
501*5ffd83dbSDimitry Andric 
502*5ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
503*5ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
504*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
505*5ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
506*5ffd83dbSDimitry Andric 
507*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
508*5ffd83dbSDimitry Andric     .unsupportedFor({PrivatePtr})
509*5ffd83dbSDimitry Andric     .custom();
510*5ffd83dbSDimitry Andric   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
5110b57cec5SDimitry Andric 
5120b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
5138bcb0991SDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
5140b57cec5SDimitry Andric     .legalFor({S32, S64});
5158bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
5168bcb0991SDimitry Andric     .customFor({S32, S64});
5178bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
5188bcb0991SDimitry Andric     .customFor({S32, S64});
5190b57cec5SDimitry Andric 
5200b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
5210b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
5220b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
5230b57cec5SDimitry Andric     else
5240b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
5258bcb0991SDimitry Andric 
5268bcb0991SDimitry Andric     TrigActions.customFor({S16});
5278bcb0991SDimitry Andric     FDIVActions.customFor({S16});
5280b57cec5SDimitry Andric   }
5290b57cec5SDimitry Andric 
5300b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
5310b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
5320b57cec5SDimitry Andric 
5330b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
5340b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
535480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
5360b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
5370b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
5380b57cec5SDimitry Andric       .scalarize(0);
5390b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
5400b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
5410b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
5420b57cec5SDimitry Andric       .scalarize(0);
5430b57cec5SDimitry Andric   } else {
5440b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
5450b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
5460b57cec5SDimitry Andric       .scalarize(0);
5470b57cec5SDimitry Andric   }
5480b57cec5SDimitry Andric 
5490b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
5500b57cec5SDimitry Andric     FPOpActions.clampMaxNumElements(0, S16, 2);
5518bcb0991SDimitry Andric 
5520b57cec5SDimitry Andric   FPOpActions
5530b57cec5SDimitry Andric     .scalarize(0)
5540b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
5550b57cec5SDimitry Andric 
5568bcb0991SDimitry Andric   TrigActions
5578bcb0991SDimitry Andric     .scalarize(0)
5588bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
5598bcb0991SDimitry Andric 
5608bcb0991SDimitry Andric   FDIVActions
5618bcb0991SDimitry Andric     .scalarize(0)
5628bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
5638bcb0991SDimitry Andric 
5648bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
5658bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
5668bcb0991SDimitry Andric     .clampMaxNumElements(0, S16, 2)
5678bcb0991SDimitry Andric     .scalarize(0)
5688bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
5698bcb0991SDimitry Andric 
5700b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
5718bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
5720b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
5730b57cec5SDimitry Andric       .scalarize(0)
5740b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
5750b57cec5SDimitry Andric   } else {
576*5ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
577*5ffd83dbSDimitry Andric       .legalFor({S32, S64})
578*5ffd83dbSDimitry Andric       .scalarize(0)
579*5ffd83dbSDimitry Andric       .clampScalar(0, S32, S64);
580*5ffd83dbSDimitry Andric 
581*5ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
582*5ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
583*5ffd83dbSDimitry Andric         .customFor({S64})
584*5ffd83dbSDimitry Andric         .legalFor({S32, S64})
585*5ffd83dbSDimitry Andric         .scalarize(0)
586*5ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
587*5ffd83dbSDimitry Andric     } else {
588*5ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
5890b57cec5SDimitry Andric         .legalFor({S32, S64})
5900b57cec5SDimitry Andric         .scalarize(0)
5910b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
5920b57cec5SDimitry Andric     }
593*5ffd83dbSDimitry Andric   }
5940b57cec5SDimitry Andric 
5950b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
5960b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
597*5ffd83dbSDimitry Andric     .scalarize(0)
598*5ffd83dbSDimitry Andric     .lower();
5990b57cec5SDimitry Andric 
6000b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
6010b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
6020b57cec5SDimitry Andric     .lowerFor({{S64, S16}}) // FIXME: Implement
6030b57cec5SDimitry Andric     .scalarize(0);
6040b57cec5SDimitry Andric 
6050b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FSUB)
6060b57cec5SDimitry Andric       // Use actual fsub instruction
6070b57cec5SDimitry Andric       .legalFor({S32})
6080b57cec5SDimitry Andric       // Must use fadd + fneg
6090b57cec5SDimitry Andric       .lowerFor({S64, S16, V2S16})
6100b57cec5SDimitry Andric       .scalarize(0)
6110b57cec5SDimitry Andric       .clampScalar(0, S32, S64);
6120b57cec5SDimitry Andric 
6138bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
6148bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
615*5ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
6168bcb0991SDimitry Andric     FMad.customFor({S32, S16});
617*5ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
6188bcb0991SDimitry Andric     FMad.customFor({S32});
619*5ffd83dbSDimitry Andric   else if (ST.hasMadF16())
620*5ffd83dbSDimitry Andric     FMad.customFor({S16});
6218bcb0991SDimitry Andric   FMad.scalarize(0)
6228bcb0991SDimitry Andric       .lower();
6238bcb0991SDimitry Andric 
624*5ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
625*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
626*5ffd83dbSDimitry Andric     .legalIf(isScalar(0))
627*5ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
628*5ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
629*5ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
630*5ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
631*5ffd83dbSDimitry Andric     // in the legalizer.
632*5ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
633*5ffd83dbSDimitry Andric     .alwaysLegal();
634*5ffd83dbSDimitry Andric 
6350b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
6360b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
637*5ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
638480093f4SDimitry Andric     .scalarize(0)
639*5ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
640*5ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
6410b57cec5SDimitry Andric 
6428bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
6438bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
644480093f4SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
6450b57cec5SDimitry Andric     .lowerFor({{S32, S64}})
646480093f4SDimitry Andric     .lowerIf(typeIs(1, S1))
6478bcb0991SDimitry Andric     .customFor({{S64, S64}});
6488bcb0991SDimitry Andric   if (ST.has16BitInsts())
6498bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
6508bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
651*5ffd83dbSDimitry Andric        .scalarize(0)
652*5ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
6530b57cec5SDimitry Andric 
6548bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
655*5ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
656*5ffd83dbSDimitry Andric     .customFor({{S64, S64}});
6578bcb0991SDimitry Andric   if (ST.has16BitInsts())
6588bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
6598bcb0991SDimitry Andric   else
6608bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
6618bcb0991SDimitry Andric 
6628bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
663*5ffd83dbSDimitry Andric        .scalarize(0)
664*5ffd83dbSDimitry Andric        .lower();
6650b57cec5SDimitry Andric 
6660b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
667480093f4SDimitry Andric     .scalarize(0)
668480093f4SDimitry Andric     .lower();
6690b57cec5SDimitry Andric 
670480093f4SDimitry Andric   if (ST.has16BitInsts()) {
671480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
672480093f4SDimitry Andric       .legalFor({S16, S32, S64})
673480093f4SDimitry Andric       .clampScalar(0, S16, S64)
674480093f4SDimitry Andric       .scalarize(0);
675480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
6760b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
6770b57cec5SDimitry Andric       .legalFor({S32, S64})
6780b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
6790b57cec5SDimitry Andric       .scalarize(0);
6800b57cec5SDimitry Andric   } else {
6810b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
6820b57cec5SDimitry Andric       .legalFor({S32})
6830b57cec5SDimitry Andric       .customFor({S64})
6840b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
6850b57cec5SDimitry Andric       .scalarize(0);
6860b57cec5SDimitry Andric   }
6870b57cec5SDimitry Andric 
688*5ffd83dbSDimitry Andric   // FIXME: Clamp offset operand.
689480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
690*5ffd83dbSDimitry Andric     .legalIf(isPointer(0))
6910b57cec5SDimitry Andric     .scalarize(0);
6920b57cec5SDimitry Andric 
693*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
694*5ffd83dbSDimitry Andric     .legalIf(typeInSet(1, {S64, S32}))
695*5ffd83dbSDimitry Andric     .minScalar(1, S32)
696*5ffd83dbSDimitry Andric     .maxScalarIf(sizeIs(0, 32), 1, S32)
697*5ffd83dbSDimitry Andric     .maxScalarIf(sizeIs(0, 64), 1, S64)
698*5ffd83dbSDimitry Andric     .scalarize(0);
6990b57cec5SDimitry Andric 
7000b57cec5SDimitry Andric   auto &CmpBuilder =
7010b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
702480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
703480093f4SDimitry Andric     // so make both s1 and s32 legal.
704480093f4SDimitry Andric     //
705480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
706480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
707480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
708480093f4SDimitry Andric     // before that won't try to use s32 result types.
709480093f4SDimitry Andric     //
710480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
711480093f4SDimitry Andric     // bank.
7120b57cec5SDimitry Andric     .legalForCartesianProduct(
7130b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
714480093f4SDimitry Andric     .legalForCartesianProduct(
715480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
7160b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7170b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
7180b57cec5SDimitry Andric   }
7190b57cec5SDimitry Andric 
7200b57cec5SDimitry Andric   CmpBuilder
7210b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
7220b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
7230b57cec5SDimitry Andric     .scalarize(0)
724480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
7250b57cec5SDimitry Andric 
7260b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
7270b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
7280b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
7290b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
7300b57cec5SDimitry Andric     .scalarize(0);
7310b57cec5SDimitry Andric 
732*5ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
733*5ffd83dbSDimitry Andric   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
734*5ffd83dbSDimitry Andric   if (ST.has16BitInsts())
735*5ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32, S16});
736*5ffd83dbSDimitry Andric   else
737*5ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32});
738*5ffd83dbSDimitry Andric   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
739*5ffd83dbSDimitry Andric   Exp2Ops.scalarize(0);
740*5ffd83dbSDimitry Andric 
741*5ffd83dbSDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
742*5ffd83dbSDimitry Andric   if (ST.has16BitInsts())
743*5ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
744*5ffd83dbSDimitry Andric   else
745*5ffd83dbSDimitry Andric     ExpOps.customFor({S32});
746*5ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
7470b57cec5SDimitry Andric         .scalarize(0);
7480b57cec5SDimitry Andric 
7490b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
750*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
7510b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
7520b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
7530b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
7540b57cec5SDimitry Andric     .scalarize(0)
7550b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
7560b57cec5SDimitry Andric     .widenScalarToNextPow2(1, 32);
7570b57cec5SDimitry Andric 
758*5ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
759*5ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
760*5ffd83dbSDimitry Andric   // bitwidth.
761*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
762*5ffd83dbSDimitry Andric     .scalarize(0)
763*5ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
764*5ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
765*5ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
766*5ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
767*5ffd83dbSDimitry Andric     .lower();
768*5ffd83dbSDimitry Andric 
769*5ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
770*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
771*5ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
772*5ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
773*5ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
774*5ffd83dbSDimitry Andric     .scalarize(0)
775*5ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
776*5ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
777*5ffd83dbSDimitry Andric 
778*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
7790b57cec5SDimitry Andric     .legalFor({S32})
7800b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
7810b57cec5SDimitry Andric     .scalarize(0);
7820b57cec5SDimitry Andric 
7830b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
784*5ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
785*5ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
786*5ffd83dbSDimitry Andric       .clampMaxNumElements(0, S16, 2)
787*5ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
788*5ffd83dbSDimitry Andric       // narrowScalar limitation.
789*5ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
790*5ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
791*5ffd83dbSDimitry Andric       .scalarize(0);
792*5ffd83dbSDimitry Andric 
7930b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
7940b57cec5SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
7950b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
7960b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
7970b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
798*5ffd83dbSDimitry Andric         .minScalar(0, S16)
7990b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
800*5ffd83dbSDimitry Andric         .scalarize(0)
801*5ffd83dbSDimitry Andric         .lower();
8020b57cec5SDimitry Andric     } else {
8030b57cec5SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
8040b57cec5SDimitry Andric         .legalFor({S32, S16})
8050b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
806*5ffd83dbSDimitry Andric         .minScalar(0, S16)
807*5ffd83dbSDimitry Andric         .scalarize(0)
808*5ffd83dbSDimitry Andric         .lower();
8090b57cec5SDimitry Andric     }
8100b57cec5SDimitry Andric   } else {
811*5ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
812*5ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
813*5ffd83dbSDimitry Andric       .legalFor({S32})
814*5ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
815*5ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
816*5ffd83dbSDimitry Andric       // narrowScalar limitation.
817*5ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
818*5ffd83dbSDimitry Andric       .maxScalar(0, S32)
819*5ffd83dbSDimitry Andric       .scalarize(0)
820*5ffd83dbSDimitry Andric       .lower();
821*5ffd83dbSDimitry Andric 
8220b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
8230b57cec5SDimitry Andric       .legalFor({S32})
824*5ffd83dbSDimitry Andric       .minScalar(0, S32)
8250b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
826*5ffd83dbSDimitry Andric       .scalarize(0)
827*5ffd83dbSDimitry Andric       .lower();
8280b57cec5SDimitry Andric   }
8290b57cec5SDimitry Andric 
8300b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
8310b57cec5SDimitry Andric     // List the common cases
8320b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
8330b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
8340b57cec5SDimitry Andric     .scalarize(0)
8350b57cec5SDimitry Andric     // Accept any address space as long as the size matches
8360b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
8370b57cec5SDimitry Andric     .widenScalarIf(smallerThan(1, 0),
8380b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
8390b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
8400b57cec5SDimitry Andric       })
841*5ffd83dbSDimitry Andric     .narrowScalarIf(largerThan(1, 0),
8420b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
8430b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
8440b57cec5SDimitry Andric       });
8450b57cec5SDimitry Andric 
8460b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
8470b57cec5SDimitry Andric     // List the common cases
8480b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
8490b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
8500b57cec5SDimitry Andric     .scalarize(0)
8510b57cec5SDimitry Andric     // Accept any address space as long as the size matches
8520b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
8530b57cec5SDimitry Andric     .widenScalarIf(smallerThan(0, 1),
8540b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
8550b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
8560b57cec5SDimitry Andric       })
8570b57cec5SDimitry Andric     .narrowScalarIf(
858*5ffd83dbSDimitry Andric       largerThan(0, 1),
8590b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
8600b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
8610b57cec5SDimitry Andric       });
8620b57cec5SDimitry Andric 
8630b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
8640b57cec5SDimitry Andric     .scalarize(0)
8650b57cec5SDimitry Andric     .custom();
8660b57cec5SDimitry Andric 
867*5ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
868*5ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
8698bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
8708bcb0991SDimitry Andric 
8718bcb0991SDimitry Andric     // Split vector extloads.
8728bcb0991SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
873480093f4SDimitry Andric     unsigned Align = Query.MMODescrs[0].AlignInBits;
874480093f4SDimitry Andric 
875480093f4SDimitry Andric     if (MemSize < DstTy.getSizeInBits())
876480093f4SDimitry Andric       MemSize = std::max(MemSize, Align);
877480093f4SDimitry Andric 
8788bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
8798bcb0991SDimitry Andric       return true;
8808bcb0991SDimitry Andric 
8818bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
8828bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
883*5ffd83dbSDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
8848bcb0991SDimitry Andric       return true;
8858bcb0991SDimitry Andric 
8868bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
8878bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
888*5ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
889*5ffd83dbSDimitry Andric     if (NumRegs == 3) {
890*5ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
8918bcb0991SDimitry Andric         return true;
892*5ffd83dbSDimitry Andric     } else {
893*5ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
894*5ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
895*5ffd83dbSDimitry Andric         return true;
896*5ffd83dbSDimitry Andric     }
8978bcb0991SDimitry Andric 
8988bcb0991SDimitry Andric     if (Align < MemSize) {
8998bcb0991SDimitry Andric       const SITargetLowering *TLI = ST.getTargetLowering();
9008bcb0991SDimitry Andric       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
9018bcb0991SDimitry Andric     }
9028bcb0991SDimitry Andric 
9038bcb0991SDimitry Andric     return false;
9048bcb0991SDimitry Andric   };
9058bcb0991SDimitry Andric 
906*5ffd83dbSDimitry Andric   const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
907*5ffd83dbSDimitry Andric                                          unsigned Opc) -> bool {
908*5ffd83dbSDimitry Andric     unsigned Size = Query.Types[0].getSizeInBits();
909*5ffd83dbSDimitry Andric     if (isPowerOf2_32(Size))
910*5ffd83dbSDimitry Andric       return false;
911*5ffd83dbSDimitry Andric 
912*5ffd83dbSDimitry Andric     if (Size == 96 && ST.hasDwordx3LoadStores())
913*5ffd83dbSDimitry Andric       return false;
914*5ffd83dbSDimitry Andric 
915*5ffd83dbSDimitry Andric     unsigned AddrSpace = Query.Types[1].getAddressSpace();
916*5ffd83dbSDimitry Andric     if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
917*5ffd83dbSDimitry Andric       return false;
918*5ffd83dbSDimitry Andric 
919*5ffd83dbSDimitry Andric     unsigned Align = Query.MMODescrs[0].AlignInBits;
920*5ffd83dbSDimitry Andric     unsigned RoundedSize = NextPowerOf2(Size);
921*5ffd83dbSDimitry Andric     return (Align >= RoundedSize);
922*5ffd83dbSDimitry Andric   };
923*5ffd83dbSDimitry Andric 
9248bcb0991SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
9258bcb0991SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
9268bcb0991SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
9278bcb0991SDimitry Andric 
9288bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
9298bcb0991SDimitry Andric   // LDS
9308bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
9318bcb0991SDimitry Andric 
9328bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
9338bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
9348bcb0991SDimitry Andric 
9358bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
936*5ffd83dbSDimitry Andric     // Explicitly list some common cases.
937*5ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
9388bcb0991SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
9398bcb0991SDimitry Andric                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
9408bcb0991SDimitry Andric                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
9418bcb0991SDimitry Andric                                       {S64, GlobalPtr, 64, GlobalAlign32},
9428bcb0991SDimitry Andric                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
9438bcb0991SDimitry Andric                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
9448bcb0991SDimitry Andric                                       {S32, GlobalPtr, 8, GlobalAlign8},
9458bcb0991SDimitry Andric                                       {S32, GlobalPtr, 16, GlobalAlign16},
9468bcb0991SDimitry Andric 
9478bcb0991SDimitry Andric                                       {S32, LocalPtr, 32, 32},
9488bcb0991SDimitry Andric                                       {S64, LocalPtr, 64, 32},
9498bcb0991SDimitry Andric                                       {V2S32, LocalPtr, 64, 32},
9508bcb0991SDimitry Andric                                       {S32, LocalPtr, 8, 8},
9518bcb0991SDimitry Andric                                       {S32, LocalPtr, 16, 16},
9528bcb0991SDimitry Andric                                       {V2S16, LocalPtr, 32, 32},
9538bcb0991SDimitry Andric 
9548bcb0991SDimitry Andric                                       {S32, PrivatePtr, 32, 32},
9558bcb0991SDimitry Andric                                       {S32, PrivatePtr, 8, 8},
9568bcb0991SDimitry Andric                                       {S32, PrivatePtr, 16, 16},
9578bcb0991SDimitry Andric                                       {V2S16, PrivatePtr, 32, 32},
9588bcb0991SDimitry Andric 
9598bcb0991SDimitry Andric                                       {S32, ConstantPtr, 32, GlobalAlign32},
9608bcb0991SDimitry Andric                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
9618bcb0991SDimitry Andric                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
9628bcb0991SDimitry Andric                                       {S64, ConstantPtr, 64, GlobalAlign32},
9638bcb0991SDimitry Andric                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
964*5ffd83dbSDimitry Andric     Actions.legalIf(
965*5ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
966*5ffd83dbSDimitry Andric         return isLoadStoreLegal(ST, Query, Op);
967*5ffd83dbSDimitry Andric       });
968*5ffd83dbSDimitry Andric 
969*5ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
970*5ffd83dbSDimitry Andric     // 64-bits.
971*5ffd83dbSDimitry Andric     //
972*5ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
973*5ffd83dbSDimitry Andric     // inserting addrspacecasts.
974*5ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
975*5ffd83dbSDimitry Andric 
976*5ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
977*5ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
978*5ffd83dbSDimitry Andric     // parts anyway.
979*5ffd83dbSDimitry Andric     //
980*5ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
981*5ffd83dbSDimitry Andric     // 16-bit vector parts.
982*5ffd83dbSDimitry Andric     Actions.bitcastIf(
983*5ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
984*5ffd83dbSDimitry Andric         const LLT Ty = Query.Types[0];
985*5ffd83dbSDimitry Andric         const unsigned Size = Ty.getSizeInBits();
986*5ffd83dbSDimitry Andric 
987*5ffd83dbSDimitry Andric         if (Size != Query.MMODescrs[0].SizeInBits)
988*5ffd83dbSDimitry Andric           return Size <= 32 && Ty.isVector();
989*5ffd83dbSDimitry Andric 
990*5ffd83dbSDimitry Andric         if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
991*5ffd83dbSDimitry Andric           return true;
992*5ffd83dbSDimitry Andric         return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
993*5ffd83dbSDimitry Andric                !isRegisterVectorElementType(Ty.getElementType());
994*5ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
995*5ffd83dbSDimitry Andric 
9968bcb0991SDimitry Andric     Actions
9978bcb0991SDimitry Andric         .customIf(typeIs(1, Constant32Ptr))
998*5ffd83dbSDimitry Andric         // Widen suitably aligned loads by loading extra elements.
999*5ffd83dbSDimitry Andric         .moreElementsIf([=](const LegalityQuery &Query) {
1000*5ffd83dbSDimitry Andric             const LLT Ty = Query.Types[0];
1001*5ffd83dbSDimitry Andric             return Op == G_LOAD && Ty.isVector() &&
1002*5ffd83dbSDimitry Andric                    shouldWidenLoadResult(Query, Op);
1003*5ffd83dbSDimitry Andric           }, moreElementsToNextPow2(0))
1004*5ffd83dbSDimitry Andric         .widenScalarIf([=](const LegalityQuery &Query) {
1005*5ffd83dbSDimitry Andric             const LLT Ty = Query.Types[0];
1006*5ffd83dbSDimitry Andric             return Op == G_LOAD && !Ty.isVector() &&
1007*5ffd83dbSDimitry Andric                    shouldWidenLoadResult(Query, Op);
1008*5ffd83dbSDimitry Andric           }, widenScalarOrEltToNextPow2(0))
10098bcb0991SDimitry Andric         .narrowScalarIf(
10108bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
1011*5ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
1012*5ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
10138bcb0991SDimitry Andric             },
10148bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
10158bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
10168bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
10178bcb0991SDimitry Andric 
10188bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
10198bcb0991SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
10208bcb0991SDimitry Andric 
10218bcb0991SDimitry Andric               // Split extloads.
10228bcb0991SDimitry Andric               if (DstSize > MemSize)
10238bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MemSize));
10248bcb0991SDimitry Andric 
1025*5ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
1026*5ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
1027*5ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
1028*5ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
1029*5ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
1030*5ffd83dbSDimitry Andric                 return std::make_pair(0, LLT::scalar(FloorSize));
1031*5ffd83dbSDimitry Andric               }
1032*5ffd83dbSDimitry Andric 
10338bcb0991SDimitry Andric               if (DstSize > 32 && (DstSize % 32 != 0)) {
10348bcb0991SDimitry Andric                 // FIXME: Need a way to specify non-extload of larger size if
10358bcb0991SDimitry Andric                 // suitably aligned.
10368bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
10378bcb0991SDimitry Andric               }
10388bcb0991SDimitry Andric 
1039*5ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
1040*5ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
1041*5ffd83dbSDimitry Andric                                                      Op == G_LOAD);
10428bcb0991SDimitry Andric               if (MemSize > MaxSize)
10438bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MaxSize));
10448bcb0991SDimitry Andric 
10458bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
10468bcb0991SDimitry Andric               return std::make_pair(0, LLT::scalar(Align));
10478bcb0991SDimitry Andric             })
10488bcb0991SDimitry Andric         .fewerElementsIf(
10498bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
1050*5ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
1051*5ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
10528bcb0991SDimitry Andric             },
10538bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
10548bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
10558bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
10568bcb0991SDimitry Andric 
10578bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
1058*5ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
1059*5ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
1060*5ffd83dbSDimitry Andric                                                      Op == G_LOAD);
1061*5ffd83dbSDimitry Andric 
1062*5ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
1063*5ffd83dbSDimitry Andric               // up scalarizing.
1064*5ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
10658bcb0991SDimitry Andric 
10668bcb0991SDimitry Andric               // Split if it's too large for the address space.
10678bcb0991SDimitry Andric               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
10688bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
1069*5ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
1070*5ffd83dbSDimitry Andric 
1071*5ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
1072*5ffd83dbSDimitry Andric                   return std::make_pair(
1073*5ffd83dbSDimitry Andric                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
1074*5ffd83dbSDimitry Andric                 }
1075*5ffd83dbSDimitry Andric 
10768bcb0991SDimitry Andric                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
10778bcb0991SDimitry Andric 
10788bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
10798bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
10808bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
10818bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
10828bcb0991SDimitry Andric                   return std::make_pair(0, EltTy);
10838bcb0991SDimitry Andric 
10848bcb0991SDimitry Andric                 return std::make_pair(0,
10858bcb0991SDimitry Andric                                       LLT::vector(NumElts / NumPieces, EltTy));
10868bcb0991SDimitry Andric               }
10878bcb0991SDimitry Andric 
1088*5ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
1089*5ffd83dbSDimitry Andric               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1090*5ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
1091*5ffd83dbSDimitry Andric                 return std::make_pair(0, EltTy);
1092*5ffd83dbSDimitry Andric 
1093*5ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
1094*5ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
1095*5ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
1096*5ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
1097*5ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
1098*5ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
1099*5ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
1100*5ffd83dbSDimitry Andric                 return std::make_pair(
1101*5ffd83dbSDimitry Andric                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
1102*5ffd83dbSDimitry Andric               }
1103*5ffd83dbSDimitry Andric 
11048bcb0991SDimitry Andric               // Need to split because of alignment.
11058bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
11068bcb0991SDimitry Andric               if (EltSize > Align &&
11078bcb0991SDimitry Andric                   (EltSize / Align < DstTy.getNumElements())) {
11088bcb0991SDimitry Andric                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
11098bcb0991SDimitry Andric               }
11108bcb0991SDimitry Andric 
11118bcb0991SDimitry Andric               // May need relegalization for the scalars.
11128bcb0991SDimitry Andric               return std::make_pair(0, EltTy);
11138bcb0991SDimitry Andric             })
11148bcb0991SDimitry Andric         .minScalar(0, S32);
11158bcb0991SDimitry Andric 
11168bcb0991SDimitry Andric     if (IsStore)
11178bcb0991SDimitry Andric       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
11188bcb0991SDimitry Andric 
11198bcb0991SDimitry Andric     // TODO: Need a bitcast lower option?
11208bcb0991SDimitry Andric     Actions
11218bcb0991SDimitry Andric         .widenScalarToNextPow2(0)
11228bcb0991SDimitry Andric         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
11238bcb0991SDimitry Andric   }
11240b57cec5SDimitry Andric 
11250b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
11268bcb0991SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
11278bcb0991SDimitry Andric                                                   {S32, GlobalPtr, 16, 2 * 8},
11280b57cec5SDimitry Andric                                                   {S32, LocalPtr, 8, 8},
11298bcb0991SDimitry Andric                                                   {S32, LocalPtr, 16, 16},
11300b57cec5SDimitry Andric                                                   {S32, PrivatePtr, 8, 8},
11318bcb0991SDimitry Andric                                                   {S32, PrivatePtr, 16, 16},
11328bcb0991SDimitry Andric                                                   {S32, ConstantPtr, 8, 8},
11338bcb0991SDimitry Andric                                                   {S32, ConstantPtr, 16, 2 * 8}});
11340b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
11358bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
11368bcb0991SDimitry Andric         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
11370b57cec5SDimitry Andric   }
11380b57cec5SDimitry Andric 
11390b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
11400b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
11410b57cec5SDimitry Andric           .unsupportedIfMemSizeNotPow2()
11420b57cec5SDimitry Andric           .lower();
11430b57cec5SDimitry Andric 
11440b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
11450b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
11460b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
11470b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1148480093f4SDimitry Andric      G_ATOMICRMW_UMIN})
11490b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
11500b57cec5SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr}});
11510b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
11520b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
11530b57cec5SDimitry Andric   }
11540b57cec5SDimitry Andric 
1155*5ffd83dbSDimitry Andric   if (ST.hasLDSFPAtomics()) {
11568bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
11578bcb0991SDimitry Andric       .legalFor({{S32, LocalPtr}});
1158*5ffd83dbSDimitry Andric   }
11598bcb0991SDimitry Andric 
1160480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1161480093f4SDimitry Andric   // demarshalling
1162480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1163480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1164480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1165480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1166480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
11670b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1168480093f4SDimitry Andric 
1169480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
11700b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
11710b57cec5SDimitry Andric     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
11720b57cec5SDimitry Andric           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1173480093f4SDimitry Andric           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
11740b57cec5SDimitry Andric     .clampScalar(0, S16, S64)
1175*5ffd83dbSDimitry Andric     .scalarize(1)
11760b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
11770b57cec5SDimitry Andric     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
11780b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 2)
11790b57cec5SDimitry Andric     .clampMaxNumElements(0, LocalPtr, 2)
11800b57cec5SDimitry Andric     .clampMaxNumElements(0, PrivatePtr, 2)
11810b57cec5SDimitry Andric     .scalarize(0)
11820b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
1183480093f4SDimitry Andric     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
11840b57cec5SDimitry Andric 
11850b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
11860b57cec5SDimitry Andric   // be more flexible with the shift amount type.
11870b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
11880b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
11890b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
11900b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
1191*5ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
11920b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
11930b57cec5SDimitry Andric     } else
1194*5ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
11950b57cec5SDimitry Andric 
1196*5ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
1197*5ffd83dbSDimitry Andric     Shifts.widenScalarIf(
1198*5ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
1199*5ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1200*5ffd83dbSDimitry Andric         // 32-bit amount.
1201*5ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
1202*5ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
1203*5ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
1204*5ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
1205*5ffd83dbSDimitry Andric       }, changeTo(1, S16));
1206*5ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1207480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
12080b57cec5SDimitry Andric     Shifts.clampScalar(0, S16, S64);
12090b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
12100b57cec5SDimitry Andric   } else {
12110b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
12120b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
12130b57cec5SDimitry Andric     // been truncated already.
12140b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
12150b57cec5SDimitry Andric     Shifts.clampScalar(0, S32, S64);
12160b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
12170b57cec5SDimitry Andric   }
12180b57cec5SDimitry Andric   Shifts.scalarize(0);
12190b57cec5SDimitry Andric 
12200b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
12210b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
12220b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
12230b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
12240b57cec5SDimitry Andric 
12250b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
12260b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
12270b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
12280b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
12290b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
12300b57cec5SDimitry Andric           return (EltTy.getSizeInBits() == 16 ||
12310b57cec5SDimitry Andric                   EltTy.getSizeInBits() % 32 == 0) &&
12320b57cec5SDimitry Andric                  VecTy.getSizeInBits() % 32 == 0 &&
1233*5ffd83dbSDimitry Andric                  VecTy.getSizeInBits() <= MaxRegisterSize &&
12340b57cec5SDimitry Andric                  IdxTy.getSizeInBits() == 32;
12350b57cec5SDimitry Andric         })
12360b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
12370b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
12380b57cec5SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32);
12390b57cec5SDimitry Andric   }
12400b57cec5SDimitry Andric 
12410b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
12420b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
12430b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
12440b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
12450b57cec5SDimitry Andric       });
12460b57cec5SDimitry Andric 
12470b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
12480b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
12490b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
12500b57cec5SDimitry Andric 
12510b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
12520b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
12538bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
12548bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
12550b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
12560b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
12570b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
12580b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
12590b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
12600b57cec5SDimitry Andric         })
12610b57cec5SDimitry Andric       .widenScalarIf(
12620b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
12630b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
12640b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
12650b57cec5SDimitry Andric         },
12660b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
12670b57cec5SDimitry Andric       .widenScalarIf(
12680b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
12690b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
12700b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
12710b57cec5SDimitry Andric         },
12720b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
12730b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
12740b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
12750b57cec5SDimitry Andric 
12760b57cec5SDimitry Andric   }
12770b57cec5SDimitry Andric 
12788bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
12790b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
12800b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
12818bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
12828bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
12838bcb0991SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
12848bcb0991SDimitry Andric 
12858bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
1286*5ffd83dbSDimitry Andric     BuildVector
1287*5ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
1288*5ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
1289*5ffd83dbSDimitry Andric       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
1290*5ffd83dbSDimitry Andric       .minScalar(1, S32);
1291*5ffd83dbSDimitry Andric 
12928bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
12938bcb0991SDimitry Andric       .legalFor({V2S16, S32})
12948bcb0991SDimitry Andric       .lower();
1295*5ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
12968bcb0991SDimitry Andric   } else {
1297*5ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
1298*5ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
1299*5ffd83dbSDimitry Andric 
13008bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1301*5ffd83dbSDimitry Andric       .customFor({V2S16, S32})
13028bcb0991SDimitry Andric       .lower();
13038bcb0991SDimitry Andric   }
13048bcb0991SDimitry Andric 
1305*5ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
1306*5ffd83dbSDimitry Andric 
1307*5ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
13080b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
13090b57cec5SDimitry Andric     .legalIf(isRegisterType(0));
13100b57cec5SDimitry Andric 
1311*5ffd83dbSDimitry Andric   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
1312*5ffd83dbSDimitry Andric   // pre-legalize.
1313*5ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
1314*5ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
1315*5ffd83dbSDimitry Andric       .customFor({V2S16, V2S16})
1316*5ffd83dbSDimitry Andric       .lower();
1317*5ffd83dbSDimitry Andric   } else
13188bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
13198bcb0991SDimitry Andric 
13200b57cec5SDimitry Andric   // Merge/Unmerge
13210b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
13220b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
13230b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
13240b57cec5SDimitry Andric 
13250b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
1326*5ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
13270b57cec5SDimitry Andric       if (Ty.isVector()) {
13280b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
1329*5ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
13300b57cec5SDimitry Andric           return true;
13310b57cec5SDimitry Andric         if (!isPowerOf2_32(EltTy.getSizeInBits()))
13320b57cec5SDimitry Andric           return true;
13330b57cec5SDimitry Andric       }
13340b57cec5SDimitry Andric       return false;
13350b57cec5SDimitry Andric     };
13360b57cec5SDimitry Andric 
13378bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1338*5ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
1339*5ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
1340*5ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
1341*5ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
1342*5ffd83dbSDimitry Andric         })
1343*5ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
1344*5ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
1345*5ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
13460b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
13478bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
13488bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
13498bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
13508bcb0991SDimitry Andric                        changeTo(1, V2S16))
1351*5ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
1352*5ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
1353*5ffd83dbSDimitry Andric       // valid.
1354*5ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
1355*5ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
13560b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
13570b57cec5SDimitry Andric       .fewerElementsIf(
1358*5ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
13590b57cec5SDimitry Andric         scalarize(0))
13600b57cec5SDimitry Andric       .fewerElementsIf(
1361*5ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
13620b57cec5SDimitry Andric         scalarize(1))
1363*5ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
13648bcb0991SDimitry Andric 
13658bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
13668bcb0991SDimitry Andric       Builder.widenScalarIf(
13678bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
13680b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
13698bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
13708bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
13718bcb0991SDimitry Andric         },
13728bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
13738bcb0991SDimitry Andric     }
13748bcb0991SDimitry Andric 
13758bcb0991SDimitry Andric     Builder.widenScalarIf(
13768bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
13778bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
13780b57cec5SDimitry Andric         return !isPowerOf2_32(Ty.getSizeInBits()) &&
13790b57cec5SDimitry Andric           Ty.getSizeInBits() % 16 != 0;
13800b57cec5SDimitry Andric       },
13810b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
13820b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
13830b57cec5SDimitry Andric         // Whichever is smaller.
13840b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
13850b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
13860b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
13870b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
13880b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
13890b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
13900b57cec5SDimitry Andric         }
13910b57cec5SDimitry Andric         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
13920b57cec5SDimitry Andric       })
13930b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
13940b57cec5SDimitry Andric           const LLT &BigTy = Query.Types[BigTyIdx];
13950b57cec5SDimitry Andric           const LLT &LitTy = Query.Types[LitTyIdx];
13960b57cec5SDimitry Andric 
13970b57cec5SDimitry Andric           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
13980b57cec5SDimitry Andric             return false;
13990b57cec5SDimitry Andric           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
14000b57cec5SDimitry Andric             return false;
14010b57cec5SDimitry Andric 
14020b57cec5SDimitry Andric           return BigTy.getSizeInBits() % 16 == 0 &&
14030b57cec5SDimitry Andric                  LitTy.getSizeInBits() % 16 == 0 &&
1404*5ffd83dbSDimitry Andric                  BigTy.getSizeInBits() <= MaxRegisterSize;
14050b57cec5SDimitry Andric         })
14060b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
14070b57cec5SDimitry Andric       .scalarize(0)
14080b57cec5SDimitry Andric       .scalarize(1);
14090b57cec5SDimitry Andric   }
14100b57cec5SDimitry Andric 
1411*5ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1412*5ffd83dbSDimitry Andric   // RegBankSelect.
1413*5ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1414*5ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
14158bcb0991SDimitry Andric 
1416*5ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
1417*5ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
1418*5ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
1419*5ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
1420*5ffd83dbSDimitry Andric       // expanded.
1421*5ffd83dbSDimitry Andric       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
1422*5ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
1423*5ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
1424*5ffd83dbSDimitry Andric   } else {
1425*5ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
1426*5ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
1427*5ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
1428*5ffd83dbSDimitry Andric   }
1429*5ffd83dbSDimitry Andric 
1430*5ffd83dbSDimitry Andric   // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
1431*5ffd83dbSDimitry Andric   // available, and is selectively legal for s16, s32, v2s16.
1432*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
1433*5ffd83dbSDimitry Andric     .scalarize(0)
1434*5ffd83dbSDimitry Andric     .clampScalar(0, S16, S32);
1435*5ffd83dbSDimitry Andric 
1436*5ffd83dbSDimitry Andric   SextInReg
1437*5ffd83dbSDimitry Andric     .scalarize(0)
1438*5ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
1439*5ffd83dbSDimitry Andric     .lower();
1440*5ffd83dbSDimitry Andric 
1441*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
1442*5ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1443*5ffd83dbSDimitry Andric     .scalarize(0)
1444*5ffd83dbSDimitry Andric     .lower();
1445480093f4SDimitry Andric 
1446480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1447480093f4SDimitry Andric     .legalFor({S64});
1448480093f4SDimitry Andric 
1449*5ffd83dbSDimitry Andric   getActionDefinitionsBuilder({
1450*5ffd83dbSDimitry Andric       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
1451*5ffd83dbSDimitry Andric       G_FCOPYSIGN,
1452*5ffd83dbSDimitry Andric 
1453*5ffd83dbSDimitry Andric       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1454*5ffd83dbSDimitry Andric       G_READ_REGISTER,
1455*5ffd83dbSDimitry Andric       G_WRITE_REGISTER,
1456*5ffd83dbSDimitry Andric 
1457*5ffd83dbSDimitry Andric       G_SADDO, G_SSUBO,
1458*5ffd83dbSDimitry Andric 
1459*5ffd83dbSDimitry Andric        // TODO: Implement
1460*5ffd83dbSDimitry Andric       G_FMINIMUM, G_FMAXIMUM,
1461*5ffd83dbSDimitry Andric       G_FSHL
1462*5ffd83dbSDimitry Andric     }).lower();
1463*5ffd83dbSDimitry Andric 
1464480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1465*5ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1466480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1467480093f4SDimitry Andric     .unsupported();
1468480093f4SDimitry Andric 
14690b57cec5SDimitry Andric   computeTables();
14700b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
14710b57cec5SDimitry Andric }
14720b57cec5SDimitry Andric 
1473*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
1474*5ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
1475*5ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
1476*5ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
1477*5ffd83dbSDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
1478*5ffd83dbSDimitry Andric 
14790b57cec5SDimitry Andric   switch (MI.getOpcode()) {
14800b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
14818bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
14820b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
14838bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
14840b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
14858bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
14860b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
14878bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
14880b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
14898bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
14900b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
14918bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
1492*5ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
1493*5ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
1494*5ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
1495*5ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
14960b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
14970b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
14980b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
14990b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
1500*5ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
15010b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
15028bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
15030b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
15048bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
1505*5ffd83dbSDimitry Andric   case TargetOpcode::G_SHUFFLE_VECTOR:
1506*5ffd83dbSDimitry Andric     return legalizeShuffleVector(MI, MRI, B);
15078bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
15088bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
15098bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
15108bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
15118bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
15128bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
15138bcb0991SDimitry Andric     return legalizeLoad(MI, MRI, B, Observer);
15148bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
15158bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
15168bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
15178bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
1518*5ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
1519*5ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
1520*5ffd83dbSDimitry Andric     return legalizeUDIV_UREM(MI, MRI, B);
1521*5ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
1522*5ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
1523*5ffd83dbSDimitry Andric     return legalizeSDIV_SREM(MI, MRI, B);
1524480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
1525480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
1526*5ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
1527*5ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f);
1528*5ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
1529*5ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
1530*5ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
1531*5ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
1532*5ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
1533*5ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
1534*5ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
1535*5ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
1536*5ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
1537*5ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
15380b57cec5SDimitry Andric   default:
15390b57cec5SDimitry Andric     return false;
15400b57cec5SDimitry Andric   }
15410b57cec5SDimitry Andric 
15420b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
15430b57cec5SDimitry Andric }
15440b57cec5SDimitry Andric 
15450b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
15460b57cec5SDimitry Andric   unsigned AS,
15470b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
15488bcb0991SDimitry Andric   MachineIRBuilder &B) const {
15498bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
15500b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15510b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
15520b57cec5SDimitry Andric 
15538bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
15548bcb0991SDimitry Andric 
15550b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
15560b57cec5SDimitry Andric     // FIXME: Use inline constants (src_{shared, private}_base) instead of
15570b57cec5SDimitry Andric     // getreg.
15580b57cec5SDimitry Andric     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
15590b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
15600b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
15610b57cec5SDimitry Andric     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
15620b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
15630b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
15640b57cec5SDimitry Andric     unsigned Encoding =
15650b57cec5SDimitry Andric         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
15660b57cec5SDimitry Andric         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
15670b57cec5SDimitry Andric         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
15680b57cec5SDimitry Andric 
15690b57cec5SDimitry Andric     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
15700b57cec5SDimitry Andric 
15718bcb0991SDimitry Andric     B.buildInstr(AMDGPU::S_GETREG_B32)
15720b57cec5SDimitry Andric       .addDef(GetReg)
15730b57cec5SDimitry Andric       .addImm(Encoding);
15740b57cec5SDimitry Andric     MRI.setType(GetReg, S32);
15750b57cec5SDimitry Andric 
15768bcb0991SDimitry Andric     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
1577*5ffd83dbSDimitry Andric     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
15780b57cec5SDimitry Andric   }
15790b57cec5SDimitry Andric 
15800b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
15810b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
15820b57cec5SDimitry Andric 
15838bcb0991SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
15848bcb0991SDimitry Andric   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
15858bcb0991SDimitry Andric     return Register();
15860b57cec5SDimitry Andric 
15870b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
15880b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
15890b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
15900b57cec5SDimitry Andric 
1591480093f4SDimitry Andric   // TODO: can we be smarter about machine pointer info?
1592480093f4SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
15930b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
15940b57cec5SDimitry Andric       PtrInfo,
1595*5ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
15960b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
1597*5ffd83dbSDimitry Andric       4, commonAlignment(Align(64), StructOffset));
15980b57cec5SDimitry Andric 
15990b57cec5SDimitry Andric   Register LoadAddr;
16000b57cec5SDimitry Andric 
1601480093f4SDimitry Andric   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
1602*5ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
16030b57cec5SDimitry Andric }
16040b57cec5SDimitry Andric 
16050b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
16060b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
16078bcb0991SDimitry Andric   MachineIRBuilder &B) const {
16088bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
16090b57cec5SDimitry Andric 
16108bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
16110b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
16120b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
16130b57cec5SDimitry Andric 
16140b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
16150b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
16160b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
16170b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
16180b57cec5SDimitry Andric 
16190b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
16200b57cec5SDimitry Andric   // vector element.
16210b57cec5SDimitry Andric   assert(!DstTy.isVector());
16220b57cec5SDimitry Andric 
16230b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
16240b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
16250b57cec5SDimitry Andric 
16260b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
16270b57cec5SDimitry Andric   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
16288bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
16298bcb0991SDimitry Andric     return true;
16308bcb0991SDimitry Andric   }
16318bcb0991SDimitry Andric 
16328bcb0991SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
16338bcb0991SDimitry Andric     // Truncate.
16348bcb0991SDimitry Andric     B.buildExtract(Dst, Src, 0);
16358bcb0991SDimitry Andric     MI.eraseFromParent();
16368bcb0991SDimitry Andric     return true;
16378bcb0991SDimitry Andric   }
16388bcb0991SDimitry Andric 
16398bcb0991SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
16408bcb0991SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16418bcb0991SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
16428bcb0991SDimitry Andric 
16438bcb0991SDimitry Andric     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
16448bcb0991SDimitry Andric     // another. Merge operands are required to be the same type, but creating an
16458bcb0991SDimitry Andric     // extra ptrtoint would be kind of pointless.
16468bcb0991SDimitry Andric     auto HighAddr = B.buildConstant(
16478bcb0991SDimitry Andric       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
1648*5ffd83dbSDimitry Andric     B.buildMerge(Dst, {Src, HighAddr});
16498bcb0991SDimitry Andric     MI.eraseFromParent();
16500b57cec5SDimitry Andric     return true;
16510b57cec5SDimitry Andric   }
16520b57cec5SDimitry Andric 
16530b57cec5SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
16540b57cec5SDimitry Andric     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
16550b57cec5SDimitry Andric            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
16560b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
16570b57cec5SDimitry Andric 
16588bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
16598bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
16600b57cec5SDimitry Andric 
16610b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
1662*5ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
16630b57cec5SDimitry Andric 
1664*5ffd83dbSDimitry Andric     auto CmpRes =
1665*5ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
16668bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
16670b57cec5SDimitry Andric 
16680b57cec5SDimitry Andric     MI.eraseFromParent();
16690b57cec5SDimitry Andric     return true;
16700b57cec5SDimitry Andric   }
16710b57cec5SDimitry Andric 
16728bcb0991SDimitry Andric   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
16738bcb0991SDimitry Andric     return false;
16748bcb0991SDimitry Andric 
16758bcb0991SDimitry Andric   if (!ST.hasFlatAddressSpace())
16768bcb0991SDimitry Andric     return false;
16770b57cec5SDimitry Andric 
16780b57cec5SDimitry Andric   auto SegmentNull =
16798bcb0991SDimitry Andric       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
16800b57cec5SDimitry Andric   auto FlatNull =
16818bcb0991SDimitry Andric       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
16820b57cec5SDimitry Andric 
16838bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
16848bcb0991SDimitry Andric   if (!ApertureReg.isValid())
16858bcb0991SDimitry Andric     return false;
16860b57cec5SDimitry Andric 
1687*5ffd83dbSDimitry Andric   auto CmpRes =
1688*5ffd83dbSDimitry Andric       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
16890b57cec5SDimitry Andric 
16900b57cec5SDimitry Andric   // Coerce the type of the low half of the result so we can use merge_values.
1691*5ffd83dbSDimitry Andric   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
16920b57cec5SDimitry Andric 
16930b57cec5SDimitry Andric   // TODO: Should we allow mismatched types but matching sizes in merges to
16940b57cec5SDimitry Andric   // avoid the ptrtoint?
1695*5ffd83dbSDimitry Andric   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1696*5ffd83dbSDimitry Andric   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
16970b57cec5SDimitry Andric 
16980b57cec5SDimitry Andric   MI.eraseFromParent();
16990b57cec5SDimitry Andric   return true;
17000b57cec5SDimitry Andric }
17010b57cec5SDimitry Andric 
17020b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
17030b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
17048bcb0991SDimitry Andric   MachineIRBuilder &B) const {
17050b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
17060b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
17070b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
17080b57cec5SDimitry Andric 
17090b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
17100b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
17110b57cec5SDimitry Andric 
17128bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
17138bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
17140b57cec5SDimitry Andric 
17150b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
17168bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
17178bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
17180b57cec5SDimitry Andric 
17198bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
17208bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
17210b57cec5SDimitry Andric 
17228bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
17238bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
17240b57cec5SDimitry Andric   return true;
17250b57cec5SDimitry Andric }
17260b57cec5SDimitry Andric 
17270b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
17280b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
17290b57cec5SDimitry Andric   MachineIRBuilder &B) const {
17300b57cec5SDimitry Andric 
17310b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
17320b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
17330b57cec5SDimitry Andric 
17340b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
17350b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
17360b57cec5SDimitry Andric 
17370b57cec5SDimitry Andric   // result = trunc(src)
17380b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
17390b57cec5SDimitry Andric   //   result += 1.0
17400b57cec5SDimitry Andric 
1741*5ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
17420b57cec5SDimitry Andric 
17430b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
17440b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
17450b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
17460b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
17470b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
17480b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
17490b57cec5SDimitry Andric 
17500b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
17510b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
17520b57cec5SDimitry Andric   return true;
17530b57cec5SDimitry Andric }
17540b57cec5SDimitry Andric 
17550b57cec5SDimitry Andric static MachineInstrBuilder extractF64Exponent(unsigned Hi,
17560b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
17570b57cec5SDimitry Andric   const unsigned FractBits = 52;
17580b57cec5SDimitry Andric   const unsigned ExpBits = 11;
17590b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
17600b57cec5SDimitry Andric 
17610b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
17620b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
17630b57cec5SDimitry Andric 
17640b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
17650b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
17660b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
17670b57cec5SDimitry Andric 
17680b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
17690b57cec5SDimitry Andric }
17700b57cec5SDimitry Andric 
17710b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
17720b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
17730b57cec5SDimitry Andric   MachineIRBuilder &B) const {
17740b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
17750b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
17760b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
17770b57cec5SDimitry Andric 
17780b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
17790b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
17800b57cec5SDimitry Andric 
17810b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
17820b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
17830b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
17840b57cec5SDimitry Andric 
17850b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
17860b57cec5SDimitry Andric   // exponent.
17870b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
17880b57cec5SDimitry Andric 
17890b57cec5SDimitry Andric   const unsigned FractBits = 52;
17900b57cec5SDimitry Andric 
17910b57cec5SDimitry Andric   // Extract the sign bit.
17920b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
17930b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
17940b57cec5SDimitry Andric 
17950b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
17960b57cec5SDimitry Andric 
17970b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
17980b57cec5SDimitry Andric 
17990b57cec5SDimitry Andric   // Extend back to 64-bits.
1800*5ffd83dbSDimitry Andric   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
18010b57cec5SDimitry Andric 
18020b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
18030b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
18040b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
18050b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
18060b57cec5SDimitry Andric 
18070b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
18080b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
18090b57cec5SDimitry Andric 
18100b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
18110b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
18120b57cec5SDimitry Andric   return true;
18130b57cec5SDimitry Andric }
18140b57cec5SDimitry Andric 
18150b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
18160b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
18170b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
18180b57cec5SDimitry Andric 
18190b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
18200b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
18210b57cec5SDimitry Andric 
18220b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
18230b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
18240b57cec5SDimitry Andric 
18250b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
18260b57cec5SDimitry Andric 
18270b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
18280b57cec5SDimitry Andric 
18290b57cec5SDimitry Andric   auto CvtHi = Signed ?
18300b57cec5SDimitry Andric     B.buildSITOFP(S64, Unmerge.getReg(1)) :
18310b57cec5SDimitry Andric     B.buildUITOFP(S64, Unmerge.getReg(1));
18320b57cec5SDimitry Andric 
18330b57cec5SDimitry Andric   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
18340b57cec5SDimitry Andric 
18350b57cec5SDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
18360b57cec5SDimitry Andric   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
18370b57cec5SDimitry Andric     .addUse(CvtHi.getReg(0))
18380b57cec5SDimitry Andric     .addUse(ThirtyTwo.getReg(0));
18390b57cec5SDimitry Andric 
18400b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
18410b57cec5SDimitry Andric   B.buildFAdd(Dst, LdExp, CvtLo);
18420b57cec5SDimitry Andric   MI.eraseFromParent();
18430b57cec5SDimitry Andric   return true;
18440b57cec5SDimitry Andric }
18450b57cec5SDimitry Andric 
1846*5ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
1847*5ffd83dbSDimitry Andric // actually works.
1848*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(
18490b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
1850*5ffd83dbSDimitry Andric   MachineIRBuilder &B, bool Signed) const {
1851*5ffd83dbSDimitry Andric 
1852*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
1853*5ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
1854*5ffd83dbSDimitry Andric 
1855*5ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
1856*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
1857*5ffd83dbSDimitry Andric 
1858*5ffd83dbSDimitry Andric   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
1859*5ffd83dbSDimitry Andric 
1860*5ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
1861*5ffd83dbSDimitry Andric 
1862*5ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
1863*5ffd83dbSDimitry Andric   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
1864*5ffd83dbSDimitry Andric   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
1865*5ffd83dbSDimitry Andric 
1866*5ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
1867*5ffd83dbSDimitry Andric   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
1868*5ffd83dbSDimitry Andric   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
1869*5ffd83dbSDimitry Andric 
1870*5ffd83dbSDimitry Andric   auto Hi = Signed ?
1871*5ffd83dbSDimitry Andric     B.buildFPTOSI(S32, FloorMul) :
1872*5ffd83dbSDimitry Andric     B.buildFPTOUI(S32, FloorMul);
1873*5ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
1874*5ffd83dbSDimitry Andric 
1875*5ffd83dbSDimitry Andric   B.buildMerge(Dst, { Lo, Hi });
1876*5ffd83dbSDimitry Andric   MI.eraseFromParent();
1877*5ffd83dbSDimitry Andric 
1878*5ffd83dbSDimitry Andric   return true;
1879*5ffd83dbSDimitry Andric }
1880*5ffd83dbSDimitry Andric 
1881*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
1882*5ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
1883*5ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
18840b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
18850b57cec5SDimitry Andric 
18860b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
18870b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
18880b57cec5SDimitry Andric 
18890b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
18900b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
18910b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
18920b57cec5SDimitry Andric     return !IsIEEEOp;
18930b57cec5SDimitry Andric 
18940b57cec5SDimitry Andric   if (IsIEEEOp)
18950b57cec5SDimitry Andric     return true;
18960b57cec5SDimitry Andric 
18970b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
18980b57cec5SDimitry Andric }
18990b57cec5SDimitry Andric 
19000b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
19010b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19020b57cec5SDimitry Andric   MachineIRBuilder &B) const {
19030b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
19040b57cec5SDimitry Andric 
19050b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
1906*5ffd83dbSDimitry Andric 
1907*5ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
1908*5ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
1909*5ffd83dbSDimitry Andric   // getConstantVRegValWithLookThrough.
1910*5ffd83dbSDimitry Andric   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1911*5ffd83dbSDimitry Andric     MI.getOperand(2).getReg(), MRI);
19120b57cec5SDimitry Andric   if (!IdxVal) // Dynamic case will be selected to register indexing.
19130b57cec5SDimitry Andric     return true;
19140b57cec5SDimitry Andric 
19150b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
19160b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
19170b57cec5SDimitry Andric 
19180b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
19190b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
19200b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Dst));
19210b57cec5SDimitry Andric 
1922*5ffd83dbSDimitry Andric   if (IdxVal->Value < VecTy.getNumElements())
1923*5ffd83dbSDimitry Andric     B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
19240b57cec5SDimitry Andric   else
19250b57cec5SDimitry Andric     B.buildUndef(Dst);
19260b57cec5SDimitry Andric 
19270b57cec5SDimitry Andric   MI.eraseFromParent();
19280b57cec5SDimitry Andric   return true;
19290b57cec5SDimitry Andric }
19300b57cec5SDimitry Andric 
19310b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
19320b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19330b57cec5SDimitry Andric   MachineIRBuilder &B) const {
19340b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
19350b57cec5SDimitry Andric 
19360b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
1937*5ffd83dbSDimitry Andric 
1938*5ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
1939*5ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
1940*5ffd83dbSDimitry Andric   // getConstantVRegValWithLookThrough.
1941*5ffd83dbSDimitry Andric   Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
1942*5ffd83dbSDimitry Andric     MI.getOperand(3).getReg(), MRI);
19430b57cec5SDimitry Andric   if (!IdxVal) // Dynamic case will be selected to register indexing.
19440b57cec5SDimitry Andric     return true;
19450b57cec5SDimitry Andric 
19460b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
19470b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
19480b57cec5SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
19490b57cec5SDimitry Andric 
19500b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
19510b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
19520b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Ins));
19530b57cec5SDimitry Andric 
1954*5ffd83dbSDimitry Andric   if (IdxVal->Value < VecTy.getNumElements())
1955*5ffd83dbSDimitry Andric     B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
19560b57cec5SDimitry Andric   else
19570b57cec5SDimitry Andric     B.buildUndef(Dst);
19580b57cec5SDimitry Andric 
19590b57cec5SDimitry Andric   MI.eraseFromParent();
19600b57cec5SDimitry Andric   return true;
19610b57cec5SDimitry Andric }
19620b57cec5SDimitry Andric 
1963*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector(
1964*5ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
1965*5ffd83dbSDimitry Andric   MachineIRBuilder &B) const {
1966*5ffd83dbSDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
1967*5ffd83dbSDimitry Andric 
1968*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
1969*5ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
1970*5ffd83dbSDimitry Andric   LLT DstTy = MRI.getType(Dst);
1971*5ffd83dbSDimitry Andric   LLT SrcTy = MRI.getType(Src0);
1972*5ffd83dbSDimitry Andric 
1973*5ffd83dbSDimitry Andric   if (SrcTy == V2S16 && DstTy == V2S16 &&
1974*5ffd83dbSDimitry Andric       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
1975*5ffd83dbSDimitry Andric     return true;
1976*5ffd83dbSDimitry Andric 
1977*5ffd83dbSDimitry Andric   MachineIRBuilder HelperBuilder(MI);
1978*5ffd83dbSDimitry Andric   GISelObserverWrapper DummyObserver;
1979*5ffd83dbSDimitry Andric   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
1980*5ffd83dbSDimitry Andric   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
1981*5ffd83dbSDimitry Andric }
1982*5ffd83dbSDimitry Andric 
19838bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
19848bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19858bcb0991SDimitry Andric   MachineIRBuilder &B) const {
19868bcb0991SDimitry Andric 
19878bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
19888bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
19898bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
19908bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
19918bcb0991SDimitry Andric 
19928bcb0991SDimitry Andric   Register TrigVal;
1993*5ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
19948bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
19958bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
19968bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
19978bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
19988bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
19998bcb0991SDimitry Andric   } else
20008bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
20018bcb0991SDimitry Andric 
20028bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
20038bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
20048bcb0991SDimitry Andric   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
20058bcb0991SDimitry Andric     .addUse(TrigVal)
20068bcb0991SDimitry Andric     .setMIFlags(Flags);
20078bcb0991SDimitry Andric   MI.eraseFromParent();
20088bcb0991SDimitry Andric   return true;
20098bcb0991SDimitry Andric }
20108bcb0991SDimitry Andric 
2011*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
2012*5ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
2013*5ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
2014*5ffd83dbSDimitry Andric                                                   int64_t Offset,
2015*5ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
2016*5ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
20178bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
20188bcb0991SDimitry Andric   // to the following code sequence:
20198bcb0991SDimitry Andric   //
20208bcb0991SDimitry Andric   // For constant address space:
20218bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
20228bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
20238bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
20248bcb0991SDimitry Andric   //
20258bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
20268bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
20278bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
20288bcb0991SDimitry Andric   //   operand to the global variable.
20298bcb0991SDimitry Andric   //
20308bcb0991SDimitry Andric   // For global address space:
20318bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
20328bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
20338bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
20348bcb0991SDimitry Andric   //
20358bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
20368bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
20378bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
20388bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
20398bcb0991SDimitry Andric   //   operand to the global variable.
20408bcb0991SDimitry Andric   //
20418bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
20428bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
20438bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
20448bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
20458bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
20468bcb0991SDimitry Andric   // compute the correct address.
20478bcb0991SDimitry Andric 
20488bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
20498bcb0991SDimitry Andric 
20508bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
20518bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
20528bcb0991SDimitry Andric 
20538bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
20548bcb0991SDimitry Andric     .addDef(PCReg);
20558bcb0991SDimitry Andric 
20568bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
20578bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
20588bcb0991SDimitry Andric     MIB.addImm(0);
20598bcb0991SDimitry Andric   else
20608bcb0991SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
20618bcb0991SDimitry Andric 
20628bcb0991SDimitry Andric   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
20638bcb0991SDimitry Andric 
20648bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
20658bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
20668bcb0991SDimitry Andric   return true;
20678bcb0991SDimitry Andric  }
20688bcb0991SDimitry Andric 
20698bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
20708bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20718bcb0991SDimitry Andric   MachineIRBuilder &B) const {
20728bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
20738bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
20748bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
20758bcb0991SDimitry Andric 
20768bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
20778bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
20788bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
20798bcb0991SDimitry Andric 
20808bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
20818bcb0991SDimitry Andric     if (!MFI->isEntryFunction()) {
20828bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
20838bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
2084*5ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
2085*5ffd83dbSDimitry Andric         DS_Warning);
20868bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
2087*5ffd83dbSDimitry Andric 
2088*5ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
2089*5ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
2090*5ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
2091*5ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
2092*5ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
2093*5ffd83dbSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
2094*5ffd83dbSDimitry Andric       B.buildUndef(DstReg);
2095*5ffd83dbSDimitry Andric       MI.eraseFromParent();
2096*5ffd83dbSDimitry Andric       return true;
20978bcb0991SDimitry Andric     }
20988bcb0991SDimitry Andric 
20998bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
21008bcb0991SDimitry Andric     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
2101*5ffd83dbSDimitry Andric       const SITargetLowering *TLI = ST.getTargetLowering();
2102*5ffd83dbSDimitry Andric       if (!TLI->shouldUseLDSConstAddress(GV)) {
2103*5ffd83dbSDimitry Andric         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
2104*5ffd83dbSDimitry Andric         return true; // Leave in place;
2105*5ffd83dbSDimitry Andric       }
2106*5ffd83dbSDimitry Andric 
2107*5ffd83dbSDimitry Andric       B.buildConstant(
2108*5ffd83dbSDimitry Andric           DstReg,
2109*5ffd83dbSDimitry Andric           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
21108bcb0991SDimitry Andric       MI.eraseFromParent();
21118bcb0991SDimitry Andric       return true;
21128bcb0991SDimitry Andric     }
21138bcb0991SDimitry Andric 
21148bcb0991SDimitry Andric     const Function &Fn = MF.getFunction();
21158bcb0991SDimitry Andric     DiagnosticInfoUnsupported BadInit(
21168bcb0991SDimitry Andric       Fn, "unsupported initializer for address space", MI.getDebugLoc());
21178bcb0991SDimitry Andric     Fn.getContext().diagnose(BadInit);
21188bcb0991SDimitry Andric     return true;
21198bcb0991SDimitry Andric   }
21208bcb0991SDimitry Andric 
21218bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
21228bcb0991SDimitry Andric 
21238bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
21248bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
21258bcb0991SDimitry Andric     MI.eraseFromParent();
21268bcb0991SDimitry Andric     return true;
21278bcb0991SDimitry Andric   }
21288bcb0991SDimitry Andric 
21298bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
21308bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
21318bcb0991SDimitry Andric     MI.eraseFromParent();
21328bcb0991SDimitry Andric     return true;
21338bcb0991SDimitry Andric   }
21348bcb0991SDimitry Andric 
21358bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
21368bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
21378bcb0991SDimitry Andric 
21388bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
21398bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
21408bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
21418bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2142*5ffd83dbSDimitry Andric       8 /*Size*/, Align(8));
21438bcb0991SDimitry Andric 
21448bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
21458bcb0991SDimitry Andric 
21468bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
21478bcb0991SDimitry Andric     // Truncate if this is a 32-bit constant adrdess.
21488bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
21498bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
21508bcb0991SDimitry Andric   } else
21518bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
21528bcb0991SDimitry Andric 
21538bcb0991SDimitry Andric   MI.eraseFromParent();
21548bcb0991SDimitry Andric   return true;
21558bcb0991SDimitry Andric }
21568bcb0991SDimitry Andric 
21578bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(
21588bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21598bcb0991SDimitry Andric   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
21608bcb0991SDimitry Andric   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
21618bcb0991SDimitry Andric   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
21628bcb0991SDimitry Andric   Observer.changingInstr(MI);
21638bcb0991SDimitry Andric   MI.getOperand(1).setReg(Cast.getReg(0));
21648bcb0991SDimitry Andric   Observer.changedInstr(MI);
21658bcb0991SDimitry Andric   return true;
21668bcb0991SDimitry Andric }
21678bcb0991SDimitry Andric 
21688bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
21698bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21708bcb0991SDimitry Andric   MachineIRBuilder &B) const {
21718bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
21728bcb0991SDimitry Andric   assert(Ty.isScalar());
21738bcb0991SDimitry Andric 
2174480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
2175480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2176480093f4SDimitry Andric 
21778bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
2178*5ffd83dbSDimitry Andric   // FIXME: Do we need just output?
2179*5ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
21808bcb0991SDimitry Andric     return true;
2181*5ffd83dbSDimitry Andric   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
21828bcb0991SDimitry Andric     return true;
21838bcb0991SDimitry Andric 
21848bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
21858bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
21868bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
21878bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
21888bcb0991SDimitry Andric }
21898bcb0991SDimitry Andric 
2190480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2191480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2192480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2193480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2194480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
2195480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
2196480093f4SDimitry Andric 
2197480093f4SDimitry Andric   assert(SITargetLowering::isFlatGlobalAddrSpace(
2198480093f4SDimitry Andric            MRI.getType(PtrReg).getAddressSpace()) &&
2199480093f4SDimitry Andric          "this should not have been custom lowered");
2200480093f4SDimitry Andric 
2201480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
2202480093f4SDimitry Andric   LLT VecTy = LLT::vector(2, ValTy);
2203480093f4SDimitry Andric 
2204480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2205480093f4SDimitry Andric 
2206480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2207480093f4SDimitry Andric     .addDef(DstReg)
2208480093f4SDimitry Andric     .addUse(PtrReg)
2209480093f4SDimitry Andric     .addUse(PackedVal)
2210480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
2211480093f4SDimitry Andric 
2212480093f4SDimitry Andric   MI.eraseFromParent();
2213480093f4SDimitry Andric   return true;
2214480093f4SDimitry Andric }
2215480093f4SDimitry Andric 
2216*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog(
2217*5ffd83dbSDimitry Andric   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
2218*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2219*5ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
2220*5ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
2221*5ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
2222*5ffd83dbSDimitry Andric 
2223*5ffd83dbSDimitry Andric   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
2224*5ffd83dbSDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
2225*5ffd83dbSDimitry Andric 
2226*5ffd83dbSDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
2227*5ffd83dbSDimitry Andric   MI.eraseFromParent();
2228*5ffd83dbSDimitry Andric   return true;
2229*5ffd83dbSDimitry Andric }
2230*5ffd83dbSDimitry Andric 
2231*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
2232*5ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
2233*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2234*5ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
2235*5ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
2236*5ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
2237*5ffd83dbSDimitry Andric 
2238*5ffd83dbSDimitry Andric   auto K = B.buildFConstant(Ty, numbers::log2e);
2239*5ffd83dbSDimitry Andric   auto Mul = B.buildFMul(Ty, Src, K, Flags);
2240*5ffd83dbSDimitry Andric   B.buildFExp2(Dst, Mul, Flags);
2241*5ffd83dbSDimitry Andric   MI.eraseFromParent();
2242*5ffd83dbSDimitry Andric   return true;
2243*5ffd83dbSDimitry Andric }
2244*5ffd83dbSDimitry Andric 
2245*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
2246*5ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
2247*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2248*5ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
2249*5ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
2250*5ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
2251*5ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
2252*5ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
2253*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2254*5ffd83dbSDimitry Andric 
2255*5ffd83dbSDimitry Andric   if (Ty == S32) {
2256*5ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S32, Src0, Flags);
2257*5ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2258*5ffd83dbSDimitry Andric       .addUse(Log.getReg(0))
2259*5ffd83dbSDimitry Andric       .addUse(Src1)
2260*5ffd83dbSDimitry Andric       .setMIFlags(Flags);
2261*5ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
2262*5ffd83dbSDimitry Andric   } else if (Ty == S16) {
2263*5ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
2264*5ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S16, Src0, Flags);
2265*5ffd83dbSDimitry Andric     auto Ext0 = B.buildFPExt(S32, Log, Flags);
2266*5ffd83dbSDimitry Andric     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
2267*5ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
2268*5ffd83dbSDimitry Andric       .addUse(Ext0.getReg(0))
2269*5ffd83dbSDimitry Andric       .addUse(Ext1.getReg(0))
2270*5ffd83dbSDimitry Andric       .setMIFlags(Flags);
2271*5ffd83dbSDimitry Andric 
2272*5ffd83dbSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
2273*5ffd83dbSDimitry Andric   } else
2274*5ffd83dbSDimitry Andric     return false;
2275*5ffd83dbSDimitry Andric 
2276*5ffd83dbSDimitry Andric   MI.eraseFromParent();
2277*5ffd83dbSDimitry Andric   return true;
2278*5ffd83dbSDimitry Andric }
2279*5ffd83dbSDimitry Andric 
2280*5ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
2281*5ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
2282*5ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
2283*5ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
2284*5ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
2285*5ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2286*5ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
2287*5ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
2288*5ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
2289*5ffd83dbSDimitry Andric   return ModSrc;
2290*5ffd83dbSDimitry Andric }
2291*5ffd83dbSDimitry Andric 
2292*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
2293*5ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
2294*5ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
2295*5ffd83dbSDimitry Andric 
2296*5ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
2297*5ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
2298*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2299*5ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
2300*5ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
2301*5ffd83dbSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
2302*5ffd83dbSDimitry Andric          "this should not have been custom lowered");
2303*5ffd83dbSDimitry Andric 
2304*5ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
2305*5ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
2306*5ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
2307*5ffd83dbSDimitry Andric   // V_FRACT bug is:
2308*5ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
2309*5ffd83dbSDimitry Andric   //
2310*5ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
2311*5ffd83dbSDimitry Andric 
2312*5ffd83dbSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
2313*5ffd83dbSDimitry Andric     .addUse(OrigSrc)
2314*5ffd83dbSDimitry Andric     .setMIFlags(Flags);
2315*5ffd83dbSDimitry Andric 
2316*5ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
2317*5ffd83dbSDimitry Andric   // pattern.
2318*5ffd83dbSDimitry Andric 
2319*5ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
2320*5ffd83dbSDimitry Andric   // shouldn't matter?
2321*5ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
2322*5ffd83dbSDimitry Andric 
2323*5ffd83dbSDimitry Andric   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
2324*5ffd83dbSDimitry Andric 
2325*5ffd83dbSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(S64);
2326*5ffd83dbSDimitry Andric 
2327*5ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
2328*5ffd83dbSDimitry Andric   // use the one which will directly select.
2329*5ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2330*5ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
2331*5ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
2332*5ffd83dbSDimitry Andric   else
2333*5ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
2334*5ffd83dbSDimitry Andric 
2335*5ffd83dbSDimitry Andric   Register CorrectedFract = Min;
2336*5ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
2337*5ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
2338*5ffd83dbSDimitry Andric     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
2339*5ffd83dbSDimitry Andric   }
2340*5ffd83dbSDimitry Andric 
2341*5ffd83dbSDimitry Andric   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
2342*5ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
2343*5ffd83dbSDimitry Andric 
2344*5ffd83dbSDimitry Andric   MI.eraseFromParent();
2345*5ffd83dbSDimitry Andric   return true;
2346*5ffd83dbSDimitry Andric }
2347*5ffd83dbSDimitry Andric 
2348*5ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
2349*5ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
2350*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
2351*5ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2352*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2353*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2354*5ffd83dbSDimitry Andric   assert(MRI.getType(Dst) == LLT::vector(2, 16));
2355*5ffd83dbSDimitry Andric 
2356*5ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
2357*5ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
2358*5ffd83dbSDimitry Andric   assert(MRI.getType(Src0) == LLT::scalar(16));
2359*5ffd83dbSDimitry Andric 
2360*5ffd83dbSDimitry Andric   auto Merge = B.buildMerge(S32, {Src0, Src1});
2361*5ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
2362*5ffd83dbSDimitry Andric 
2363*5ffd83dbSDimitry Andric   MI.eraseFromParent();
2364*5ffd83dbSDimitry Andric   return true;
2365*5ffd83dbSDimitry Andric }
2366*5ffd83dbSDimitry Andric 
23670b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
23680b57cec5SDimitry Andric static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
2369480093f4SDimitry Andric                                        MachineRegisterInfo &MRI,
2370*5ffd83dbSDimitry Andric                                        MachineInstr *&Br,
2371*5ffd83dbSDimitry Andric                                        MachineBasicBlock *&UncondBrTarget) {
23720b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
23730b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
23740b57cec5SDimitry Andric     return nullptr;
23750b57cec5SDimitry Andric 
2376*5ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
23770b57cec5SDimitry Andric   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
2378*5ffd83dbSDimitry Andric   if (UseMI.getParent() != Parent ||
2379480093f4SDimitry Andric       UseMI.getOpcode() != AMDGPU::G_BRCOND)
2380480093f4SDimitry Andric     return nullptr;
2381480093f4SDimitry Andric 
2382*5ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2383480093f4SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
2384*5ffd83dbSDimitry Andric   if (Next == Parent->end()) {
2385*5ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
2386*5ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
2387*5ffd83dbSDimitry Andric       return nullptr;
2388*5ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
2389*5ffd83dbSDimitry Andric   } else {
2390480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
2391480093f4SDimitry Andric       return nullptr;
2392480093f4SDimitry Andric     Br = &*Next;
2393*5ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
2394480093f4SDimitry Andric   }
2395480093f4SDimitry Andric 
2396480093f4SDimitry Andric   return &UseMI;
23970b57cec5SDimitry Andric }
23980b57cec5SDimitry Andric 
2399*5ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
2400*5ffd83dbSDimitry Andric                                                MachineRegisterInfo &MRI,
2401*5ffd83dbSDimitry Andric                                                Register LiveIn,
2402*5ffd83dbSDimitry Andric                                                Register PhyReg) const {
2403*5ffd83dbSDimitry Andric   assert(PhyReg.isPhysical() && "Physical register expected");
2404*5ffd83dbSDimitry Andric 
2405*5ffd83dbSDimitry Andric   // Insert the live-in copy, if required, by defining destination virtual
2406*5ffd83dbSDimitry Andric   // register.
2407*5ffd83dbSDimitry Andric   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
2408*5ffd83dbSDimitry Andric   if (!MRI.getVRegDef(LiveIn)) {
2409*5ffd83dbSDimitry Andric     // FIXME: Should have scoped insert pt
2410*5ffd83dbSDimitry Andric     MachineBasicBlock &OrigInsBB = B.getMBB();
2411*5ffd83dbSDimitry Andric     auto OrigInsPt = B.getInsertPt();
2412*5ffd83dbSDimitry Andric 
2413*5ffd83dbSDimitry Andric     MachineBasicBlock &EntryMBB = B.getMF().front();
2414*5ffd83dbSDimitry Andric     EntryMBB.addLiveIn(PhyReg);
2415*5ffd83dbSDimitry Andric     B.setInsertPt(EntryMBB, EntryMBB.begin());
2416*5ffd83dbSDimitry Andric     B.buildCopy(LiveIn, PhyReg);
2417*5ffd83dbSDimitry Andric 
2418*5ffd83dbSDimitry Andric     B.setInsertPt(OrigInsBB, OrigInsPt);
2419*5ffd83dbSDimitry Andric   }
2420*5ffd83dbSDimitry Andric 
2421*5ffd83dbSDimitry Andric   return LiveIn;
2422*5ffd83dbSDimitry Andric }
2423*5ffd83dbSDimitry Andric 
2424*5ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
2425*5ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
2426*5ffd83dbSDimitry Andric                                                 Register PhyReg, LLT Ty,
2427*5ffd83dbSDimitry Andric                                                 bool InsertLiveInCopy) const {
2428*5ffd83dbSDimitry Andric   assert(PhyReg.isPhysical() && "Physical register expected");
2429*5ffd83dbSDimitry Andric 
2430*5ffd83dbSDimitry Andric   // Get or create virtual live-in regester
2431*5ffd83dbSDimitry Andric   Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
2432*5ffd83dbSDimitry Andric   if (!LiveIn) {
2433*5ffd83dbSDimitry Andric     LiveIn = MRI.createGenericVirtualRegister(Ty);
2434*5ffd83dbSDimitry Andric     MRI.addLiveIn(PhyReg, LiveIn);
2435*5ffd83dbSDimitry Andric   }
2436*5ffd83dbSDimitry Andric 
2437*5ffd83dbSDimitry Andric   // When the actual true copy required is from virtual register to physical
2438*5ffd83dbSDimitry Andric   // register (to be inserted later), live-in copy insertion from physical
2439*5ffd83dbSDimitry Andric   // to register virtual register is not required
2440*5ffd83dbSDimitry Andric   if (!InsertLiveInCopy)
24410b57cec5SDimitry Andric     return LiveIn;
24420b57cec5SDimitry Andric 
2443*5ffd83dbSDimitry Andric   return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
2444*5ffd83dbSDimitry Andric }
2445*5ffd83dbSDimitry Andric 
2446*5ffd83dbSDimitry Andric const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
2447*5ffd83dbSDimitry Andric     MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2448*5ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2449*5ffd83dbSDimitry Andric   const ArgDescriptor *Arg;
2450*5ffd83dbSDimitry Andric   const TargetRegisterClass *RC;
2451*5ffd83dbSDimitry Andric   LLT ArgTy;
2452*5ffd83dbSDimitry Andric   std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
2453*5ffd83dbSDimitry Andric   if (!Arg) {
2454*5ffd83dbSDimitry Andric     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
2455*5ffd83dbSDimitry Andric     return nullptr;
2456*5ffd83dbSDimitry Andric   }
2457*5ffd83dbSDimitry Andric   return Arg;
24580b57cec5SDimitry Andric }
24590b57cec5SDimitry Andric 
24600b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
24610b57cec5SDimitry Andric                                          const ArgDescriptor *Arg) const {
24628bcb0991SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
24630b57cec5SDimitry Andric     return false; // TODO: Handle these
24640b57cec5SDimitry Andric 
2465*5ffd83dbSDimitry Andric   Register SrcReg = Arg->getRegister();
2466*5ffd83dbSDimitry Andric   assert(SrcReg.isPhysical() && "Physical register expected");
2467*5ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
24680b57cec5SDimitry Andric 
24690b57cec5SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
24700b57cec5SDimitry Andric 
24710b57cec5SDimitry Andric   LLT Ty = MRI.getType(DstReg);
2472*5ffd83dbSDimitry Andric   Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
24730b57cec5SDimitry Andric 
24740b57cec5SDimitry Andric   if (Arg->isMasked()) {
24750b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
24760b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
24770b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
24780b57cec5SDimitry Andric     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
24790b57cec5SDimitry Andric 
24808bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
24818bcb0991SDimitry Andric 
24828bcb0991SDimitry Andric     if (Shift != 0) {
24830b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
24848bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
24858bcb0991SDimitry Andric     }
24868bcb0991SDimitry Andric 
24878bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
2488*5ffd83dbSDimitry Andric   } else {
24890b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
24900b57cec5SDimitry Andric   }
24910b57cec5SDimitry Andric 
24920b57cec5SDimitry Andric   return true;
24930b57cec5SDimitry Andric }
24940b57cec5SDimitry Andric 
24950b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
2496*5ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
24970b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
24980b57cec5SDimitry Andric 
2499*5ffd83dbSDimitry Andric   const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
2500*5ffd83dbSDimitry Andric   if (!Arg)
25010b57cec5SDimitry Andric     return false;
25020b57cec5SDimitry Andric 
2503*5ffd83dbSDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
2504*5ffd83dbSDimitry Andric     return false;
2505*5ffd83dbSDimitry Andric 
25060b57cec5SDimitry Andric   MI.eraseFromParent();
25070b57cec5SDimitry Andric   return true;
25080b57cec5SDimitry Andric }
25090b57cec5SDimitry Andric 
25108bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
25118bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
25128bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
2513480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2514480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
2515480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
2516480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2517480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
25188bcb0991SDimitry Andric 
25198bcb0991SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
25208bcb0991SDimitry Andric     return true;
25218bcb0991SDimitry Andric 
2522480093f4SDimitry Andric   if (DstTy == S16)
2523480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
2524480093f4SDimitry Andric   if (DstTy == S32)
2525480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
2526480093f4SDimitry Andric   if (DstTy == S64)
2527480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
2528480093f4SDimitry Andric 
25298bcb0991SDimitry Andric   return false;
25308bcb0991SDimitry Andric }
25318bcb0991SDimitry Andric 
2532*5ffd83dbSDimitry Andric void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
2533*5ffd83dbSDimitry Andric                                                   Register DstReg,
2534*5ffd83dbSDimitry Andric                                                   Register X,
2535*5ffd83dbSDimitry Andric                                                   Register Y,
2536*5ffd83dbSDimitry Andric                                                   bool IsDiv) const {
2537*5ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
2538*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2539*5ffd83dbSDimitry Andric 
2540*5ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2541*5ffd83dbSDimitry Andric   // algorithm used here.
2542*5ffd83dbSDimitry Andric 
2543*5ffd83dbSDimitry Andric   // Initial estimate of inv(y).
2544*5ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
2545*5ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
2546*5ffd83dbSDimitry Andric   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
2547*5ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
2548*5ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
2549*5ffd83dbSDimitry Andric 
2550*5ffd83dbSDimitry Andric   // One round of UNR.
2551*5ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
2552*5ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
2553*5ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
2554*5ffd83dbSDimitry Andric 
2555*5ffd83dbSDimitry Andric   // Quotient/remainder estimate.
2556*5ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
2557*5ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
2558*5ffd83dbSDimitry Andric 
2559*5ffd83dbSDimitry Andric   // First quotient/remainder refinement.
2560*5ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
2561*5ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2562*5ffd83dbSDimitry Andric   if (IsDiv)
2563*5ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
2564*5ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
2565*5ffd83dbSDimitry Andric 
2566*5ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
2567*5ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2568*5ffd83dbSDimitry Andric   if (IsDiv)
2569*5ffd83dbSDimitry Andric     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
2570*5ffd83dbSDimitry Andric   else
2571*5ffd83dbSDimitry Andric     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
2572*5ffd83dbSDimitry Andric }
2573*5ffd83dbSDimitry Andric 
2574*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
2575*5ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
2576*5ffd83dbSDimitry Andric                                               MachineIRBuilder &B) const {
2577*5ffd83dbSDimitry Andric   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2578*5ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2579*5ffd83dbSDimitry Andric   Register Num = MI.getOperand(1).getReg();
2580*5ffd83dbSDimitry Andric   Register Den = MI.getOperand(2).getReg();
2581*5ffd83dbSDimitry Andric   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2582*5ffd83dbSDimitry Andric   MI.eraseFromParent();
2583*5ffd83dbSDimitry Andric   return true;
2584*5ffd83dbSDimitry Andric }
2585*5ffd83dbSDimitry Andric 
2586*5ffd83dbSDimitry Andric // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
2587*5ffd83dbSDimitry Andric //
2588*5ffd83dbSDimitry Andric // Return lo, hi of result
2589*5ffd83dbSDimitry Andric //
2590*5ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
2591*5ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
2592*5ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
2593*5ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
2594*5ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
2595*5ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
2596*5ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
2597*5ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
2598*5ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
2599*5ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
2600*5ffd83dbSDimitry Andric                                                        Register Val) {
2601*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2602*5ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
2603*5ffd83dbSDimitry Andric 
2604*5ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
2605*5ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
2606*5ffd83dbSDimitry Andric 
2607*5ffd83dbSDimitry Andric   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
2608*5ffd83dbSDimitry Andric                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
2609*5ffd83dbSDimitry Andric 
2610*5ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
2611*5ffd83dbSDimitry Andric   auto Mul1 =
2612*5ffd83dbSDimitry Andric       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
2613*5ffd83dbSDimitry Andric 
2614*5ffd83dbSDimitry Andric   // 2**(-32)
2615*5ffd83dbSDimitry Andric   auto Mul2 =
2616*5ffd83dbSDimitry Andric       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
2617*5ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
2618*5ffd83dbSDimitry Andric 
2619*5ffd83dbSDimitry Andric   // -(2**32)
2620*5ffd83dbSDimitry Andric   auto Mad2 = B.buildFMAD(S32, Trunc,
2621*5ffd83dbSDimitry Andric                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
2622*5ffd83dbSDimitry Andric 
2623*5ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
2624*5ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
2625*5ffd83dbSDimitry Andric 
2626*5ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
2627*5ffd83dbSDimitry Andric }
2628*5ffd83dbSDimitry Andric 
2629*5ffd83dbSDimitry Andric void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
2630*5ffd83dbSDimitry Andric                                                   Register DstReg,
2631*5ffd83dbSDimitry Andric                                                   Register Numer,
2632*5ffd83dbSDimitry Andric                                                   Register Denom,
2633*5ffd83dbSDimitry Andric                                                   bool IsDiv) const {
2634*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2635*5ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
2636*5ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
2637*5ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
2638*5ffd83dbSDimitry Andric 
2639*5ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
2640*5ffd83dbSDimitry Andric 
2641*5ffd83dbSDimitry Andric   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
2642*5ffd83dbSDimitry Andric 
2643*5ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
2644*5ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
2645*5ffd83dbSDimitry Andric 
2646*5ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
2647*5ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
2648*5ffd83dbSDimitry Andric 
2649*5ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
2650*5ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
2651*5ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
2652*5ffd83dbSDimitry Andric 
2653*5ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
2654*5ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
2655*5ffd83dbSDimitry Andric   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
2656*5ffd83dbSDimitry Andric   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
2657*5ffd83dbSDimitry Andric 
2658*5ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
2659*5ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
2660*5ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
2661*5ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
2662*5ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
2663*5ffd83dbSDimitry Andric 
2664*5ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
2665*5ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
2666*5ffd83dbSDimitry Andric   auto Add2_HiC =
2667*5ffd83dbSDimitry Andric       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
2668*5ffd83dbSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
2669*5ffd83dbSDimitry Andric   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
2670*5ffd83dbSDimitry Andric 
2671*5ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
2672*5ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
2673*5ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
2674*5ffd83dbSDimitry Andric 
2675*5ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
2676*5ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
2677*5ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
2678*5ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
2679*5ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
2680*5ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
2681*5ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
2682*5ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
2683*5ffd83dbSDimitry Andric   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
2684*5ffd83dbSDimitry Andric 
2685*5ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
2686*5ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
2687*5ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
2688*5ffd83dbSDimitry Andric 
2689*5ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
2690*5ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
2691*5ffd83dbSDimitry Andric 
2692*5ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
2693*5ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
2694*5ffd83dbSDimitry Andric 
2695*5ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
2696*5ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
2697*5ffd83dbSDimitry Andric 
2698*5ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
2699*5ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
2700*5ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
2701*5ffd83dbSDimitry Andric 
2702*5ffd83dbSDimitry Andric   // if C3 != 0 ...
2703*5ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
2704*5ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
2705*5ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
2706*5ffd83dbSDimitry Andric   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
2707*5ffd83dbSDimitry Andric 
2708*5ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
2709*5ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
2710*5ffd83dbSDimitry Andric 
2711*5ffd83dbSDimitry Andric   auto C4 =
2712*5ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
2713*5ffd83dbSDimitry Andric   auto C5 =
2714*5ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
2715*5ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
2716*5ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
2717*5ffd83dbSDimitry Andric 
2718*5ffd83dbSDimitry Andric   // if (C6 != 0)
2719*5ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
2720*5ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
2721*5ffd83dbSDimitry Andric 
2722*5ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
2723*5ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
2724*5ffd83dbSDimitry Andric   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
2725*5ffd83dbSDimitry Andric 
2726*5ffd83dbSDimitry Andric   // endif C6
2727*5ffd83dbSDimitry Andric   // endif C3
2728*5ffd83dbSDimitry Andric 
2729*5ffd83dbSDimitry Andric   if (IsDiv) {
2730*5ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
2731*5ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
2732*5ffd83dbSDimitry Andric     B.buildSelect(DstReg,
2733*5ffd83dbSDimitry Andric                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
2734*5ffd83dbSDimitry Andric   } else {
2735*5ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
2736*5ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
2737*5ffd83dbSDimitry Andric     B.buildSelect(DstReg,
2738*5ffd83dbSDimitry Andric                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
2739*5ffd83dbSDimitry Andric   }
2740*5ffd83dbSDimitry Andric }
2741*5ffd83dbSDimitry Andric 
2742*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
2743*5ffd83dbSDimitry Andric                                             MachineRegisterInfo &MRI,
2744*5ffd83dbSDimitry Andric                                             MachineIRBuilder &B) const {
2745*5ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
2746*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2747*5ffd83dbSDimitry Andric   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
2748*5ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2749*5ffd83dbSDimitry Andric   Register Num = MI.getOperand(1).getReg();
2750*5ffd83dbSDimitry Andric   Register Den = MI.getOperand(2).getReg();
2751*5ffd83dbSDimitry Andric   LLT Ty = MRI.getType(DstReg);
2752*5ffd83dbSDimitry Andric 
2753*5ffd83dbSDimitry Andric   if (Ty == S32)
2754*5ffd83dbSDimitry Andric     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
2755*5ffd83dbSDimitry Andric   else if (Ty == S64)
2756*5ffd83dbSDimitry Andric     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
2757*5ffd83dbSDimitry Andric   else
2758*5ffd83dbSDimitry Andric     return false;
2759*5ffd83dbSDimitry Andric 
2760*5ffd83dbSDimitry Andric   MI.eraseFromParent();
2761*5ffd83dbSDimitry Andric   return true;
2762*5ffd83dbSDimitry Andric 
2763*5ffd83dbSDimitry Andric }
2764*5ffd83dbSDimitry Andric 
2765*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
2766*5ffd83dbSDimitry Andric                                             MachineRegisterInfo &MRI,
2767*5ffd83dbSDimitry Andric                                             MachineIRBuilder &B) const {
2768*5ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
2769*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2770*5ffd83dbSDimitry Andric 
2771*5ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2772*5ffd83dbSDimitry Andric   const LLT Ty = MRI.getType(DstReg);
2773*5ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
2774*5ffd83dbSDimitry Andric     return false;
2775*5ffd83dbSDimitry Andric 
2776*5ffd83dbSDimitry Andric   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
2777*5ffd83dbSDimitry Andric 
2778*5ffd83dbSDimitry Andric   Register LHS = MI.getOperand(1).getReg();
2779*5ffd83dbSDimitry Andric   Register RHS = MI.getOperand(2).getReg();
2780*5ffd83dbSDimitry Andric 
2781*5ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
2782*5ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
2783*5ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
2784*5ffd83dbSDimitry Andric 
2785*5ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
2786*5ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
2787*5ffd83dbSDimitry Andric 
2788*5ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
2789*5ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
2790*5ffd83dbSDimitry Andric 
2791*5ffd83dbSDimitry Andric   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
2792*5ffd83dbSDimitry Andric   if (Ty == S32)
2793*5ffd83dbSDimitry Andric     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
2794*5ffd83dbSDimitry Andric   else
2795*5ffd83dbSDimitry Andric     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
2796*5ffd83dbSDimitry Andric 
2797*5ffd83dbSDimitry Andric   Register Sign;
2798*5ffd83dbSDimitry Andric   if (IsDiv)
2799*5ffd83dbSDimitry Andric     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
2800*5ffd83dbSDimitry Andric   else
2801*5ffd83dbSDimitry Andric     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
2802*5ffd83dbSDimitry Andric 
2803*5ffd83dbSDimitry Andric   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
2804*5ffd83dbSDimitry Andric   B.buildSub(DstReg, UDivRem, Sign);
2805*5ffd83dbSDimitry Andric 
2806*5ffd83dbSDimitry Andric   MI.eraseFromParent();
2807*5ffd83dbSDimitry Andric   return true;
2808*5ffd83dbSDimitry Andric }
2809*5ffd83dbSDimitry Andric 
28108bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
28118bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
28128bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
28138bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
28148bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
28158bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
28168bcb0991SDimitry Andric 
28178bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
28188bcb0991SDimitry Andric 
28198bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
28208bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
28218bcb0991SDimitry Andric   LLT S64 = LLT::scalar(64);
28228bcb0991SDimitry Andric 
28238bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
28248bcb0991SDimitry Andric   bool Unsafe =
28258bcb0991SDimitry Andric     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
28268bcb0991SDimitry Andric 
28278bcb0991SDimitry Andric   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
28288bcb0991SDimitry Andric     return false;
28298bcb0991SDimitry Andric 
2830480093f4SDimitry Andric   if (!Unsafe && ResTy == S32 &&
2831*5ffd83dbSDimitry Andric       MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
28328bcb0991SDimitry Andric     return false;
28338bcb0991SDimitry Andric 
28348bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
28358bcb0991SDimitry Andric     // 1 / x -> RCP(x)
28368bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
28378bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
28388bcb0991SDimitry Andric         .addUse(RHS)
28398bcb0991SDimitry Andric         .setMIFlags(Flags);
28408bcb0991SDimitry Andric 
28418bcb0991SDimitry Andric       MI.eraseFromParent();
28428bcb0991SDimitry Andric       return true;
28438bcb0991SDimitry Andric     }
28448bcb0991SDimitry Andric 
28458bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
28468bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
28478bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
28488bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
28498bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
28508bcb0991SDimitry Andric         .setMIFlags(Flags);
28518bcb0991SDimitry Andric 
28528bcb0991SDimitry Andric       MI.eraseFromParent();
28538bcb0991SDimitry Andric       return true;
28548bcb0991SDimitry Andric     }
28558bcb0991SDimitry Andric   }
28568bcb0991SDimitry Andric 
28578bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
28588bcb0991SDimitry Andric   if (Unsafe) {
28598bcb0991SDimitry Andric     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
28608bcb0991SDimitry Andric       .addUse(RHS)
28618bcb0991SDimitry Andric       .setMIFlags(Flags);
28628bcb0991SDimitry Andric     B.buildFMul(Res, LHS, RCP, Flags);
28638bcb0991SDimitry Andric 
28648bcb0991SDimitry Andric     MI.eraseFromParent();
28658bcb0991SDimitry Andric     return true;
28668bcb0991SDimitry Andric   }
28678bcb0991SDimitry Andric 
28688bcb0991SDimitry Andric   return false;
28698bcb0991SDimitry Andric }
28708bcb0991SDimitry Andric 
2871480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
2872480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
2873480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
2874480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
2875480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
2876480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
2877480093f4SDimitry Andric 
2878480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
2879480093f4SDimitry Andric 
2880480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
2881480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2882480093f4SDimitry Andric 
2883480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2884480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2885480093f4SDimitry Andric 
2886480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2887480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
2888480093f4SDimitry Andric     .setMIFlags(Flags);
2889480093f4SDimitry Andric 
2890480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2891480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2892480093f4SDimitry Andric 
2893480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2894480093f4SDimitry Andric     .addUse(RDst.getReg(0))
2895480093f4SDimitry Andric     .addUse(RHS)
2896480093f4SDimitry Andric     .addUse(LHS)
2897480093f4SDimitry Andric     .setMIFlags(Flags);
2898480093f4SDimitry Andric 
2899480093f4SDimitry Andric   MI.eraseFromParent();
2900480093f4SDimitry Andric   return true;
2901480093f4SDimitry Andric }
2902480093f4SDimitry Andric 
2903480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2904480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2905480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable,
2906480093f4SDimitry Andric                                MachineIRBuilder &B,
2907480093f4SDimitry Andric                                const GCNSubtarget &ST,
2908480093f4SDimitry Andric                                AMDGPU::SIModeRegisterDefaults Mode) {
2909480093f4SDimitry Andric   // Set SP denorm mode to this value.
2910480093f4SDimitry Andric   unsigned SPDenormMode =
2911*5ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
2912480093f4SDimitry Andric 
2913480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
2914480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2915*5ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
2916480093f4SDimitry Andric 
2917*5ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2918480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
2919480093f4SDimitry Andric       .addImm(NewDenormModeValue);
2920480093f4SDimitry Andric 
2921480093f4SDimitry Andric   } else {
2922480093f4SDimitry Andric     // Select FP32 bit field in mode register.
2923480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2924480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2925480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2926480093f4SDimitry Andric 
2927480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2928480093f4SDimitry Andric       .addImm(SPDenormMode)
2929480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
2930480093f4SDimitry Andric   }
2931480093f4SDimitry Andric }
2932480093f4SDimitry Andric 
2933480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2934480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
2935480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
2936480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
2937480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
2938480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
2939480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2940480093f4SDimitry Andric   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2941480093f4SDimitry Andric 
2942480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
2943480093f4SDimitry Andric 
2944480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2945480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
2946480093f4SDimitry Andric 
2947480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
2948480093f4SDimitry Andric 
2949480093f4SDimitry Andric   auto DenominatorScaled =
2950480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2951480093f4SDimitry Andric       .addUse(LHS)
2952*5ffd83dbSDimitry Andric       .addUse(RHS)
2953*5ffd83dbSDimitry Andric       .addImm(0)
2954480093f4SDimitry Andric       .setMIFlags(Flags);
2955480093f4SDimitry Andric   auto NumeratorScaled =
2956480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2957480093f4SDimitry Andric       .addUse(LHS)
2958480093f4SDimitry Andric       .addUse(RHS)
2959*5ffd83dbSDimitry Andric       .addImm(1)
2960480093f4SDimitry Andric       .setMIFlags(Flags);
2961480093f4SDimitry Andric 
2962480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2963480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
2964480093f4SDimitry Andric     .setMIFlags(Flags);
2965480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2966480093f4SDimitry Andric 
2967480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2968480093f4SDimitry Andric   // aren't modeled as reading it.
2969*5ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
2970480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
2971480093f4SDimitry Andric 
2972480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2973480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2974480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2975480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2976480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2977480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2978480093f4SDimitry Andric 
2979*5ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
2980480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
2981480093f4SDimitry Andric 
2982480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2983480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
2984480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
2985480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
2986480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
2987480093f4SDimitry Andric     .setMIFlags(Flags);
2988480093f4SDimitry Andric 
2989480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2990480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
2991480093f4SDimitry Andric     .addUse(RHS)
2992480093f4SDimitry Andric     .addUse(LHS)
2993480093f4SDimitry Andric     .setMIFlags(Flags);
2994480093f4SDimitry Andric 
2995480093f4SDimitry Andric   MI.eraseFromParent();
2996480093f4SDimitry Andric   return true;
2997480093f4SDimitry Andric }
2998480093f4SDimitry Andric 
2999480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3000480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3001480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3002480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3003480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3004480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3005480093f4SDimitry Andric 
3006480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3007480093f4SDimitry Andric 
3008480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
3009480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3010480093f4SDimitry Andric 
3011480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
3012480093f4SDimitry Andric 
3013480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3014480093f4SDimitry Andric     .addUse(LHS)
3015480093f4SDimitry Andric     .addUse(RHS)
3016*5ffd83dbSDimitry Andric     .addImm(0)
3017480093f4SDimitry Andric     .setMIFlags(Flags);
3018480093f4SDimitry Andric 
3019480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3020480093f4SDimitry Andric 
3021480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3022480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
3023480093f4SDimitry Andric     .setMIFlags(Flags);
3024480093f4SDimitry Andric 
3025480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3026480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3027480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3028480093f4SDimitry Andric 
3029480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3030480093f4SDimitry Andric     .addUse(LHS)
3031480093f4SDimitry Andric     .addUse(RHS)
3032*5ffd83dbSDimitry Andric     .addImm(1)
3033480093f4SDimitry Andric     .setMIFlags(Flags);
3034480093f4SDimitry Andric 
3035480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
3036*5ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3037480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3038480093f4SDimitry Andric 
3039480093f4SDimitry Andric   Register Scale;
3040480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
3041480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
3042480093f4SDimitry Andric     // is not usable.
3043480093f4SDimitry Andric 
3044480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
3045480093f4SDimitry Andric 
3046480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3047480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3048480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3049480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3050480093f4SDimitry Andric 
3051480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3052480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
3053480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3054480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
3055*5ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3056480093f4SDimitry Andric   } else {
3057480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
3058480093f4SDimitry Andric   }
3059480093f4SDimitry Andric 
3060480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3061480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3062480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3063480093f4SDimitry Andric     .addUse(Mul.getReg(0))
3064480093f4SDimitry Andric     .addUse(Scale)
3065480093f4SDimitry Andric     .setMIFlags(Flags);
3066480093f4SDimitry Andric 
3067480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3068480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3069480093f4SDimitry Andric     .addUse(RHS)
3070480093f4SDimitry Andric     .addUse(LHS)
3071480093f4SDimitry Andric     .setMIFlags(Flags);
3072480093f4SDimitry Andric 
3073480093f4SDimitry Andric   MI.eraseFromParent();
3074480093f4SDimitry Andric   return true;
3075480093f4SDimitry Andric }
3076480093f4SDimitry Andric 
30778bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
30788bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
30798bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
30808bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
30818bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
30828bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
30838bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
30848bcb0991SDimitry Andric 
30858bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
30868bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
30878bcb0991SDimitry Andric 
30888bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
30898bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
30908bcb0991SDimitry Andric 
30918bcb0991SDimitry Andric   auto C0 = B.buildConstant(S32, 0x6f800000);
30928bcb0991SDimitry Andric   auto C1 = B.buildConstant(S32, 0x2f800000);
30938bcb0991SDimitry Andric   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
30948bcb0991SDimitry Andric 
30958bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
30968bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
30978bcb0991SDimitry Andric 
30988bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
30998bcb0991SDimitry Andric 
31008bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
31018bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
31028bcb0991SDimitry Andric     .setMIFlags(Flags);
31038bcb0991SDimitry Andric 
31048bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
31058bcb0991SDimitry Andric 
31068bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
31078bcb0991SDimitry Andric 
31088bcb0991SDimitry Andric   MI.eraseFromParent();
31098bcb0991SDimitry Andric   return true;
31108bcb0991SDimitry Andric }
31118bcb0991SDimitry Andric 
31120b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
31130b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
31140b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
31150b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
31160b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
31170b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
31180b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
31190b57cec5SDimitry Andric   }
31200b57cec5SDimitry Andric 
31210b57cec5SDimitry Andric   uint64_t Offset =
31220b57cec5SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
31230b57cec5SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
31240b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
31250b57cec5SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
31260b57cec5SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
31270b57cec5SDimitry Andric 
31280b57cec5SDimitry Andric   const ArgDescriptor *Arg;
31290b57cec5SDimitry Andric   const TargetRegisterClass *RC;
3130*5ffd83dbSDimitry Andric   LLT ArgTy;
3131*5ffd83dbSDimitry Andric   std::tie(Arg, RC, ArgTy) =
3132*5ffd83dbSDimitry Andric       MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
31330b57cec5SDimitry Andric   if (!Arg)
31340b57cec5SDimitry Andric     return false;
31350b57cec5SDimitry Andric 
31360b57cec5SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
31370b57cec5SDimitry Andric   if (!loadInputValue(KernargPtrReg, B, Arg))
31380b57cec5SDimitry Andric     return false;
31390b57cec5SDimitry Andric 
3140480093f4SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
31410b57cec5SDimitry Andric   MI.eraseFromParent();
31420b57cec5SDimitry Andric   return true;
31430b57cec5SDimitry Andric }
31440b57cec5SDimitry Andric 
31458bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
31468bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
31478bcb0991SDimitry Andric                                               MachineIRBuilder &B,
31488bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
31498bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
31508bcb0991SDimitry Andric   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
31518bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
31528bcb0991SDimitry Andric   MI.eraseFromParent();
31538bcb0991SDimitry Andric   return true;
31548bcb0991SDimitry Andric }
31558bcb0991SDimitry Andric 
3156*5ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
3157*5ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
3158*5ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
3159*5ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
3160*5ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
3161*5ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
3162*5ffd83dbSDimitry Andric std::tuple<Register, unsigned, unsigned>
3163*5ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
3164*5ffd83dbSDimitry Andric                                         Register OrigOffset) const {
3165*5ffd83dbSDimitry Andric   const unsigned MaxImm = 4095;
3166*5ffd83dbSDimitry Andric   Register BaseReg;
3167*5ffd83dbSDimitry Andric   unsigned TotalConstOffset;
3168*5ffd83dbSDimitry Andric   MachineInstr *OffsetDef;
3169*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3170*5ffd83dbSDimitry Andric 
3171*5ffd83dbSDimitry Andric   std::tie(BaseReg, TotalConstOffset, OffsetDef)
3172*5ffd83dbSDimitry Andric     = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
3173*5ffd83dbSDimitry Andric 
3174*5ffd83dbSDimitry Andric   unsigned ImmOffset = TotalConstOffset;
3175*5ffd83dbSDimitry Andric 
3176*5ffd83dbSDimitry Andric   // If the immediate value is too big for the immoffset field, put the value
3177*5ffd83dbSDimitry Andric   // and -4096 into the immoffset field so that the value that is copied/added
3178*5ffd83dbSDimitry Andric   // for the voffset field is a multiple of 4096, and it stands more chance
3179*5ffd83dbSDimitry Andric   // of being CSEd with the copy/add for another similar load/store.
3180*5ffd83dbSDimitry Andric   // However, do not do that rounding down to a multiple of 4096 if that is a
3181*5ffd83dbSDimitry Andric   // negative number, as it appears to be illegal to have a negative offset
3182*5ffd83dbSDimitry Andric   // in the vgpr, even if adding the immediate offset makes it positive.
3183*5ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
3184*5ffd83dbSDimitry Andric   ImmOffset -= Overflow;
3185*5ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
3186*5ffd83dbSDimitry Andric     Overflow += ImmOffset;
3187*5ffd83dbSDimitry Andric     ImmOffset = 0;
3188*5ffd83dbSDimitry Andric   }
3189*5ffd83dbSDimitry Andric 
3190*5ffd83dbSDimitry Andric   if (Overflow != 0) {
3191*5ffd83dbSDimitry Andric     if (!BaseReg) {
3192*5ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
3193*5ffd83dbSDimitry Andric     } else {
3194*5ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
3195*5ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
3196*5ffd83dbSDimitry Andric     }
3197*5ffd83dbSDimitry Andric   }
3198*5ffd83dbSDimitry Andric 
3199*5ffd83dbSDimitry Andric   if (!BaseReg)
3200*5ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
3201*5ffd83dbSDimitry Andric 
3202*5ffd83dbSDimitry Andric   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
3203*5ffd83dbSDimitry Andric }
3204*5ffd83dbSDimitry Andric 
32058bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
32068bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
32078bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
32088bcb0991SDimitry Andric                                              Register Reg) const {
32098bcb0991SDimitry Andric   if (!ST.hasUnpackedD16VMem())
32108bcb0991SDimitry Andric     return Reg;
32118bcb0991SDimitry Andric 
32128bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
32138bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
32148bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
32158bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
32168bcb0991SDimitry Andric 
32178bcb0991SDimitry Andric   auto Unmerge = B.buildUnmerge(S16, Reg);
32188bcb0991SDimitry Andric 
32198bcb0991SDimitry Andric   SmallVector<Register, 4> WideRegs;
32208bcb0991SDimitry Andric   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
32218bcb0991SDimitry Andric     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
32228bcb0991SDimitry Andric 
32238bcb0991SDimitry Andric   int NumElts = StoreVT.getNumElements();
32248bcb0991SDimitry Andric 
32258bcb0991SDimitry Andric   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
32268bcb0991SDimitry Andric }
32278bcb0991SDimitry Andric 
3228*5ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
3229*5ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
3230*5ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
3231*5ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
32328bcb0991SDimitry Andric 
32338bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
32348bcb0991SDimitry Andric 
32358bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
32368bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
32378bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
3238*5ffd83dbSDimitry Andric     return AnyExt;
32398bcb0991SDimitry Andric   }
32408bcb0991SDimitry Andric 
32418bcb0991SDimitry Andric   if (Ty.isVector()) {
32428bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
32438bcb0991SDimitry Andric       if (IsFormat)
3244*5ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
3245*5ffd83dbSDimitry Andric     }
3246*5ffd83dbSDimitry Andric   }
3247*5ffd83dbSDimitry Andric 
3248*5ffd83dbSDimitry Andric   return VData;
3249*5ffd83dbSDimitry Andric }
3250*5ffd83dbSDimitry Andric 
3251*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
3252*5ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
3253*5ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
3254*5ffd83dbSDimitry Andric                                               bool IsTyped,
3255*5ffd83dbSDimitry Andric                                               bool IsFormat) const {
3256*5ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
3257*5ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
3258*5ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
3259*5ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3260*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3261*5ffd83dbSDimitry Andric 
3262*5ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
3263*5ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
3264*5ffd83dbSDimitry Andric 
3265*5ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
3266*5ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
3267*5ffd83dbSDimitry Andric 
3268*5ffd83dbSDimitry Andric   unsigned ImmOffset;
3269*5ffd83dbSDimitry Andric   unsigned TotalOffset;
3270*5ffd83dbSDimitry Andric 
3271*5ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
3272*5ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3273*5ffd83dbSDimitry Andric 
3274*5ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
3275*5ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3276*5ffd83dbSDimitry Andric   Register VIndex;
3277*5ffd83dbSDimitry Andric   int OpOffset = 0;
3278*5ffd83dbSDimitry Andric   if (HasVIndex) {
3279*5ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
3280*5ffd83dbSDimitry Andric     OpOffset = 1;
3281*5ffd83dbSDimitry Andric   }
3282*5ffd83dbSDimitry Andric 
3283*5ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3284*5ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3285*5ffd83dbSDimitry Andric 
3286*5ffd83dbSDimitry Andric   unsigned Format = 0;
3287*5ffd83dbSDimitry Andric   if (IsTyped) {
3288*5ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
3289*5ffd83dbSDimitry Andric     ++OpOffset;
3290*5ffd83dbSDimitry Andric   }
3291*5ffd83dbSDimitry Andric 
3292*5ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3293*5ffd83dbSDimitry Andric 
3294*5ffd83dbSDimitry Andric   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3295*5ffd83dbSDimitry Andric   if (TotalOffset != 0)
3296*5ffd83dbSDimitry Andric     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3297*5ffd83dbSDimitry Andric 
3298*5ffd83dbSDimitry Andric   unsigned Opc;
3299*5ffd83dbSDimitry Andric   if (IsTyped) {
3300*5ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
3301*5ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
3302*5ffd83dbSDimitry Andric   } else if (IsFormat) {
3303*5ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
3304*5ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
3305*5ffd83dbSDimitry Andric   } else {
3306*5ffd83dbSDimitry Andric     switch (MemSize) {
3307*5ffd83dbSDimitry Andric     case 1:
3308*5ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
3309*5ffd83dbSDimitry Andric       break;
3310*5ffd83dbSDimitry Andric     case 2:
3311*5ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
3312*5ffd83dbSDimitry Andric       break;
3313*5ffd83dbSDimitry Andric     default:
3314*5ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
3315*5ffd83dbSDimitry Andric       break;
3316*5ffd83dbSDimitry Andric     }
3317*5ffd83dbSDimitry Andric   }
3318*5ffd83dbSDimitry Andric 
3319*5ffd83dbSDimitry Andric   if (!VIndex)
3320*5ffd83dbSDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
3321*5ffd83dbSDimitry Andric 
3322*5ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
3323*5ffd83dbSDimitry Andric     .addUse(VData)              // vdata
3324*5ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
3325*5ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
3326*5ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
3327*5ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
3328*5ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
3329*5ffd83dbSDimitry Andric 
3330*5ffd83dbSDimitry Andric   if (IsTyped)
3331*5ffd83dbSDimitry Andric     MIB.addImm(Format);
3332*5ffd83dbSDimitry Andric 
3333*5ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3334*5ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3335*5ffd83dbSDimitry Andric      .addMemOperand(MMO);
3336*5ffd83dbSDimitry Andric 
3337*5ffd83dbSDimitry Andric   MI.eraseFromParent();
33388bcb0991SDimitry Andric   return true;
33398bcb0991SDimitry Andric }
33408bcb0991SDimitry Andric 
3341*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
3342*5ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
3343*5ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
3344*5ffd83dbSDimitry Andric                                              bool IsFormat,
3345*5ffd83dbSDimitry Andric                                              bool IsTyped) const {
3346*5ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
3347*5ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
3348*5ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
3349*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3350*5ffd83dbSDimitry Andric 
3351*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3352*5ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
3353*5ffd83dbSDimitry Andric 
3354*5ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
3355*5ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
3356*5ffd83dbSDimitry Andric 
3357*5ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
3358*5ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3359*5ffd83dbSDimitry Andric   Register VIndex;
3360*5ffd83dbSDimitry Andric   int OpOffset = 0;
3361*5ffd83dbSDimitry Andric   if (HasVIndex) {
3362*5ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
3363*5ffd83dbSDimitry Andric     OpOffset = 1;
33648bcb0991SDimitry Andric   }
33658bcb0991SDimitry Andric 
3366*5ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
3367*5ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
3368*5ffd83dbSDimitry Andric 
3369*5ffd83dbSDimitry Andric   unsigned Format = 0;
3370*5ffd83dbSDimitry Andric   if (IsTyped) {
3371*5ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
3372*5ffd83dbSDimitry Andric     ++OpOffset;
33738bcb0991SDimitry Andric   }
33748bcb0991SDimitry Andric 
3375*5ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
3376*5ffd83dbSDimitry Andric   unsigned ImmOffset;
3377*5ffd83dbSDimitry Andric   unsigned TotalOffset;
3378*5ffd83dbSDimitry Andric 
3379*5ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
3380*5ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
3381*5ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
3382*5ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
3383*5ffd83dbSDimitry Andric 
3384*5ffd83dbSDimitry Andric   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3385*5ffd83dbSDimitry Andric   if (TotalOffset != 0)
3386*5ffd83dbSDimitry Andric     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
3387*5ffd83dbSDimitry Andric 
3388*5ffd83dbSDimitry Andric   unsigned Opc;
3389*5ffd83dbSDimitry Andric 
3390*5ffd83dbSDimitry Andric   if (IsTyped) {
3391*5ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
3392*5ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
3393*5ffd83dbSDimitry Andric   } else if (IsFormat) {
3394*5ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
3395*5ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
3396*5ffd83dbSDimitry Andric   } else {
3397*5ffd83dbSDimitry Andric     switch (MemSize) {
3398*5ffd83dbSDimitry Andric     case 1:
3399*5ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
3400*5ffd83dbSDimitry Andric       break;
3401*5ffd83dbSDimitry Andric     case 2:
3402*5ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
3403*5ffd83dbSDimitry Andric       break;
3404*5ffd83dbSDimitry Andric     default:
3405*5ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
3406*5ffd83dbSDimitry Andric       break;
3407*5ffd83dbSDimitry Andric     }
3408*5ffd83dbSDimitry Andric   }
3409*5ffd83dbSDimitry Andric 
3410*5ffd83dbSDimitry Andric   Register LoadDstReg;
3411*5ffd83dbSDimitry Andric 
3412*5ffd83dbSDimitry Andric   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
3413*5ffd83dbSDimitry Andric   LLT UnpackedTy = Ty.changeElementSize(32);
3414*5ffd83dbSDimitry Andric 
3415*5ffd83dbSDimitry Andric   if (IsExtLoad)
3416*5ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
3417*5ffd83dbSDimitry Andric   else if (Unpacked && IsD16 && Ty.isVector())
3418*5ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
3419*5ffd83dbSDimitry Andric   else
3420*5ffd83dbSDimitry Andric     LoadDstReg = Dst;
3421*5ffd83dbSDimitry Andric 
3422*5ffd83dbSDimitry Andric   if (!VIndex)
3423*5ffd83dbSDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
3424*5ffd83dbSDimitry Andric 
3425*5ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
3426*5ffd83dbSDimitry Andric     .addDef(LoadDstReg)         // vdata
3427*5ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
3428*5ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
3429*5ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
3430*5ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
3431*5ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
3432*5ffd83dbSDimitry Andric 
3433*5ffd83dbSDimitry Andric   if (IsTyped)
3434*5ffd83dbSDimitry Andric     MIB.addImm(Format);
3435*5ffd83dbSDimitry Andric 
3436*5ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3437*5ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3438*5ffd83dbSDimitry Andric      .addMemOperand(MMO);
3439*5ffd83dbSDimitry Andric 
3440*5ffd83dbSDimitry Andric   if (LoadDstReg != Dst) {
3441*5ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
3442*5ffd83dbSDimitry Andric 
3443*5ffd83dbSDimitry Andric     // Widen result for extending loads was widened.
3444*5ffd83dbSDimitry Andric     if (IsExtLoad)
3445*5ffd83dbSDimitry Andric       B.buildTrunc(Dst, LoadDstReg);
3446*5ffd83dbSDimitry Andric     else {
3447*5ffd83dbSDimitry Andric       // Repack to original 16-bit vector result
3448*5ffd83dbSDimitry Andric       // FIXME: G_TRUNC should work, but legalization currently fails
3449*5ffd83dbSDimitry Andric       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
3450*5ffd83dbSDimitry Andric       SmallVector<Register, 4> Repack;
3451*5ffd83dbSDimitry Andric       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
3452*5ffd83dbSDimitry Andric         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
3453*5ffd83dbSDimitry Andric       B.buildMerge(Dst, Repack);
3454*5ffd83dbSDimitry Andric     }
3455*5ffd83dbSDimitry Andric   }
3456*5ffd83dbSDimitry Andric 
3457*5ffd83dbSDimitry Andric   MI.eraseFromParent();
3458*5ffd83dbSDimitry Andric   return true;
3459*5ffd83dbSDimitry Andric }
3460*5ffd83dbSDimitry Andric 
3461*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
3462*5ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
3463*5ffd83dbSDimitry Andric                                                bool IsInc) const {
3464*5ffd83dbSDimitry Andric   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
3465*5ffd83dbSDimitry Andric                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
3466*5ffd83dbSDimitry Andric   B.buildInstr(Opc)
3467*5ffd83dbSDimitry Andric     .addDef(MI.getOperand(0).getReg())
3468*5ffd83dbSDimitry Andric     .addUse(MI.getOperand(2).getReg())
3469*5ffd83dbSDimitry Andric     .addUse(MI.getOperand(3).getReg())
3470*5ffd83dbSDimitry Andric     .cloneMemRefs(MI);
3471*5ffd83dbSDimitry Andric   MI.eraseFromParent();
3472*5ffd83dbSDimitry Andric   return true;
3473*5ffd83dbSDimitry Andric }
3474*5ffd83dbSDimitry Andric 
3475*5ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
3476*5ffd83dbSDimitry Andric   switch (IntrID) {
3477*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
3478*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
3479*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
3480*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
3481*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
3482*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
3483*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
3484*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
3485*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
3486*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
3487*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
3488*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
3489*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
3490*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
3491*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
3492*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
3493*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
3494*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
3495*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
3496*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
3497*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
3498*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
3499*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
3500*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
3501*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
3502*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
3503*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
3504*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
3505*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
3506*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
3507*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
3508*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
3509*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
3510*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
3511*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
3512*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
3513*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
3514*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
3515*5ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3516*5ffd83dbSDimitry Andric   default:
3517*5ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
3518*5ffd83dbSDimitry Andric   }
3519*5ffd83dbSDimitry Andric }
3520*5ffd83dbSDimitry Andric 
3521*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
3522*5ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
3523*5ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
3524*5ffd83dbSDimitry Andric   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
3525*5ffd83dbSDimitry Andric                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3526*5ffd83dbSDimitry Andric 
3527*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3528*5ffd83dbSDimitry Andric   Register VData = MI.getOperand(2).getReg();
3529*5ffd83dbSDimitry Andric 
3530*5ffd83dbSDimitry Andric   Register CmpVal;
3531*5ffd83dbSDimitry Andric   int OpOffset = 0;
3532*5ffd83dbSDimitry Andric 
3533*5ffd83dbSDimitry Andric   if (IsCmpSwap) {
3534*5ffd83dbSDimitry Andric     CmpVal = MI.getOperand(3 + OpOffset).getReg();
3535*5ffd83dbSDimitry Andric     ++OpOffset;
3536*5ffd83dbSDimitry Andric   }
3537*5ffd83dbSDimitry Andric 
3538*5ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3539*5ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
3540*5ffd83dbSDimitry Andric 
3541*5ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
3542*5ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
3543*5ffd83dbSDimitry Andric   Register VIndex;
3544*5ffd83dbSDimitry Andric   if (HasVIndex) {
3545*5ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
3546*5ffd83dbSDimitry Andric     ++OpOffset;
3547*5ffd83dbSDimitry Andric   }
3548*5ffd83dbSDimitry Andric 
3549*5ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3550*5ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
3551*5ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
3552*5ffd83dbSDimitry Andric 
3553*5ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
3554*5ffd83dbSDimitry Andric 
3555*5ffd83dbSDimitry Andric   unsigned ImmOffset;
3556*5ffd83dbSDimitry Andric   unsigned TotalOffset;
3557*5ffd83dbSDimitry Andric   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
3558*5ffd83dbSDimitry Andric   if (TotalOffset != 0)
3559*5ffd83dbSDimitry Andric     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
3560*5ffd83dbSDimitry Andric 
3561*5ffd83dbSDimitry Andric   if (!VIndex)
3562*5ffd83dbSDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
3563*5ffd83dbSDimitry Andric 
3564*5ffd83dbSDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
3565*5ffd83dbSDimitry Andric     .addDef(Dst)
3566*5ffd83dbSDimitry Andric     .addUse(VData); // vdata
3567*5ffd83dbSDimitry Andric 
3568*5ffd83dbSDimitry Andric   if (IsCmpSwap)
3569*5ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
3570*5ffd83dbSDimitry Andric 
3571*5ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
3572*5ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
3573*5ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
3574*5ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
3575*5ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
3576*5ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
3577*5ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
3578*5ffd83dbSDimitry Andric      .addMemOperand(MMO);
3579*5ffd83dbSDimitry Andric 
3580*5ffd83dbSDimitry Andric   MI.eraseFromParent();
3581*5ffd83dbSDimitry Andric   return true;
3582*5ffd83dbSDimitry Andric }
3583*5ffd83dbSDimitry Andric 
3584*5ffd83dbSDimitry Andric /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
3585*5ffd83dbSDimitry Andric /// vector with s16 typed elements.
3586*5ffd83dbSDimitry Andric static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
3587*5ffd83dbSDimitry Andric                                         SmallVectorImpl<Register> &PackedAddrs,
3588*5ffd83dbSDimitry Andric                                         int AddrIdx, int DimIdx, int EndIdx,
3589*5ffd83dbSDimitry Andric                                         int NumGradients) {
3590*5ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
3591*5ffd83dbSDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
3592*5ffd83dbSDimitry Andric 
3593*5ffd83dbSDimitry Andric   for (int I = AddrIdx; I < EndIdx; ++I) {
3594*5ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(I);
3595*5ffd83dbSDimitry Andric     if (!SrcOp.isReg())
3596*5ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
3597*5ffd83dbSDimitry Andric 
3598*5ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
3599*5ffd83dbSDimitry Andric 
3600*5ffd83dbSDimitry Andric     if (I < DimIdx) {
3601*5ffd83dbSDimitry Andric       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
3602*5ffd83dbSDimitry Andric       PackedAddrs.push_back(AddrReg);
3603*5ffd83dbSDimitry Andric     } else {
3604*5ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
3605*5ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
3606*5ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
3607*5ffd83dbSDimitry Andric           ((NumGradients / 2) % 2 == 1 &&
3608*5ffd83dbSDimitry Andric            (I == DimIdx + (NumGradients / 2) - 1 ||
3609*5ffd83dbSDimitry Andric             I == DimIdx + NumGradients - 1)) ||
3610*5ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
3611*5ffd83dbSDimitry Andric           !MI.getOperand(I + 1).isReg()) {
3612*5ffd83dbSDimitry Andric         PackedAddrs.push_back(
3613*5ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
3614*5ffd83dbSDimitry Andric                 .getReg(0));
3615*5ffd83dbSDimitry Andric       } else {
3616*5ffd83dbSDimitry Andric         PackedAddrs.push_back(
3617*5ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
3618*5ffd83dbSDimitry Andric                 .getReg(0));
3619*5ffd83dbSDimitry Andric         ++I;
3620*5ffd83dbSDimitry Andric       }
3621*5ffd83dbSDimitry Andric     }
3622*5ffd83dbSDimitry Andric   }
3623*5ffd83dbSDimitry Andric }
3624*5ffd83dbSDimitry Andric 
3625*5ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
3626*5ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
3627*5ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
3628*5ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
3629*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3630*5ffd83dbSDimitry Andric 
3631*5ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
3632*5ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
3633*5ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3634*5ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
3635*5ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
3636*5ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
3637*5ffd83dbSDimitry Andric     }
3638*5ffd83dbSDimitry Andric   }
3639*5ffd83dbSDimitry Andric 
3640*5ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
3641*5ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
3642*5ffd83dbSDimitry Andric     // Round up to 8 elements for v5-v7
3643*5ffd83dbSDimitry Andric     // FIXME: Missing intermediate sized register classes and instructions.
3644*5ffd83dbSDimitry Andric     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
3645*5ffd83dbSDimitry Andric       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
3646*5ffd83dbSDimitry Andric       auto Undef = B.buildUndef(S32);
3647*5ffd83dbSDimitry Andric       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
3648*5ffd83dbSDimitry Andric       NumAddrRegs = RoundedNumRegs;
3649*5ffd83dbSDimitry Andric     }
3650*5ffd83dbSDimitry Andric 
3651*5ffd83dbSDimitry Andric     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
3652*5ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
3653*5ffd83dbSDimitry Andric   }
3654*5ffd83dbSDimitry Andric 
3655*5ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
3656*5ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
3657*5ffd83dbSDimitry Andric     if (SrcOp.isReg())
3658*5ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
3659*5ffd83dbSDimitry Andric   }
3660*5ffd83dbSDimitry Andric }
3661*5ffd83dbSDimitry Andric 
3662*5ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
3663*5ffd83dbSDimitry Andric ///
3664*5ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
3665*5ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
3666*5ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
3667*5ffd83dbSDimitry Andric /// registers.
3668*5ffd83dbSDimitry Andric ///
3669*5ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
3670*5ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
3671*5ffd83dbSDimitry Andric /// want a selected instrution entering RegBankSelect. In order to avoid
3672*5ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
3673*5ffd83dbSDimitry Andric /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
3674*5ffd83dbSDimitry Andric /// now unnecessary arguments with $noreg.
3675*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
3676*5ffd83dbSDimitry Andric     MachineInstr &MI, MachineIRBuilder &B,
3677*5ffd83dbSDimitry Andric     GISelChangeObserver &Observer,
3678*5ffd83dbSDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
3679*5ffd83dbSDimitry Andric 
3680*5ffd83dbSDimitry Andric   const int NumDefs = MI.getNumExplicitDefs();
3681*5ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
3682*5ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
3683*5ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
3684*5ffd83dbSDimitry Andric 
3685*5ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
3686*5ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
3687*5ffd83dbSDimitry Andric     AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
3688*5ffd83dbSDimitry Andric 
3689*5ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
3690*5ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3691*5ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
3692*5ffd83dbSDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
3693*5ffd83dbSDimitry Andric 
3694*5ffd83dbSDimitry Andric   // Index of first address argument
3695*5ffd83dbSDimitry Andric   const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
3696*5ffd83dbSDimitry Andric 
3697*5ffd83dbSDimitry Andric   int NumVAddrs, NumGradients;
3698*5ffd83dbSDimitry Andric   std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
3699*5ffd83dbSDimitry Andric   const int DMaskIdx = BaseOpcode->Atomic ? -1 :
3700*5ffd83dbSDimitry Andric     getDMaskIdx(BaseOpcode, NumDefs);
3701*5ffd83dbSDimitry Andric   unsigned DMask = 0;
3702*5ffd83dbSDimitry Andric 
3703*5ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
3704*5ffd83dbSDimitry Andric   int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
3705*5ffd83dbSDimitry Andric   LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
3706*5ffd83dbSDimitry Andric   LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
3707*5ffd83dbSDimitry Andric   const bool IsG16 = GradTy == S16;
3708*5ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
3709*5ffd83dbSDimitry Andric 
3710*5ffd83dbSDimitry Andric   int DMaskLanes = 0;
3711*5ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
3712*5ffd83dbSDimitry Andric     DMask = MI.getOperand(DMaskIdx).getImm();
3713*5ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
3714*5ffd83dbSDimitry Andric       DMaskLanes = 4;
3715*5ffd83dbSDimitry Andric     } else if (DMask != 0) {
3716*5ffd83dbSDimitry Andric       DMaskLanes = countPopulation(DMask);
3717*5ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
3718*5ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
3719*5ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
3720*5ffd83dbSDimitry Andric       MI.eraseFromParent();
3721*5ffd83dbSDimitry Andric       return true;
3722*5ffd83dbSDimitry Andric     }
3723*5ffd83dbSDimitry Andric   }
3724*5ffd83dbSDimitry Andric 
3725*5ffd83dbSDimitry Andric   Observer.changingInstr(MI);
3726*5ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
3727*5ffd83dbSDimitry Andric 
3728*5ffd83dbSDimitry Andric   unsigned NewOpcode = NumDefs == 0 ?
3729*5ffd83dbSDimitry Andric     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
3730*5ffd83dbSDimitry Andric 
3731*5ffd83dbSDimitry Andric   // Track that we legalized this
3732*5ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
3733*5ffd83dbSDimitry Andric 
3734*5ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
3735*5ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
3736*5ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
3737*5ffd83dbSDimitry Andric     DMask = 0x1;
3738*5ffd83dbSDimitry Andric     DMaskLanes = 1;
3739*5ffd83dbSDimitry Andric     MI.getOperand(DMaskIdx).setImm(DMask);
3740*5ffd83dbSDimitry Andric   }
3741*5ffd83dbSDimitry Andric 
3742*5ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
3743*5ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
3744*5ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
3745*5ffd83dbSDimitry Andric 
3746*5ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
3747*5ffd83dbSDimitry Andric     if (Ty.isVector())
3748*5ffd83dbSDimitry Andric       return false;
3749*5ffd83dbSDimitry Andric 
3750*5ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
3751*5ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
3752*5ffd83dbSDimitry Andric       // The two values are packed in one register.
3753*5ffd83dbSDimitry Andric       LLT PackedTy = LLT::vector(2, Ty);
3754*5ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
3755*5ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
3756*5ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
3757*5ffd83dbSDimitry Andric     }
3758*5ffd83dbSDimitry Andric   }
3759*5ffd83dbSDimitry Andric 
3760*5ffd83dbSDimitry Andric   int CorrectedNumVAddrs = NumVAddrs;
3761*5ffd83dbSDimitry Andric 
3762*5ffd83dbSDimitry Andric   // Optimize _L to _LZ when _L is zero
3763*5ffd83dbSDimitry Andric   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
3764*5ffd83dbSDimitry Andric         AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
3765*5ffd83dbSDimitry Andric     const ConstantFP *ConstantLod;
3766*5ffd83dbSDimitry Andric     const int LodIdx = AddrIdx + NumVAddrs - 1;
3767*5ffd83dbSDimitry Andric 
3768*5ffd83dbSDimitry Andric     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
3769*5ffd83dbSDimitry Andric       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
3770*5ffd83dbSDimitry Andric         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
3771*5ffd83dbSDimitry Andric         ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
3772*5ffd83dbSDimitry Andric           LZMappingInfo->LZ, ImageDimIntr->Dim);
3773*5ffd83dbSDimitry Andric 
3774*5ffd83dbSDimitry Andric         // The starting indexes should remain in the same place.
3775*5ffd83dbSDimitry Andric         --NumVAddrs;
3776*5ffd83dbSDimitry Andric         --CorrectedNumVAddrs;
3777*5ffd83dbSDimitry Andric 
3778*5ffd83dbSDimitry Andric         MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
3779*5ffd83dbSDimitry Andric           static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
3780*5ffd83dbSDimitry Andric         MI.RemoveOperand(LodIdx);
3781*5ffd83dbSDimitry Andric       }
3782*5ffd83dbSDimitry Andric     }
3783*5ffd83dbSDimitry Andric   }
3784*5ffd83dbSDimitry Andric 
3785*5ffd83dbSDimitry Andric   // Optimize _mip away, when 'lod' is zero
3786*5ffd83dbSDimitry Andric   if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
3787*5ffd83dbSDimitry Andric     int64_t ConstantLod;
3788*5ffd83dbSDimitry Andric     const int LodIdx = AddrIdx + NumVAddrs - 1;
3789*5ffd83dbSDimitry Andric 
3790*5ffd83dbSDimitry Andric     if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
3791*5ffd83dbSDimitry Andric       if (ConstantLod == 0) {
3792*5ffd83dbSDimitry Andric         // TODO: Change intrinsic opcode and remove operand instead or replacing
3793*5ffd83dbSDimitry Andric         // it with 0, as the _L to _LZ handling is done above.
3794*5ffd83dbSDimitry Andric         MI.getOperand(LodIdx).ChangeToImmediate(0);
3795*5ffd83dbSDimitry Andric         --CorrectedNumVAddrs;
3796*5ffd83dbSDimitry Andric       }
3797*5ffd83dbSDimitry Andric     }
3798*5ffd83dbSDimitry Andric   }
3799*5ffd83dbSDimitry Andric 
3800*5ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
3801*5ffd83dbSDimitry Andric   if (IsA16 || IsG16) {
3802*5ffd83dbSDimitry Andric     if (IsA16) {
3803*5ffd83dbSDimitry Andric       // Target must support the feature and gradients need to be 16 bit too
3804*5ffd83dbSDimitry Andric       if (!ST.hasA16() || !IsG16)
3805*5ffd83dbSDimitry Andric         return false;
3806*5ffd83dbSDimitry Andric     } else if (!ST.hasG16())
3807*5ffd83dbSDimitry Andric       return false;
3808*5ffd83dbSDimitry Andric 
3809*5ffd83dbSDimitry Andric     if (NumVAddrs > 1) {
3810*5ffd83dbSDimitry Andric       SmallVector<Register, 4> PackedRegs;
3811*5ffd83dbSDimitry Andric       // Don't compress addresses for G16
3812*5ffd83dbSDimitry Andric       const int PackEndIdx =
3813*5ffd83dbSDimitry Andric           IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
3814*5ffd83dbSDimitry Andric       packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
3815*5ffd83dbSDimitry Andric                                   PackEndIdx, NumGradients);
3816*5ffd83dbSDimitry Andric 
3817*5ffd83dbSDimitry Andric       if (!IsA16) {
3818*5ffd83dbSDimitry Andric         // Add uncompressed address
3819*5ffd83dbSDimitry Andric         for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
3820*5ffd83dbSDimitry Andric           int AddrReg = MI.getOperand(I).getReg();
3821*5ffd83dbSDimitry Andric           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
3822*5ffd83dbSDimitry Andric           PackedRegs.push_back(AddrReg);
3823*5ffd83dbSDimitry Andric         }
3824*5ffd83dbSDimitry Andric       }
3825*5ffd83dbSDimitry Andric 
3826*5ffd83dbSDimitry Andric       // See also below in the non-a16 branch
3827*5ffd83dbSDimitry Andric       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
3828*5ffd83dbSDimitry Andric 
3829*5ffd83dbSDimitry Andric       if (!UseNSA && PackedRegs.size() > 1) {
3830*5ffd83dbSDimitry Andric         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
3831*5ffd83dbSDimitry Andric         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
3832*5ffd83dbSDimitry Andric         PackedRegs[0] = Concat.getReg(0);
3833*5ffd83dbSDimitry Andric         PackedRegs.resize(1);
3834*5ffd83dbSDimitry Andric       }
3835*5ffd83dbSDimitry Andric 
3836*5ffd83dbSDimitry Andric       const int NumPacked = PackedRegs.size();
3837*5ffd83dbSDimitry Andric       for (int I = 0; I != NumVAddrs; ++I) {
3838*5ffd83dbSDimitry Andric         MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
3839*5ffd83dbSDimitry Andric         if (!SrcOp.isReg()) {
3840*5ffd83dbSDimitry Andric           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
3841*5ffd83dbSDimitry Andric           continue;
3842*5ffd83dbSDimitry Andric         }
3843*5ffd83dbSDimitry Andric 
3844*5ffd83dbSDimitry Andric         assert(SrcOp.getReg() != AMDGPU::NoRegister);
3845*5ffd83dbSDimitry Andric 
3846*5ffd83dbSDimitry Andric         if (I < NumPacked)
3847*5ffd83dbSDimitry Andric           SrcOp.setReg(PackedRegs[I]);
3848*5ffd83dbSDimitry Andric         else
3849*5ffd83dbSDimitry Andric           SrcOp.setReg(AMDGPU::NoRegister);
3850*5ffd83dbSDimitry Andric       }
3851*5ffd83dbSDimitry Andric     }
3852*5ffd83dbSDimitry Andric   } else {
3853*5ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
3854*5ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
3855*5ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
3856*5ffd83dbSDimitry Andric     // wash in terms of code size or even better.
3857*5ffd83dbSDimitry Andric     //
3858*5ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
3859*5ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
3860*5ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
3861*5ffd83dbSDimitry Andric     //
3862*5ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
3863*5ffd83dbSDimitry Andric     // allocation when possible.
3864*5ffd83dbSDimitry Andric     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
3865*5ffd83dbSDimitry Andric 
3866*5ffd83dbSDimitry Andric     if (!UseNSA && NumVAddrs > 1)
3867*5ffd83dbSDimitry Andric       convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
3868*5ffd83dbSDimitry Andric   }
3869*5ffd83dbSDimitry Andric 
3870*5ffd83dbSDimitry Andric   int Flags = 0;
3871*5ffd83dbSDimitry Andric   if (IsA16)
3872*5ffd83dbSDimitry Andric     Flags |= 1;
3873*5ffd83dbSDimitry Andric   if (IsG16)
3874*5ffd83dbSDimitry Andric     Flags |= 2;
3875*5ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
3876*5ffd83dbSDimitry Andric 
3877*5ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
3878*5ffd83dbSDimitry Andric     // TODO: Handle dmask trim
3879*5ffd83dbSDimitry Andric     Register VData = MI.getOperand(1).getReg();
3880*5ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData);
3881*5ffd83dbSDimitry Andric     if (!Ty.isVector() || Ty.getElementType() != S16)
3882*5ffd83dbSDimitry Andric       return true;
3883*5ffd83dbSDimitry Andric 
3884*5ffd83dbSDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData);
3885*5ffd83dbSDimitry Andric     if (RepackedReg != VData) {
3886*5ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
3887*5ffd83dbSDimitry Andric     }
3888*5ffd83dbSDimitry Andric 
3889*5ffd83dbSDimitry Andric     return true;
3890*5ffd83dbSDimitry Andric   }
3891*5ffd83dbSDimitry Andric 
3892*5ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
3893*5ffd83dbSDimitry Andric   LLT Ty = MRI->getType(DstReg);
3894*5ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
3895*5ffd83dbSDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
3896*5ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
3897*5ffd83dbSDimitry Andric 
3898*5ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
3899*5ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
3900*5ffd83dbSDimitry Andric     return false;
3901*5ffd83dbSDimitry Andric 
3902*5ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
3903*5ffd83dbSDimitry Andric     return false;
3904*5ffd83dbSDimitry Andric 
3905*5ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
3906*5ffd83dbSDimitry Andric   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
3907*5ffd83dbSDimitry Andric 
3908*5ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
3909*5ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
3910*5ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
3911*5ffd83dbSDimitry Andric   LLT RoundedTy;
3912*5ffd83dbSDimitry Andric 
3913*5ffd83dbSDimitry Andric   // S32 vector to to cover all data, plus TFE result element.
3914*5ffd83dbSDimitry Andric   LLT TFETy;
3915*5ffd83dbSDimitry Andric 
3916*5ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
3917*5ffd83dbSDimitry Andric   LLT RegTy;
3918*5ffd83dbSDimitry Andric 
3919*5ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
3920*5ffd83dbSDimitry Andric     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
3921*5ffd83dbSDimitry Andric     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
3922*5ffd83dbSDimitry Andric     RegTy = S32;
3923*5ffd83dbSDimitry Andric   } else {
3924*5ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
3925*5ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
3926*5ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
3927*5ffd83dbSDimitry Andric     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
3928*5ffd83dbSDimitry Andric     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
3929*5ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
3930*5ffd83dbSDimitry Andric   }
3931*5ffd83dbSDimitry Andric 
3932*5ffd83dbSDimitry Andric   // The return type does not need adjustment.
3933*5ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
3934*5ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
3935*5ffd83dbSDimitry Andric     return true;
3936*5ffd83dbSDimitry Andric 
3937*5ffd83dbSDimitry Andric   Register Dst1Reg;
3938*5ffd83dbSDimitry Andric 
3939*5ffd83dbSDimitry Andric   // Insert after the instruction.
3940*5ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
3941*5ffd83dbSDimitry Andric 
3942*5ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
3943*5ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
3944*5ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
3945*5ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
3946*5ffd83dbSDimitry Andric 
3947*5ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
3948*5ffd83dbSDimitry Andric 
3949*5ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
3950*5ffd83dbSDimitry Andric 
3951*5ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
3952*5ffd83dbSDimitry Andric   // type. The intruction really returns these two values in one contiguous
3953*5ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
3954*5ffd83dbSDimitry Andric   // return type to use a single register result.
3955*5ffd83dbSDimitry Andric 
3956*5ffd83dbSDimitry Andric   if (IsTFE) {
3957*5ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
3958*5ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
3959*5ffd83dbSDimitry Andric       return false;
3960*5ffd83dbSDimitry Andric 
3961*5ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
3962*5ffd83dbSDimitry Andric     MI.RemoveOperand(1);
3963*5ffd83dbSDimitry Andric 
3964*5ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
3965*5ffd83dbSDimitry Andric     if (Ty == S32) {
3966*5ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
3967*5ffd83dbSDimitry Andric       return true;
3968*5ffd83dbSDimitry Andric     }
3969*5ffd83dbSDimitry Andric   }
3970*5ffd83dbSDimitry Andric 
3971*5ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
3972*5ffd83dbSDimitry Andric   // result.
3973*5ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
3974*5ffd83dbSDimitry Andric 
3975*5ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
3976*5ffd83dbSDimitry Andric 
3977*5ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
3978*5ffd83dbSDimitry Andric     assert(!IsTFE);
3979*5ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
3980*5ffd83dbSDimitry Andric   } else {
3981*5ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
3982*5ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
3983*5ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
3984*5ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
3985*5ffd83dbSDimitry Andric 
3986*5ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
3987*5ffd83dbSDimitry Andric     // directly written to the right place already.
3988*5ffd83dbSDimitry Andric     if (IsTFE)
3989*5ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
3990*5ffd83dbSDimitry Andric   }
3991*5ffd83dbSDimitry Andric 
3992*5ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
3993*5ffd83dbSDimitry Andric   // of packed vs. unpacked.
3994*5ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
3995*5ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
3996*5ffd83dbSDimitry Andric     return true;
3997*5ffd83dbSDimitry Andric   }
3998*5ffd83dbSDimitry Andric 
3999*5ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
4000*5ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
4001*5ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
4002*5ffd83dbSDimitry Andric     return true;
4003*5ffd83dbSDimitry Andric   }
4004*5ffd83dbSDimitry Andric 
4005*5ffd83dbSDimitry Andric   assert(Ty.isVector());
4006*5ffd83dbSDimitry Andric 
4007*5ffd83dbSDimitry Andric   if (IsD16) {
4008*5ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
4009*5ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
4010*5ffd83dbSDimitry Andric     //
4011*5ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
4012*5ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
4013*5ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
4014*5ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
4015*5ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
4016*5ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
4017*5ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
4018*5ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
4019*5ffd83dbSDimitry Andric     }
4020*5ffd83dbSDimitry Andric   }
4021*5ffd83dbSDimitry Andric 
4022*5ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
4023*5ffd83dbSDimitry Andric     if (NumElts == 0)
4024*5ffd83dbSDimitry Andric       return;
4025*5ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
4026*5ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
4027*5ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
4028*5ffd83dbSDimitry Andric   };
4029*5ffd83dbSDimitry Andric 
4030*5ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
4031*5ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
4032*5ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
4033*5ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
4034*5ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
4035*5ffd83dbSDimitry Andric     return true;
4036*5ffd83dbSDimitry Andric   }
4037*5ffd83dbSDimitry Andric 
4038*5ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
4039*5ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
4040*5ffd83dbSDimitry Andric 
4041*5ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
4042*5ffd83dbSDimitry Andric   const LLT V3S16 = LLT::vector(3, 16);
4043*5ffd83dbSDimitry Andric   if (Ty == V3S16) {
4044*5ffd83dbSDimitry Andric     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4045*5ffd83dbSDimitry Andric     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
4046*5ffd83dbSDimitry Andric     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
4047*5ffd83dbSDimitry Andric     return true;
4048*5ffd83dbSDimitry Andric   }
4049*5ffd83dbSDimitry Andric 
4050*5ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
4051*5ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
4052*5ffd83dbSDimitry Andric   return true;
4053*5ffd83dbSDimitry Andric }
4054*5ffd83dbSDimitry Andric 
4055*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4056*5ffd83dbSDimitry Andric   MachineInstr &MI, MachineIRBuilder &B,
4057*5ffd83dbSDimitry Andric   GISelChangeObserver &Observer) const {
4058*5ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4059*5ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
4060*5ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
4061*5ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
4062*5ffd83dbSDimitry Andric 
4063*5ffd83dbSDimitry Andric   Observer.changingInstr(MI);
4064*5ffd83dbSDimitry Andric 
4065*5ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
4066*5ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
4067*5ffd83dbSDimitry Andric   // allowed to add one.
4068*5ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
4069*5ffd83dbSDimitry Andric   MI.RemoveOperand(1); // Remove intrinsic ID
4070*5ffd83dbSDimitry Andric 
4071*5ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
4072*5ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
4073*5ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
4074*5ffd83dbSDimitry Andric   const Align MemAlign(4);
4075*5ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
4076*5ffd83dbSDimitry Andric       MachinePointerInfo(),
4077*5ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
4078*5ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
4079*5ffd83dbSDimitry Andric       MemSize, MemAlign);
4080*5ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
4081*5ffd83dbSDimitry Andric 
4082*5ffd83dbSDimitry Andric   // There are no 96-bit result scalar loads, but widening to 128-bit should
4083*5ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
4084*5ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
4085*5ffd83dbSDimitry Andric   if (!isPowerOf2_32(Size)) {
4086*5ffd83dbSDimitry Andric     LegalizerHelper Helper(MF, *this, Observer, B);
4087*5ffd83dbSDimitry Andric 
4088*5ffd83dbSDimitry Andric     if (Ty.isVector())
4089*5ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
4090*5ffd83dbSDimitry Andric     else
4091*5ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
4092*5ffd83dbSDimitry Andric   }
4093*5ffd83dbSDimitry Andric 
4094*5ffd83dbSDimitry Andric   Observer.changedInstr(MI);
4095*5ffd83dbSDimitry Andric   return true;
4096*5ffd83dbSDimitry Andric }
4097*5ffd83dbSDimitry Andric 
4098*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
40990b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
41000b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
4101*5ffd83dbSDimitry Andric   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
4102*5ffd83dbSDimitry Andric   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4103*5ffd83dbSDimitry Andric       !ST.isTrapHandlerEnabled()) {
4104*5ffd83dbSDimitry Andric     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4105*5ffd83dbSDimitry Andric   } else {
4106*5ffd83dbSDimitry Andric     // Pass queue pointer to trap handler as input, and insert trap instruction
4107*5ffd83dbSDimitry Andric     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4108*5ffd83dbSDimitry Andric     const ArgDescriptor *Arg =
4109*5ffd83dbSDimitry Andric         getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
4110*5ffd83dbSDimitry Andric     if (!Arg)
4111*5ffd83dbSDimitry Andric       return false;
4112*5ffd83dbSDimitry Andric     MachineRegisterInfo &MRI = *B.getMRI();
4113*5ffd83dbSDimitry Andric     Register SGPR01(AMDGPU::SGPR0_SGPR1);
4114*5ffd83dbSDimitry Andric     Register LiveIn = getLiveInRegister(
4115*5ffd83dbSDimitry Andric         B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
4116*5ffd83dbSDimitry Andric         /*InsertLiveInCopy=*/false);
4117*5ffd83dbSDimitry Andric     if (!loadInputValue(LiveIn, B, Arg))
4118*5ffd83dbSDimitry Andric       return false;
4119*5ffd83dbSDimitry Andric     B.buildCopy(SGPR01, LiveIn);
4120*5ffd83dbSDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
4121*5ffd83dbSDimitry Andric         .addImm(GCNSubtarget::TrapIDLLVMTrap)
4122*5ffd83dbSDimitry Andric         .addReg(SGPR01, RegState::Implicit);
4123*5ffd83dbSDimitry Andric   }
4124*5ffd83dbSDimitry Andric 
4125*5ffd83dbSDimitry Andric   MI.eraseFromParent();
4126*5ffd83dbSDimitry Andric   return true;
4127*5ffd83dbSDimitry Andric }
4128*5ffd83dbSDimitry Andric 
4129*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
4130*5ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4131*5ffd83dbSDimitry Andric   // Is non-HSA path or trap-handler disabled? then, report a warning
4132*5ffd83dbSDimitry Andric   // accordingly
4133*5ffd83dbSDimitry Andric   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4134*5ffd83dbSDimitry Andric       !ST.isTrapHandlerEnabled()) {
4135*5ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
4136*5ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
4137*5ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
4138*5ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
4139*5ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
4140*5ffd83dbSDimitry Andric   } else {
4141*5ffd83dbSDimitry Andric     // Insert debug-trap instruction
4142*5ffd83dbSDimitry Andric     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
4143*5ffd83dbSDimitry Andric   }
4144*5ffd83dbSDimitry Andric 
4145*5ffd83dbSDimitry Andric   MI.eraseFromParent();
4146*5ffd83dbSDimitry Andric   return true;
4147*5ffd83dbSDimitry Andric }
4148*5ffd83dbSDimitry Andric 
4149*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
4150*5ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
4151*5ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
4152*5ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
4153*5ffd83dbSDimitry Andric 
41540b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4155480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
4156480093f4SDimitry Andric   switch (IntrID) {
4157480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
4158480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
4159480093f4SDimitry Andric     MachineInstr *Br = nullptr;
4160*5ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4161*5ffd83dbSDimitry Andric     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
41620b57cec5SDimitry Andric       const SIRegisterInfo *TRI
41630b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
41640b57cec5SDimitry Andric 
41650b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
41660b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
4167480093f4SDimitry Andric 
4168*5ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4169*5ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4170480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
41710b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
41720b57cec5SDimitry Andric           .addDef(Def)
41730b57cec5SDimitry Andric           .addUse(Use)
4174*5ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
4175480093f4SDimitry Andric       } else {
4176480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
4177480093f4SDimitry Andric           .addDef(Def)
4178480093f4SDimitry Andric           .addUse(Use)
4179*5ffd83dbSDimitry Andric           .addMBB(UncondBrTarget)
4180480093f4SDimitry Andric           .addImm(0);
4181480093f4SDimitry Andric       }
4182480093f4SDimitry Andric 
4183*5ffd83dbSDimitry Andric       if (Br) {
4184*5ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
4185*5ffd83dbSDimitry Andric       } else {
4186*5ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
4187*5ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
4188*5ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
4189*5ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
4190*5ffd83dbSDimitry Andric       }
41910b57cec5SDimitry Andric 
41920b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
41930b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
41940b57cec5SDimitry Andric       MI.eraseFromParent();
41950b57cec5SDimitry Andric       BrCond->eraseFromParent();
41960b57cec5SDimitry Andric       return true;
41970b57cec5SDimitry Andric     }
41980b57cec5SDimitry Andric 
41990b57cec5SDimitry Andric     return false;
42000b57cec5SDimitry Andric   }
42010b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
4202480093f4SDimitry Andric     MachineInstr *Br = nullptr;
4203*5ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4204*5ffd83dbSDimitry Andric     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
42050b57cec5SDimitry Andric       const SIRegisterInfo *TRI
42060b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
42070b57cec5SDimitry Andric 
4208*5ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
42090b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
4210*5ffd83dbSDimitry Andric 
4211*5ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
42120b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
42130b57cec5SDimitry Andric         .addUse(Reg)
4214*5ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
4215*5ffd83dbSDimitry Andric 
4216*5ffd83dbSDimitry Andric       if (Br)
4217*5ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
4218*5ffd83dbSDimitry Andric       else
4219*5ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
4220*5ffd83dbSDimitry Andric 
42210b57cec5SDimitry Andric       MI.eraseFromParent();
42220b57cec5SDimitry Andric       BrCond->eraseFromParent();
42230b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
42240b57cec5SDimitry Andric       return true;
42250b57cec5SDimitry Andric     }
42260b57cec5SDimitry Andric 
42270b57cec5SDimitry Andric     return false;
42280b57cec5SDimitry Andric   }
42290b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
4230*5ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
4231*5ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
4232*5ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
4233*5ffd83dbSDimitry Andric       MI.eraseFromParent();
4234*5ffd83dbSDimitry Andric       return true;
4235*5ffd83dbSDimitry Andric     }
4236*5ffd83dbSDimitry Andric 
42370b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
42380b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
42390b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
42400b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
42410b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
42420b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42430b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
42440b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
42450b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42460b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
42470b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
42480b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42490b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
42500b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
42510b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42520b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
42530b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
42540b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42550b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
42560b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
42570b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42580b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
42590b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
42600b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42610b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
42620b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
42630b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42640b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
42650b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
42660b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
42670b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
42680b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
42690b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
42700b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
42718bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
42728bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
42738bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
42748bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
42758bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
42768bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
42778bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
42788bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
42798bcb0991SDimitry Andric     MI.eraseFromParent();
42808bcb0991SDimitry Andric     return true;
42818bcb0991SDimitry Andric   }
4282*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
4283*5ffd83dbSDimitry Andric     return legalizeSBufferLoad(MI, B, Helper.Observer);
42848bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
4285*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
4286*5ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
42878bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
4288*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
4289*5ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
4290*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
4291*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
4292*5ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
4293*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
4294*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
4295*5ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
4296*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
4297*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
4298*5ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
4299*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
4300*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
4301*5ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
4302*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
4303*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
4304*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
4305*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
4306*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
4307*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
4308*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
4309*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
4310*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
4311*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
4312*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
4313*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
4314*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
4315*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
4316*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
4317*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
4318*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
4319*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
4320*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
4321*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
4322*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
4323*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
4324*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
4325*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4326*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
4327*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
4328*5ffd83dbSDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
4329*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_inc:
4330*5ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, true);
4331*5ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_dec:
4332*5ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, false);
4333*5ffd83dbSDimitry Andric   case Intrinsic::trap:
4334*5ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
4335*5ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
4336*5ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4337*5ffd83dbSDimitry Andric   default: {
4338*5ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
4339*5ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
4340*5ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
43410b57cec5SDimitry Andric     return true;
43420b57cec5SDimitry Andric   }
4343*5ffd83dbSDimitry Andric   }
43440b57cec5SDimitry Andric 
43450b57cec5SDimitry Andric   return true;
43460b57cec5SDimitry Andric }
4347