xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 480093f4440d54b30b3025afeac24b48f2ba7a2e)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
148bcb0991SDimitry Andric #if defined(_MSC_VER) || defined(__MINGW32__)
158bcb0991SDimitry Andric // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
168bcb0991SDimitry Andric // from the Visual C++ cmath / math.h headers:
178bcb0991SDimitry Andric // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
188bcb0991SDimitry Andric #define _USE_MATH_DEFINES
198bcb0991SDimitry Andric #endif
208bcb0991SDimitry Andric 
210b57cec5SDimitry Andric #include "AMDGPU.h"
220b57cec5SDimitry Andric #include "AMDGPULegalizerInfo.h"
230b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
240b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
250b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
260b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
270b57cec5SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h"
280b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h"
290b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h"
308bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
310b57cec5SDimitry Andric #include "llvm/IR/Type.h"
320b57cec5SDimitry Andric #include "llvm/Support/Debug.h"
330b57cec5SDimitry Andric 
340b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
350b57cec5SDimitry Andric 
360b57cec5SDimitry Andric using namespace llvm;
370b57cec5SDimitry Andric using namespace LegalizeActions;
380b57cec5SDimitry Andric using namespace LegalizeMutations;
390b57cec5SDimitry Andric using namespace LegalityPredicates;
400b57cec5SDimitry Andric 
410b57cec5SDimitry Andric 
420b57cec5SDimitry Andric static LegalityPredicate isMultiple32(unsigned TypeIdx,
438bcb0991SDimitry Andric                                       unsigned MaxSize = 1024) {
440b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
450b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
460b57cec5SDimitry Andric     const LLT EltTy = Ty.getScalarType();
470b57cec5SDimitry Andric     return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
480b57cec5SDimitry Andric   };
490b57cec5SDimitry Andric }
500b57cec5SDimitry Andric 
518bcb0991SDimitry Andric static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
528bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
538bcb0991SDimitry Andric     return Query.Types[TypeIdx].getSizeInBits() == Size;
548bcb0991SDimitry Andric   };
558bcb0991SDimitry Andric }
568bcb0991SDimitry Andric 
570b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
580b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
590b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
600b57cec5SDimitry Andric     return Ty.isVector() &&
610b57cec5SDimitry Andric            Ty.getNumElements() % 2 != 0 &&
628bcb0991SDimitry Andric            Ty.getElementType().getSizeInBits() < 32 &&
638bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
648bcb0991SDimitry Andric   };
658bcb0991SDimitry Andric }
668bcb0991SDimitry Andric 
678bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
688bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
698bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
708bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
718bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
720b57cec5SDimitry Andric   };
730b57cec5SDimitry Andric }
740b57cec5SDimitry Andric 
750b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
760b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
770b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
780b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
790b57cec5SDimitry Andric     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
800b57cec5SDimitry Andric   };
810b57cec5SDimitry Andric }
820b57cec5SDimitry Andric 
830b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
840b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
850b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
860b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
870b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
880b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
890b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
900b57cec5SDimitry Andric     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
910b57cec5SDimitry Andric   };
920b57cec5SDimitry Andric }
930b57cec5SDimitry Andric 
948bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
958bcb0991SDimitry Andric // type.
968bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
978bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
988bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
998bcb0991SDimitry Andric 
1008bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1018bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1028bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1038bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1048bcb0991SDimitry Andric 
1058bcb0991SDimitry Andric     assert(EltSize < 32);
1068bcb0991SDimitry Andric 
1078bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
1088bcb0991SDimitry Andric     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
1098bcb0991SDimitry Andric   };
1108bcb0991SDimitry Andric }
1118bcb0991SDimitry Andric 
1128bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
1138bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1148bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1158bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
1168bcb0991SDimitry Andric   };
1178bcb0991SDimitry Andric }
1188bcb0991SDimitry Andric 
1190b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
1200b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1210b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1220b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
1230b57cec5SDimitry Andric   };
1240b57cec5SDimitry Andric }
1250b57cec5SDimitry Andric 
1260b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
1270b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1280b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1290b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
1300b57cec5SDimitry Andric   };
1310b57cec5SDimitry Andric }
1320b57cec5SDimitry Andric 
1338bcb0991SDimitry Andric // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
1340b57cec5SDimitry Andric // v2s16.
1350b57cec5SDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
1360b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1370b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1380b57cec5SDimitry Andric     if (Ty.isVector()) {
1390b57cec5SDimitry Andric       const int EltSize = Ty.getElementType().getSizeInBits();
1400b57cec5SDimitry Andric       return EltSize == 32 || EltSize == 64 ||
1410b57cec5SDimitry Andric             (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
1420b57cec5SDimitry Andric              EltSize == 128 || EltSize == 256;
1430b57cec5SDimitry Andric     }
1440b57cec5SDimitry Andric 
1458bcb0991SDimitry Andric     return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
1468bcb0991SDimitry Andric   };
1478bcb0991SDimitry Andric }
1488bcb0991SDimitry Andric 
1498bcb0991SDimitry Andric static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
1508bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1518bcb0991SDimitry Andric     return Query.Types[TypeIdx].getElementType() == Type;
1528bcb0991SDimitry Andric   };
1538bcb0991SDimitry Andric }
1548bcb0991SDimitry Andric 
1558bcb0991SDimitry Andric static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
1568bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1578bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1588bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
1598bcb0991SDimitry Andric            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
1600b57cec5SDimitry Andric   };
1610b57cec5SDimitry Andric }
1620b57cec5SDimitry Andric 
1630b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
1640b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
1650b57cec5SDimitry Andric   :  ST(ST_) {
1660b57cec5SDimitry Andric   using namespace TargetOpcode;
1670b57cec5SDimitry Andric 
1680b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
1690b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
1700b57cec5SDimitry Andric   };
1710b57cec5SDimitry Andric 
1720b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
1730b57cec5SDimitry Andric   const LLT S8 = LLT::scalar(8);
1740b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
1750b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
1760b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
1778bcb0991SDimitry Andric   const LLT S96 = LLT::scalar(96);
1780b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
1790b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
1808bcb0991SDimitry Andric   const LLT S1024 = LLT::scalar(1024);
1810b57cec5SDimitry Andric 
1820b57cec5SDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
1830b57cec5SDimitry Andric   const LLT V4S16 = LLT::vector(4, 16);
1840b57cec5SDimitry Andric 
1850b57cec5SDimitry Andric   const LLT V2S32 = LLT::vector(2, 32);
1860b57cec5SDimitry Andric   const LLT V3S32 = LLT::vector(3, 32);
1870b57cec5SDimitry Andric   const LLT V4S32 = LLT::vector(4, 32);
1880b57cec5SDimitry Andric   const LLT V5S32 = LLT::vector(5, 32);
1890b57cec5SDimitry Andric   const LLT V6S32 = LLT::vector(6, 32);
1900b57cec5SDimitry Andric   const LLT V7S32 = LLT::vector(7, 32);
1910b57cec5SDimitry Andric   const LLT V8S32 = LLT::vector(8, 32);
1920b57cec5SDimitry Andric   const LLT V9S32 = LLT::vector(9, 32);
1930b57cec5SDimitry Andric   const LLT V10S32 = LLT::vector(10, 32);
1940b57cec5SDimitry Andric   const LLT V11S32 = LLT::vector(11, 32);
1950b57cec5SDimitry Andric   const LLT V12S32 = LLT::vector(12, 32);
1960b57cec5SDimitry Andric   const LLT V13S32 = LLT::vector(13, 32);
1970b57cec5SDimitry Andric   const LLT V14S32 = LLT::vector(14, 32);
1980b57cec5SDimitry Andric   const LLT V15S32 = LLT::vector(15, 32);
1990b57cec5SDimitry Andric   const LLT V16S32 = LLT::vector(16, 32);
2008bcb0991SDimitry Andric   const LLT V32S32 = LLT::vector(32, 32);
2010b57cec5SDimitry Andric 
2020b57cec5SDimitry Andric   const LLT V2S64 = LLT::vector(2, 64);
2030b57cec5SDimitry Andric   const LLT V3S64 = LLT::vector(3, 64);
2040b57cec5SDimitry Andric   const LLT V4S64 = LLT::vector(4, 64);
2050b57cec5SDimitry Andric   const LLT V5S64 = LLT::vector(5, 64);
2060b57cec5SDimitry Andric   const LLT V6S64 = LLT::vector(6, 64);
2070b57cec5SDimitry Andric   const LLT V7S64 = LLT::vector(7, 64);
2080b57cec5SDimitry Andric   const LLT V8S64 = LLT::vector(8, 64);
2098bcb0991SDimitry Andric   const LLT V16S64 = LLT::vector(16, 64);
2100b57cec5SDimitry Andric 
2110b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
2120b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
2138bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
2140b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
2158bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
2160b57cec5SDimitry Andric 
2170b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
2180b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
2198bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
2200b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
2218bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
2220b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
2230b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
2240b57cec5SDimitry Andric 
2250b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
2260b57cec5SDimitry Andric 
2270b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
2280b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
2290b57cec5SDimitry Andric   };
2300b57cec5SDimitry Andric 
2310b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
2328bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
2330b57cec5SDimitry Andric   };
2340b57cec5SDimitry Andric 
2350b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
2360b57cec5SDimitry Andric     S32, S64
2370b57cec5SDimitry Andric   };
2380b57cec5SDimitry Andric 
2390b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
2400b57cec5SDimitry Andric     S32, S64, S16
2410b57cec5SDimitry Andric   };
2420b57cec5SDimitry Andric 
2430b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
2440b57cec5SDimitry Andric     S32, S64, S16, V2S16
2450b57cec5SDimitry Andric   };
2460b57cec5SDimitry Andric 
247*480093f4SDimitry Andric   setAction({G_BRCOND, S1}, Legal); // VCC branches
248*480093f4SDimitry Andric   setAction({G_BRCOND, S32}, Legal); // SCC branches
2490b57cec5SDimitry Andric 
2500b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
2510b57cec5SDimitry Andric   // elements for v3s16
2520b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
2530b57cec5SDimitry Andric     .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
2540b57cec5SDimitry Andric     .legalFor(AllS32Vectors)
2550b57cec5SDimitry Andric     .legalFor(AllS64Vectors)
2560b57cec5SDimitry Andric     .legalFor(AddrSpaces64)
2570b57cec5SDimitry Andric     .legalFor(AddrSpaces32)
2580b57cec5SDimitry Andric     .clampScalar(0, S32, S256)
2590b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
2600b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16)
2610b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
2620b57cec5SDimitry Andric     .legalIf(isPointer(0));
2630b57cec5SDimitry Andric 
2640b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
2650b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
2660b57cec5SDimitry Andric       .legalFor({S32, S16})
2670b57cec5SDimitry Andric       .clampScalar(0, S16, S32)
2680b57cec5SDimitry Andric       .scalarize(0);
2690b57cec5SDimitry Andric   } else {
2700b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
2710b57cec5SDimitry Andric       .legalFor({S32})
2720b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
2730b57cec5SDimitry Andric       .scalarize(0);
2740b57cec5SDimitry Andric   }
2750b57cec5SDimitry Andric 
276*480093f4SDimitry Andric   // FIXME: Not really legal. Placeholder for custom lowering.
277*480093f4SDimitry Andric   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
278*480093f4SDimitry Andric     .legalFor({S32, S64})
279*480093f4SDimitry Andric     .clampScalar(0, S32, S64)
280*480093f4SDimitry Andric     .widenScalarToNextPow2(0, 32)
281*480093f4SDimitry Andric     .scalarize(0);
282*480093f4SDimitry Andric 
2830b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_UMULH, G_SMULH})
2840b57cec5SDimitry Andric     .legalFor({S32})
2850b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
2860b57cec5SDimitry Andric     .scalarize(0);
2870b57cec5SDimitry Andric 
2880b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
2890b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
2900b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
2910b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
2920b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
2930b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
2948bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
2950b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
2960b57cec5SDimitry Andric     .scalarize(0);
2970b57cec5SDimitry Andric 
2988bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
2990b57cec5SDimitry Andric                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
300*480093f4SDimitry Andric     .legalFor({{S32, S1}, {S32, S32}})
3018bcb0991SDimitry Andric     .clampScalar(0, S32, S32)
3028bcb0991SDimitry Andric     .scalarize(0); // TODO: Implement.
3038bcb0991SDimitry Andric 
3048bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
3058bcb0991SDimitry Andric     .lower();
3060b57cec5SDimitry Andric 
3070b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
3080b57cec5SDimitry Andric     // Don't worry about the size constraint.
3098bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
3108bcb0991SDimitry Andric     // FIXME: Testing hack
3118bcb0991SDimitry Andric     .legalForCartesianProduct({S16, LLT::vector(2, 8), });
3120b57cec5SDimitry Andric 
3130b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
3140b57cec5SDimitry Andric     .legalFor({S32, S64, S16})
3150b57cec5SDimitry Andric     .clampScalar(0, S16, S64);
3160b57cec5SDimitry Andric 
3170b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_IMPLICIT_DEF)
3188bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
3190b57cec5SDimitry Andric                ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
3200b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
3218bcb0991SDimitry Andric     .clampScalarOrElt(0, S32, S1024)
3220b57cec5SDimitry Andric     .legalIf(isMultiple32(0))
3230b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
3240b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16);
3250b57cec5SDimitry Andric 
3260b57cec5SDimitry Andric 
3270b57cec5SDimitry Andric   // FIXME: i1 operands to intrinsics should always be legal, but other i1
3280b57cec5SDimitry Andric   // values may not be legal.  We need to figure out how to distinguish
3290b57cec5SDimitry Andric   // between these two scenarios.
3300b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
3318bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
3320b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
3330b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
3340b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
3350b57cec5SDimitry Andric     .legalIf(isPointer(0));
3360b57cec5SDimitry Andric 
3370b57cec5SDimitry Andric   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
3388bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
3398bcb0991SDimitry Andric     .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
3408bcb0991SDimitry Andric 
3410b57cec5SDimitry Andric 
3420b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
3438bcb0991SDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
3440b57cec5SDimitry Andric     .legalFor({S32, S64});
3458bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
3468bcb0991SDimitry Andric     .customFor({S32, S64});
3478bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
3488bcb0991SDimitry Andric     .customFor({S32, S64});
3490b57cec5SDimitry Andric 
3500b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
3510b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
3520b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
3530b57cec5SDimitry Andric     else
3540b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
3558bcb0991SDimitry Andric 
3568bcb0991SDimitry Andric     TrigActions.customFor({S16});
3578bcb0991SDimitry Andric     FDIVActions.customFor({S16});
3580b57cec5SDimitry Andric   }
3590b57cec5SDimitry Andric 
3600b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
3610b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
3620b57cec5SDimitry Andric 
3630b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
3640b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
365*480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
3660b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
3670b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
3680b57cec5SDimitry Andric       .scalarize(0);
3690b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
3700b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
3710b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
3720b57cec5SDimitry Andric       .scalarize(0);
3730b57cec5SDimitry Andric   } else {
3740b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
3750b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
3760b57cec5SDimitry Andric       .scalarize(0);
3770b57cec5SDimitry Andric   }
3780b57cec5SDimitry Andric 
3790b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
3800b57cec5SDimitry Andric     FPOpActions.clampMaxNumElements(0, S16, 2);
3818bcb0991SDimitry Andric 
3820b57cec5SDimitry Andric   FPOpActions
3830b57cec5SDimitry Andric     .scalarize(0)
3840b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
3850b57cec5SDimitry Andric 
3868bcb0991SDimitry Andric   TrigActions
3878bcb0991SDimitry Andric     .scalarize(0)
3888bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
3898bcb0991SDimitry Andric 
3908bcb0991SDimitry Andric   FDIVActions
3918bcb0991SDimitry Andric     .scalarize(0)
3928bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
3938bcb0991SDimitry Andric 
3948bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
3958bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
3968bcb0991SDimitry Andric     .clampMaxNumElements(0, S16, 2)
3978bcb0991SDimitry Andric     .scalarize(0)
3988bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
3998bcb0991SDimitry Andric 
4008bcb0991SDimitry Andric   // TODO: Implement
4018bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
4028bcb0991SDimitry Andric 
4030b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
4048bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
4050b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
4060b57cec5SDimitry Andric       .scalarize(0)
4070b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
4080b57cec5SDimitry Andric   } else {
4098bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
4100b57cec5SDimitry Andric       .legalFor({S32, S64})
4110b57cec5SDimitry Andric       .scalarize(0)
4120b57cec5SDimitry Andric       .clampScalar(0, S32, S64);
4130b57cec5SDimitry Andric   }
4140b57cec5SDimitry Andric 
4150b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
4160b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
4170b57cec5SDimitry Andric     .scalarize(0);
4180b57cec5SDimitry Andric 
4190b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
4200b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
4210b57cec5SDimitry Andric     .lowerFor({{S64, S16}}) // FIXME: Implement
4220b57cec5SDimitry Andric     .scalarize(0);
4230b57cec5SDimitry Andric 
4240b57cec5SDimitry Andric   // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
4250b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
4260b57cec5SDimitry Andric 
4270b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FSUB)
4280b57cec5SDimitry Andric       // Use actual fsub instruction
4290b57cec5SDimitry Andric       .legalFor({S32})
4300b57cec5SDimitry Andric       // Must use fadd + fneg
4310b57cec5SDimitry Andric       .lowerFor({S64, S16, V2S16})
4320b57cec5SDimitry Andric       .scalarize(0)
4330b57cec5SDimitry Andric       .clampScalar(0, S32, S64);
4340b57cec5SDimitry Andric 
4358bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
4368bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
4378bcb0991SDimitry Andric   if (ST.hasMadF16())
4388bcb0991SDimitry Andric     FMad.customFor({S32, S16});
4398bcb0991SDimitry Andric   else
4408bcb0991SDimitry Andric     FMad.customFor({S32});
4418bcb0991SDimitry Andric   FMad.scalarize(0)
4428bcb0991SDimitry Andric       .lower();
4438bcb0991SDimitry Andric 
4440b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
4450b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
4460b57cec5SDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1},
4478bcb0991SDimitry Andric                {S96, S32},
4480b57cec5SDimitry Andric                // FIXME: Hack
4490b57cec5SDimitry Andric                {S64, LLT::scalar(33)},
450*480093f4SDimitry Andric                {S32, S8}, {S32, LLT::scalar(24)}})
451*480093f4SDimitry Andric     .scalarize(0)
452*480093f4SDimitry Andric     .clampScalar(0, S32, S64);
4530b57cec5SDimitry Andric 
4548bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
4558bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
456*480093f4SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
4570b57cec5SDimitry Andric     .lowerFor({{S32, S64}})
458*480093f4SDimitry Andric     .lowerIf(typeIs(1, S1))
4598bcb0991SDimitry Andric     .customFor({{S64, S64}});
4608bcb0991SDimitry Andric   if (ST.has16BitInsts())
4618bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
4628bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
4630b57cec5SDimitry Andric        .scalarize(0);
4640b57cec5SDimitry Andric 
4658bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
4668bcb0991SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
4678bcb0991SDimitry Andric   if (ST.has16BitInsts())
4688bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
4698bcb0991SDimitry Andric   else
4708bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
4718bcb0991SDimitry Andric 
4728bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
4730b57cec5SDimitry Andric        .scalarize(0);
4740b57cec5SDimitry Andric 
4750b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
476*480093f4SDimitry Andric     .scalarize(0)
477*480093f4SDimitry Andric     .lower();
4780b57cec5SDimitry Andric 
479*480093f4SDimitry Andric   if (ST.has16BitInsts()) {
480*480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
481*480093f4SDimitry Andric       .legalFor({S16, S32, S64})
482*480093f4SDimitry Andric       .clampScalar(0, S16, S64)
483*480093f4SDimitry Andric       .scalarize(0);
484*480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
4850b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
4860b57cec5SDimitry Andric       .legalFor({S32, S64})
4870b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
4880b57cec5SDimitry Andric       .scalarize(0);
4890b57cec5SDimitry Andric   } else {
4900b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
4910b57cec5SDimitry Andric       .legalFor({S32})
4920b57cec5SDimitry Andric       .customFor({S64})
4930b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
4940b57cec5SDimitry Andric       .scalarize(0);
4950b57cec5SDimitry Andric   }
4960b57cec5SDimitry Andric 
497*480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
4980b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
4990b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
5000b57cec5SDimitry Andric     .scalarize(0);
5010b57cec5SDimitry Andric 
5028bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_PTR_MASK)
5038bcb0991SDimitry Andric     .scalarize(0)
5048bcb0991SDimitry Andric     .alwaysLegal();
5058bcb0991SDimitry Andric 
5060b57cec5SDimitry Andric   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
5070b57cec5SDimitry Andric 
5080b57cec5SDimitry Andric   auto &CmpBuilder =
5090b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
510*480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
511*480093f4SDimitry Andric     // so make both s1 and s32 legal.
512*480093f4SDimitry Andric     //
513*480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
514*480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
515*480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
516*480093f4SDimitry Andric     // before that won't try to use s32 result types.
517*480093f4SDimitry Andric     //
518*480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
519*480093f4SDimitry Andric     // bank.
5200b57cec5SDimitry Andric     .legalForCartesianProduct(
5210b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
522*480093f4SDimitry Andric     .legalForCartesianProduct(
523*480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
5240b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
5250b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
5260b57cec5SDimitry Andric   }
5270b57cec5SDimitry Andric 
5280b57cec5SDimitry Andric   CmpBuilder
5290b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
5300b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
5310b57cec5SDimitry Andric     .scalarize(0)
532*480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
5330b57cec5SDimitry Andric 
5340b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
5350b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
5360b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
5370b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
5380b57cec5SDimitry Andric     .scalarize(0);
5390b57cec5SDimitry Andric 
5400b57cec5SDimitry Andric   // FIXME: fexp, flog2, flog10 needs to be custom lowered.
5410b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
5420b57cec5SDimitry Andric                                G_FLOG, G_FLOG2, G_FLOG10})
5430b57cec5SDimitry Andric     .legalFor({S32})
5440b57cec5SDimitry Andric     .scalarize(0);
5450b57cec5SDimitry Andric 
5460b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
5470b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
5480b57cec5SDimitry Andric                                G_CTTZ, G_CTTZ_ZERO_UNDEF,
5490b57cec5SDimitry Andric                                G_CTPOP})
5500b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
5510b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
5520b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
5530b57cec5SDimitry Andric     .scalarize(0)
5540b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
5550b57cec5SDimitry Andric     .widenScalarToNextPow2(1, 32);
5560b57cec5SDimitry Andric 
5570b57cec5SDimitry Andric   // TODO: Expand for > s32
5588bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
5590b57cec5SDimitry Andric     .legalFor({S32})
5600b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
5610b57cec5SDimitry Andric     .scalarize(0);
5620b57cec5SDimitry Andric 
5630b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
5640b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
5650b57cec5SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
5660b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
5670b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
5680b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
5690b57cec5SDimitry Andric         .clampScalar(0, S16, S32)
5700b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
5710b57cec5SDimitry Andric         .scalarize(0);
5720b57cec5SDimitry Andric     } else {
5730b57cec5SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
5740b57cec5SDimitry Andric         .legalFor({S32, S16})
5750b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
5760b57cec5SDimitry Andric         .clampScalar(0, S16, S32)
5770b57cec5SDimitry Andric         .scalarize(0);
5780b57cec5SDimitry Andric     }
5790b57cec5SDimitry Andric   } else {
5800b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
5810b57cec5SDimitry Andric       .legalFor({S32})
5820b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
5830b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
5840b57cec5SDimitry Andric       .scalarize(0);
5850b57cec5SDimitry Andric   }
5860b57cec5SDimitry Andric 
5870b57cec5SDimitry Andric   auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
5880b57cec5SDimitry Andric     return [=](const LegalityQuery &Query) {
5890b57cec5SDimitry Andric       return Query.Types[TypeIdx0].getSizeInBits() <
5900b57cec5SDimitry Andric              Query.Types[TypeIdx1].getSizeInBits();
5910b57cec5SDimitry Andric     };
5920b57cec5SDimitry Andric   };
5930b57cec5SDimitry Andric 
5940b57cec5SDimitry Andric   auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
5950b57cec5SDimitry Andric     return [=](const LegalityQuery &Query) {
5960b57cec5SDimitry Andric       return Query.Types[TypeIdx0].getSizeInBits() >
5970b57cec5SDimitry Andric              Query.Types[TypeIdx1].getSizeInBits();
5980b57cec5SDimitry Andric     };
5990b57cec5SDimitry Andric   };
6000b57cec5SDimitry Andric 
6010b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
6020b57cec5SDimitry Andric     // List the common cases
6030b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
6040b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
6050b57cec5SDimitry Andric     .scalarize(0)
6060b57cec5SDimitry Andric     // Accept any address space as long as the size matches
6070b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
6080b57cec5SDimitry Andric     .widenScalarIf(smallerThan(1, 0),
6090b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
6100b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
6110b57cec5SDimitry Andric       })
6120b57cec5SDimitry Andric     .narrowScalarIf(greaterThan(1, 0),
6130b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
6140b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
6150b57cec5SDimitry Andric       });
6160b57cec5SDimitry Andric 
6170b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
6180b57cec5SDimitry Andric     // List the common cases
6190b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
6200b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
6210b57cec5SDimitry Andric     .scalarize(0)
6220b57cec5SDimitry Andric     // Accept any address space as long as the size matches
6230b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
6240b57cec5SDimitry Andric     .widenScalarIf(smallerThan(0, 1),
6250b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
6260b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
6270b57cec5SDimitry Andric       })
6280b57cec5SDimitry Andric     .narrowScalarIf(
6290b57cec5SDimitry Andric       greaterThan(0, 1),
6300b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
6310b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
6320b57cec5SDimitry Andric       });
6330b57cec5SDimitry Andric 
6340b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
6350b57cec5SDimitry Andric     .scalarize(0)
6360b57cec5SDimitry Andric     .custom();
6370b57cec5SDimitry Andric 
6380b57cec5SDimitry Andric   // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
6390b57cec5SDimitry Andric   // handle some operations by just promoting the register during
6400b57cec5SDimitry Andric   // selection. There are also d16 loads on GFX9+ which preserve the high bits.
6418bcb0991SDimitry Andric   auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
6428bcb0991SDimitry Andric     switch (AS) {
6438bcb0991SDimitry Andric     // FIXME: Private element size.
6448bcb0991SDimitry Andric     case AMDGPUAS::PRIVATE_ADDRESS:
6458bcb0991SDimitry Andric       return 32;
6468bcb0991SDimitry Andric     // FIXME: Check subtarget
6478bcb0991SDimitry Andric     case AMDGPUAS::LOCAL_ADDRESS:
6488bcb0991SDimitry Andric       return ST.useDS128() ? 128 : 64;
6490b57cec5SDimitry Andric 
6508bcb0991SDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable
6518bcb0991SDimitry Andric     // for global loads (ideally constant address space should be eliminated)
6528bcb0991SDimitry Andric     // depending on the context. Legality cannot be context dependent, but
6538bcb0991SDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
6548bcb0991SDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in
6558bcb0991SDimitry Andric     // a kernel.
6568bcb0991SDimitry Andric     case AMDGPUAS::CONSTANT_ADDRESS:
6578bcb0991SDimitry Andric     case AMDGPUAS::GLOBAL_ADDRESS:
6588bcb0991SDimitry Andric       return 512;
6598bcb0991SDimitry Andric     default:
6608bcb0991SDimitry Andric       return 128;
6618bcb0991SDimitry Andric     }
6628bcb0991SDimitry Andric   };
6638bcb0991SDimitry Andric 
6648bcb0991SDimitry Andric   const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
6658bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
6668bcb0991SDimitry Andric 
6678bcb0991SDimitry Andric     // Split vector extloads.
6688bcb0991SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
669*480093f4SDimitry Andric     unsigned Align = Query.MMODescrs[0].AlignInBits;
670*480093f4SDimitry Andric 
671*480093f4SDimitry Andric     if (MemSize < DstTy.getSizeInBits())
672*480093f4SDimitry Andric       MemSize = std::max(MemSize, Align);
673*480093f4SDimitry Andric 
6748bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
6758bcb0991SDimitry Andric       return true;
6768bcb0991SDimitry Andric 
6778bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
6788bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
6798bcb0991SDimitry Andric     if (MemSize > maxSizeForAddrSpace(AS))
6808bcb0991SDimitry Andric       return true;
6818bcb0991SDimitry Andric 
6828bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
6838bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
6848bcb0991SDimitry Andric     unsigned NumRegs = MemSize / 32;
6858bcb0991SDimitry Andric     if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
6868bcb0991SDimitry Andric       return true;
6878bcb0991SDimitry Andric 
6888bcb0991SDimitry Andric     if (Align < MemSize) {
6898bcb0991SDimitry Andric       const SITargetLowering *TLI = ST.getTargetLowering();
6908bcb0991SDimitry Andric       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
6918bcb0991SDimitry Andric     }
6928bcb0991SDimitry Andric 
6938bcb0991SDimitry Andric     return false;
6948bcb0991SDimitry Andric   };
6958bcb0991SDimitry Andric 
6968bcb0991SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
6978bcb0991SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
6988bcb0991SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
6998bcb0991SDimitry Andric 
7008bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
7018bcb0991SDimitry Andric   // LDS
7028bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
7038bcb0991SDimitry Andric 
7048bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
7058bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
7068bcb0991SDimitry Andric 
7078bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
7088bcb0991SDimitry Andric     // Whitelist the common cases.
7098bcb0991SDimitry Andric     // TODO: Pointer loads
7108bcb0991SDimitry Andric     // TODO: Wide constant loads
7118bcb0991SDimitry Andric     // TODO: Only CI+ has 3x loads
7128bcb0991SDimitry Andric     // TODO: Loads to s16 on gfx9
7138bcb0991SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
7148bcb0991SDimitry Andric                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
7158bcb0991SDimitry Andric                                       {V3S32, GlobalPtr, 96, GlobalAlign32},
7168bcb0991SDimitry Andric                                       {S96, GlobalPtr, 96, GlobalAlign32},
7178bcb0991SDimitry Andric                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
7188bcb0991SDimitry Andric                                       {S128, GlobalPtr, 128, GlobalAlign32},
7198bcb0991SDimitry Andric                                       {S64, GlobalPtr, 64, GlobalAlign32},
7208bcb0991SDimitry Andric                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
7218bcb0991SDimitry Andric                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
7228bcb0991SDimitry Andric                                       {S32, GlobalPtr, 8, GlobalAlign8},
7238bcb0991SDimitry Andric                                       {S32, GlobalPtr, 16, GlobalAlign16},
7248bcb0991SDimitry Andric 
7258bcb0991SDimitry Andric                                       {S32, LocalPtr, 32, 32},
7268bcb0991SDimitry Andric                                       {S64, LocalPtr, 64, 32},
7278bcb0991SDimitry Andric                                       {V2S32, LocalPtr, 64, 32},
7288bcb0991SDimitry Andric                                       {S32, LocalPtr, 8, 8},
7298bcb0991SDimitry Andric                                       {S32, LocalPtr, 16, 16},
7308bcb0991SDimitry Andric                                       {V2S16, LocalPtr, 32, 32},
7318bcb0991SDimitry Andric 
7328bcb0991SDimitry Andric                                       {S32, PrivatePtr, 32, 32},
7338bcb0991SDimitry Andric                                       {S32, PrivatePtr, 8, 8},
7348bcb0991SDimitry Andric                                       {S32, PrivatePtr, 16, 16},
7358bcb0991SDimitry Andric                                       {V2S16, PrivatePtr, 32, 32},
7368bcb0991SDimitry Andric 
7378bcb0991SDimitry Andric                                       {S32, FlatPtr, 32, GlobalAlign32},
7388bcb0991SDimitry Andric                                       {S32, FlatPtr, 16, GlobalAlign16},
7398bcb0991SDimitry Andric                                       {S32, FlatPtr, 8, GlobalAlign8},
7408bcb0991SDimitry Andric                                       {V2S16, FlatPtr, 32, GlobalAlign32},
7418bcb0991SDimitry Andric 
7428bcb0991SDimitry Andric                                       {S32, ConstantPtr, 32, GlobalAlign32},
7438bcb0991SDimitry Andric                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
7448bcb0991SDimitry Andric                                       {V3S32, ConstantPtr, 96, GlobalAlign32},
7458bcb0991SDimitry Andric                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
7468bcb0991SDimitry Andric                                       {S64, ConstantPtr, 64, GlobalAlign32},
7478bcb0991SDimitry Andric                                       {S128, ConstantPtr, 128, GlobalAlign32},
7488bcb0991SDimitry Andric                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
7498bcb0991SDimitry Andric     Actions
7508bcb0991SDimitry Andric         .customIf(typeIs(1, Constant32Ptr))
7518bcb0991SDimitry Andric         .narrowScalarIf(
7528bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
7538bcb0991SDimitry Andric               return !Query.Types[0].isVector() && needToSplitLoad(Query);
7548bcb0991SDimitry Andric             },
7558bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
7568bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
7578bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
7588bcb0991SDimitry Andric 
7598bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
7608bcb0991SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
7618bcb0991SDimitry Andric 
7628bcb0991SDimitry Andric               // Split extloads.
7638bcb0991SDimitry Andric               if (DstSize > MemSize)
7648bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MemSize));
7658bcb0991SDimitry Andric 
7668bcb0991SDimitry Andric               if (DstSize > 32 && (DstSize % 32 != 0)) {
7678bcb0991SDimitry Andric                 // FIXME: Need a way to specify non-extload of larger size if
7688bcb0991SDimitry Andric                 // suitably aligned.
7698bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
7708bcb0991SDimitry Andric               }
7718bcb0991SDimitry Andric 
7728bcb0991SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
7738bcb0991SDimitry Andric               if (MemSize > MaxSize)
7748bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MaxSize));
7758bcb0991SDimitry Andric 
7768bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
7778bcb0991SDimitry Andric               return std::make_pair(0, LLT::scalar(Align));
7788bcb0991SDimitry Andric             })
7798bcb0991SDimitry Andric         .fewerElementsIf(
7808bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
7818bcb0991SDimitry Andric               return Query.Types[0].isVector() && needToSplitLoad(Query);
7828bcb0991SDimitry Andric             },
7838bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
7848bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
7858bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
7868bcb0991SDimitry Andric 
7878bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
7888bcb0991SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
7898bcb0991SDimitry Andric 
7908bcb0991SDimitry Andric               // Split if it's too large for the address space.
7918bcb0991SDimitry Andric               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
7928bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
7938bcb0991SDimitry Andric                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
7948bcb0991SDimitry Andric 
7958bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
7968bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
7978bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
7988bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
7998bcb0991SDimitry Andric                   return std::make_pair(0, EltTy);
8008bcb0991SDimitry Andric 
8018bcb0991SDimitry Andric                 return std::make_pair(0,
8028bcb0991SDimitry Andric                                       LLT::vector(NumElts / NumPieces, EltTy));
8038bcb0991SDimitry Andric               }
8048bcb0991SDimitry Andric 
8058bcb0991SDimitry Andric               // Need to split because of alignment.
8068bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
8078bcb0991SDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
8088bcb0991SDimitry Andric               if (EltSize > Align &&
8098bcb0991SDimitry Andric                   (EltSize / Align < DstTy.getNumElements())) {
8108bcb0991SDimitry Andric                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
8118bcb0991SDimitry Andric               }
8128bcb0991SDimitry Andric 
8138bcb0991SDimitry Andric               // May need relegalization for the scalars.
8148bcb0991SDimitry Andric               return std::make_pair(0, EltTy);
8158bcb0991SDimitry Andric             })
8168bcb0991SDimitry Andric         .minScalar(0, S32);
8178bcb0991SDimitry Andric 
8188bcb0991SDimitry Andric     if (IsStore)
8198bcb0991SDimitry Andric       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
8208bcb0991SDimitry Andric 
8218bcb0991SDimitry Andric     // TODO: Need a bitcast lower option?
8228bcb0991SDimitry Andric     Actions
8238bcb0991SDimitry Andric         .legalIf([=](const LegalityQuery &Query) {
8248bcb0991SDimitry Andric           const LLT Ty0 = Query.Types[0];
8250b57cec5SDimitry Andric           unsigned Size = Ty0.getSizeInBits();
8260b57cec5SDimitry Andric           unsigned MemSize = Query.MMODescrs[0].SizeInBits;
8278bcb0991SDimitry Andric           unsigned Align = Query.MMODescrs[0].AlignInBits;
8288bcb0991SDimitry Andric 
8298bcb0991SDimitry Andric           // FIXME: Widening store from alignment not valid.
8308bcb0991SDimitry Andric           if (MemSize < Size)
8318bcb0991SDimitry Andric             MemSize = std::max(MemSize, Align);
8320b57cec5SDimitry Andric 
833*480093f4SDimitry Andric           // No extending vector loads.
834*480093f4SDimitry Andric           if (Size > MemSize && Ty0.isVector())
835*480093f4SDimitry Andric             return false;
836*480093f4SDimitry Andric 
8370b57cec5SDimitry Andric           switch (MemSize) {
8380b57cec5SDimitry Andric           case 8:
8390b57cec5SDimitry Andric           case 16:
8400b57cec5SDimitry Andric             return Size == 32;
8410b57cec5SDimitry Andric           case 32:
8420b57cec5SDimitry Andric           case 64:
8430b57cec5SDimitry Andric           case 128:
8440b57cec5SDimitry Andric             return true;
8450b57cec5SDimitry Andric           case 96:
8460b57cec5SDimitry Andric             return ST.hasDwordx3LoadStores();
8470b57cec5SDimitry Andric           case 256:
8480b57cec5SDimitry Andric           case 512:
8498bcb0991SDimitry Andric             return true;
8500b57cec5SDimitry Andric           default:
8510b57cec5SDimitry Andric             return false;
8520b57cec5SDimitry Andric           }
8530b57cec5SDimitry Andric         })
8548bcb0991SDimitry Andric         .widenScalarToNextPow2(0)
8558bcb0991SDimitry Andric         // TODO: v3s32->v4s32 with alignment
8568bcb0991SDimitry Andric         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
8578bcb0991SDimitry Andric   }
8580b57cec5SDimitry Andric 
8590b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
8608bcb0991SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
8618bcb0991SDimitry Andric                                                   {S32, GlobalPtr, 16, 2 * 8},
8620b57cec5SDimitry Andric                                                   {S32, LocalPtr, 8, 8},
8638bcb0991SDimitry Andric                                                   {S32, LocalPtr, 16, 16},
8640b57cec5SDimitry Andric                                                   {S32, PrivatePtr, 8, 8},
8658bcb0991SDimitry Andric                                                   {S32, PrivatePtr, 16, 16},
8668bcb0991SDimitry Andric                                                   {S32, ConstantPtr, 8, 8},
8678bcb0991SDimitry Andric                                                   {S32, ConstantPtr, 16, 2 * 8}});
8680b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
8698bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
8708bcb0991SDimitry Andric         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
8710b57cec5SDimitry Andric   }
8720b57cec5SDimitry Andric 
8730b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
8740b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
8750b57cec5SDimitry Andric           .unsupportedIfMemSizeNotPow2()
8760b57cec5SDimitry Andric           .lower();
8770b57cec5SDimitry Andric 
8780b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
8790b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
8800b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
8810b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
882*480093f4SDimitry Andric      G_ATOMICRMW_UMIN})
8830b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
8840b57cec5SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr}});
8850b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
8860b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
8870b57cec5SDimitry Andric   }
8880b57cec5SDimitry Andric 
8898bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
8908bcb0991SDimitry Andric     .legalFor({{S32, LocalPtr}});
8918bcb0991SDimitry Andric 
892*480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
893*480093f4SDimitry Andric   // demarshalling
894*480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
895*480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
896*480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
897*480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
898*480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
899*480093f4SDimitry Andric 
9008bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
9018bcb0991SDimitry Andric     .lower();
9028bcb0991SDimitry Andric 
9030b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
904*480093f4SDimitry Andric 
905*480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
9060b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
9070b57cec5SDimitry Andric     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
9080b57cec5SDimitry Andric           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
909*480093f4SDimitry Andric           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
9100b57cec5SDimitry Andric     .clampScalar(0, S16, S64)
9110b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
9120b57cec5SDimitry Andric     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
9130b57cec5SDimitry Andric     .scalarize(1)
9140b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 2)
9150b57cec5SDimitry Andric     .clampMaxNumElements(0, LocalPtr, 2)
9160b57cec5SDimitry Andric     .clampMaxNumElements(0, PrivatePtr, 2)
9170b57cec5SDimitry Andric     .scalarize(0)
9180b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
919*480093f4SDimitry Andric     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
9200b57cec5SDimitry Andric 
9210b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
9220b57cec5SDimitry Andric   // be more flexible with the shift amount type.
9230b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
9240b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
9250b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
9260b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
9270b57cec5SDimitry Andric       Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
9280b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
9290b57cec5SDimitry Andric     } else
9300b57cec5SDimitry Andric       Shifts.legalFor({{S16, S32}, {S16, S16}});
9310b57cec5SDimitry Andric 
932*480093f4SDimitry Andric     // TODO: Support 16-bit shift amounts
933*480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
9340b57cec5SDimitry Andric     Shifts.clampScalar(0, S16, S64);
9350b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
9360b57cec5SDimitry Andric   } else {
9370b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
9380b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
9390b57cec5SDimitry Andric     // been truncated already.
9400b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
9410b57cec5SDimitry Andric     Shifts.clampScalar(0, S32, S64);
9420b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
9430b57cec5SDimitry Andric   }
9440b57cec5SDimitry Andric   Shifts.scalarize(0);
9450b57cec5SDimitry Andric 
9460b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
9470b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
9480b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
9490b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
9500b57cec5SDimitry Andric 
9510b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
9520b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
9530b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
9540b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
9550b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
9560b57cec5SDimitry Andric           return (EltTy.getSizeInBits() == 16 ||
9570b57cec5SDimitry Andric                   EltTy.getSizeInBits() % 32 == 0) &&
9580b57cec5SDimitry Andric                  VecTy.getSizeInBits() % 32 == 0 &&
9598bcb0991SDimitry Andric                  VecTy.getSizeInBits() <= 1024 &&
9600b57cec5SDimitry Andric                  IdxTy.getSizeInBits() == 32;
9610b57cec5SDimitry Andric         })
9620b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
9630b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
9640b57cec5SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32);
9650b57cec5SDimitry Andric   }
9660b57cec5SDimitry Andric 
9670b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
9680b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
9690b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
9700b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
9710b57cec5SDimitry Andric       });
9720b57cec5SDimitry Andric 
9730b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
9740b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
9750b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
9760b57cec5SDimitry Andric 
9770b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
9780b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
9798bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
9808bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
9810b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
9820b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
9830b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
9840b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
9850b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
9860b57cec5SDimitry Andric         })
9870b57cec5SDimitry Andric       .widenScalarIf(
9880b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
9890b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
9900b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
9910b57cec5SDimitry Andric         },
9920b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
9930b57cec5SDimitry Andric       .widenScalarIf(
9940b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
9950b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
9960b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
9970b57cec5SDimitry Andric         },
9980b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
9990b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
10000b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
10010b57cec5SDimitry Andric 
10020b57cec5SDimitry Andric   }
10030b57cec5SDimitry Andric 
10048bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
10050b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
10060b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
10078bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
10088bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
10098bcb0991SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
10108bcb0991SDimitry Andric 
10118bcb0991SDimitry Andric   if (ST.hasScalarPackInsts())
10128bcb0991SDimitry Andric     BuildVector.legalFor({V2S16, S32});
10138bcb0991SDimitry Andric 
10148bcb0991SDimitry Andric   BuildVector
10150b57cec5SDimitry Andric     .minScalarSameAs(1, 0)
10160b57cec5SDimitry Andric     .legalIf(isRegisterType(0))
10170b57cec5SDimitry Andric     .minScalarOrElt(0, S32);
10180b57cec5SDimitry Andric 
10198bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
10208bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
10218bcb0991SDimitry Andric       .legalFor({V2S16, S32})
10228bcb0991SDimitry Andric       .lower();
10238bcb0991SDimitry Andric   } else {
10248bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
10258bcb0991SDimitry Andric       .lower();
10268bcb0991SDimitry Andric   }
10278bcb0991SDimitry Andric 
10280b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
10290b57cec5SDimitry Andric     .legalIf(isRegisterType(0));
10300b57cec5SDimitry Andric 
10318bcb0991SDimitry Andric   // TODO: Don't fully scalarize v2s16 pieces
10328bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
10338bcb0991SDimitry Andric 
10340b57cec5SDimitry Andric   // Merge/Unmerge
10350b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
10360b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
10370b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
10380b57cec5SDimitry Andric 
10390b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
10400b57cec5SDimitry Andric       const LLT &Ty = Query.Types[TypeIdx];
10410b57cec5SDimitry Andric       if (Ty.isVector()) {
10420b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
10430b57cec5SDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
10440b57cec5SDimitry Andric           return true;
10450b57cec5SDimitry Andric         if (!isPowerOf2_32(EltTy.getSizeInBits()))
10460b57cec5SDimitry Andric           return true;
10470b57cec5SDimitry Andric       }
10480b57cec5SDimitry Andric       return false;
10490b57cec5SDimitry Andric     };
10500b57cec5SDimitry Andric 
10518bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
10520b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
10530b57cec5SDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
10540b57cec5SDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
10550b57cec5SDimitry Andric       // valid.
10560b57cec5SDimitry Andric       .clampScalar(LitTyIdx, S16, S256)
10570b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
10588bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
10598bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
10608bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
10618bcb0991SDimitry Andric                        changeTo(1, V2S16))
10620b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
10630b57cec5SDimitry Andric       .fewerElementsIf(
10640b57cec5SDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
10650b57cec5SDimitry Andric         scalarize(0))
10660b57cec5SDimitry Andric       .fewerElementsIf(
10670b57cec5SDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
10680b57cec5SDimitry Andric         scalarize(1))
10698bcb0991SDimitry Andric       .clampScalar(BigTyIdx, S32, S1024)
10708bcb0991SDimitry Andric       .lowerFor({{S16, V2S16}});
10718bcb0991SDimitry Andric 
10728bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
10738bcb0991SDimitry Andric       Builder.widenScalarIf(
10748bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
10750b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
10768bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
10778bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
10788bcb0991SDimitry Andric         },
10798bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
10808bcb0991SDimitry Andric     }
10818bcb0991SDimitry Andric 
10828bcb0991SDimitry Andric     Builder.widenScalarIf(
10838bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
10848bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
10850b57cec5SDimitry Andric         return !isPowerOf2_32(Ty.getSizeInBits()) &&
10860b57cec5SDimitry Andric           Ty.getSizeInBits() % 16 != 0;
10870b57cec5SDimitry Andric       },
10880b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
10890b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
10900b57cec5SDimitry Andric         // Whichever is smaller.
10910b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
10920b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
10930b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
10940b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
10950b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
10960b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
10970b57cec5SDimitry Andric         }
10980b57cec5SDimitry Andric         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
10990b57cec5SDimitry Andric       })
11000b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
11010b57cec5SDimitry Andric           const LLT &BigTy = Query.Types[BigTyIdx];
11020b57cec5SDimitry Andric           const LLT &LitTy = Query.Types[LitTyIdx];
11030b57cec5SDimitry Andric 
11040b57cec5SDimitry Andric           if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
11050b57cec5SDimitry Andric             return false;
11060b57cec5SDimitry Andric           if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
11070b57cec5SDimitry Andric             return false;
11080b57cec5SDimitry Andric 
11090b57cec5SDimitry Andric           return BigTy.getSizeInBits() % 16 == 0 &&
11100b57cec5SDimitry Andric                  LitTy.getSizeInBits() % 16 == 0 &&
11118bcb0991SDimitry Andric                  BigTy.getSizeInBits() <= 1024;
11120b57cec5SDimitry Andric         })
11130b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
11140b57cec5SDimitry Andric       .scalarize(0)
11150b57cec5SDimitry Andric       .scalarize(1);
11160b57cec5SDimitry Andric   }
11170b57cec5SDimitry Andric 
11188bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
11198bcb0991SDimitry Andric 
1120*480093f4SDimitry Andric   getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
1121*480093f4SDimitry Andric 
1122*480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1123*480093f4SDimitry Andric     .legalFor({S64});
1124*480093f4SDimitry Andric 
1125*480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
1126*480093f4SDimitry Andric         G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1127*480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1128*480093f4SDimitry Andric     .unsupported();
1129*480093f4SDimitry Andric 
11300b57cec5SDimitry Andric   computeTables();
11310b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
11320b57cec5SDimitry Andric }
11330b57cec5SDimitry Andric 
11340b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
11350b57cec5SDimitry Andric                                          MachineRegisterInfo &MRI,
11368bcb0991SDimitry Andric                                          MachineIRBuilder &B,
11370b57cec5SDimitry Andric                                          GISelChangeObserver &Observer) const {
11380b57cec5SDimitry Andric   switch (MI.getOpcode()) {
11390b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
11408bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
11410b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
11428bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
11430b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
11448bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
11450b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
11468bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
11470b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
11488bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
11490b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
11508bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
11510b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
11520b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
11530b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
11540b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
11558bcb0991SDimitry Andric     return legalizeMinNumMaxNum(MI, MRI, B);
11560b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
11578bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
11580b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
11598bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
11608bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
11618bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
11628bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
11638bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
11648bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
11658bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
11668bcb0991SDimitry Andric     return legalizeLoad(MI, MRI, B, Observer);
11678bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
11688bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
11698bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
11708bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
1171*480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
1172*480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
11730b57cec5SDimitry Andric   default:
11740b57cec5SDimitry Andric     return false;
11750b57cec5SDimitry Andric   }
11760b57cec5SDimitry Andric 
11770b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
11780b57cec5SDimitry Andric }
11790b57cec5SDimitry Andric 
11800b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
11810b57cec5SDimitry Andric   unsigned AS,
11820b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
11838bcb0991SDimitry Andric   MachineIRBuilder &B) const {
11848bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
11850b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
11860b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
11870b57cec5SDimitry Andric 
11888bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
11898bcb0991SDimitry Andric 
11900b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
11910b57cec5SDimitry Andric     // FIXME: Use inline constants (src_{shared, private}_base) instead of
11920b57cec5SDimitry Andric     // getreg.
11930b57cec5SDimitry Andric     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
11940b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
11950b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
11960b57cec5SDimitry Andric     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
11970b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
11980b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
11990b57cec5SDimitry Andric     unsigned Encoding =
12000b57cec5SDimitry Andric         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
12010b57cec5SDimitry Andric         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
12020b57cec5SDimitry Andric         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
12030b57cec5SDimitry Andric 
12040b57cec5SDimitry Andric     Register ApertureReg = MRI.createGenericVirtualRegister(S32);
12050b57cec5SDimitry Andric     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
12060b57cec5SDimitry Andric 
12078bcb0991SDimitry Andric     B.buildInstr(AMDGPU::S_GETREG_B32)
12080b57cec5SDimitry Andric       .addDef(GetReg)
12090b57cec5SDimitry Andric       .addImm(Encoding);
12100b57cec5SDimitry Andric     MRI.setType(GetReg, S32);
12110b57cec5SDimitry Andric 
12128bcb0991SDimitry Andric     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
12138bcb0991SDimitry Andric     B.buildInstr(TargetOpcode::G_SHL)
12140b57cec5SDimitry Andric       .addDef(ApertureReg)
12150b57cec5SDimitry Andric       .addUse(GetReg)
12160b57cec5SDimitry Andric       .addUse(ShiftAmt.getReg(0));
12170b57cec5SDimitry Andric 
12180b57cec5SDimitry Andric     return ApertureReg;
12190b57cec5SDimitry Andric   }
12200b57cec5SDimitry Andric 
12210b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
12220b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
12230b57cec5SDimitry Andric 
12248bcb0991SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12258bcb0991SDimitry Andric   if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
12268bcb0991SDimitry Andric     return Register();
12270b57cec5SDimitry Andric 
12280b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
12290b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
12300b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
12310b57cec5SDimitry Andric 
1232*480093f4SDimitry Andric   // TODO: can we be smarter about machine pointer info?
1233*480093f4SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
12340b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
12350b57cec5SDimitry Andric     PtrInfo,
12360b57cec5SDimitry Andric     MachineMemOperand::MOLoad |
12370b57cec5SDimitry Andric     MachineMemOperand::MODereferenceable |
12380b57cec5SDimitry Andric     MachineMemOperand::MOInvariant,
12390b57cec5SDimitry Andric     4,
12400b57cec5SDimitry Andric     MinAlign(64, StructOffset));
12410b57cec5SDimitry Andric 
12420b57cec5SDimitry Andric   Register LoadResult = MRI.createGenericVirtualRegister(S32);
12430b57cec5SDimitry Andric   Register LoadAddr;
12440b57cec5SDimitry Andric 
1245*480093f4SDimitry Andric   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
12468bcb0991SDimitry Andric   B.buildLoad(LoadResult, LoadAddr, *MMO);
12470b57cec5SDimitry Andric   return LoadResult;
12480b57cec5SDimitry Andric }
12490b57cec5SDimitry Andric 
12500b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
12510b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
12528bcb0991SDimitry Andric   MachineIRBuilder &B) const {
12538bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
12540b57cec5SDimitry Andric 
12558bcb0991SDimitry Andric   B.setInstr(MI);
12560b57cec5SDimitry Andric 
12578bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
12580b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
12590b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
12600b57cec5SDimitry Andric 
12610b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
12620b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
12630b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
12640b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
12650b57cec5SDimitry Andric 
12660b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
12670b57cec5SDimitry Andric   // vector element.
12680b57cec5SDimitry Andric   assert(!DstTy.isVector());
12690b57cec5SDimitry Andric 
12700b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
12710b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
12720b57cec5SDimitry Andric 
12730b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
12740b57cec5SDimitry Andric   if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
12758bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
12768bcb0991SDimitry Andric     return true;
12778bcb0991SDimitry Andric   }
12788bcb0991SDimitry Andric 
12798bcb0991SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
12808bcb0991SDimitry Andric     // Truncate.
12818bcb0991SDimitry Andric     B.buildExtract(Dst, Src, 0);
12828bcb0991SDimitry Andric     MI.eraseFromParent();
12838bcb0991SDimitry Andric     return true;
12848bcb0991SDimitry Andric   }
12858bcb0991SDimitry Andric 
12868bcb0991SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
12878bcb0991SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12888bcb0991SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
12898bcb0991SDimitry Andric 
12908bcb0991SDimitry Andric     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
12918bcb0991SDimitry Andric     // another. Merge operands are required to be the same type, but creating an
12928bcb0991SDimitry Andric     // extra ptrtoint would be kind of pointless.
12938bcb0991SDimitry Andric     auto HighAddr = B.buildConstant(
12948bcb0991SDimitry Andric       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
12958bcb0991SDimitry Andric     B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
12968bcb0991SDimitry Andric     MI.eraseFromParent();
12970b57cec5SDimitry Andric     return true;
12980b57cec5SDimitry Andric   }
12990b57cec5SDimitry Andric 
13000b57cec5SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
13010b57cec5SDimitry Andric     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
13020b57cec5SDimitry Andric            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
13030b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
13040b57cec5SDimitry Andric 
13058bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
13068bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
13070b57cec5SDimitry Andric 
13080b57cec5SDimitry Andric     Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
13090b57cec5SDimitry Andric 
13100b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
13118bcb0991SDimitry Andric     B.buildExtract(PtrLo32, Src, 0);
13120b57cec5SDimitry Andric 
13130b57cec5SDimitry Andric     Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
13148bcb0991SDimitry Andric     B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
13158bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
13160b57cec5SDimitry Andric 
13170b57cec5SDimitry Andric     MI.eraseFromParent();
13180b57cec5SDimitry Andric     return true;
13190b57cec5SDimitry Andric   }
13200b57cec5SDimitry Andric 
13218bcb0991SDimitry Andric   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
13228bcb0991SDimitry Andric     return false;
13238bcb0991SDimitry Andric 
13248bcb0991SDimitry Andric   if (!ST.hasFlatAddressSpace())
13258bcb0991SDimitry Andric     return false;
13260b57cec5SDimitry Andric 
13270b57cec5SDimitry Andric   auto SegmentNull =
13288bcb0991SDimitry Andric       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
13290b57cec5SDimitry Andric   auto FlatNull =
13308bcb0991SDimitry Andric       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
13310b57cec5SDimitry Andric 
13328bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
13338bcb0991SDimitry Andric   if (!ApertureReg.isValid())
13348bcb0991SDimitry Andric     return false;
13350b57cec5SDimitry Andric 
13360b57cec5SDimitry Andric   Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
13378bcb0991SDimitry Andric   B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
13380b57cec5SDimitry Andric 
13390b57cec5SDimitry Andric   Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
13400b57cec5SDimitry Andric 
13410b57cec5SDimitry Andric   // Coerce the type of the low half of the result so we can use merge_values.
13428bcb0991SDimitry Andric   Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
13438bcb0991SDimitry Andric   B.buildInstr(TargetOpcode::G_PTRTOINT)
13440b57cec5SDimitry Andric     .addDef(SrcAsInt)
13450b57cec5SDimitry Andric     .addUse(Src);
13460b57cec5SDimitry Andric 
13470b57cec5SDimitry Andric   // TODO: Should we allow mismatched types but matching sizes in merges to
13480b57cec5SDimitry Andric   // avoid the ptrtoint?
13498bcb0991SDimitry Andric   B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
13508bcb0991SDimitry Andric   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
13510b57cec5SDimitry Andric 
13520b57cec5SDimitry Andric   MI.eraseFromParent();
13530b57cec5SDimitry Andric   return true;
13540b57cec5SDimitry Andric }
13550b57cec5SDimitry Andric 
13560b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
13570b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
13588bcb0991SDimitry Andric   MachineIRBuilder &B) const {
13598bcb0991SDimitry Andric   B.setInstr(MI);
13600b57cec5SDimitry Andric 
13610b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
13620b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
13630b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
13640b57cec5SDimitry Andric 
13650b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
13660b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
13670b57cec5SDimitry Andric 
13688bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
13698bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
13700b57cec5SDimitry Andric 
13710b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
13728bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
13738bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
13740b57cec5SDimitry Andric 
13758bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
13768bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
13770b57cec5SDimitry Andric 
13788bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
13798bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
13800b57cec5SDimitry Andric   return true;
13810b57cec5SDimitry Andric }
13820b57cec5SDimitry Andric 
13830b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
13840b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
13850b57cec5SDimitry Andric   MachineIRBuilder &B) const {
13860b57cec5SDimitry Andric   B.setInstr(MI);
13870b57cec5SDimitry Andric 
13880b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
13890b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
13900b57cec5SDimitry Andric 
13910b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
13920b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
13930b57cec5SDimitry Andric 
13940b57cec5SDimitry Andric   // result = trunc(src)
13950b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
13960b57cec5SDimitry Andric   //   result += 1.0
13970b57cec5SDimitry Andric 
13980b57cec5SDimitry Andric   auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
13990b57cec5SDimitry Andric 
14000b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
14010b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
14020b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
14030b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
14040b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
14050b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
14060b57cec5SDimitry Andric 
14070b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
14080b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
14090b57cec5SDimitry Andric   return true;
14100b57cec5SDimitry Andric }
14110b57cec5SDimitry Andric 
14120b57cec5SDimitry Andric static MachineInstrBuilder extractF64Exponent(unsigned Hi,
14130b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
14140b57cec5SDimitry Andric   const unsigned FractBits = 52;
14150b57cec5SDimitry Andric   const unsigned ExpBits = 11;
14160b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
14170b57cec5SDimitry Andric 
14180b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
14190b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
14200b57cec5SDimitry Andric 
14210b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
14220b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
14230b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
14240b57cec5SDimitry Andric 
14250b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
14260b57cec5SDimitry Andric }
14270b57cec5SDimitry Andric 
14280b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
14290b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
14300b57cec5SDimitry Andric   MachineIRBuilder &B) const {
14310b57cec5SDimitry Andric   B.setInstr(MI);
14320b57cec5SDimitry Andric 
14330b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
14340b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
14350b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
14360b57cec5SDimitry Andric 
14370b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
14380b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
14390b57cec5SDimitry Andric 
14400b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
14410b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
14420b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
14430b57cec5SDimitry Andric 
14440b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
14450b57cec5SDimitry Andric   // exponent.
14460b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
14470b57cec5SDimitry Andric 
14480b57cec5SDimitry Andric   const unsigned FractBits = 52;
14490b57cec5SDimitry Andric 
14500b57cec5SDimitry Andric   // Extract the sign bit.
14510b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
14520b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
14530b57cec5SDimitry Andric 
14540b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
14550b57cec5SDimitry Andric 
14560b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
14570b57cec5SDimitry Andric 
14580b57cec5SDimitry Andric   // Extend back to 64-bits.
14590b57cec5SDimitry Andric   auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
14600b57cec5SDimitry Andric 
14610b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
14620b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
14630b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
14640b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
14650b57cec5SDimitry Andric 
14660b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
14670b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
14680b57cec5SDimitry Andric 
14690b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
14700b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
14710b57cec5SDimitry Andric   return true;
14720b57cec5SDimitry Andric }
14730b57cec5SDimitry Andric 
14740b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
14750b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
14760b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
14770b57cec5SDimitry Andric   B.setInstr(MI);
14780b57cec5SDimitry Andric 
14790b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
14800b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
14810b57cec5SDimitry Andric 
14820b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
14830b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
14840b57cec5SDimitry Andric 
14850b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
14860b57cec5SDimitry Andric 
14870b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
14880b57cec5SDimitry Andric 
14890b57cec5SDimitry Andric   auto CvtHi = Signed ?
14900b57cec5SDimitry Andric     B.buildSITOFP(S64, Unmerge.getReg(1)) :
14910b57cec5SDimitry Andric     B.buildUITOFP(S64, Unmerge.getReg(1));
14920b57cec5SDimitry Andric 
14930b57cec5SDimitry Andric   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
14940b57cec5SDimitry Andric 
14950b57cec5SDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
14960b57cec5SDimitry Andric   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
14970b57cec5SDimitry Andric     .addUse(CvtHi.getReg(0))
14980b57cec5SDimitry Andric     .addUse(ThirtyTwo.getReg(0));
14990b57cec5SDimitry Andric 
15000b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
15010b57cec5SDimitry Andric   B.buildFAdd(Dst, LdExp, CvtLo);
15020b57cec5SDimitry Andric   MI.eraseFromParent();
15030b57cec5SDimitry Andric   return true;
15040b57cec5SDimitry Andric }
15050b57cec5SDimitry Andric 
15060b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
15070b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
15080b57cec5SDimitry Andric   MachineIRBuilder &B) const {
15090b57cec5SDimitry Andric   MachineFunction &MF = B.getMF();
15100b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
15110b57cec5SDimitry Andric 
15120b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
15130b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
15140b57cec5SDimitry Andric 
15150b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
15160b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
15170b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
15180b57cec5SDimitry Andric     return !IsIEEEOp;
15190b57cec5SDimitry Andric 
15200b57cec5SDimitry Andric   if (IsIEEEOp)
15210b57cec5SDimitry Andric     return true;
15220b57cec5SDimitry Andric 
15230b57cec5SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
15240b57cec5SDimitry Andric   GISelObserverWrapper DummyObserver;
15250b57cec5SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
15268bcb0991SDimitry Andric   HelperBuilder.setInstr(MI);
15270b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
15280b57cec5SDimitry Andric }
15290b57cec5SDimitry Andric 
15300b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
15310b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
15320b57cec5SDimitry Andric   MachineIRBuilder &B) const {
15330b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
15340b57cec5SDimitry Andric 
15350b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
15360b57cec5SDimitry Andric   // TODO: Dynamic s64 indexing is only legal for SGPR.
15370b57cec5SDimitry Andric   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
15380b57cec5SDimitry Andric   if (!IdxVal) // Dynamic case will be selected to register indexing.
15390b57cec5SDimitry Andric     return true;
15400b57cec5SDimitry Andric 
15410b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
15420b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
15430b57cec5SDimitry Andric 
15440b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
15450b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
15460b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Dst));
15470b57cec5SDimitry Andric 
15480b57cec5SDimitry Andric   B.setInstr(MI);
15490b57cec5SDimitry Andric 
15500b57cec5SDimitry Andric   if (IdxVal.getValue() < VecTy.getNumElements())
15510b57cec5SDimitry Andric     B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
15520b57cec5SDimitry Andric   else
15530b57cec5SDimitry Andric     B.buildUndef(Dst);
15540b57cec5SDimitry Andric 
15550b57cec5SDimitry Andric   MI.eraseFromParent();
15560b57cec5SDimitry Andric   return true;
15570b57cec5SDimitry Andric }
15580b57cec5SDimitry Andric 
15590b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
15600b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
15610b57cec5SDimitry Andric   MachineIRBuilder &B) const {
15620b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
15630b57cec5SDimitry Andric 
15640b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
15650b57cec5SDimitry Andric   // TODO: Dynamic s64 indexing is only legal for SGPR.
15660b57cec5SDimitry Andric   Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
15670b57cec5SDimitry Andric   if (!IdxVal) // Dynamic case will be selected to register indexing.
15680b57cec5SDimitry Andric     return true;
15690b57cec5SDimitry Andric 
15700b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
15710b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
15720b57cec5SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
15730b57cec5SDimitry Andric 
15740b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
15750b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
15760b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Ins));
15770b57cec5SDimitry Andric 
15780b57cec5SDimitry Andric   B.setInstr(MI);
15790b57cec5SDimitry Andric 
15800b57cec5SDimitry Andric   if (IdxVal.getValue() < VecTy.getNumElements())
15810b57cec5SDimitry Andric     B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
15820b57cec5SDimitry Andric   else
15830b57cec5SDimitry Andric     B.buildUndef(Dst);
15840b57cec5SDimitry Andric 
15850b57cec5SDimitry Andric   MI.eraseFromParent();
15860b57cec5SDimitry Andric   return true;
15870b57cec5SDimitry Andric }
15880b57cec5SDimitry Andric 
15898bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
15908bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
15918bcb0991SDimitry Andric   MachineIRBuilder &B) const {
15928bcb0991SDimitry Andric   B.setInstr(MI);
15938bcb0991SDimitry Andric 
15948bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
15958bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
15968bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
15978bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
15988bcb0991SDimitry Andric 
15998bcb0991SDimitry Andric   Register TrigVal;
16008bcb0991SDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
16018bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
16028bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
16038bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
16048bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
16058bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
16068bcb0991SDimitry Andric   } else
16078bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
16088bcb0991SDimitry Andric 
16098bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
16108bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
16118bcb0991SDimitry Andric   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
16128bcb0991SDimitry Andric     .addUse(TrigVal)
16138bcb0991SDimitry Andric     .setMIFlags(Flags);
16148bcb0991SDimitry Andric   MI.eraseFromParent();
16158bcb0991SDimitry Andric   return true;
16168bcb0991SDimitry Andric }
16178bcb0991SDimitry Andric 
16188bcb0991SDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
16198bcb0991SDimitry Andric   Register DstReg, LLT PtrTy,
16208bcb0991SDimitry Andric   MachineIRBuilder &B, const GlobalValue *GV,
16218bcb0991SDimitry Andric   unsigned Offset, unsigned GAFlags) const {
16228bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
16238bcb0991SDimitry Andric   // to the following code sequence:
16248bcb0991SDimitry Andric   //
16258bcb0991SDimitry Andric   // For constant address space:
16268bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
16278bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
16288bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
16298bcb0991SDimitry Andric   //
16308bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
16318bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
16328bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
16338bcb0991SDimitry Andric   //   operand to the global variable.
16348bcb0991SDimitry Andric   //
16358bcb0991SDimitry Andric   // For global address space:
16368bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
16378bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
16388bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
16398bcb0991SDimitry Andric   //
16408bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
16418bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
16428bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
16438bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
16448bcb0991SDimitry Andric   //   operand to the global variable.
16458bcb0991SDimitry Andric   //
16468bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
16478bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
16488bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
16498bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
16508bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
16518bcb0991SDimitry Andric   // compute the correct address.
16528bcb0991SDimitry Andric 
16538bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
16548bcb0991SDimitry Andric 
16558bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
16568bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
16578bcb0991SDimitry Andric 
16588bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
16598bcb0991SDimitry Andric     .addDef(PCReg);
16608bcb0991SDimitry Andric 
16618bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
16628bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
16638bcb0991SDimitry Andric     MIB.addImm(0);
16648bcb0991SDimitry Andric   else
16658bcb0991SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
16668bcb0991SDimitry Andric 
16678bcb0991SDimitry Andric   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
16688bcb0991SDimitry Andric 
16698bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
16708bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
16718bcb0991SDimitry Andric   return true;
16728bcb0991SDimitry Andric  }
16738bcb0991SDimitry Andric 
16748bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
16758bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
16768bcb0991SDimitry Andric   MachineIRBuilder &B) const {
16778bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
16788bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
16798bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
16808bcb0991SDimitry Andric 
16818bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
16828bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
16838bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
16848bcb0991SDimitry Andric   B.setInstr(MI);
16858bcb0991SDimitry Andric 
16868bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
16878bcb0991SDimitry Andric     if (!MFI->isEntryFunction()) {
16888bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
16898bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
16908bcb0991SDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
16918bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
16928bcb0991SDimitry Andric     }
16938bcb0991SDimitry Andric 
16948bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
16958bcb0991SDimitry Andric     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
16968bcb0991SDimitry Andric       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
16978bcb0991SDimitry Andric       MI.eraseFromParent();
16988bcb0991SDimitry Andric       return true;
16998bcb0991SDimitry Andric     }
17008bcb0991SDimitry Andric 
17018bcb0991SDimitry Andric     const Function &Fn = MF.getFunction();
17028bcb0991SDimitry Andric     DiagnosticInfoUnsupported BadInit(
17038bcb0991SDimitry Andric       Fn, "unsupported initializer for address space", MI.getDebugLoc());
17048bcb0991SDimitry Andric     Fn.getContext().diagnose(BadInit);
17058bcb0991SDimitry Andric     return true;
17068bcb0991SDimitry Andric   }
17078bcb0991SDimitry Andric 
17088bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
17098bcb0991SDimitry Andric 
17108bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
17118bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
17128bcb0991SDimitry Andric     MI.eraseFromParent();
17138bcb0991SDimitry Andric     return true;
17148bcb0991SDimitry Andric   }
17158bcb0991SDimitry Andric 
17168bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
17178bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
17188bcb0991SDimitry Andric     MI.eraseFromParent();
17198bcb0991SDimitry Andric     return true;
17208bcb0991SDimitry Andric   }
17218bcb0991SDimitry Andric 
17228bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
17238bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
17248bcb0991SDimitry Andric 
17258bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
17268bcb0991SDimitry Andric     MachinePointerInfo::getGOT(MF),
17278bcb0991SDimitry Andric     MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
17288bcb0991SDimitry Andric     MachineMemOperand::MOInvariant,
17298bcb0991SDimitry Andric     8 /*Size*/, 8 /*Align*/);
17308bcb0991SDimitry Andric 
17318bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
17328bcb0991SDimitry Andric 
17338bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
17348bcb0991SDimitry Andric     // Truncate if this is a 32-bit constant adrdess.
17358bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
17368bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
17378bcb0991SDimitry Andric   } else
17388bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
17398bcb0991SDimitry Andric 
17408bcb0991SDimitry Andric   MI.eraseFromParent();
17418bcb0991SDimitry Andric   return true;
17428bcb0991SDimitry Andric }
17438bcb0991SDimitry Andric 
17448bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(
17458bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
17468bcb0991SDimitry Andric   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
17478bcb0991SDimitry Andric   B.setInstr(MI);
17488bcb0991SDimitry Andric   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
17498bcb0991SDimitry Andric   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
17508bcb0991SDimitry Andric   Observer.changingInstr(MI);
17518bcb0991SDimitry Andric   MI.getOperand(1).setReg(Cast.getReg(0));
17528bcb0991SDimitry Andric   Observer.changedInstr(MI);
17538bcb0991SDimitry Andric   return true;
17548bcb0991SDimitry Andric }
17558bcb0991SDimitry Andric 
17568bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
17578bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
17588bcb0991SDimitry Andric   MachineIRBuilder &B) const {
17598bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
17608bcb0991SDimitry Andric   assert(Ty.isScalar());
17618bcb0991SDimitry Andric 
1762*480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
1763*480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1764*480093f4SDimitry Andric 
17658bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
1766*480093f4SDimitry Andric   if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
17678bcb0991SDimitry Andric     return true;
1768*480093f4SDimitry Andric   if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
17698bcb0991SDimitry Andric     return true;
17708bcb0991SDimitry Andric 
17718bcb0991SDimitry Andric 
17728bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
17738bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
17748bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
17758bcb0991SDimitry Andric   HelperBuilder.setMBB(*MI.getParent());
17768bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
17778bcb0991SDimitry Andric }
17788bcb0991SDimitry Andric 
1779*480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
1780*480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
1781*480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
1782*480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
1783*480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
1784*480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
1785*480093f4SDimitry Andric 
1786*480093f4SDimitry Andric   assert(SITargetLowering::isFlatGlobalAddrSpace(
1787*480093f4SDimitry Andric            MRI.getType(PtrReg).getAddressSpace()) &&
1788*480093f4SDimitry Andric          "this should not have been custom lowered");
1789*480093f4SDimitry Andric 
1790*480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
1791*480093f4SDimitry Andric   LLT VecTy = LLT::vector(2, ValTy);
1792*480093f4SDimitry Andric 
1793*480093f4SDimitry Andric   B.setInstr(MI);
1794*480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
1795*480093f4SDimitry Andric 
1796*480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
1797*480093f4SDimitry Andric     .addDef(DstReg)
1798*480093f4SDimitry Andric     .addUse(PtrReg)
1799*480093f4SDimitry Andric     .addUse(PackedVal)
1800*480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
1801*480093f4SDimitry Andric 
1802*480093f4SDimitry Andric   MI.eraseFromParent();
1803*480093f4SDimitry Andric   return true;
1804*480093f4SDimitry Andric }
1805*480093f4SDimitry Andric 
18060b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
18070b57cec5SDimitry Andric static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
1808*480093f4SDimitry Andric                                        MachineRegisterInfo &MRI,
1809*480093f4SDimitry Andric                                        MachineInstr *&Br) {
18100b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
18110b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
18120b57cec5SDimitry Andric     return nullptr;
18130b57cec5SDimitry Andric 
18140b57cec5SDimitry Andric   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
1815*480093f4SDimitry Andric   if (UseMI.getParent() != MI.getParent() ||
1816*480093f4SDimitry Andric       UseMI.getOpcode() != AMDGPU::G_BRCOND)
1817*480093f4SDimitry Andric     return nullptr;
1818*480093f4SDimitry Andric 
1819*480093f4SDimitry Andric   // Make sure the cond br is followed by a G_BR
1820*480093f4SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
1821*480093f4SDimitry Andric   if (Next != MI.getParent()->end()) {
1822*480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
1823*480093f4SDimitry Andric       return nullptr;
1824*480093f4SDimitry Andric     Br = &*Next;
1825*480093f4SDimitry Andric   }
1826*480093f4SDimitry Andric 
1827*480093f4SDimitry Andric   return &UseMI;
18280b57cec5SDimitry Andric }
18290b57cec5SDimitry Andric 
18300b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
18310b57cec5SDimitry Andric                                                 Register Reg, LLT Ty) const {
18320b57cec5SDimitry Andric   Register LiveIn = MRI.getLiveInVirtReg(Reg);
18330b57cec5SDimitry Andric   if (LiveIn)
18340b57cec5SDimitry Andric     return LiveIn;
18350b57cec5SDimitry Andric 
18360b57cec5SDimitry Andric   Register NewReg = MRI.createGenericVirtualRegister(Ty);
18370b57cec5SDimitry Andric   MRI.addLiveIn(Reg, NewReg);
18380b57cec5SDimitry Andric   return NewReg;
18390b57cec5SDimitry Andric }
18400b57cec5SDimitry Andric 
18410b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
18420b57cec5SDimitry Andric                                          const ArgDescriptor *Arg) const {
18438bcb0991SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
18440b57cec5SDimitry Andric     return false; // TODO: Handle these
18450b57cec5SDimitry Andric 
18460b57cec5SDimitry Andric   assert(Arg->getRegister().isPhysical());
18470b57cec5SDimitry Andric 
18480b57cec5SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
18490b57cec5SDimitry Andric 
18500b57cec5SDimitry Andric   LLT Ty = MRI.getType(DstReg);
18510b57cec5SDimitry Andric   Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
18520b57cec5SDimitry Andric 
18530b57cec5SDimitry Andric   if (Arg->isMasked()) {
18540b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
18550b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
18560b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
18570b57cec5SDimitry Andric     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
18580b57cec5SDimitry Andric 
18598bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
18608bcb0991SDimitry Andric 
18618bcb0991SDimitry Andric     if (Shift != 0) {
18620b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
18638bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
18648bcb0991SDimitry Andric     }
18658bcb0991SDimitry Andric 
18668bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
18670b57cec5SDimitry Andric   } else
18680b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
18690b57cec5SDimitry Andric 
18700b57cec5SDimitry Andric   // Insert the argument copy if it doens't already exist.
18710b57cec5SDimitry Andric   // FIXME: It seems EmitLiveInCopies isn't called anywhere?
18720b57cec5SDimitry Andric   if (!MRI.getVRegDef(LiveIn)) {
18738bcb0991SDimitry Andric     // FIXME: Should have scoped insert pt
18748bcb0991SDimitry Andric     MachineBasicBlock &OrigInsBB = B.getMBB();
18758bcb0991SDimitry Andric     auto OrigInsPt = B.getInsertPt();
18768bcb0991SDimitry Andric 
18770b57cec5SDimitry Andric     MachineBasicBlock &EntryMBB = B.getMF().front();
18780b57cec5SDimitry Andric     EntryMBB.addLiveIn(Arg->getRegister());
18790b57cec5SDimitry Andric     B.setInsertPt(EntryMBB, EntryMBB.begin());
18800b57cec5SDimitry Andric     B.buildCopy(LiveIn, Arg->getRegister());
18818bcb0991SDimitry Andric 
18828bcb0991SDimitry Andric     B.setInsertPt(OrigInsBB, OrigInsPt);
18830b57cec5SDimitry Andric   }
18840b57cec5SDimitry Andric 
18850b57cec5SDimitry Andric   return true;
18860b57cec5SDimitry Andric }
18870b57cec5SDimitry Andric 
18880b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
18890b57cec5SDimitry Andric   MachineInstr &MI,
18900b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
18910b57cec5SDimitry Andric   MachineIRBuilder &B,
18920b57cec5SDimitry Andric   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
18930b57cec5SDimitry Andric   B.setInstr(MI);
18940b57cec5SDimitry Andric 
18950b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
18960b57cec5SDimitry Andric 
18970b57cec5SDimitry Andric   const ArgDescriptor *Arg;
18980b57cec5SDimitry Andric   const TargetRegisterClass *RC;
18990b57cec5SDimitry Andric   std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
19000b57cec5SDimitry Andric   if (!Arg) {
19010b57cec5SDimitry Andric     LLVM_DEBUG(dbgs() << "Required arg register missing\n");
19020b57cec5SDimitry Andric     return false;
19030b57cec5SDimitry Andric   }
19040b57cec5SDimitry Andric 
19050b57cec5SDimitry Andric   if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
19060b57cec5SDimitry Andric     MI.eraseFromParent();
19070b57cec5SDimitry Andric     return true;
19080b57cec5SDimitry Andric   }
19090b57cec5SDimitry Andric 
19100b57cec5SDimitry Andric   return false;
19110b57cec5SDimitry Andric }
19120b57cec5SDimitry Andric 
19138bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
19148bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
19158bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
19168bcb0991SDimitry Andric   B.setInstr(MI);
1917*480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
1918*480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
1919*480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
1920*480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
1921*480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
19228bcb0991SDimitry Andric 
19238bcb0991SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
19248bcb0991SDimitry Andric     return true;
19258bcb0991SDimitry Andric 
1926*480093f4SDimitry Andric   if (DstTy == S16)
1927*480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
1928*480093f4SDimitry Andric   if (DstTy == S32)
1929*480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
1930*480093f4SDimitry Andric   if (DstTy == S64)
1931*480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
1932*480093f4SDimitry Andric 
19338bcb0991SDimitry Andric   return false;
19348bcb0991SDimitry Andric }
19358bcb0991SDimitry Andric 
19368bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
19378bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
19388bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
19398bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
19408bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
19418bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
19428bcb0991SDimitry Andric 
19438bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
19448bcb0991SDimitry Andric 
19458bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
19468bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
19478bcb0991SDimitry Andric   LLT S64 = LLT::scalar(64);
19488bcb0991SDimitry Andric 
19498bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
19508bcb0991SDimitry Andric   bool Unsafe =
19518bcb0991SDimitry Andric     MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
19528bcb0991SDimitry Andric 
19538bcb0991SDimitry Andric   if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
19548bcb0991SDimitry Andric     return false;
19558bcb0991SDimitry Andric 
1956*480093f4SDimitry Andric   if (!Unsafe && ResTy == S32 &&
1957*480093f4SDimitry Andric       MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
19588bcb0991SDimitry Andric     return false;
19598bcb0991SDimitry Andric 
19608bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
19618bcb0991SDimitry Andric     // 1 / x -> RCP(x)
19628bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
19638bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
19648bcb0991SDimitry Andric         .addUse(RHS)
19658bcb0991SDimitry Andric         .setMIFlags(Flags);
19668bcb0991SDimitry Andric 
19678bcb0991SDimitry Andric       MI.eraseFromParent();
19688bcb0991SDimitry Andric       return true;
19698bcb0991SDimitry Andric     }
19708bcb0991SDimitry Andric 
19718bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
19728bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
19738bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
19748bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
19758bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
19768bcb0991SDimitry Andric         .setMIFlags(Flags);
19778bcb0991SDimitry Andric 
19788bcb0991SDimitry Andric       MI.eraseFromParent();
19798bcb0991SDimitry Andric       return true;
19808bcb0991SDimitry Andric     }
19818bcb0991SDimitry Andric   }
19828bcb0991SDimitry Andric 
19838bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
19848bcb0991SDimitry Andric   if (Unsafe) {
19858bcb0991SDimitry Andric     auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
19868bcb0991SDimitry Andric       .addUse(RHS)
19878bcb0991SDimitry Andric       .setMIFlags(Flags);
19888bcb0991SDimitry Andric     B.buildFMul(Res, LHS, RCP, Flags);
19898bcb0991SDimitry Andric 
19908bcb0991SDimitry Andric     MI.eraseFromParent();
19918bcb0991SDimitry Andric     return true;
19928bcb0991SDimitry Andric   }
19938bcb0991SDimitry Andric 
19948bcb0991SDimitry Andric   return false;
19958bcb0991SDimitry Andric }
19968bcb0991SDimitry Andric 
1997*480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
1998*480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
1999*480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
2000*480093f4SDimitry Andric   B.setInstr(MI);
2001*480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
2002*480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
2003*480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
2004*480093f4SDimitry Andric 
2005*480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
2006*480093f4SDimitry Andric 
2007*480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
2008*480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2009*480093f4SDimitry Andric 
2010*480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
2011*480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
2012*480093f4SDimitry Andric 
2013*480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2014*480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
2015*480093f4SDimitry Andric     .setMIFlags(Flags);
2016*480093f4SDimitry Andric 
2017*480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
2018*480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
2019*480093f4SDimitry Andric 
2020*480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2021*480093f4SDimitry Andric     .addUse(RDst.getReg(0))
2022*480093f4SDimitry Andric     .addUse(RHS)
2023*480093f4SDimitry Andric     .addUse(LHS)
2024*480093f4SDimitry Andric     .setMIFlags(Flags);
2025*480093f4SDimitry Andric 
2026*480093f4SDimitry Andric   MI.eraseFromParent();
2027*480093f4SDimitry Andric   return true;
2028*480093f4SDimitry Andric }
2029*480093f4SDimitry Andric 
2030*480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
2031*480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
2032*480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable,
2033*480093f4SDimitry Andric                                MachineIRBuilder &B,
2034*480093f4SDimitry Andric                                const GCNSubtarget &ST,
2035*480093f4SDimitry Andric                                AMDGPU::SIModeRegisterDefaults Mode) {
2036*480093f4SDimitry Andric   // Set SP denorm mode to this value.
2037*480093f4SDimitry Andric   unsigned SPDenormMode =
2038*480093f4SDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2039*480093f4SDimitry Andric 
2040*480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
2041*480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
2042*480093f4SDimitry Andric     unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
2043*480093f4SDimitry Andric                                    ? FP_DENORM_FLUSH_NONE
2044*480093f4SDimitry Andric                                    : FP_DENORM_FLUSH_IN_FLUSH_OUT;
2045*480093f4SDimitry Andric 
2046*480093f4SDimitry Andric     unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
2047*480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
2048*480093f4SDimitry Andric       .addImm(NewDenormModeValue);
2049*480093f4SDimitry Andric 
2050*480093f4SDimitry Andric   } else {
2051*480093f4SDimitry Andric     // Select FP32 bit field in mode register.
2052*480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
2053*480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
2054*480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
2055*480093f4SDimitry Andric 
2056*480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
2057*480093f4SDimitry Andric       .addImm(SPDenormMode)
2058*480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
2059*480093f4SDimitry Andric   }
2060*480093f4SDimitry Andric }
2061*480093f4SDimitry Andric 
2062*480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
2063*480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
2064*480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
2065*480093f4SDimitry Andric   B.setInstr(MI);
2066*480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
2067*480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
2068*480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
2069*480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2070*480093f4SDimitry Andric   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
2071*480093f4SDimitry Andric 
2072*480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
2073*480093f4SDimitry Andric 
2074*480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2075*480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
2076*480093f4SDimitry Andric 
2077*480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
2078*480093f4SDimitry Andric 
2079*480093f4SDimitry Andric   auto DenominatorScaled =
2080*480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2081*480093f4SDimitry Andric       .addUse(RHS)
2082*480093f4SDimitry Andric       .addUse(LHS)
2083*480093f4SDimitry Andric       .addImm(1)
2084*480093f4SDimitry Andric       .setMIFlags(Flags);
2085*480093f4SDimitry Andric   auto NumeratorScaled =
2086*480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
2087*480093f4SDimitry Andric       .addUse(LHS)
2088*480093f4SDimitry Andric       .addUse(RHS)
2089*480093f4SDimitry Andric       .addImm(0)
2090*480093f4SDimitry Andric       .setMIFlags(Flags);
2091*480093f4SDimitry Andric 
2092*480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
2093*480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
2094*480093f4SDimitry Andric     .setMIFlags(Flags);
2095*480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
2096*480093f4SDimitry Andric 
2097*480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
2098*480093f4SDimitry Andric   // aren't modeled as reading it.
2099*480093f4SDimitry Andric   if (!Mode.FP32Denormals)
2100*480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
2101*480093f4SDimitry Andric 
2102*480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
2103*480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
2104*480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
2105*480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
2106*480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
2107*480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
2108*480093f4SDimitry Andric 
2109*480093f4SDimitry Andric   if (!Mode.FP32Denormals)
2110*480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
2111*480093f4SDimitry Andric 
2112*480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
2113*480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
2114*480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
2115*480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
2116*480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
2117*480093f4SDimitry Andric     .setMIFlags(Flags);
2118*480093f4SDimitry Andric 
2119*480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
2120*480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
2121*480093f4SDimitry Andric     .addUse(RHS)
2122*480093f4SDimitry Andric     .addUse(LHS)
2123*480093f4SDimitry Andric     .setMIFlags(Flags);
2124*480093f4SDimitry Andric 
2125*480093f4SDimitry Andric   MI.eraseFromParent();
2126*480093f4SDimitry Andric   return true;
2127*480093f4SDimitry Andric }
2128*480093f4SDimitry Andric 
2129*480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
2130*480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
2131*480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
2132*480093f4SDimitry Andric   B.setInstr(MI);
2133*480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
2134*480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
2135*480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
2136*480093f4SDimitry Andric 
2137*480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
2138*480093f4SDimitry Andric 
2139*480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
2140*480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
2141*480093f4SDimitry Andric 
2142*480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
2143*480093f4SDimitry Andric 
2144*480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2145*480093f4SDimitry Andric     .addUse(LHS)
2146*480093f4SDimitry Andric     .addUse(RHS)
2147*480093f4SDimitry Andric     .addImm(1)
2148*480093f4SDimitry Andric     .setMIFlags(Flags);
2149*480093f4SDimitry Andric 
2150*480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
2151*480093f4SDimitry Andric 
2152*480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
2153*480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
2154*480093f4SDimitry Andric     .setMIFlags(Flags);
2155*480093f4SDimitry Andric 
2156*480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
2157*480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
2158*480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
2159*480093f4SDimitry Andric 
2160*480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
2161*480093f4SDimitry Andric     .addUse(LHS)
2162*480093f4SDimitry Andric     .addUse(RHS)
2163*480093f4SDimitry Andric     .addImm(0)
2164*480093f4SDimitry Andric     .setMIFlags(Flags);
2165*480093f4SDimitry Andric 
2166*480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
2167*480093f4SDimitry Andric   auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
2168*480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
2169*480093f4SDimitry Andric 
2170*480093f4SDimitry Andric   Register Scale;
2171*480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
2172*480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
2173*480093f4SDimitry Andric     // is not usable.
2174*480093f4SDimitry Andric 
2175*480093f4SDimitry Andric     Scale = MRI.createGenericVirtualRegister(S1);
2176*480093f4SDimitry Andric 
2177*480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
2178*480093f4SDimitry Andric 
2179*480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
2180*480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
2181*480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
2182*480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
2183*480093f4SDimitry Andric 
2184*480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
2185*480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
2186*480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
2187*480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
2188*480093f4SDimitry Andric     B.buildXor(Scale, CmpNum, CmpDen);
2189*480093f4SDimitry Andric   } else {
2190*480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
2191*480093f4SDimitry Andric   }
2192*480093f4SDimitry Andric 
2193*480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
2194*480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
2195*480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
2196*480093f4SDimitry Andric     .addUse(Mul.getReg(0))
2197*480093f4SDimitry Andric     .addUse(Scale)
2198*480093f4SDimitry Andric     .setMIFlags(Flags);
2199*480093f4SDimitry Andric 
2200*480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
2201*480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
2202*480093f4SDimitry Andric     .addUse(RHS)
2203*480093f4SDimitry Andric     .addUse(LHS)
2204*480093f4SDimitry Andric     .setMIFlags(Flags);
2205*480093f4SDimitry Andric 
2206*480093f4SDimitry Andric   MI.eraseFromParent();
2207*480093f4SDimitry Andric   return true;
2208*480093f4SDimitry Andric }
2209*480093f4SDimitry Andric 
22108bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
22118bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
22128bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
22138bcb0991SDimitry Andric   B.setInstr(MI);
22148bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
22158bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
22168bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
22178bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
22188bcb0991SDimitry Andric 
22198bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
22208bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
22218bcb0991SDimitry Andric 
22228bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
22238bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
22248bcb0991SDimitry Andric 
22258bcb0991SDimitry Andric   auto C0 = B.buildConstant(S32, 0x6f800000);
22268bcb0991SDimitry Andric   auto C1 = B.buildConstant(S32, 0x2f800000);
22278bcb0991SDimitry Andric   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
22288bcb0991SDimitry Andric 
22298bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
22308bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
22318bcb0991SDimitry Andric 
22328bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
22338bcb0991SDimitry Andric 
22348bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
22358bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
22368bcb0991SDimitry Andric     .setMIFlags(Flags);
22378bcb0991SDimitry Andric 
22388bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
22398bcb0991SDimitry Andric 
22408bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
22418bcb0991SDimitry Andric 
22428bcb0991SDimitry Andric   MI.eraseFromParent();
22438bcb0991SDimitry Andric   return true;
22448bcb0991SDimitry Andric }
22458bcb0991SDimitry Andric 
22460b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
22470b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
22480b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
22490b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
22500b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
22510b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
22520b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
22530b57cec5SDimitry Andric   }
22540b57cec5SDimitry Andric 
22550b57cec5SDimitry Andric   B.setInstr(MI);
22560b57cec5SDimitry Andric 
22570b57cec5SDimitry Andric   uint64_t Offset =
22580b57cec5SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
22590b57cec5SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
22600b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
22610b57cec5SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
22620b57cec5SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
22630b57cec5SDimitry Andric 
22640b57cec5SDimitry Andric   const ArgDescriptor *Arg;
22650b57cec5SDimitry Andric   const TargetRegisterClass *RC;
22660b57cec5SDimitry Andric   std::tie(Arg, RC)
22670b57cec5SDimitry Andric     = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
22680b57cec5SDimitry Andric   if (!Arg)
22690b57cec5SDimitry Andric     return false;
22700b57cec5SDimitry Andric 
22710b57cec5SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
22720b57cec5SDimitry Andric   if (!loadInputValue(KernargPtrReg, B, Arg))
22730b57cec5SDimitry Andric     return false;
22740b57cec5SDimitry Andric 
2275*480093f4SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
22760b57cec5SDimitry Andric   MI.eraseFromParent();
22770b57cec5SDimitry Andric   return true;
22780b57cec5SDimitry Andric }
22790b57cec5SDimitry Andric 
22808bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
22818bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
22828bcb0991SDimitry Andric                                               MachineIRBuilder &B,
22838bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
22848bcb0991SDimitry Andric   B.setInstr(MI);
22858bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
22868bcb0991SDimitry Andric   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
22878bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
22888bcb0991SDimitry Andric   MI.eraseFromParent();
22898bcb0991SDimitry Andric   return true;
22908bcb0991SDimitry Andric }
22918bcb0991SDimitry Andric 
22928bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
22938bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
22948bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
22958bcb0991SDimitry Andric                                              Register Reg) const {
22968bcb0991SDimitry Andric   if (!ST.hasUnpackedD16VMem())
22978bcb0991SDimitry Andric     return Reg;
22988bcb0991SDimitry Andric 
22998bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
23008bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
23018bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
23028bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
23038bcb0991SDimitry Andric 
23048bcb0991SDimitry Andric   auto Unmerge = B.buildUnmerge(S16, Reg);
23058bcb0991SDimitry Andric 
23068bcb0991SDimitry Andric   SmallVector<Register, 4> WideRegs;
23078bcb0991SDimitry Andric   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
23088bcb0991SDimitry Andric     WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
23098bcb0991SDimitry Andric 
23108bcb0991SDimitry Andric   int NumElts = StoreVT.getNumElements();
23118bcb0991SDimitry Andric 
23128bcb0991SDimitry Andric   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
23138bcb0991SDimitry Andric }
23148bcb0991SDimitry Andric 
23158bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
23168bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
23178bcb0991SDimitry Andric                                                  MachineIRBuilder &B,
23188bcb0991SDimitry Andric                                                  bool IsFormat) const {
23198bcb0991SDimitry Andric   // TODO: Reject f16 format on targets where unsupported.
23208bcb0991SDimitry Andric   Register VData = MI.getOperand(1).getReg();
23218bcb0991SDimitry Andric   LLT Ty = MRI.getType(VData);
23228bcb0991SDimitry Andric 
23238bcb0991SDimitry Andric   B.setInstr(MI);
23248bcb0991SDimitry Andric 
23258bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
23268bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
23278bcb0991SDimitry Andric 
23288bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
23298bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
23308bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
23318bcb0991SDimitry Andric     MI.getOperand(1).setReg(AnyExt);
23328bcb0991SDimitry Andric     return true;
23338bcb0991SDimitry Andric   }
23348bcb0991SDimitry Andric 
23358bcb0991SDimitry Andric   if (Ty.isVector()) {
23368bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
23378bcb0991SDimitry Andric       if (IsFormat)
23388bcb0991SDimitry Andric         MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
23398bcb0991SDimitry Andric       return true;
23408bcb0991SDimitry Andric     }
23418bcb0991SDimitry Andric 
23428bcb0991SDimitry Andric     return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
23438bcb0991SDimitry Andric   }
23448bcb0991SDimitry Andric 
23458bcb0991SDimitry Andric   return Ty == S32;
23468bcb0991SDimitry Andric }
23478bcb0991SDimitry Andric 
23480b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
23490b57cec5SDimitry Andric                                             MachineRegisterInfo &MRI,
23500b57cec5SDimitry Andric                                             MachineIRBuilder &B) const {
23510b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
2352*480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
2353*480093f4SDimitry Andric   switch (IntrID) {
2354*480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
2355*480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
2356*480093f4SDimitry Andric     MachineInstr *Br = nullptr;
2357*480093f4SDimitry Andric     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
23580b57cec5SDimitry Andric       const SIRegisterInfo *TRI
23590b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
23600b57cec5SDimitry Andric 
23610b57cec5SDimitry Andric       B.setInstr(*BrCond);
23620b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
23630b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
2364*480093f4SDimitry Andric 
2365*480093f4SDimitry Andric       MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
2366*480093f4SDimitry Andric       if (Br)
2367*480093f4SDimitry Andric         BrTarget = Br->getOperand(0).getMBB();
2368*480093f4SDimitry Andric 
2369*480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
23700b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
23710b57cec5SDimitry Andric           .addDef(Def)
23720b57cec5SDimitry Andric           .addUse(Use)
2373*480093f4SDimitry Andric           .addMBB(BrTarget);
2374*480093f4SDimitry Andric       } else {
2375*480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
2376*480093f4SDimitry Andric           .addDef(Def)
2377*480093f4SDimitry Andric           .addUse(Use)
2378*480093f4SDimitry Andric           .addMBB(BrTarget)
2379*480093f4SDimitry Andric           .addImm(0);
2380*480093f4SDimitry Andric       }
2381*480093f4SDimitry Andric 
2382*480093f4SDimitry Andric       if (Br)
2383*480093f4SDimitry Andric         Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
23840b57cec5SDimitry Andric 
23850b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
23860b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
23870b57cec5SDimitry Andric       MI.eraseFromParent();
23880b57cec5SDimitry Andric       BrCond->eraseFromParent();
23890b57cec5SDimitry Andric       return true;
23900b57cec5SDimitry Andric     }
23910b57cec5SDimitry Andric 
23920b57cec5SDimitry Andric     return false;
23930b57cec5SDimitry Andric   }
23940b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
2395*480093f4SDimitry Andric     MachineInstr *Br = nullptr;
2396*480093f4SDimitry Andric     if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
23970b57cec5SDimitry Andric       const SIRegisterInfo *TRI
23980b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
23990b57cec5SDimitry Andric 
24000b57cec5SDimitry Andric       B.setInstr(*BrCond);
2401*480093f4SDimitry Andric 
2402*480093f4SDimitry Andric       // FIXME: Need to adjust branch targets based on unconditional branch.
24030b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
24040b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
24050b57cec5SDimitry Andric         .addUse(Reg)
24060b57cec5SDimitry Andric         .addMBB(BrCond->getOperand(1).getMBB());
24070b57cec5SDimitry Andric       MI.eraseFromParent();
24080b57cec5SDimitry Andric       BrCond->eraseFromParent();
24090b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
24100b57cec5SDimitry Andric       return true;
24110b57cec5SDimitry Andric     }
24120b57cec5SDimitry Andric 
24130b57cec5SDimitry Andric     return false;
24140b57cec5SDimitry Andric   }
24150b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
24160b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
24170b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
24180b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
24190b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
24200b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
24210b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24220b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
24230b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
24240b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24250b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
24260b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
24270b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24280b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
24290b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
24300b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24310b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
24320b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
24330b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24340b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
24350b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
24360b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24370b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
24380b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
24390b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24400b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
24410b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
24420b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24430b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
24440b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
24450b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
24460b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
24470b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
24480b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
24490b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
24508bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
24518bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
24528bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
24538bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
24548bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
24558bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
24568bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
24578bcb0991SDimitry Andric     B.setInstr(MI);
24588bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
24598bcb0991SDimitry Andric     MI.eraseFromParent();
24608bcb0991SDimitry Andric     return true;
24618bcb0991SDimitry Andric   }
24628bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
24638bcb0991SDimitry Andric     return legalizeRawBufferStore(MI, MRI, B, false);
24648bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
24658bcb0991SDimitry Andric     return legalizeRawBufferStore(MI, MRI, B, true);
24660b57cec5SDimitry Andric   default:
24670b57cec5SDimitry Andric     return true;
24680b57cec5SDimitry Andric   }
24690b57cec5SDimitry Andric 
24700b57cec5SDimitry Andric   return true;
24710b57cec5SDimitry Andric }
2472