xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 04eeddc0aa8e0a417a16eaf9d7d095207f4a8623)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
21fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
23fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
28e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
290b57cec5SDimitry Andric 
300b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
310b57cec5SDimitry Andric 
320b57cec5SDimitry Andric using namespace llvm;
330b57cec5SDimitry Andric using namespace LegalizeActions;
340b57cec5SDimitry Andric using namespace LegalizeMutations;
350b57cec5SDimitry Andric using namespace LegalityPredicates;
365ffd83dbSDimitry Andric using namespace MIPatternMatch;
370b57cec5SDimitry Andric 
385ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
395ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
405ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
415ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
425ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
435ffd83dbSDimitry Andric   cl::init(false),
445ffd83dbSDimitry Andric   cl::ReallyHidden);
450b57cec5SDimitry Andric 
465ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
475ffd83dbSDimitry Andric 
485ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
495ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
505ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
515ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
530b57cec5SDimitry Andric }
540b57cec5SDimitry Andric 
555ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
565ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
575ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
585ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
595ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
608bcb0991SDimitry Andric }
618bcb0991SDimitry Andric 
62349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an
63e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
650b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
660b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
670b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
68e8d8bef9SDimitry Andric     if (!Ty.isVector())
69e8d8bef9SDimitry Andric       return false;
70e8d8bef9SDimitry Andric 
71e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
72e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
73e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
74e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
758bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
768bcb0991SDimitry Andric   };
778bcb0991SDimitry Andric }
788bcb0991SDimitry Andric 
79e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
81e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
82e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
83e8d8bef9SDimitry Andric   };
84e8d8bef9SDimitry Andric }
85e8d8bef9SDimitry Andric 
868bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
878bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
888bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
898bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
908bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
910b57cec5SDimitry Andric   };
920b57cec5SDimitry Andric }
930b57cec5SDimitry Andric 
940b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
950b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
960b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
970b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
98fe6060f1SDimitry Andric     return std::make_pair(TypeIdx,
99fe6060f1SDimitry Andric                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1000b57cec5SDimitry Andric   };
1010b57cec5SDimitry Andric }
1020b57cec5SDimitry Andric 
1030b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1040b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1050b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1060b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1070b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1080b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1090b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110fe6060f1SDimitry Andric     return std::make_pair(
111fe6060f1SDimitry Andric         TypeIdx,
112fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
1130b57cec5SDimitry Andric   };
1140b57cec5SDimitry Andric }
1150b57cec5SDimitry Andric 
1168bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1178bcb0991SDimitry Andric // type.
1188bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1198bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1208bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1218bcb0991SDimitry Andric 
1228bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1238bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1248bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1258bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1268bcb0991SDimitry Andric 
1278bcb0991SDimitry Andric     assert(EltSize < 32);
1288bcb0991SDimitry Andric 
1298bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130fe6060f1SDimitry Andric     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1318bcb0991SDimitry Andric   };
1328bcb0991SDimitry Andric }
1338bcb0991SDimitry Andric 
134e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
135e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1365ffd83dbSDimitry Andric 
1375ffd83dbSDimitry Andric   LLT CoercedTy;
1385ffd83dbSDimitry Andric   if (Size <= 32) {
1395ffd83dbSDimitry Andric     // <2 x s8> -> s16
1405ffd83dbSDimitry Andric     // <4 x s8> -> s32
141e8d8bef9SDimitry Andric     return LLT::scalar(Size);
142e8d8bef9SDimitry Andric   }
1435ffd83dbSDimitry Andric 
144fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145e8d8bef9SDimitry Andric }
146e8d8bef9SDimitry Andric 
147e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
149e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
150e8d8bef9SDimitry Andric     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151e8d8bef9SDimitry Andric   };
152e8d8bef9SDimitry Andric }
153e8d8bef9SDimitry Andric 
154e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
156e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
157e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
158e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
159fe6060f1SDimitry Andric     return std::make_pair(
160fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
1615ffd83dbSDimitry Andric   };
1625ffd83dbSDimitry Andric }
1635ffd83dbSDimitry Andric 
1648bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
1658bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1668bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1678bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
1688bcb0991SDimitry Andric   };
1698bcb0991SDimitry Andric }
1708bcb0991SDimitry Andric 
1710b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
1720b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1730b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1740b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
1750b57cec5SDimitry Andric   };
1760b57cec5SDimitry Andric }
1770b57cec5SDimitry Andric 
1780b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
1790b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1800b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1810b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
1820b57cec5SDimitry Andric   };
1830b57cec5SDimitry Andric }
1840b57cec5SDimitry Andric 
1855ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
1865ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
1875ffd83dbSDimitry Andric }
1885ffd83dbSDimitry Andric 
1895ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
1905ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
1915ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
1925ffd83dbSDimitry Andric }
1935ffd83dbSDimitry Andric 
1945ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
1950b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
1960b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
1970b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
1980b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
1990b57cec5SDimitry Andric }
2000b57cec5SDimitry Andric 
2015ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2025ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2035ffd83dbSDimitry Andric     return false;
2045ffd83dbSDimitry Andric 
2055ffd83dbSDimitry Andric   if (Ty.isVector())
2065ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2075ffd83dbSDimitry Andric 
2085ffd83dbSDimitry Andric   return true;
2095ffd83dbSDimitry Andric }
2105ffd83dbSDimitry Andric 
2115ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2125ffd83dbSDimitry Andric // multiples of v2s16.
2135ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2145ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2155ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2168bcb0991SDimitry Andric   };
2178bcb0991SDimitry Andric }
2188bcb0991SDimitry Andric 
2195ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2208bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2215ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2225ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2235ffd83dbSDimitry Andric       return false;
2245ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2255ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2268bcb0991SDimitry Andric   };
2278bcb0991SDimitry Andric }
2288bcb0991SDimitry Andric 
229fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
230fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
231fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
2328bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2338bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2348bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
2360b57cec5SDimitry Andric   };
2370b57cec5SDimitry Andric }
2380b57cec5SDimitry Andric 
2395ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2405ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2415ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2425ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
2435ffd83dbSDimitry Andric                                     bool IsLoad) {
2445ffd83dbSDimitry Andric   switch (AS) {
2455ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2465ffd83dbSDimitry Andric     // FIXME: Private element size.
247e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2485ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2495ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
2505ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
2515ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
2525ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
2535ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
2545ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
2555ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
2565ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
2575ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
2585ffd83dbSDimitry Andric     // kernel.
2595ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
2605ffd83dbSDimitry Andric   default:
2615ffd83dbSDimitry Andric     // Flat addresses may contextually need to be split to 32-bit parts if they
2625ffd83dbSDimitry Andric     // may alias scratch depending on the subtarget.
2635ffd83dbSDimitry Andric     return 128;
2645ffd83dbSDimitry Andric   }
2655ffd83dbSDimitry Andric }
2665ffd83dbSDimitry Andric 
2675ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
2695ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
2705ffd83dbSDimitry Andric 
2715ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
2735ffd83dbSDimitry Andric 
2745ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
275*04eeddc0SDimitry Andric   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276*04eeddc0SDimitry Andric   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
2775ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
2785ffd83dbSDimitry Andric 
2795ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
2805ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2815ffd83dbSDimitry Andric     return false;
2825ffd83dbSDimitry Andric 
283fe6060f1SDimitry Andric   // Do not handle extending vector loads.
284fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
285fe6060f1SDimitry Andric     return false;
286fe6060f1SDimitry Andric 
2875ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
2885ffd83dbSDimitry Andric   // we also need to modify the memory access size.
2895ffd83dbSDimitry Andric #if 0
2905ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
2915ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
2925ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
2935ffd83dbSDimitry Andric #endif
2945ffd83dbSDimitry Andric 
2955ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
2965ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
2975ffd83dbSDimitry Andric     return false;
2985ffd83dbSDimitry Andric 
2995ffd83dbSDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
3005ffd83dbSDimitry Andric     return false;
3015ffd83dbSDimitry Andric 
3025ffd83dbSDimitry Andric   switch (MemSize) {
3035ffd83dbSDimitry Andric   case 8:
3045ffd83dbSDimitry Andric   case 16:
3055ffd83dbSDimitry Andric   case 32:
3065ffd83dbSDimitry Andric   case 64:
3075ffd83dbSDimitry Andric   case 128:
3085ffd83dbSDimitry Andric     break;
3095ffd83dbSDimitry Andric   case 96:
3105ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3115ffd83dbSDimitry Andric       return false;
3125ffd83dbSDimitry Andric     break;
3135ffd83dbSDimitry Andric   case 256:
3145ffd83dbSDimitry Andric   case 512:
3155ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3165ffd83dbSDimitry Andric     break;
3175ffd83dbSDimitry Andric   default:
3185ffd83dbSDimitry Andric     return false;
3195ffd83dbSDimitry Andric   }
3205ffd83dbSDimitry Andric 
3215ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3225ffd83dbSDimitry Andric 
323e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3245ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
325e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3275ffd83dbSDimitry Andric       return false;
3285ffd83dbSDimitry Andric   }
3295ffd83dbSDimitry Andric 
3305ffd83dbSDimitry Andric   return true;
3315ffd83dbSDimitry Andric }
3325ffd83dbSDimitry Andric 
3335ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
3345ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
3355ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
3365ffd83dbSDimitry Andric // bitcasting.
3375ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
3385ffd83dbSDimitry Andric   if (EnableNewLegality)
3395ffd83dbSDimitry Andric     return false;
3405ffd83dbSDimitry Andric 
3415ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
3425ffd83dbSDimitry Andric   if (Size <= 64)
3435ffd83dbSDimitry Andric     return false;
3445ffd83dbSDimitry Andric   if (!Ty.isVector())
3455ffd83dbSDimitry Andric     return true;
346e8d8bef9SDimitry Andric 
347e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
348e8d8bef9SDimitry Andric   if (EltTy.isPointer())
349e8d8bef9SDimitry Andric     return true;
350e8d8bef9SDimitry Andric 
351e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
3525ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
3535ffd83dbSDimitry Andric }
3545ffd83dbSDimitry Andric 
355fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
3565ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
357fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
3585ffd83dbSDimitry Andric          !loadStoreBitcastWorkaround(Ty);
3595ffd83dbSDimitry Andric }
3605ffd83dbSDimitry Andric 
361e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
362e8d8bef9SDimitry Andric /// to a different type.
363e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364fe6060f1SDimitry Andric                                        const LLT MemTy) {
365fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
367e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
368e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
369e8d8bef9SDimitry Andric 
370e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371e8d8bef9SDimitry Andric     return true;
372fe6060f1SDimitry Andric 
373fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
374fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
376e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
377e8d8bef9SDimitry Andric }
378e8d8bef9SDimitry Andric 
379e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
380e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
381e8d8bef9SDimitry Andric /// changes, not the size of the result register.
382fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383*04eeddc0SDimitry Andric                             uint64_t AlignInBits, unsigned AddrSpace,
384e8d8bef9SDimitry Andric                             unsigned Opcode) {
385fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
386e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
387e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
388e8d8bef9SDimitry Andric     return false;
389e8d8bef9SDimitry Andric 
390e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391e8d8bef9SDimitry Andric   // end up widening these for a scalar load during RegBankSelect, since there
392e8d8bef9SDimitry Andric   // aren't 96-bit scalar loads.
393e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394e8d8bef9SDimitry Andric     return false;
395e8d8bef9SDimitry Andric 
396e8d8bef9SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397e8d8bef9SDimitry Andric     return false;
398e8d8bef9SDimitry Andric 
399e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
400e8d8bef9SDimitry Andric   // to it.
401e8d8bef9SDimitry Andric   //
402e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
403e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
405e8d8bef9SDimitry Andric     return false;
406e8d8bef9SDimitry Andric 
407e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
408e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
409e8d8bef9SDimitry Andric   bool Fast = false;
410e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
411e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
413e8d8bef9SDimitry Andric          Fast;
414e8d8bef9SDimitry Andric }
415e8d8bef9SDimitry Andric 
416e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417e8d8bef9SDimitry Andric                             unsigned Opcode) {
418e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419e8d8bef9SDimitry Andric     return false;
420e8d8bef9SDimitry Andric 
421fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
423e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
424e8d8bef9SDimitry Andric }
425e8d8bef9SDimitry Andric 
4260b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
4270b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
4280b57cec5SDimitry Andric   :  ST(ST_) {
4290b57cec5SDimitry Andric   using namespace TargetOpcode;
4300b57cec5SDimitry Andric 
4310b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
4320b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
4330b57cec5SDimitry Andric   };
4340b57cec5SDimitry Andric 
4350b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
436e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
4370b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
4380b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
4390b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
4400b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
4410b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
4425ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
4435ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
4440b57cec5SDimitry Andric 
445fe6060f1SDimitry Andric   const LLT V2S8 = LLT::fixed_vector(2, 8);
446fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
447fe6060f1SDimitry Andric   const LLT V4S16 = LLT::fixed_vector(4, 16);
4480b57cec5SDimitry Andric 
449fe6060f1SDimitry Andric   const LLT V2S32 = LLT::fixed_vector(2, 32);
450fe6060f1SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
451fe6060f1SDimitry Andric   const LLT V4S32 = LLT::fixed_vector(4, 32);
452fe6060f1SDimitry Andric   const LLT V5S32 = LLT::fixed_vector(5, 32);
453fe6060f1SDimitry Andric   const LLT V6S32 = LLT::fixed_vector(6, 32);
454fe6060f1SDimitry Andric   const LLT V7S32 = LLT::fixed_vector(7, 32);
455fe6060f1SDimitry Andric   const LLT V8S32 = LLT::fixed_vector(8, 32);
456fe6060f1SDimitry Andric   const LLT V9S32 = LLT::fixed_vector(9, 32);
457fe6060f1SDimitry Andric   const LLT V10S32 = LLT::fixed_vector(10, 32);
458fe6060f1SDimitry Andric   const LLT V11S32 = LLT::fixed_vector(11, 32);
459fe6060f1SDimitry Andric   const LLT V12S32 = LLT::fixed_vector(12, 32);
460fe6060f1SDimitry Andric   const LLT V13S32 = LLT::fixed_vector(13, 32);
461fe6060f1SDimitry Andric   const LLT V14S32 = LLT::fixed_vector(14, 32);
462fe6060f1SDimitry Andric   const LLT V15S32 = LLT::fixed_vector(15, 32);
463fe6060f1SDimitry Andric   const LLT V16S32 = LLT::fixed_vector(16, 32);
464fe6060f1SDimitry Andric   const LLT V32S32 = LLT::fixed_vector(32, 32);
4650b57cec5SDimitry Andric 
466fe6060f1SDimitry Andric   const LLT V2S64 = LLT::fixed_vector(2, 64);
467fe6060f1SDimitry Andric   const LLT V3S64 = LLT::fixed_vector(3, 64);
468fe6060f1SDimitry Andric   const LLT V4S64 = LLT::fixed_vector(4, 64);
469fe6060f1SDimitry Andric   const LLT V5S64 = LLT::fixed_vector(5, 64);
470fe6060f1SDimitry Andric   const LLT V6S64 = LLT::fixed_vector(6, 64);
471fe6060f1SDimitry Andric   const LLT V7S64 = LLT::fixed_vector(7, 64);
472fe6060f1SDimitry Andric   const LLT V8S64 = LLT::fixed_vector(8, 64);
473fe6060f1SDimitry Andric   const LLT V16S64 = LLT::fixed_vector(16, 64);
4740b57cec5SDimitry Andric 
4750b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
4760b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
4778bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
4780b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
4798bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
4800b57cec5SDimitry Andric 
4810b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
4820b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
4838bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
4840b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
4858bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
4860b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
4870b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
4880b57cec5SDimitry Andric 
4890b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
4900b57cec5SDimitry Andric 
4910b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
4920b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
4930b57cec5SDimitry Andric   };
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
4968bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
4970b57cec5SDimitry Andric   };
4980b57cec5SDimitry Andric 
4990b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
5000b57cec5SDimitry Andric     S32, S64
5010b57cec5SDimitry Andric   };
5020b57cec5SDimitry Andric 
5030b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
5040b57cec5SDimitry Andric     S32, S64, S16
5050b57cec5SDimitry Andric   };
5060b57cec5SDimitry Andric 
5070b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
5080b57cec5SDimitry Andric     S32, S64, S16, V2S16
5090b57cec5SDimitry Andric   };
5100b57cec5SDimitry Andric 
5115ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
5125ffd83dbSDimitry Andric 
513fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
514fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
5150b57cec5SDimitry Andric 
5160b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
5170b57cec5SDimitry Andric   // elements for v3s16
5180b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
519e8d8bef9SDimitry Andric     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
5200b57cec5SDimitry Andric     .legalFor(AllS32Vectors)
5210b57cec5SDimitry Andric     .legalFor(AllS64Vectors)
5220b57cec5SDimitry Andric     .legalFor(AddrSpaces64)
5230b57cec5SDimitry Andric     .legalFor(AddrSpaces32)
524e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
525e8d8bef9SDimitry Andric     .clampScalar(0, S16, S256)
5260b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
5270b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16)
5280b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529e8d8bef9SDimitry Andric     .scalarize(0);
5300b57cec5SDimitry Andric 
531e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532e8d8bef9SDimitry Andric     // Full set of gfx9 features.
5335ffd83dbSDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5345ffd83dbSDimitry Andric       .legalFor({S32, S16, V2S16})
535349cc55cSDimitry Andric       .minScalar(0, S16)
5360eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
537349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
538349cc55cSDimitry Andric       .maxScalar(0, S32)
539349cc55cSDimitry Andric       .scalarize(0);
540e8d8bef9SDimitry Andric 
541e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
543e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
5440eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
545e8d8bef9SDimitry Andric       .scalarize(0)
546e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
547e8d8bef9SDimitry Andric       .lower();
5485ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
5490b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5500b57cec5SDimitry Andric       .legalFor({S32, S16})
551349cc55cSDimitry Andric       .minScalar(0, S16)
552349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
553349cc55cSDimitry Andric       .maxScalar(0, S32)
554349cc55cSDimitry Andric       .scalarize(0);
555e8d8bef9SDimitry Andric 
556e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
557e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
558e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
560e8d8bef9SDimitry Andric       .minScalar(0, S16)
561e8d8bef9SDimitry Andric       .scalarize(0)
562e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
563e8d8bef9SDimitry Andric       .lower();
564e8d8bef9SDimitry Andric 
565e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
566e8d8bef9SDimitry Andric     // coerce to the desired type first.
567e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568e8d8bef9SDimitry Andric       .minScalar(0, S16)
569e8d8bef9SDimitry Andric       .scalarize(0)
570e8d8bef9SDimitry Andric       .lower();
5710b57cec5SDimitry Andric   } else {
5720b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5730b57cec5SDimitry Andric       .legalFor({S32})
574349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
5750b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
5760b57cec5SDimitry Andric       .scalarize(0);
577e8d8bef9SDimitry Andric 
578e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
579e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
581e8d8bef9SDimitry Andric         .scalarize(0)
582e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
583e8d8bef9SDimitry Andric         .lower();
584e8d8bef9SDimitry Andric     } else {
585e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
586e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587e8d8bef9SDimitry Andric         .minScalar(0, S32)
588e8d8bef9SDimitry Andric         .scalarize(0)
589e8d8bef9SDimitry Andric         .lower();
5900b57cec5SDimitry Andric     }
5910b57cec5SDimitry Andric 
592e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
593e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
594e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595e8d8bef9SDimitry Andric       .minScalar(0, S32)
596e8d8bef9SDimitry Andric       .scalarize(0)
597e8d8bef9SDimitry Andric       .lower();
598e8d8bef9SDimitry Andric   }
599e8d8bef9SDimitry Andric 
600fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
601fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
6025ffd83dbSDimitry Andric       .customFor({S32, S64})
603480093f4SDimitry Andric       .clampScalar(0, S32, S64)
604480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
605480093f4SDimitry Andric       .scalarize(0);
606480093f4SDimitry Andric 
607e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
6080b57cec5SDimitry Andric                    .legalFor({S32})
609349cc55cSDimitry Andric                    .maxScalar(0, S32);
610e8d8bef9SDimitry Andric 
611e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
612e8d8bef9SDimitry Andric     Mulh
613e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
614e8d8bef9SDimitry Andric       .lowerFor({V2S8});
615e8d8bef9SDimitry Andric   }
616e8d8bef9SDimitry Andric 
617e8d8bef9SDimitry Andric   Mulh
618e8d8bef9SDimitry Andric     .scalarize(0)
619e8d8bef9SDimitry Andric     .lower();
6200b57cec5SDimitry Andric 
6210b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
6220b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
6230b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
6240b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
6250b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
6260b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6278bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
6280b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
6290b57cec5SDimitry Andric     .scalarize(0);
6300b57cec5SDimitry Andric 
6318bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
6320b57cec5SDimitry Andric                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633480093f4SDimitry Andric     .legalFor({{S32, S1}, {S32, S32}})
6345ffd83dbSDimitry Andric     .minScalar(0, S32)
6355ffd83dbSDimitry Andric     // TODO: .scalarize(0)
6368bcb0991SDimitry Andric     .lower();
6370b57cec5SDimitry Andric 
6380b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
6390b57cec5SDimitry Andric     // Don't worry about the size constraint.
6408bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
6415ffd83dbSDimitry Andric     .lower();
6420b57cec5SDimitry Andric 
6430b57cec5SDimitry Andric 
6440b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
6458bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
6460b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
6480b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
649e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
6500b57cec5SDimitry Andric 
6515ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
6525ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
6535ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
6548bcb0991SDimitry Andric 
6555ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
6565ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
6575ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
6585ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
6595ffd83dbSDimitry Andric       .legalFor({S1, S16})
6605ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6615ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
6625ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
6635ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
6645ffd83dbSDimitry Andric 
665fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
6665ffd83dbSDimitry Andric 
6675ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
6685ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
6695ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
6705ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
6715ffd83dbSDimitry Andric 
6725ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
674e8d8bef9SDimitry Andric 
675fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
6760b57cec5SDimitry Andric 
6770b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
6788bcb0991SDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
6790b57cec5SDimitry Andric     .legalFor({S32, S64});
6808bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
6818bcb0991SDimitry Andric     .customFor({S32, S64});
6828bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
6838bcb0991SDimitry Andric     .customFor({S32, S64});
6840b57cec5SDimitry Andric 
6850b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
6860b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
6870b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
6880b57cec5SDimitry Andric     else
6890b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
6908bcb0991SDimitry Andric 
6918bcb0991SDimitry Andric     TrigActions.customFor({S16});
6928bcb0991SDimitry Andric     FDIVActions.customFor({S16});
6930b57cec5SDimitry Andric   }
6940b57cec5SDimitry Andric 
6950b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
6960b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
6970b57cec5SDimitry Andric 
6980b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
6990b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
700480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
7010b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
7020b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7030b57cec5SDimitry Andric       .scalarize(0);
7040b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
7050b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
7060b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7070b57cec5SDimitry Andric       .scalarize(0);
7080b57cec5SDimitry Andric   } else {
7090b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
7100b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
7110b57cec5SDimitry Andric       .scalarize(0);
7120b57cec5SDimitry Andric   }
7130b57cec5SDimitry Andric 
7140b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
7150eae32dcSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
7168bcb0991SDimitry Andric 
7170b57cec5SDimitry Andric   FPOpActions
7180b57cec5SDimitry Andric     .scalarize(0)
7190b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7200b57cec5SDimitry Andric 
7218bcb0991SDimitry Andric   TrigActions
7228bcb0991SDimitry Andric     .scalarize(0)
7238bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7248bcb0991SDimitry Andric 
7258bcb0991SDimitry Andric   FDIVActions
7268bcb0991SDimitry Andric     .scalarize(0)
7278bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7288bcb0991SDimitry Andric 
7298bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
7308bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
7310eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
7328bcb0991SDimitry Andric     .scalarize(0)
7338bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
7348bcb0991SDimitry Andric 
7350b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7368bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
7370b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
7380b57cec5SDimitry Andric       .scalarize(0)
7390b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
7400b57cec5SDimitry Andric   } else {
7415ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
7425ffd83dbSDimitry Andric       .legalFor({S32, S64})
7435ffd83dbSDimitry Andric       .scalarize(0)
7445ffd83dbSDimitry Andric       .clampScalar(0, S32, S64);
7455ffd83dbSDimitry Andric 
7465ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
7475ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7485ffd83dbSDimitry Andric         .customFor({S64})
7495ffd83dbSDimitry Andric         .legalFor({S32, S64})
7505ffd83dbSDimitry Andric         .scalarize(0)
7515ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
7525ffd83dbSDimitry Andric     } else {
7535ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7540b57cec5SDimitry Andric         .legalFor({S32, S64})
7550b57cec5SDimitry Andric         .scalarize(0)
7560b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
7570b57cec5SDimitry Andric     }
7585ffd83dbSDimitry Andric   }
7590b57cec5SDimitry Andric 
7600b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
7610b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
7625ffd83dbSDimitry Andric     .scalarize(0)
7635ffd83dbSDimitry Andric     .lower();
7640b57cec5SDimitry Andric 
7650b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
7660b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
767e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
7680b57cec5SDimitry Andric     .scalarize(0);
7690b57cec5SDimitry Andric 
7700b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FSUB)
7710b57cec5SDimitry Andric       // Use actual fsub instruction
7720b57cec5SDimitry Andric       .legalFor({S32})
7730b57cec5SDimitry Andric       // Must use fadd + fneg
7740b57cec5SDimitry Andric       .lowerFor({S64, S16, V2S16})
7750b57cec5SDimitry Andric       .scalarize(0)
7760b57cec5SDimitry Andric       .clampScalar(0, S32, S64);
7770b57cec5SDimitry Andric 
7788bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
7798bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
7805ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
7818bcb0991SDimitry Andric     FMad.customFor({S32, S16});
7825ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
7838bcb0991SDimitry Andric     FMad.customFor({S32});
7845ffd83dbSDimitry Andric   else if (ST.hasMadF16())
7855ffd83dbSDimitry Andric     FMad.customFor({S16});
7868bcb0991SDimitry Andric   FMad.scalarize(0)
7878bcb0991SDimitry Andric       .lower();
7888bcb0991SDimitry Andric 
789e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
791e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
792e8d8bef9SDimitry Andric   } else {
793e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
794e8d8bef9SDimitry Andric         .customFor({S32, S64});
795e8d8bef9SDimitry Andric   }
796e8d8bef9SDimitry Andric   FRem.scalarize(0);
797e8d8bef9SDimitry Andric 
7985ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
7995ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
8005ffd83dbSDimitry Andric     .legalIf(isScalar(0))
8015ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
8025ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
8035ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
8045ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
8055ffd83dbSDimitry Andric     // in the legalizer.
8065ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
8075ffd83dbSDimitry Andric     .alwaysLegal();
8085ffd83dbSDimitry Andric 
8090b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
8100b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
8115ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
812480093f4SDimitry Andric     .scalarize(0)
8135ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
8145ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
8150b57cec5SDimitry Andric 
8168bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
8178bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818480093f4SDimitry Andric                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819480093f4SDimitry Andric                     .lowerIf(typeIs(1, S1))
820349cc55cSDimitry Andric                     .customFor({{S32, S64}, {S64, S64}});
8218bcb0991SDimitry Andric   if (ST.has16BitInsts())
8228bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
8238bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
824e8d8bef9SDimitry Andric        .minScalar(0, S32)
8255ffd83dbSDimitry Andric        .scalarize(0)
8265ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
8270b57cec5SDimitry Andric 
8288bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
8295ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
831e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
8328bcb0991SDimitry Andric   if (ST.has16BitInsts())
8338bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
8348bcb0991SDimitry Andric   else
8358bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
8368bcb0991SDimitry Andric 
8378bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
838fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
8395ffd83dbSDimitry Andric        .scalarize(0)
8405ffd83dbSDimitry Andric        .lower();
8410b57cec5SDimitry Andric 
842e8d8bef9SDimitry Andric   // Lower roundeven into G_FRINT
843e8d8bef9SDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844480093f4SDimitry Andric     .scalarize(0)
845480093f4SDimitry Andric     .lower();
8460b57cec5SDimitry Andric 
847480093f4SDimitry Andric   if (ST.has16BitInsts()) {
848480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849480093f4SDimitry Andric       .legalFor({S16, S32, S64})
850480093f4SDimitry Andric       .clampScalar(0, S16, S64)
851480093f4SDimitry Andric       .scalarize(0);
852480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
8530b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8540b57cec5SDimitry Andric       .legalFor({S32, S64})
8550b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8560b57cec5SDimitry Andric       .scalarize(0);
8570b57cec5SDimitry Andric   } else {
8580b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8590b57cec5SDimitry Andric       .legalFor({S32})
8600b57cec5SDimitry Andric       .customFor({S64})
8610b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8620b57cec5SDimitry Andric       .scalarize(0);
8630b57cec5SDimitry Andric   }
8640b57cec5SDimitry Andric 
865480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
866e8d8bef9SDimitry Andric     .legalIf(all(isPointer(0), sameSize(0, 1)))
867e8d8bef9SDimitry Andric     .scalarize(0)
868e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0);
8690b57cec5SDimitry Andric 
8705ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
871e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
8735ffd83dbSDimitry Andric     .scalarize(0);
8740b57cec5SDimitry Andric 
8750b57cec5SDimitry Andric   auto &CmpBuilder =
8760b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
877480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
878480093f4SDimitry Andric     // so make both s1 and s32 legal.
879480093f4SDimitry Andric     //
880480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
881480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
882480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
883480093f4SDimitry Andric     // before that won't try to use s32 result types.
884480093f4SDimitry Andric     //
885480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886480093f4SDimitry Andric     // bank.
8870b57cec5SDimitry Andric     .legalForCartesianProduct(
8880b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889480093f4SDimitry Andric     .legalForCartesianProduct(
890480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
8910b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
8920b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
8930b57cec5SDimitry Andric   }
8940b57cec5SDimitry Andric 
8950b57cec5SDimitry Andric   CmpBuilder
8960b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
8970b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
8980b57cec5SDimitry Andric     .scalarize(0)
899480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
9000b57cec5SDimitry Andric 
9010b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
9020b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
9030b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
9040b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9050b57cec5SDimitry Andric     .scalarize(0);
9060b57cec5SDimitry Andric 
9075ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
9085ffd83dbSDimitry Andric   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
9095ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9105ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32, S16});
9115ffd83dbSDimitry Andric   else
9125ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32});
9135ffd83dbSDimitry Andric   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
9145ffd83dbSDimitry Andric   Exp2Ops.scalarize(0);
9155ffd83dbSDimitry Andric 
9165ffd83dbSDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
9175ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9185ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
9195ffd83dbSDimitry Andric   else
9205ffd83dbSDimitry Andric     ExpOps.customFor({S32});
9215ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
9220b57cec5SDimitry Andric         .scalarize(0);
9230b57cec5SDimitry Andric 
924e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
925e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
926e8d8bef9SDimitry Andric     .lower();
927e8d8bef9SDimitry Andric 
9280b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9295ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
9300b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9310b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
932*04eeddc0SDimitry Andric     .widenScalarToNextPow2(1, 32)
9330b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9340b57cec5SDimitry Andric     .scalarize(0)
935*04eeddc0SDimitry Andric     .widenScalarToNextPow2(0, 32);
936*04eeddc0SDimitry Andric 
9370b57cec5SDimitry Andric 
9385ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
9395ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
9405ffd83dbSDimitry Andric   // bitwidth.
9415ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
9425ffd83dbSDimitry Andric     .scalarize(0)
9435ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9445ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9455ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9465ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
947349cc55cSDimitry Andric     .custom();
9485ffd83dbSDimitry Andric 
9495ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9505ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
9515ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9525ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9535ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9545ffd83dbSDimitry Andric     .scalarize(0)
9555ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9565ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
9575ffd83dbSDimitry Andric 
958fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
959fe6060f1SDimitry Andric   // RegBankSelect.
9605ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
961fe6060f1SDimitry Andric     .legalFor({S32, S64})
962fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
963fe6060f1SDimitry Andric     .scalarize(0)
964fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
9650b57cec5SDimitry Andric 
9660b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
9675ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
9685ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
9690eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
9705ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
9715ffd83dbSDimitry Andric       // narrowScalar limitation.
9725ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
9735ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
9745ffd83dbSDimitry Andric       .scalarize(0);
9755ffd83dbSDimitry Andric 
9760b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
977fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
9780b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
9790b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
9800b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
9815ffd83dbSDimitry Andric         .minScalar(0, S16)
9820b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
9835ffd83dbSDimitry Andric         .scalarize(0)
9845ffd83dbSDimitry Andric         .lower();
9850b57cec5SDimitry Andric     } else {
986fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
9870b57cec5SDimitry Andric         .legalFor({S32, S16})
9880b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
9895ffd83dbSDimitry Andric         .minScalar(0, S16)
9905ffd83dbSDimitry Andric         .scalarize(0)
9915ffd83dbSDimitry Andric         .lower();
9920b57cec5SDimitry Andric     }
9930b57cec5SDimitry Andric   } else {
9945ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
9955ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
9965ffd83dbSDimitry Andric       .legalFor({S32})
9975ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
9985ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
9995ffd83dbSDimitry Andric       // narrowScalar limitation.
10005ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
10015ffd83dbSDimitry Andric       .maxScalar(0, S32)
10025ffd83dbSDimitry Andric       .scalarize(0)
10035ffd83dbSDimitry Andric       .lower();
10045ffd83dbSDimitry Andric 
1005fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10060b57cec5SDimitry Andric       .legalFor({S32})
10075ffd83dbSDimitry Andric       .minScalar(0, S32)
10080b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
10095ffd83dbSDimitry Andric       .scalarize(0)
10105ffd83dbSDimitry Andric       .lower();
10110b57cec5SDimitry Andric   }
10120b57cec5SDimitry Andric 
10130b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
10140b57cec5SDimitry Andric     // List the common cases
10150b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10160b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10170b57cec5SDimitry Andric     .scalarize(0)
10180b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10190b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10200b57cec5SDimitry Andric     .widenScalarIf(smallerThan(1, 0),
10210b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10220b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10230b57cec5SDimitry Andric       })
10245ffd83dbSDimitry Andric     .narrowScalarIf(largerThan(1, 0),
10250b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10260b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10270b57cec5SDimitry Andric       });
10280b57cec5SDimitry Andric 
10290b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
10300b57cec5SDimitry Andric     // List the common cases
10310b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10320b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10330b57cec5SDimitry Andric     .scalarize(0)
10340b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10350b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10360b57cec5SDimitry Andric     .widenScalarIf(smallerThan(0, 1),
10370b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10380b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10390b57cec5SDimitry Andric       })
10400b57cec5SDimitry Andric     .narrowScalarIf(
10415ffd83dbSDimitry Andric       largerThan(0, 1),
10420b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10430b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10440b57cec5SDimitry Andric       });
10450b57cec5SDimitry Andric 
10460b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
10470b57cec5SDimitry Andric     .scalarize(0)
10480b57cec5SDimitry Andric     .custom();
10490b57cec5SDimitry Andric 
10505ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
10515ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
10528bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
10538bcb0991SDimitry Andric 
10548bcb0991SDimitry Andric     // Split vector extloads.
1055fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1056480093f4SDimitry Andric 
10578bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
10588bcb0991SDimitry Andric       return true;
10598bcb0991SDimitry Andric 
10608bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
10618bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
10625ffd83dbSDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
10638bcb0991SDimitry Andric       return true;
10648bcb0991SDimitry Andric 
10658bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
10668bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
10675ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
10685ffd83dbSDimitry Andric     if (NumRegs == 3) {
10695ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
10708bcb0991SDimitry Andric         return true;
10715ffd83dbSDimitry Andric     } else {
10725ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
10735ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
10745ffd83dbSDimitry Andric         return true;
10755ffd83dbSDimitry Andric     }
10768bcb0991SDimitry Andric 
10778bcb0991SDimitry Andric     return false;
10788bcb0991SDimitry Andric   };
10798bcb0991SDimitry Andric 
1080e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1081e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1082e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
10838bcb0991SDimitry Andric 
10848bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
10858bcb0991SDimitry Andric   // LDS
10868bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
10878bcb0991SDimitry Andric 
10888bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
10898bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
10908bcb0991SDimitry Andric 
10918bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
10925ffd83dbSDimitry Andric     // Explicitly list some common cases.
10935ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1094fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1095fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1096fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1097fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1098fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1099fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1100fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1101fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
11028bcb0991SDimitry Andric 
1103fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1104fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1105fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1106fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1107fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1108fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
11098bcb0991SDimitry Andric 
1110fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1111fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1112fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1113fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
11148bcb0991SDimitry Andric 
1115fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1116fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1117fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1118fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1119fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
11205ffd83dbSDimitry Andric     Actions.legalIf(
11215ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1122fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
11235ffd83dbSDimitry Andric       });
11245ffd83dbSDimitry Andric 
11255ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
11265ffd83dbSDimitry Andric     // 64-bits.
11275ffd83dbSDimitry Andric     //
11285ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
11295ffd83dbSDimitry Andric     // inserting addrspacecasts.
11305ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
11315ffd83dbSDimitry Andric 
11325ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
11335ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
11345ffd83dbSDimitry Andric     // parts anyway.
11355ffd83dbSDimitry Andric     //
11365ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
11375ffd83dbSDimitry Andric     // 16-bit vector parts.
11385ffd83dbSDimitry Andric     Actions.bitcastIf(
11395ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1140e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1141fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
11425ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
11435ffd83dbSDimitry Andric 
1144e8d8bef9SDimitry Andric     if (!IsStore) {
1145e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1146e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1147e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1148e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1149e8d8bef9SDimitry Andric       });
1150e8d8bef9SDimitry Andric     }
1151e8d8bef9SDimitry Andric 
1152e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
11538bcb0991SDimitry Andric     Actions
11548bcb0991SDimitry Andric         .narrowScalarIf(
11558bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
11565ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
11575ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
11588bcb0991SDimitry Andric             },
11598bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
11608bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
11618bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
11628bcb0991SDimitry Andric 
11638bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1164fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
11658bcb0991SDimitry Andric 
11668bcb0991SDimitry Andric               // Split extloads.
11678bcb0991SDimitry Andric               if (DstSize > MemSize)
11688bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MemSize));
11698bcb0991SDimitry Andric 
11705ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
11715ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
11725ffd83dbSDimitry Andric                                                      Op == G_LOAD);
11738bcb0991SDimitry Andric               if (MemSize > MaxSize)
11748bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MaxSize));
11758bcb0991SDimitry Andric 
1176*04eeddc0SDimitry Andric               uint64_t Align = Query.MMODescrs[0].AlignInBits;
11778bcb0991SDimitry Andric               return std::make_pair(0, LLT::scalar(Align));
11788bcb0991SDimitry Andric             })
11798bcb0991SDimitry Andric         .fewerElementsIf(
11808bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
11815ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
11825ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
11838bcb0991SDimitry Andric             },
11848bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
11858bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
11868bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
11878bcb0991SDimitry Andric 
11888bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
11895ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
11905ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
11915ffd83dbSDimitry Andric                                                      Op == G_LOAD);
11925ffd83dbSDimitry Andric 
11935ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
11945ffd83dbSDimitry Andric               // up scalarizing.
11955ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
11968bcb0991SDimitry Andric 
11978bcb0991SDimitry Andric               // Split if it's too large for the address space.
1198fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1199fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
12008bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
12015ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
12025ffd83dbSDimitry Andric 
12035ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
12045ffd83dbSDimitry Andric                   return std::make_pair(
1205fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1206fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
12075ffd83dbSDimitry Andric                 }
12085ffd83dbSDimitry Andric 
1209fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
12108bcb0991SDimitry Andric 
12118bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
12128bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
12138bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
12148bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
12158bcb0991SDimitry Andric                   return std::make_pair(0, EltTy);
12168bcb0991SDimitry Andric 
1217fe6060f1SDimitry Andric                 return std::make_pair(
1218fe6060f1SDimitry Andric                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
12198bcb0991SDimitry Andric               }
12208bcb0991SDimitry Andric 
12215ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
12225ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
12235ffd83dbSDimitry Andric                 return std::make_pair(0, EltTy);
12245ffd83dbSDimitry Andric 
12255ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
12265ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
12275ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
12285ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
12295ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
12305ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
12315ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
12325ffd83dbSDimitry Andric                 return std::make_pair(
1233fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1234fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
12355ffd83dbSDimitry Andric               }
12365ffd83dbSDimitry Andric 
12378bcb0991SDimitry Andric               // May need relegalization for the scalars.
12388bcb0991SDimitry Andric               return std::make_pair(0, EltTy);
12398bcb0991SDimitry Andric             })
1240fe6060f1SDimitry Andric     .minScalar(0, S32)
1241fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
12428bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1243e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1244e8d8bef9SDimitry Andric     .lower();
12458bcb0991SDimitry Andric   }
12460b57cec5SDimitry Andric 
1247fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
12480b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1249fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1250fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1251fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1252fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1253fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1254fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1255fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1256fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1257fe6060f1SDimitry Andric                        .legalIf(
1258fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1259fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1260fe6060f1SDimitry Andric                          });
1261fe6060f1SDimitry Andric 
12620b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
12638bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1264fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
12650b57cec5SDimitry Andric   }
12660b57cec5SDimitry Andric 
1267fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1268fe6060f1SDimitry Andric   // 64-bits.
1269fe6060f1SDimitry Andric   //
1270fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1271fe6060f1SDimitry Andric   // inserting addrspacecasts.
1272fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1273fe6060f1SDimitry Andric 
12740b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
12750b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
12760b57cec5SDimitry Andric           .lower();
12770b57cec5SDimitry Andric 
12780b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
12790b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
12800b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
12810b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1282480093f4SDimitry Andric      G_ATOMICRMW_UMIN})
12830b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1284e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1285e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
12860b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
12870b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
12880b57cec5SDimitry Andric   }
12890b57cec5SDimitry Andric 
1290fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1291349cc55cSDimitry Andric   if (ST.hasLDSFPAtomicAdd()) {
1292fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1293fe6060f1SDimitry Andric     if (ST.hasGFX90AInsts())
1294fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
12955ffd83dbSDimitry Andric   }
1296fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1297fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
12988bcb0991SDimitry Andric 
1299*04eeddc0SDimitry Andric   if (ST.hasGFX90AInsts()) {
1300*04eeddc0SDimitry Andric     // These are legal with some caveats, and should have undergone expansion in
1301*04eeddc0SDimitry Andric     // the IR in most situations
1302*04eeddc0SDimitry Andric     // TODO: Move atomic expansion into legalizer
1303*04eeddc0SDimitry Andric     // TODO: Also supports <2 x f16>
1304*04eeddc0SDimitry Andric     Atomic.legalFor({
1305*04eeddc0SDimitry Andric         {S32, GlobalPtr},
1306*04eeddc0SDimitry Andric         {S64, GlobalPtr},
1307*04eeddc0SDimitry Andric         {S64, FlatPtr}
1308*04eeddc0SDimitry Andric       });
1309*04eeddc0SDimitry Andric   }
1310*04eeddc0SDimitry Andric 
1311480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1312480093f4SDimitry Andric   // demarshalling
1313480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1314480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1315480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1316480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1317480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13180b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1319480093f4SDimitry Andric 
1320480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
13210b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1322fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1323fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1324fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1325fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1326fe6060f1SDimitry Andric                                 {S1, S32})
13270b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
13285ffd83dbSDimitry Andric       .scalarize(1)
13290b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
13300b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
13310b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
13320b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
13330b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
13340b57cec5SDimitry Andric       .scalarize(0)
13350b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1336480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
13370b57cec5SDimitry Andric 
13380b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
13390b57cec5SDimitry Andric   // be more flexible with the shift amount type.
13400b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
13410b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
13420b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
13430b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
13445ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
13450b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
13460b57cec5SDimitry Andric     } else
13475ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
13480b57cec5SDimitry Andric 
13495ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
13505ffd83dbSDimitry Andric     Shifts.widenScalarIf(
13515ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
13525ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
13535ffd83dbSDimitry Andric         // 32-bit amount.
13545ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
13555ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
13565ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
13575ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
13585ffd83dbSDimitry Andric       }, changeTo(1, S16));
13595ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1360480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
13610b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
1362*04eeddc0SDimitry Andric     Shifts.clampScalar(0, S16, S64);
1363e8d8bef9SDimitry Andric 
1364e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1365e8d8bef9SDimitry Andric       .minScalar(0, S16)
1366e8d8bef9SDimitry Andric       .scalarize(0)
1367e8d8bef9SDimitry Andric       .lower();
13680b57cec5SDimitry Andric   } else {
13690b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
13700b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
13710b57cec5SDimitry Andric     // been truncated already.
13720b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
13730b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
1374*04eeddc0SDimitry Andric     Shifts.clampScalar(0, S32, S64);
1375e8d8bef9SDimitry Andric 
1376e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1377e8d8bef9SDimitry Andric       .minScalar(0, S32)
1378e8d8bef9SDimitry Andric       .scalarize(0)
1379e8d8bef9SDimitry Andric       .lower();
13800b57cec5SDimitry Andric   }
13810b57cec5SDimitry Andric   Shifts.scalarize(0);
13820b57cec5SDimitry Andric 
13830b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
13840b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
13850b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
13860b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
13870b57cec5SDimitry Andric 
13880b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
13890b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
13900b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
13910b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
13920b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1393e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
1394e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
13950b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
13965ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
13970b57cec5SDimitry Andric                   IdxTy.getSizeInBits() == 32;
13980b57cec5SDimitry Andric         })
1399e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1400e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1401e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1402e8d8bef9SDimitry Andric       .bitcastIf(
1403e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1404e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1405e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1406e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1407e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1408e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1409e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1410e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1411e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1412e8d8bef9SDimitry Andric 
1413e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1414e8d8bef9SDimitry Andric           return std::make_pair(
1415fe6060f1SDimitry Andric               VecTypeIdx,
1416fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1417e8d8bef9SDimitry Andric         })
14180b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
14190b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1420e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1421e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1422e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
1423e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1424e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1425e8d8bef9SDimitry Andric       .lower();
14260b57cec5SDimitry Andric   }
14270b57cec5SDimitry Andric 
14280b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
14290b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
14300b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
14310b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
14320b57cec5SDimitry Andric       });
14330b57cec5SDimitry Andric 
14340b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
14350b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
14360b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
14370b57cec5SDimitry Andric 
14380b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
14390b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14408bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
14410eae32dcSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
14420eae32dcSDimitry Andric           // Sub-vector(or single element) insert and extract.
14430eae32dcSDimitry Andric           // TODO: verify immediate offset here since lower only works with
14440eae32dcSDimitry Andric           // whole elements.
14450eae32dcSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14460eae32dcSDimitry Andric           return BigTy.isVector();
14470eae32dcSDimitry Andric         })
14488bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
14490b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
14500b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14510b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14520b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
14530b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
14540b57cec5SDimitry Andric         })
14550b57cec5SDimitry Andric       .widenScalarIf(
14560b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
14570b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14580b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
14590b57cec5SDimitry Andric         },
14600b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
14610b57cec5SDimitry Andric       .widenScalarIf(
14620b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
14630b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14640b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
14650b57cec5SDimitry Andric         },
14660b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
14670b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
14680b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
14690b57cec5SDimitry Andric 
14700b57cec5SDimitry Andric   }
14710b57cec5SDimitry Andric 
14728bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
14730b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
14740b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
14758bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
14768bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
14778bcb0991SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
14788bcb0991SDimitry Andric 
14798bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
14805ffd83dbSDimitry Andric     BuildVector
14815ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
14825ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
14835ffd83dbSDimitry Andric       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
14845ffd83dbSDimitry Andric       .minScalar(1, S32);
14855ffd83dbSDimitry Andric 
14868bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
14878bcb0991SDimitry Andric       .legalFor({V2S16, S32})
14888bcb0991SDimitry Andric       .lower();
14895ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
14908bcb0991SDimitry Andric   } else {
14915ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
14925ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
14935ffd83dbSDimitry Andric 
14948bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
14955ffd83dbSDimitry Andric       .customFor({V2S16, S32})
14968bcb0991SDimitry Andric       .lower();
14978bcb0991SDimitry Andric   }
14988bcb0991SDimitry Andric 
14995ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
15005ffd83dbSDimitry Andric 
15015ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
15020b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1503e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1504e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1505e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1506e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
15070b57cec5SDimitry Andric 
15085ffd83dbSDimitry Andric   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
15095ffd83dbSDimitry Andric   // pre-legalize.
15105ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
15115ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
15125ffd83dbSDimitry Andric       .customFor({V2S16, V2S16})
15135ffd83dbSDimitry Andric       .lower();
15145ffd83dbSDimitry Andric   } else
15158bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
15168bcb0991SDimitry Andric 
15170b57cec5SDimitry Andric   // Merge/Unmerge
15180b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
15190b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
15200b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
15210b57cec5SDimitry Andric 
15220b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
15235ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
15240b57cec5SDimitry Andric       if (Ty.isVector()) {
15250b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
15265ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
15270b57cec5SDimitry Andric           return true;
15280b57cec5SDimitry Andric         if (!isPowerOf2_32(EltTy.getSizeInBits()))
15290b57cec5SDimitry Andric           return true;
15300b57cec5SDimitry Andric       }
15310b57cec5SDimitry Andric       return false;
15320b57cec5SDimitry Andric     };
15330b57cec5SDimitry Andric 
15348bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1535e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
15365ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
15375ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
15385ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15395ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
15405ffd83dbSDimitry Andric         })
15415ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
15425ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
15435ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
15440b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
15458bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
15468bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
15478bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
15488bcb0991SDimitry Andric                        changeTo(1, V2S16))
15495ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
15505ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
15515ffd83dbSDimitry Andric       // valid.
15525ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
15535ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
15540b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
15550b57cec5SDimitry Andric       .fewerElementsIf(
15565ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
15570b57cec5SDimitry Andric         scalarize(0))
15580b57cec5SDimitry Andric       .fewerElementsIf(
15595ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
15600b57cec5SDimitry Andric         scalarize(1))
15615ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
15628bcb0991SDimitry Andric 
15638bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
15648bcb0991SDimitry Andric       Builder.widenScalarIf(
15658bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
15660b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
15678bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
15688bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
15698bcb0991SDimitry Andric         },
15708bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
15718bcb0991SDimitry Andric     }
15728bcb0991SDimitry Andric 
15738bcb0991SDimitry Andric     Builder.widenScalarIf(
15748bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
15758bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
15760b57cec5SDimitry Andric         return !isPowerOf2_32(Ty.getSizeInBits()) &&
15770b57cec5SDimitry Andric           Ty.getSizeInBits() % 16 != 0;
15780b57cec5SDimitry Andric       },
15790b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
15800b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
15810b57cec5SDimitry Andric         // Whichever is smaller.
15820b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
15830b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
15840b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
15850b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
15860b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
15870b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
15880b57cec5SDimitry Andric         }
15890b57cec5SDimitry Andric         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
15900b57cec5SDimitry Andric       })
15910b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
15920b57cec5SDimitry Andric       .scalarize(0)
15930b57cec5SDimitry Andric       .scalarize(1);
15940b57cec5SDimitry Andric   }
15950b57cec5SDimitry Andric 
15965ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
15975ffd83dbSDimitry Andric   // RegBankSelect.
15985ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
15995ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
16008bcb0991SDimitry Andric 
16015ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
16025ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
16035ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
16045ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
16055ffd83dbSDimitry Andric       // expanded.
16060eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2);
16075ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
16085ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
16095ffd83dbSDimitry Andric   } else {
16105ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
16115ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
16125ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
16135ffd83dbSDimitry Andric   }
16145ffd83dbSDimitry Andric 
16155ffd83dbSDimitry Andric   SextInReg
16165ffd83dbSDimitry Andric     .scalarize(0)
16175ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
16185ffd83dbSDimitry Andric     .lower();
16195ffd83dbSDimitry Andric 
1620349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1621349cc55cSDimitry Andric     .scalarize(0)
1622349cc55cSDimitry Andric     .lower();
1623349cc55cSDimitry Andric 
1624fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
16255ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
16265ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1627fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
16280eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
16295ffd83dbSDimitry Andric     .scalarize(0)
16305ffd83dbSDimitry Andric     .lower();
1631480093f4SDimitry Andric 
1632fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
1633fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1634fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
16350eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
1636fe6060f1SDimitry Andric       .scalarize(0)
1637fe6060f1SDimitry Andric       .lower();
1638fe6060f1SDimitry Andric   } else {
1639fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1640fe6060f1SDimitry Andric       .scalarize(0)
1641fe6060f1SDimitry Andric       .lower();
1642fe6060f1SDimitry Andric   }
1643fe6060f1SDimitry Andric 
1644480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1645480093f4SDimitry Andric     .legalFor({S64});
1646480093f4SDimitry Andric 
1647e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1648e8d8bef9SDimitry Andric     .alwaysLegal();
1649e8d8bef9SDimitry Andric 
1650fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1651fe6060f1SDimitry Andric       .scalarize(0)
1652fe6060f1SDimitry Andric       .minScalar(0, S32)
1653fe6060f1SDimitry Andric       .lower();
1654fe6060f1SDimitry Andric 
1655fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1656fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
1657fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
1658fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
1659fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
1660fe6060f1SDimitry Andric       .scalarize(0);
1661fe6060f1SDimitry Andric 
16625ffd83dbSDimitry Andric   getActionDefinitionsBuilder({
16635ffd83dbSDimitry Andric       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
16645ffd83dbSDimitry Andric       G_FCOPYSIGN,
16655ffd83dbSDimitry Andric 
16665ffd83dbSDimitry Andric       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1667e8d8bef9SDimitry Andric       G_ATOMICRMW_NAND,
1668e8d8bef9SDimitry Andric       G_ATOMICRMW_FSUB,
16695ffd83dbSDimitry Andric       G_READ_REGISTER,
16705ffd83dbSDimitry Andric       G_WRITE_REGISTER,
16715ffd83dbSDimitry Andric 
16725ffd83dbSDimitry Andric       G_SADDO, G_SSUBO,
16735ffd83dbSDimitry Andric 
16745ffd83dbSDimitry Andric        // TODO: Implement
1675fe6060f1SDimitry Andric       G_FMINIMUM, G_FMAXIMUM}).lower();
16765ffd83dbSDimitry Andric 
1677349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1678349cc55cSDimitry Andric       .lower();
1679349cc55cSDimitry Andric 
1680480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
16815ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1682480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1683480093f4SDimitry Andric     .unsupported();
1684480093f4SDimitry Andric 
1685fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
16860b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
16870b57cec5SDimitry Andric }
16880b57cec5SDimitry Andric 
16895ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
16905ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
16915ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
16925ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
16935ffd83dbSDimitry Andric 
16940b57cec5SDimitry Andric   switch (MI.getOpcode()) {
16950b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
16968bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
16970b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
16988bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
16990b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
17008bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
1701e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
1702e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
17030b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
17048bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
17050b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
17068bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
17070b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
17088bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
17095ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
17105ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
17115ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
17125ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
17130b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
17140b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
17150b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
17160b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
17175ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
17180b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
17198bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
17200b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
17218bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
17225ffd83dbSDimitry Andric   case TargetOpcode::G_SHUFFLE_VECTOR:
17235ffd83dbSDimitry Andric     return legalizeShuffleVector(MI, MRI, B);
17248bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
17258bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
17268bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
17278bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
17288bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
17298bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
1730fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
1731fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
1732e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
17338bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
17348bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
17358bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
17368bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
17375ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
17385ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
1739fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
1740fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
17415ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
17425ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
1743fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
1744fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
1745480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
1746480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
17475ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
17485ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f);
17495ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
17505ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
17515ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
17525ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
17535ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
17545ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
17555ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
17565ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
17575ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
17585ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
1759349cc55cSDimitry Andric   case TargetOpcode::G_CTLZ:
1760349cc55cSDimitry Andric   case TargetOpcode::G_CTTZ:
1761349cc55cSDimitry Andric     return legalizeCTLZ_CTTZ(MI, MRI, B);
17620b57cec5SDimitry Andric   default:
17630b57cec5SDimitry Andric     return false;
17640b57cec5SDimitry Andric   }
17650b57cec5SDimitry Andric 
17660b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
17670b57cec5SDimitry Andric }
17680b57cec5SDimitry Andric 
17690b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
17700b57cec5SDimitry Andric   unsigned AS,
17710b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
17728bcb0991SDimitry Andric   MachineIRBuilder &B) const {
17738bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
17740b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17750b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
17760b57cec5SDimitry Andric 
17778bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
17788bcb0991SDimitry Andric 
17790b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
17800b57cec5SDimitry Andric     // FIXME: Use inline constants (src_{shared, private}_base) instead of
17810b57cec5SDimitry Andric     // getreg.
17820b57cec5SDimitry Andric     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
17830b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
17840b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
17850b57cec5SDimitry Andric     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
17860b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
17870b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
17880b57cec5SDimitry Andric     unsigned Encoding =
17890b57cec5SDimitry Andric         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
17900b57cec5SDimitry Andric         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
17910b57cec5SDimitry Andric         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
17920b57cec5SDimitry Andric 
17930b57cec5SDimitry Andric     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
17940b57cec5SDimitry Andric 
17958bcb0991SDimitry Andric     B.buildInstr(AMDGPU::S_GETREG_B32)
17960b57cec5SDimitry Andric       .addDef(GetReg)
17970b57cec5SDimitry Andric       .addImm(Encoding);
17980b57cec5SDimitry Andric     MRI.setType(GetReg, S32);
17990b57cec5SDimitry Andric 
18008bcb0991SDimitry Andric     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
18015ffd83dbSDimitry Andric     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
18020b57cec5SDimitry Andric   }
18030b57cec5SDimitry Andric 
18040b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
18050b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
18060b57cec5SDimitry Andric 
1807e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
18088bcb0991SDimitry Andric     return Register();
18090b57cec5SDimitry Andric 
18100b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
18110b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
18120b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
18130b57cec5SDimitry Andric 
1814480093f4SDimitry Andric   // TODO: can we be smarter about machine pointer info?
1815480093f4SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
18160b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
18170b57cec5SDimitry Andric       PtrInfo,
18185ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
18190b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
1820fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
18210b57cec5SDimitry Andric 
18220b57cec5SDimitry Andric   Register LoadAddr;
18230b57cec5SDimitry Andric 
1824480093f4SDimitry Andric   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
18255ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
18260b57cec5SDimitry Andric }
18270b57cec5SDimitry Andric 
1828*04eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is
1829*04eeddc0SDimitry Andric /// not necessary.
1830*04eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
1831*04eeddc0SDimitry Andric                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
1832*04eeddc0SDimitry Andric   MachineInstr *Def = MRI.getVRegDef(Val);
1833*04eeddc0SDimitry Andric   switch (Def->getOpcode()) {
1834*04eeddc0SDimitry Andric   case AMDGPU::G_FRAME_INDEX:
1835*04eeddc0SDimitry Andric   case AMDGPU::G_GLOBAL_VALUE:
1836*04eeddc0SDimitry Andric   case AMDGPU::G_BLOCK_ADDR:
1837*04eeddc0SDimitry Andric     return true;
1838*04eeddc0SDimitry Andric   case AMDGPU::G_CONSTANT: {
1839*04eeddc0SDimitry Andric     const ConstantInt *CI = Def->getOperand(1).getCImm();
1840*04eeddc0SDimitry Andric     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
1841*04eeddc0SDimitry Andric   }
1842*04eeddc0SDimitry Andric   default:
1843*04eeddc0SDimitry Andric     return false;
1844*04eeddc0SDimitry Andric   }
1845*04eeddc0SDimitry Andric 
1846*04eeddc0SDimitry Andric   return false;
1847*04eeddc0SDimitry Andric }
1848*04eeddc0SDimitry Andric 
18490b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
18500b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
18518bcb0991SDimitry Andric   MachineIRBuilder &B) const {
18528bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
18530b57cec5SDimitry Andric 
18548bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
18550b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
18560b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
18570b57cec5SDimitry Andric 
18580b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
18590b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
18600b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
18610b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
18620b57cec5SDimitry Andric 
18630b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
18640b57cec5SDimitry Andric   // vector element.
18650b57cec5SDimitry Andric   assert(!DstTy.isVector());
18660b57cec5SDimitry Andric 
18670b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
18680b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
18690b57cec5SDimitry Andric 
1870e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
18718bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
18728bcb0991SDimitry Andric     return true;
18738bcb0991SDimitry Andric   }
18748bcb0991SDimitry Andric 
18758bcb0991SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
18768bcb0991SDimitry Andric     // Truncate.
18778bcb0991SDimitry Andric     B.buildExtract(Dst, Src, 0);
18788bcb0991SDimitry Andric     MI.eraseFromParent();
18798bcb0991SDimitry Andric     return true;
18808bcb0991SDimitry Andric   }
18818bcb0991SDimitry Andric 
18828bcb0991SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
18838bcb0991SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18848bcb0991SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
18858bcb0991SDimitry Andric 
18868bcb0991SDimitry Andric     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
18878bcb0991SDimitry Andric     // another. Merge operands are required to be the same type, but creating an
18888bcb0991SDimitry Andric     // extra ptrtoint would be kind of pointless.
18898bcb0991SDimitry Andric     auto HighAddr = B.buildConstant(
18908bcb0991SDimitry Andric       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
18915ffd83dbSDimitry Andric     B.buildMerge(Dst, {Src, HighAddr});
18928bcb0991SDimitry Andric     MI.eraseFromParent();
18930b57cec5SDimitry Andric     return true;
18940b57cec5SDimitry Andric   }
18950b57cec5SDimitry Andric 
18960b57cec5SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
18970b57cec5SDimitry Andric     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
18980b57cec5SDimitry Andric            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
1899*04eeddc0SDimitry Andric 
1900*04eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1901*04eeddc0SDimitry Andric       // Extract low 32-bits of the pointer.
1902*04eeddc0SDimitry Andric       B.buildExtract(Dst, Src, 0);
1903*04eeddc0SDimitry Andric       MI.eraseFromParent();
1904*04eeddc0SDimitry Andric       return true;
1905*04eeddc0SDimitry Andric     }
1906*04eeddc0SDimitry Andric 
19070b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
19080b57cec5SDimitry Andric 
19098bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
19108bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
19110b57cec5SDimitry Andric 
19120b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
19135ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
19140b57cec5SDimitry Andric 
19155ffd83dbSDimitry Andric     auto CmpRes =
19165ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
19178bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
19180b57cec5SDimitry Andric 
19190b57cec5SDimitry Andric     MI.eraseFromParent();
19200b57cec5SDimitry Andric     return true;
19210b57cec5SDimitry Andric   }
19220b57cec5SDimitry Andric 
19238bcb0991SDimitry Andric   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
19248bcb0991SDimitry Andric     return false;
19258bcb0991SDimitry Andric 
19268bcb0991SDimitry Andric   if (!ST.hasFlatAddressSpace())
19278bcb0991SDimitry Andric     return false;
19280b57cec5SDimitry Andric 
19298bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
19308bcb0991SDimitry Andric   if (!ApertureReg.isValid())
19318bcb0991SDimitry Andric     return false;
19320b57cec5SDimitry Andric 
19330b57cec5SDimitry Andric   // Coerce the type of the low half of the result so we can use merge_values.
19345ffd83dbSDimitry Andric   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
19350b57cec5SDimitry Andric 
19360b57cec5SDimitry Andric   // TODO: Should we allow mismatched types but matching sizes in merges to
19370b57cec5SDimitry Andric   // avoid the ptrtoint?
19385ffd83dbSDimitry Andric   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
1939*04eeddc0SDimitry Andric 
1940*04eeddc0SDimitry Andric   if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
1941*04eeddc0SDimitry Andric     B.buildCopy(Dst, BuildPtr);
1942*04eeddc0SDimitry Andric     MI.eraseFromParent();
1943*04eeddc0SDimitry Andric     return true;
1944*04eeddc0SDimitry Andric   }
1945*04eeddc0SDimitry Andric 
1946*04eeddc0SDimitry Andric   auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
1947*04eeddc0SDimitry Andric   auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
1948*04eeddc0SDimitry Andric 
1949*04eeddc0SDimitry Andric   auto CmpRes =
1950*04eeddc0SDimitry Andric       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
1951*04eeddc0SDimitry Andric 
19525ffd83dbSDimitry Andric   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
19530b57cec5SDimitry Andric 
19540b57cec5SDimitry Andric   MI.eraseFromParent();
19550b57cec5SDimitry Andric   return true;
19560b57cec5SDimitry Andric }
19570b57cec5SDimitry Andric 
19580b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
19590b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19608bcb0991SDimitry Andric   MachineIRBuilder &B) const {
19610b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19620b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
19630b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
19640b57cec5SDimitry Andric 
19650b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
19660b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
19670b57cec5SDimitry Andric 
19688bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
19698bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
19700b57cec5SDimitry Andric 
19710b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
19728bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
19738bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
19740b57cec5SDimitry Andric 
19758bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
19768bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
19770b57cec5SDimitry Andric 
19788bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
19798bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1980e8d8bef9SDimitry Andric   MI.eraseFromParent();
19810b57cec5SDimitry Andric   return true;
19820b57cec5SDimitry Andric }
19830b57cec5SDimitry Andric 
19840b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
19850b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19860b57cec5SDimitry Andric   MachineIRBuilder &B) const {
19870b57cec5SDimitry Andric 
19880b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
19890b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
19900b57cec5SDimitry Andric 
19910b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19920b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
19930b57cec5SDimitry Andric 
19940b57cec5SDimitry Andric   // result = trunc(src)
19950b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
19960b57cec5SDimitry Andric   //   result += 1.0
19970b57cec5SDimitry Andric 
19985ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
19990b57cec5SDimitry Andric 
20000b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
20010b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
20020b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
20030b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
20040b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
20050b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
20060b57cec5SDimitry Andric 
20070b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
20080b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2009*04eeddc0SDimitry Andric   MI.eraseFromParent();
20100b57cec5SDimitry Andric   return true;
20110b57cec5SDimitry Andric }
20120b57cec5SDimitry Andric 
2013e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
2014e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
2015e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
2016e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
2017e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
2018e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
2019e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
2020e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
2021e8d8bef9SDimitry Andric 
2022e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2023e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2024e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2025e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2026e8d8bef9SDimitry Andric     MI.eraseFromParent();
2027e8d8bef9SDimitry Andric     return true;
2028e8d8bef9SDimitry Andric }
2029e8d8bef9SDimitry Andric 
2030e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
20310b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
20320b57cec5SDimitry Andric   const unsigned FractBits = 52;
20330b57cec5SDimitry Andric   const unsigned ExpBits = 11;
20340b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
20350b57cec5SDimitry Andric 
20360b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
20370b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
20380b57cec5SDimitry Andric 
20390b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2040e8d8bef9SDimitry Andric     .addUse(Hi)
20410b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
20420b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
20430b57cec5SDimitry Andric 
20440b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
20450b57cec5SDimitry Andric }
20460b57cec5SDimitry Andric 
20470b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
20480b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20490b57cec5SDimitry Andric   MachineIRBuilder &B) const {
20500b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
20510b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
20520b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
20530b57cec5SDimitry Andric 
20540b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20550b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
20560b57cec5SDimitry Andric 
20570b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
20580b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
20590b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
20600b57cec5SDimitry Andric 
20610b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
20620b57cec5SDimitry Andric   // exponent.
20630b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
20640b57cec5SDimitry Andric 
20650b57cec5SDimitry Andric   const unsigned FractBits = 52;
20660b57cec5SDimitry Andric 
20670b57cec5SDimitry Andric   // Extract the sign bit.
20680b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
20690b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
20700b57cec5SDimitry Andric 
20710b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
20720b57cec5SDimitry Andric 
20730b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
20740b57cec5SDimitry Andric 
20750b57cec5SDimitry Andric   // Extend back to 64-bits.
20765ffd83dbSDimitry Andric   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
20770b57cec5SDimitry Andric 
20780b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
20790b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
20800b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
20810b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
20820b57cec5SDimitry Andric 
20830b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
20840b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
20850b57cec5SDimitry Andric 
20860b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
20870b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2088e8d8bef9SDimitry Andric   MI.eraseFromParent();
20890b57cec5SDimitry Andric   return true;
20900b57cec5SDimitry Andric }
20910b57cec5SDimitry Andric 
20920b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
20930b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20940b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
20950b57cec5SDimitry Andric 
20960b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
20970b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20980b57cec5SDimitry Andric 
20990b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
21000b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
21010b57cec5SDimitry Andric 
2102349cc55cSDimitry Andric   assert(MRI.getType(Src) == S64);
21030b57cec5SDimitry Andric 
21040b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2105349cc55cSDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
21060b57cec5SDimitry Andric 
2107349cc55cSDimitry Andric   if (MRI.getType(Dst) == S64) {
2108349cc55cSDimitry Andric     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2109349cc55cSDimitry Andric                         : B.buildUITOFP(S64, Unmerge.getReg(1));
21100b57cec5SDimitry Andric 
21110b57cec5SDimitry Andric     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
21120b57cec5SDimitry Andric     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
21130b57cec5SDimitry Andric                      .addUse(CvtHi.getReg(0))
21140b57cec5SDimitry Andric                      .addUse(ThirtyTwo.getReg(0));
21150b57cec5SDimitry Andric 
21160b57cec5SDimitry Andric     // TODO: Should this propagate fast-math-flags?
21170b57cec5SDimitry Andric     B.buildFAdd(Dst, LdExp, CvtLo);
21180b57cec5SDimitry Andric     MI.eraseFromParent();
21190b57cec5SDimitry Andric     return true;
21200b57cec5SDimitry Andric   }
21210b57cec5SDimitry Andric 
2122349cc55cSDimitry Andric   assert(MRI.getType(Dst) == S32);
2123349cc55cSDimitry Andric 
2124349cc55cSDimitry Andric   auto One = B.buildConstant(S32, 1);
2125349cc55cSDimitry Andric 
2126349cc55cSDimitry Andric   MachineInstrBuilder ShAmt;
2127349cc55cSDimitry Andric   if (Signed) {
2128349cc55cSDimitry Andric     auto ThirtyOne = B.buildConstant(S32, 31);
2129349cc55cSDimitry Andric     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2130349cc55cSDimitry Andric     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2131349cc55cSDimitry Andric     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2132349cc55cSDimitry Andric     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2133349cc55cSDimitry Andric                                /*HasSideEffects=*/false)
2134349cc55cSDimitry Andric                   .addUse(Unmerge.getReg(1));
2135349cc55cSDimitry Andric     auto LS2 = B.buildSub(S32, LS, One);
2136349cc55cSDimitry Andric     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2137349cc55cSDimitry Andric   } else
2138349cc55cSDimitry Andric     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2139349cc55cSDimitry Andric   auto Norm = B.buildShl(S64, Src, ShAmt);
2140349cc55cSDimitry Andric   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2141349cc55cSDimitry Andric   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2142349cc55cSDimitry Andric   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2143349cc55cSDimitry Andric   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2144349cc55cSDimitry Andric   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2145349cc55cSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2146349cc55cSDimitry Andric                    /*HasSideEffects=*/false)
2147349cc55cSDimitry Andric       .addUse(FVal.getReg(0))
2148349cc55cSDimitry Andric       .addUse(Scale.getReg(0));
2149349cc55cSDimitry Andric   MI.eraseFromParent();
2150349cc55cSDimitry Andric   return true;
2151349cc55cSDimitry Andric }
2152349cc55cSDimitry Andric 
21535ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
21545ffd83dbSDimitry Andric // actually works.
2155fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2156fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2157fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2158fe6060f1SDimitry Andric                                         bool Signed) const {
21595ffd83dbSDimitry Andric 
21605ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
21615ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
21625ffd83dbSDimitry Andric 
21635ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
21645ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
21655ffd83dbSDimitry Andric 
2166fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2167fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
21685ffd83dbSDimitry Andric 
21695ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
21705ffd83dbSDimitry Andric 
2171fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2172fe6060f1SDimitry Andric   // integers is illustrated as follows:
2173fe6060f1SDimitry Andric   //
2174fe6060f1SDimitry Andric   //     tf := trunc(val);
2175fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2176fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2177fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2178fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2179fe6060f1SDimitry Andric   //
2180fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2181fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2182fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2183fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2184fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2185fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2186fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2187fe6060f1SDimitry Andric     // signedness.
2188fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2189fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2190fe6060f1SDimitry Andric   }
2191fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2192fe6060f1SDimitry Andric   if (SrcLT == S64) {
2193fe6060f1SDimitry Andric     K0 = B.buildFConstant(S64,
2194fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2195fe6060f1SDimitry Andric     K1 = B.buildFConstant(S64,
2196fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2197fe6060f1SDimitry Andric   } else {
2198fe6060f1SDimitry Andric     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2199fe6060f1SDimitry Andric     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2200fe6060f1SDimitry Andric   }
22015ffd83dbSDimitry Andric 
2202fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2203fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2204fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
22055ffd83dbSDimitry Andric 
2206fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2207fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
22085ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
22095ffd83dbSDimitry Andric 
2210fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2211fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2212fe6060f1SDimitry Andric     Sign = B.buildMerge(S64, {Sign, Sign});
2213fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2214fe6060f1SDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2215fe6060f1SDimitry Andric   } else
22165ffd83dbSDimitry Andric     B.buildMerge(Dst, {Lo, Hi});
22175ffd83dbSDimitry Andric   MI.eraseFromParent();
22185ffd83dbSDimitry Andric 
22195ffd83dbSDimitry Andric   return true;
22205ffd83dbSDimitry Andric }
22215ffd83dbSDimitry Andric 
22225ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
22235ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
22245ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
22250b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
22260b57cec5SDimitry Andric 
22270b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
22280b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
22290b57cec5SDimitry Andric 
22300b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
22310b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
22320b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
22330b57cec5SDimitry Andric     return !IsIEEEOp;
22340b57cec5SDimitry Andric 
22350b57cec5SDimitry Andric   if (IsIEEEOp)
22360b57cec5SDimitry Andric     return true;
22370b57cec5SDimitry Andric 
22380b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
22390b57cec5SDimitry Andric }
22400b57cec5SDimitry Andric 
22410b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
22420b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22430b57cec5SDimitry Andric   MachineIRBuilder &B) const {
22440b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
22450b57cec5SDimitry Andric 
22460b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
22475ffd83dbSDimitry Andric 
22485ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
22495ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2250349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2251e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2252349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2253e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
22540b57cec5SDimitry Andric     return true;
2255e8d8bef9SDimitry Andric   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
22560b57cec5SDimitry Andric 
22570b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22580b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
22590b57cec5SDimitry Andric 
22600b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
22610b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
22620b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Dst));
22630b57cec5SDimitry Andric 
2264*04eeddc0SDimitry Andric   if (IdxVal < VecTy.getNumElements()) {
2265*04eeddc0SDimitry Andric     auto Unmerge = B.buildUnmerge(EltTy, Vec);
2266*04eeddc0SDimitry Andric     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2267*04eeddc0SDimitry Andric   } else {
22680b57cec5SDimitry Andric     B.buildUndef(Dst);
2269*04eeddc0SDimitry Andric   }
22700b57cec5SDimitry Andric 
22710b57cec5SDimitry Andric   MI.eraseFromParent();
22720b57cec5SDimitry Andric   return true;
22730b57cec5SDimitry Andric }
22740b57cec5SDimitry Andric 
22750b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
22760b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22770b57cec5SDimitry Andric   MachineIRBuilder &B) const {
22780b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
22790b57cec5SDimitry Andric 
22800b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
22815ffd83dbSDimitry Andric 
22825ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
22835ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2284349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2285e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2286349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2287e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
22880b57cec5SDimitry Andric     return true;
22890b57cec5SDimitry Andric 
2290e8d8bef9SDimitry Andric   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
22910b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22920b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
22930b57cec5SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
22940b57cec5SDimitry Andric 
22950b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
22960b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
22970b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Ins));
2298*04eeddc0SDimitry Andric   (void)Ins;
22990b57cec5SDimitry Andric 
2300*04eeddc0SDimitry Andric   unsigned NumElts = VecTy.getNumElements();
2301*04eeddc0SDimitry Andric   if (IdxVal < NumElts) {
2302*04eeddc0SDimitry Andric     SmallVector<Register, 8> SrcRegs;
2303*04eeddc0SDimitry Andric     for (unsigned i = 0; i < NumElts; ++i)
2304*04eeddc0SDimitry Andric       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
2305*04eeddc0SDimitry Andric     B.buildUnmerge(SrcRegs, Vec);
2306*04eeddc0SDimitry Andric 
2307*04eeddc0SDimitry Andric     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2308*04eeddc0SDimitry Andric     B.buildMerge(Dst, SrcRegs);
2309*04eeddc0SDimitry Andric   } else {
23100b57cec5SDimitry Andric     B.buildUndef(Dst);
2311*04eeddc0SDimitry Andric   }
23120b57cec5SDimitry Andric 
23130b57cec5SDimitry Andric   MI.eraseFromParent();
23140b57cec5SDimitry Andric   return true;
23150b57cec5SDimitry Andric }
23160b57cec5SDimitry Andric 
23175ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector(
23185ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23195ffd83dbSDimitry Andric   MachineIRBuilder &B) const {
2320fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
23215ffd83dbSDimitry Andric 
23225ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
23235ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
23245ffd83dbSDimitry Andric   LLT DstTy = MRI.getType(Dst);
23255ffd83dbSDimitry Andric   LLT SrcTy = MRI.getType(Src0);
23265ffd83dbSDimitry Andric 
23275ffd83dbSDimitry Andric   if (SrcTy == V2S16 && DstTy == V2S16 &&
23285ffd83dbSDimitry Andric       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
23295ffd83dbSDimitry Andric     return true;
23305ffd83dbSDimitry Andric 
23315ffd83dbSDimitry Andric   MachineIRBuilder HelperBuilder(MI);
23325ffd83dbSDimitry Andric   GISelObserverWrapper DummyObserver;
23335ffd83dbSDimitry Andric   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
23345ffd83dbSDimitry Andric   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
23355ffd83dbSDimitry Andric }
23365ffd83dbSDimitry Andric 
23378bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
23388bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23398bcb0991SDimitry Andric   MachineIRBuilder &B) const {
23408bcb0991SDimitry Andric 
23418bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
23428bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
23438bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
23448bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
23458bcb0991SDimitry Andric 
23468bcb0991SDimitry Andric   Register TrigVal;
23475ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
23488bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
23498bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
23508bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
23518bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
23528bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
23538bcb0991SDimitry Andric   } else
23548bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
23558bcb0991SDimitry Andric 
23568bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
23578bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
23588bcb0991SDimitry Andric   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
23598bcb0991SDimitry Andric     .addUse(TrigVal)
23608bcb0991SDimitry Andric     .setMIFlags(Flags);
23618bcb0991SDimitry Andric   MI.eraseFromParent();
23628bcb0991SDimitry Andric   return true;
23638bcb0991SDimitry Andric }
23648bcb0991SDimitry Andric 
23655ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
23665ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
23675ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
23685ffd83dbSDimitry Andric                                                   int64_t Offset,
23695ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
23705ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
23718bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
23728bcb0991SDimitry Andric   // to the following code sequence:
23738bcb0991SDimitry Andric   //
23748bcb0991SDimitry Andric   // For constant address space:
23758bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
23768bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
23778bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
23788bcb0991SDimitry Andric   //
23798bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
23808bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
23818bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
23828bcb0991SDimitry Andric   //   operand to the global variable.
23838bcb0991SDimitry Andric   //
23848bcb0991SDimitry Andric   // For global address space:
23858bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
23868bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
23878bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
23888bcb0991SDimitry Andric   //
23898bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
23908bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
23918bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
23928bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
23938bcb0991SDimitry Andric   //   operand to the global variable.
23948bcb0991SDimitry Andric   //
23958bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
23968bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
23978bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
23988bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
23998bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
2400e8d8bef9SDimitry Andric   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2401e8d8bef9SDimitry Andric   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2402e8d8bef9SDimitry Andric   // instruction.
24038bcb0991SDimitry Andric 
24048bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
24058bcb0991SDimitry Andric 
24068bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
24078bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
24088bcb0991SDimitry Andric 
24098bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
24108bcb0991SDimitry Andric     .addDef(PCReg);
24118bcb0991SDimitry Andric 
24128bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
24138bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
24148bcb0991SDimitry Andric     MIB.addImm(0);
24158bcb0991SDimitry Andric   else
2416e8d8bef9SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
24178bcb0991SDimitry Andric 
24188bcb0991SDimitry Andric   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
24198bcb0991SDimitry Andric 
24208bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
24218bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
24228bcb0991SDimitry Andric   return true;
24238bcb0991SDimitry Andric  }
24248bcb0991SDimitry Andric 
24258bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
24268bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24278bcb0991SDimitry Andric   MachineIRBuilder &B) const {
24288bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
24298bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
24308bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
24318bcb0991SDimitry Andric 
24328bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
24338bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
24348bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
24358bcb0991SDimitry Andric 
24368bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2437fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2438fe6060f1SDimitry Andric         !GV->getName().equals("llvm.amdgcn.module.lds")) {
24398bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
24408bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
24415ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
24425ffd83dbSDimitry Andric         DS_Warning);
24438bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
24445ffd83dbSDimitry Andric 
24455ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
24465ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
24475ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
24485ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
24495ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
24505ffd83dbSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
24515ffd83dbSDimitry Andric       B.buildUndef(DstReg);
24525ffd83dbSDimitry Andric       MI.eraseFromParent();
24535ffd83dbSDimitry Andric       return true;
24548bcb0991SDimitry Andric     }
24558bcb0991SDimitry Andric 
24568bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
2457349cc55cSDimitry Andric     // We ignore the initializer for now and legalize it to allow selection.
2458349cc55cSDimitry Andric     // The initializer will anyway get errored out during assembly emission.
24595ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
24605ffd83dbSDimitry Andric     if (!TLI->shouldUseLDSConstAddress(GV)) {
24615ffd83dbSDimitry Andric       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
24625ffd83dbSDimitry Andric       return true; // Leave in place;
24635ffd83dbSDimitry Andric     }
24645ffd83dbSDimitry Andric 
2465e8d8bef9SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2466e8d8bef9SDimitry Andric       Type *Ty = GV->getValueType();
2467e8d8bef9SDimitry Andric       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2468e8d8bef9SDimitry Andric       // zero-sized type in other languages to declare the dynamic shared
2469e8d8bef9SDimitry Andric       // memory which size is not known at the compile time. They will be
2470e8d8bef9SDimitry Andric       // allocated by the runtime and placed directly after the static
2471e8d8bef9SDimitry Andric       // allocated ones. They all share the same offset.
2472e8d8bef9SDimitry Andric       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2473e8d8bef9SDimitry Andric         // Adjust alignment for that dynamic shared memory array.
2474e8d8bef9SDimitry Andric         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2475e8d8bef9SDimitry Andric         LLT S32 = LLT::scalar(32);
2476e8d8bef9SDimitry Andric         auto Sz =
2477e8d8bef9SDimitry Andric             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2478e8d8bef9SDimitry Andric         B.buildIntToPtr(DstReg, Sz);
2479e8d8bef9SDimitry Andric         MI.eraseFromParent();
2480e8d8bef9SDimitry Andric         return true;
2481e8d8bef9SDimitry Andric       }
2482e8d8bef9SDimitry Andric     }
2483e8d8bef9SDimitry Andric 
2484349cc55cSDimitry Andric     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2485349cc55cSDimitry Andric                                                    *cast<GlobalVariable>(GV)));
24868bcb0991SDimitry Andric     MI.eraseFromParent();
24878bcb0991SDimitry Andric     return true;
24888bcb0991SDimitry Andric   }
24898bcb0991SDimitry Andric 
24908bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
24918bcb0991SDimitry Andric 
24928bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
24938bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
24948bcb0991SDimitry Andric     MI.eraseFromParent();
24958bcb0991SDimitry Andric     return true;
24968bcb0991SDimitry Andric   }
24978bcb0991SDimitry Andric 
24988bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
24998bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
25008bcb0991SDimitry Andric     MI.eraseFromParent();
25018bcb0991SDimitry Andric     return true;
25028bcb0991SDimitry Andric   }
25038bcb0991SDimitry Andric 
25048bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
25058bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
25068bcb0991SDimitry Andric 
2507fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
25088bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
25098bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
25108bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
25118bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2512fe6060f1SDimitry Andric       LoadTy, Align(8));
25138bcb0991SDimitry Andric 
25148bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
25158bcb0991SDimitry Andric 
25168bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
2517349cc55cSDimitry Andric     // Truncate if this is a 32-bit constant address.
25188bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
25198bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
25208bcb0991SDimitry Andric   } else
25218bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
25228bcb0991SDimitry Andric 
25238bcb0991SDimitry Andric   MI.eraseFromParent();
25248bcb0991SDimitry Andric   return true;
25258bcb0991SDimitry Andric }
25268bcb0991SDimitry Andric 
2527e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2528e8d8bef9SDimitry Andric   if (Ty.isVector())
2529fe6060f1SDimitry Andric     return Ty.changeElementCount(
2530fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2531e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2532e8d8bef9SDimitry Andric }
2533e8d8bef9SDimitry Andric 
2534e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2535e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2536e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2537e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2538e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2539e8d8bef9SDimitry Andric 
2540e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2541e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2542e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2543e8d8bef9SDimitry Andric 
2544e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
25458bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2546e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
25478bcb0991SDimitry Andric     Observer.changingInstr(MI);
25488bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
25498bcb0991SDimitry Andric     Observer.changedInstr(MI);
25508bcb0991SDimitry Andric     return true;
25518bcb0991SDimitry Andric   }
25528bcb0991SDimitry Andric 
2553fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
2554fe6060f1SDimitry Andric     return false;
2555fe6060f1SDimitry Andric 
2556e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2557e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2558e8d8bef9SDimitry Andric 
2559e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
2560e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
2561fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
2562e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
2563fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
2564*04eeddc0SDimitry Andric   const uint64_t AlignInBits = 8 * MemAlign.value();
2565e8d8bef9SDimitry Andric 
2566e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
2567fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2568e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2569e8d8bef9SDimitry Andric 
2570e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
2571e8d8bef9SDimitry Andric     // the memory type.
2572e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
2573e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
2574e8d8bef9SDimitry Andric 
2575e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
2576e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2577e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
2578e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
2579e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
2580e8d8bef9SDimitry Andric       return true;
2581e8d8bef9SDimitry Andric     }
2582e8d8bef9SDimitry Andric 
2583e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
2584e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
2585e8d8bef9SDimitry Andric       return false;
2586e8d8bef9SDimitry Andric 
2587e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
2588e8d8bef9SDimitry Andric 
2589e8d8bef9SDimitry Andric     Register WideLoad;
2590e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
2591e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2592e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
2593e8d8bef9SDimitry Andric     } else {
2594e8d8bef9SDimitry Andric       // Extract the subvector.
2595e8d8bef9SDimitry Andric 
2596e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
2597e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
2598e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
2599e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2600e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
2601e8d8bef9SDimitry Andric       } else {
2602e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
2603e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
26040eae32dcSDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
26050eae32dcSDimitry Andric         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2606e8d8bef9SDimitry Andric       }
2607e8d8bef9SDimitry Andric     }
2608e8d8bef9SDimitry Andric 
2609e8d8bef9SDimitry Andric     MI.eraseFromParent();
2610e8d8bef9SDimitry Andric     return true;
2611e8d8bef9SDimitry Andric   }
2612e8d8bef9SDimitry Andric 
2613e8d8bef9SDimitry Andric   return false;
2614e8d8bef9SDimitry Andric }
2615e8d8bef9SDimitry Andric 
26168bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
26178bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
26188bcb0991SDimitry Andric   MachineIRBuilder &B) const {
26198bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26208bcb0991SDimitry Andric   assert(Ty.isScalar());
26218bcb0991SDimitry Andric 
2622480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
2623480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2624480093f4SDimitry Andric 
26258bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
26265ffd83dbSDimitry Andric   // FIXME: Do we need just output?
26275ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
26288bcb0991SDimitry Andric     return true;
26295ffd83dbSDimitry Andric   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
26308bcb0991SDimitry Andric     return true;
26318bcb0991SDimitry Andric 
26328bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
26338bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
26348bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
26358bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
26368bcb0991SDimitry Andric }
26378bcb0991SDimitry Andric 
2638480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2639480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2640480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2641480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2642480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
2643480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
2644480093f4SDimitry Andric 
2645e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2646480093f4SDimitry Andric          "this should not have been custom lowered");
2647480093f4SDimitry Andric 
2648480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
2649fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
2650480093f4SDimitry Andric 
2651480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2652480093f4SDimitry Andric 
2653480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2654480093f4SDimitry Andric     .addDef(DstReg)
2655480093f4SDimitry Andric     .addUse(PtrReg)
2656480093f4SDimitry Andric     .addUse(PackedVal)
2657480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
2658480093f4SDimitry Andric 
2659480093f4SDimitry Andric   MI.eraseFromParent();
2660480093f4SDimitry Andric   return true;
2661480093f4SDimitry Andric }
2662480093f4SDimitry Andric 
26635ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog(
26645ffd83dbSDimitry Andric   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
26655ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26665ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
26675ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26685ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26695ffd83dbSDimitry Andric 
26705ffd83dbSDimitry Andric   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
26715ffd83dbSDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
26725ffd83dbSDimitry Andric 
26735ffd83dbSDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
26745ffd83dbSDimitry Andric   MI.eraseFromParent();
26755ffd83dbSDimitry Andric   return true;
26765ffd83dbSDimitry Andric }
26775ffd83dbSDimitry Andric 
26785ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
26795ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
26805ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26815ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
26825ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26835ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26845ffd83dbSDimitry Andric 
26855ffd83dbSDimitry Andric   auto K = B.buildFConstant(Ty, numbers::log2e);
26865ffd83dbSDimitry Andric   auto Mul = B.buildFMul(Ty, Src, K, Flags);
26875ffd83dbSDimitry Andric   B.buildFExp2(Dst, Mul, Flags);
26885ffd83dbSDimitry Andric   MI.eraseFromParent();
26895ffd83dbSDimitry Andric   return true;
26905ffd83dbSDimitry Andric }
26915ffd83dbSDimitry Andric 
26925ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
26935ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
26945ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26955ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
26965ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
26975ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26985ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26995ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
27005ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
27015ffd83dbSDimitry Andric 
27025ffd83dbSDimitry Andric   if (Ty == S32) {
27035ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S32, Src0, Flags);
27045ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
27055ffd83dbSDimitry Andric       .addUse(Log.getReg(0))
27065ffd83dbSDimitry Andric       .addUse(Src1)
27075ffd83dbSDimitry Andric       .setMIFlags(Flags);
27085ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
27095ffd83dbSDimitry Andric   } else if (Ty == S16) {
27105ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
27115ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S16, Src0, Flags);
27125ffd83dbSDimitry Andric     auto Ext0 = B.buildFPExt(S32, Log, Flags);
27135ffd83dbSDimitry Andric     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
27145ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
27155ffd83dbSDimitry Andric       .addUse(Ext0.getReg(0))
27165ffd83dbSDimitry Andric       .addUse(Ext1.getReg(0))
27175ffd83dbSDimitry Andric       .setMIFlags(Flags);
27185ffd83dbSDimitry Andric 
27195ffd83dbSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
27205ffd83dbSDimitry Andric   } else
27215ffd83dbSDimitry Andric     return false;
27225ffd83dbSDimitry Andric 
27235ffd83dbSDimitry Andric   MI.eraseFromParent();
27245ffd83dbSDimitry Andric   return true;
27255ffd83dbSDimitry Andric }
27265ffd83dbSDimitry Andric 
27275ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
27285ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
27295ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
27305ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
27315ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
27325ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
27335ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
27345ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
27355ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
27365ffd83dbSDimitry Andric   return ModSrc;
27375ffd83dbSDimitry Andric }
27385ffd83dbSDimitry Andric 
27395ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
27405ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
27415ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
27425ffd83dbSDimitry Andric 
27435ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
27445ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
27455ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27465ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
27475ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
27485ffd83dbSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
27495ffd83dbSDimitry Andric          "this should not have been custom lowered");
27505ffd83dbSDimitry Andric 
27515ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
27525ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
27535ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
27545ffd83dbSDimitry Andric   // V_FRACT bug is:
27555ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
27565ffd83dbSDimitry Andric   //
27575ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
27585ffd83dbSDimitry Andric 
27595ffd83dbSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
27605ffd83dbSDimitry Andric     .addUse(OrigSrc)
27615ffd83dbSDimitry Andric     .setMIFlags(Flags);
27625ffd83dbSDimitry Andric 
27635ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
27645ffd83dbSDimitry Andric   // pattern.
27655ffd83dbSDimitry Andric 
27665ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
27675ffd83dbSDimitry Andric   // shouldn't matter?
27685ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
27695ffd83dbSDimitry Andric 
27705ffd83dbSDimitry Andric   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
27715ffd83dbSDimitry Andric 
27725ffd83dbSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(S64);
27735ffd83dbSDimitry Andric 
27745ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
27755ffd83dbSDimitry Andric   // use the one which will directly select.
27765ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
27775ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
27785ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
27795ffd83dbSDimitry Andric   else
27805ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
27815ffd83dbSDimitry Andric 
27825ffd83dbSDimitry Andric   Register CorrectedFract = Min;
27835ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
27845ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
27855ffd83dbSDimitry Andric     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
27865ffd83dbSDimitry Andric   }
27875ffd83dbSDimitry Andric 
27885ffd83dbSDimitry Andric   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
27895ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
27905ffd83dbSDimitry Andric 
27915ffd83dbSDimitry Andric   MI.eraseFromParent();
27925ffd83dbSDimitry Andric   return true;
27935ffd83dbSDimitry Andric }
27945ffd83dbSDimitry Andric 
27955ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
27965ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
27975ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
27985ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
27995ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
28005ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2801fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
28025ffd83dbSDimitry Andric 
28035ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
28045ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
28055ffd83dbSDimitry Andric   assert(MRI.getType(Src0) == LLT::scalar(16));
28065ffd83dbSDimitry Andric 
28075ffd83dbSDimitry Andric   auto Merge = B.buildMerge(S32, {Src0, Src1});
28085ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
28095ffd83dbSDimitry Andric 
28105ffd83dbSDimitry Andric   MI.eraseFromParent();
28115ffd83dbSDimitry Andric   return true;
28125ffd83dbSDimitry Andric }
28135ffd83dbSDimitry Andric 
2814349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2815349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2816349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select.
2817349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2818349cc55cSDimitry Andric                                             MachineRegisterInfo &MRI,
2819349cc55cSDimitry Andric                                             MachineIRBuilder &B) const {
2820349cc55cSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2821349cc55cSDimitry Andric   Register Src = MI.getOperand(1).getReg();
2822349cc55cSDimitry Andric   LLT DstTy = MRI.getType(Dst);
2823349cc55cSDimitry Andric   LLT SrcTy = MRI.getType(Src);
2824349cc55cSDimitry Andric 
2825349cc55cSDimitry Andric   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2826349cc55cSDimitry Andric                         ? AMDGPU::G_AMDGPU_FFBH_U32
2827349cc55cSDimitry Andric                         : AMDGPU::G_AMDGPU_FFBL_B32;
2828349cc55cSDimitry Andric   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2829349cc55cSDimitry Andric   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2830349cc55cSDimitry Andric 
2831349cc55cSDimitry Andric   MI.eraseFromParent();
2832349cc55cSDimitry Andric   return true;
2833349cc55cSDimitry Andric }
2834349cc55cSDimitry Andric 
2835e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
2836e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2837e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
2838e8d8bef9SDimitry Andric     return false;
2839349cc55cSDimitry Andric   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2840e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
2841e8d8bef9SDimitry Andric }
2842e8d8bef9SDimitry Andric 
28430b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
2844e8d8bef9SDimitry Andric static MachineInstr *
2845e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2846e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
28470b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
28480b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
28490b57cec5SDimitry Andric     return nullptr;
28500b57cec5SDimitry Andric 
28515ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
2852e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2853e8d8bef9SDimitry Andric 
2854e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
2855e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
2856e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
2857e8d8bef9SDimitry Andric       return nullptr;
2858e8d8bef9SDimitry Andric 
2859e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
2860349cc55cSDimitry Andric     eraseInstr(*UseMI, MRI);
2861e8d8bef9SDimitry Andric 
2862e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2863e8d8bef9SDimitry Andric     Negated = true;
2864e8d8bef9SDimitry Andric   }
2865e8d8bef9SDimitry Andric 
2866e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2867480093f4SDimitry Andric     return nullptr;
2868480093f4SDimitry Andric 
28695ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2870e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
28715ffd83dbSDimitry Andric   if (Next == Parent->end()) {
28725ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
28735ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
28745ffd83dbSDimitry Andric       return nullptr;
28755ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
28765ffd83dbSDimitry Andric   } else {
2877480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
2878480093f4SDimitry Andric       return nullptr;
2879480093f4SDimitry Andric     Br = &*Next;
28805ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
2881480093f4SDimitry Andric   }
2882480093f4SDimitry Andric 
2883e8d8bef9SDimitry Andric   return UseMI;
28840b57cec5SDimitry Andric }
28850b57cec5SDimitry Andric 
28860b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2887e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
2888e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
2889e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
2890e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
2891e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
28925ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
28930b57cec5SDimitry Andric 
2894*04eeddc0SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
2895*04eeddc0SDimitry Andric                                              *ArgRC, B.getDebugLoc(), ArgTy);
28960b57cec5SDimitry Andric   if (Arg->isMasked()) {
28970b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
28980b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
28990b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
29000b57cec5SDimitry Andric     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
29010b57cec5SDimitry Andric 
29028bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
29038bcb0991SDimitry Andric 
2904*04eeddc0SDimitry Andric     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
2905*04eeddc0SDimitry Andric     // 0.
29068bcb0991SDimitry Andric     if (Shift != 0) {
29070b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
29088bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
29098bcb0991SDimitry Andric     }
29108bcb0991SDimitry Andric 
29118bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
29125ffd83dbSDimitry Andric   } else {
29130b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
29140b57cec5SDimitry Andric   }
29150b57cec5SDimitry Andric 
29160b57cec5SDimitry Andric   return true;
29170b57cec5SDimitry Andric }
29180b57cec5SDimitry Andric 
2919e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
2920e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
2921e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2922e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2923e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
2924e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
2925e8d8bef9SDimitry Andric   LLT ArgTy;
2926e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2927e8d8bef9SDimitry Andric 
2928349cc55cSDimitry Andric   if (!Arg) {
2929349cc55cSDimitry Andric     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2930349cc55cSDimitry Andric       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2931349cc55cSDimitry Andric       // case the pointer argument may be missing and we use null.
2932349cc55cSDimitry Andric       B.buildConstant(DstReg, 0);
2933349cc55cSDimitry Andric       return true;
2934349cc55cSDimitry Andric     }
2935349cc55cSDimitry Andric 
2936349cc55cSDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
2937349cc55cSDimitry Andric     // attributes uses the corresponding intrinsic.
2938349cc55cSDimitry Andric     B.buildUndef(DstReg);
2939349cc55cSDimitry Andric     return true;
2940349cc55cSDimitry Andric   }
2941349cc55cSDimitry Andric 
2942e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2943e8d8bef9SDimitry Andric     return false; // TODO: Handle these
2944e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2945e8d8bef9SDimitry Andric }
2946e8d8bef9SDimitry Andric 
29470b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
29485ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
29490b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2950e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
29515ffd83dbSDimitry Andric     return false;
29525ffd83dbSDimitry Andric 
29530b57cec5SDimitry Andric   MI.eraseFromParent();
29540b57cec5SDimitry Andric   return true;
29550b57cec5SDimitry Andric }
29560b57cec5SDimitry Andric 
29578bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
29588bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
29598bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
2960480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2961480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
2962480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
2963480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2964480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
29658bcb0991SDimitry Andric 
2966480093f4SDimitry Andric   if (DstTy == S16)
2967480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
2968480093f4SDimitry Andric   if (DstTy == S32)
2969480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
2970480093f4SDimitry Andric   if (DstTy == S64)
2971480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
2972480093f4SDimitry Andric 
29738bcb0991SDimitry Andric   return false;
29748bcb0991SDimitry Andric }
29758bcb0991SDimitry Andric 
2976fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2977fe6060f1SDimitry Andric                                                         Register DstDivReg,
2978fe6060f1SDimitry Andric                                                         Register DstRemReg,
29795ffd83dbSDimitry Andric                                                         Register X,
2980fe6060f1SDimitry Andric                                                         Register Y) const {
29815ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
29825ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
29835ffd83dbSDimitry Andric 
29845ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
29855ffd83dbSDimitry Andric   // algorithm used here.
29865ffd83dbSDimitry Andric 
29875ffd83dbSDimitry Andric   // Initial estimate of inv(y).
29885ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
29895ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
29905ffd83dbSDimitry Andric   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
29915ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
29925ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
29935ffd83dbSDimitry Andric 
29945ffd83dbSDimitry Andric   // One round of UNR.
29955ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
29965ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
29975ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
29985ffd83dbSDimitry Andric 
29995ffd83dbSDimitry Andric   // Quotient/remainder estimate.
30005ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
30015ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
30025ffd83dbSDimitry Andric 
30035ffd83dbSDimitry Andric   // First quotient/remainder refinement.
30045ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
30055ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3006fe6060f1SDimitry Andric   if (DstDivReg)
30075ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
30085ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
30095ffd83dbSDimitry Andric 
30105ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
30115ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3012fe6060f1SDimitry Andric   if (DstDivReg)
3013fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
30145ffd83dbSDimitry Andric 
3015fe6060f1SDimitry Andric   if (DstRemReg)
3016fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
30175ffd83dbSDimitry Andric }
30185ffd83dbSDimitry Andric 
3019349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32
30205ffd83dbSDimitry Andric //
30215ffd83dbSDimitry Andric // Return lo, hi of result
30225ffd83dbSDimitry Andric //
30235ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
30245ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
30255ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
30265ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
30275ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
30285ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
30295ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
30305ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
30315ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
30325ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
30335ffd83dbSDimitry Andric                                                        Register Val) {
30345ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
30355ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
30365ffd83dbSDimitry Andric 
30375ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
30385ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
30395ffd83dbSDimitry Andric 
30405ffd83dbSDimitry Andric   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
30415ffd83dbSDimitry Andric                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
30425ffd83dbSDimitry Andric 
30435ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
30445ffd83dbSDimitry Andric   auto Mul1 =
30455ffd83dbSDimitry Andric       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
30465ffd83dbSDimitry Andric 
30475ffd83dbSDimitry Andric   // 2**(-32)
30485ffd83dbSDimitry Andric   auto Mul2 =
30495ffd83dbSDimitry Andric       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
30505ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
30515ffd83dbSDimitry Andric 
30525ffd83dbSDimitry Andric   // -(2**32)
30535ffd83dbSDimitry Andric   auto Mad2 = B.buildFMAD(S32, Trunc,
30545ffd83dbSDimitry Andric                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
30555ffd83dbSDimitry Andric 
30565ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
30575ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
30585ffd83dbSDimitry Andric 
30595ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
30605ffd83dbSDimitry Andric }
30615ffd83dbSDimitry Andric 
3062fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3063fe6060f1SDimitry Andric                                                         Register DstDivReg,
3064fe6060f1SDimitry Andric                                                         Register DstRemReg,
30655ffd83dbSDimitry Andric                                                         Register Numer,
3066fe6060f1SDimitry Andric                                                         Register Denom) const {
30675ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
30685ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
30695ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
30705ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
30715ffd83dbSDimitry Andric 
30725ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
30735ffd83dbSDimitry Andric 
30745ffd83dbSDimitry Andric   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
30755ffd83dbSDimitry Andric 
30765ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
30775ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
30785ffd83dbSDimitry Andric 
30795ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
30805ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
30815ffd83dbSDimitry Andric 
30825ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
30835ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
30845ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
30855ffd83dbSDimitry Andric 
30865ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
30875ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
30885ffd83dbSDimitry Andric   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
30895ffd83dbSDimitry Andric 
30905ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
30915ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
30925ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
30935ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
30945ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
30955ffd83dbSDimitry Andric 
30965ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
30975ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3098349cc55cSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
30995ffd83dbSDimitry Andric   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
31005ffd83dbSDimitry Andric 
31015ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
31025ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
31035ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
31045ffd83dbSDimitry Andric 
31055ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
31065ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
31075ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
31085ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
31095ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
31105ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
31115ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
31125ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
31135ffd83dbSDimitry Andric   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
31145ffd83dbSDimitry Andric 
31155ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
31165ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
31175ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
31185ffd83dbSDimitry Andric 
31195ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
31205ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
31215ffd83dbSDimitry Andric 
31225ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
31235ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
31245ffd83dbSDimitry Andric 
31255ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
31265ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
31275ffd83dbSDimitry Andric 
31285ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
31295ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
31305ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
31315ffd83dbSDimitry Andric 
31325ffd83dbSDimitry Andric   // if C3 != 0 ...
31335ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
31345ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
31355ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
31365ffd83dbSDimitry Andric   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
31375ffd83dbSDimitry Andric 
31385ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
31395ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
31405ffd83dbSDimitry Andric 
31415ffd83dbSDimitry Andric   auto C4 =
31425ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
31435ffd83dbSDimitry Andric   auto C5 =
31445ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
31455ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
31465ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
31475ffd83dbSDimitry Andric 
31485ffd83dbSDimitry Andric   // if (C6 != 0)
31495ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
31505ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
31515ffd83dbSDimitry Andric 
31525ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
31535ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
31545ffd83dbSDimitry Andric   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
31555ffd83dbSDimitry Andric 
31565ffd83dbSDimitry Andric   // endif C6
31575ffd83dbSDimitry Andric   // endif C3
31585ffd83dbSDimitry Andric 
3159fe6060f1SDimitry Andric   if (DstDivReg) {
31605ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
31615ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3162fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3163fe6060f1SDimitry Andric                   Sel1, MulHi3);
3164fe6060f1SDimitry Andric   }
3165fe6060f1SDimitry Andric 
3166fe6060f1SDimitry Andric   if (DstRemReg) {
31675ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
31685ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3169fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3170fe6060f1SDimitry Andric                   Sel2, Sub1);
31715ffd83dbSDimitry Andric   }
31725ffd83dbSDimitry Andric }
31735ffd83dbSDimitry Andric 
3174fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
31755ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
31765ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
3177fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
3178fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3179fe6060f1SDimitry Andric   default:
3180fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3181fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
3182fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3183fe6060f1SDimitry Andric     break;
3184fe6060f1SDimitry Andric   }
3185fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
3186fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3187fe6060f1SDimitry Andric     break;
3188fe6060f1SDimitry Andric   }
3189fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
3190fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3191fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3192fe6060f1SDimitry Andric     break;
3193fe6060f1SDimitry Andric   }
3194fe6060f1SDimitry Andric   }
3195fe6060f1SDimitry Andric 
31965ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
31975ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3198fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3199fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3200fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3201fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
32025ffd83dbSDimitry Andric 
32035ffd83dbSDimitry Andric   if (Ty == S32)
3204fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
32055ffd83dbSDimitry Andric   else if (Ty == S64)
3206fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
32075ffd83dbSDimitry Andric   else
32085ffd83dbSDimitry Andric     return false;
32095ffd83dbSDimitry Andric 
32105ffd83dbSDimitry Andric   MI.eraseFromParent();
32115ffd83dbSDimitry Andric   return true;
32125ffd83dbSDimitry Andric }
32135ffd83dbSDimitry Andric 
3214fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
32155ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
32165ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
32175ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
32185ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
32195ffd83dbSDimitry Andric 
3220fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
32215ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
32225ffd83dbSDimitry Andric     return false;
32235ffd83dbSDimitry Andric 
3224fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3225fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3226fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
32275ffd83dbSDimitry Andric 
32285ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
32295ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
32305ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
32315ffd83dbSDimitry Andric 
32325ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
32335ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
32345ffd83dbSDimitry Andric 
32355ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
32365ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
32375ffd83dbSDimitry Andric 
3238fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3239fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3240fe6060f1SDimitry Andric   default:
3241fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3242fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
3243fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3244fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3245fe6060f1SDimitry Andric     break;
3246fe6060f1SDimitry Andric   }
3247fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
3248fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3249fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3250fe6060f1SDimitry Andric     break;
3251fe6060f1SDimitry Andric   }
3252fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
3253fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3254fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3255fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3256fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3257fe6060f1SDimitry Andric     break;
3258fe6060f1SDimitry Andric   }
3259fe6060f1SDimitry Andric   }
3260fe6060f1SDimitry Andric 
32615ffd83dbSDimitry Andric   if (Ty == S32)
3262fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
32635ffd83dbSDimitry Andric   else
3264fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
32655ffd83dbSDimitry Andric 
3266fe6060f1SDimitry Andric   if (DstDivReg) {
3267fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3268fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3269fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
3270fe6060f1SDimitry Andric   }
32715ffd83dbSDimitry Andric 
3272fe6060f1SDimitry Andric   if (DstRemReg) {
3273fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3274fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3275fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
3276fe6060f1SDimitry Andric   }
32775ffd83dbSDimitry Andric 
32785ffd83dbSDimitry Andric   MI.eraseFromParent();
32795ffd83dbSDimitry Andric   return true;
32805ffd83dbSDimitry Andric }
32815ffd83dbSDimitry Andric 
32828bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
32838bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
32848bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
32858bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
32868bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
32878bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
32888bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
32898bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
32908bcb0991SDimitry Andric 
32918bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
3292e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3293e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
32948bcb0991SDimitry Andric 
3295e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
32968bcb0991SDimitry Andric     return false;
32978bcb0991SDimitry Andric 
32988bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
32998bcb0991SDimitry Andric     // 1 / x -> RCP(x)
33008bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
33018bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
33028bcb0991SDimitry Andric         .addUse(RHS)
33038bcb0991SDimitry Andric         .setMIFlags(Flags);
33048bcb0991SDimitry Andric 
33058bcb0991SDimitry Andric       MI.eraseFromParent();
33068bcb0991SDimitry Andric       return true;
33078bcb0991SDimitry Andric     }
33088bcb0991SDimitry Andric 
33098bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
33108bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
33118bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
33128bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
33138bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
33148bcb0991SDimitry Andric         .setMIFlags(Flags);
33158bcb0991SDimitry Andric 
33168bcb0991SDimitry Andric       MI.eraseFromParent();
33178bcb0991SDimitry Andric       return true;
33188bcb0991SDimitry Andric     }
33198bcb0991SDimitry Andric   }
33208bcb0991SDimitry Andric 
33218bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
33228bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
33238bcb0991SDimitry Andric     .addUse(RHS)
33248bcb0991SDimitry Andric     .setMIFlags(Flags);
33258bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
33268bcb0991SDimitry Andric 
33278bcb0991SDimitry Andric   MI.eraseFromParent();
33288bcb0991SDimitry Andric   return true;
33298bcb0991SDimitry Andric }
33308bcb0991SDimitry Andric 
3331e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3332e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
3333e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
3334e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3335e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
3336e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
3337e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
3338e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
3339e8d8bef9SDimitry Andric 
3340e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
3341e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3342e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
3343e8d8bef9SDimitry Andric 
3344e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
33458bcb0991SDimitry Andric     return false;
3346e8d8bef9SDimitry Andric 
3347e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
3348e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
3349e8d8bef9SDimitry Andric 
3350e8d8bef9SDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3351e8d8bef9SDimitry Andric     .addUse(Y)
3352e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3353e8d8bef9SDimitry Andric 
3354e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3355e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
3356e8d8bef9SDimitry Andric 
3357e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3358e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
3359e8d8bef9SDimitry Andric 
3360e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
3361e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3362e8d8bef9SDimitry Andric 
3363e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
3364e8d8bef9SDimitry Andric   MI.eraseFromParent();
3365e8d8bef9SDimitry Andric   return true;
33668bcb0991SDimitry Andric }
33678bcb0991SDimitry Andric 
3368480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3369480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3370480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3371e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3372e8d8bef9SDimitry Andric     return true;
3373e8d8bef9SDimitry Andric 
3374480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3375480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3376480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3377480093f4SDimitry Andric 
3378480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3379480093f4SDimitry Andric 
3380480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
3381480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3382480093f4SDimitry Andric 
3383480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3384480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3385480093f4SDimitry Andric 
3386480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3387480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
3388480093f4SDimitry Andric     .setMIFlags(Flags);
3389480093f4SDimitry Andric 
3390480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3391480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3392480093f4SDimitry Andric 
3393480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3394480093f4SDimitry Andric     .addUse(RDst.getReg(0))
3395480093f4SDimitry Andric     .addUse(RHS)
3396480093f4SDimitry Andric     .addUse(LHS)
3397480093f4SDimitry Andric     .setMIFlags(Flags);
3398480093f4SDimitry Andric 
3399480093f4SDimitry Andric   MI.eraseFromParent();
3400480093f4SDimitry Andric   return true;
3401480093f4SDimitry Andric }
3402480093f4SDimitry Andric 
3403480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3404480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3405480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable,
3406480093f4SDimitry Andric                                MachineIRBuilder &B,
3407480093f4SDimitry Andric                                const GCNSubtarget &ST,
3408480093f4SDimitry Andric                                AMDGPU::SIModeRegisterDefaults Mode) {
3409480093f4SDimitry Andric   // Set SP denorm mode to this value.
3410480093f4SDimitry Andric   unsigned SPDenormMode =
34115ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3412480093f4SDimitry Andric 
3413480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
3414480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
34155ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3416480093f4SDimitry Andric 
34175ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3418480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
3419480093f4SDimitry Andric       .addImm(NewDenormModeValue);
3420480093f4SDimitry Andric 
3421480093f4SDimitry Andric   } else {
3422480093f4SDimitry Andric     // Select FP32 bit field in mode register.
3423480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3424480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3425480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3426480093f4SDimitry Andric 
3427480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3428480093f4SDimitry Andric       .addImm(SPDenormMode)
3429480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
3430480093f4SDimitry Andric   }
3431480093f4SDimitry Andric }
3432480093f4SDimitry Andric 
3433480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3434480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3435480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3436e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3437e8d8bef9SDimitry Andric     return true;
3438e8d8bef9SDimitry Andric 
3439480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3440480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3441480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3442480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3443480093f4SDimitry Andric   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3444480093f4SDimitry Andric 
3445480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3446480093f4SDimitry Andric 
3447480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3448480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3449480093f4SDimitry Andric 
3450480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
3451480093f4SDimitry Andric 
3452480093f4SDimitry Andric   auto DenominatorScaled =
3453480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3454480093f4SDimitry Andric       .addUse(LHS)
34555ffd83dbSDimitry Andric       .addUse(RHS)
34565ffd83dbSDimitry Andric       .addImm(0)
3457480093f4SDimitry Andric       .setMIFlags(Flags);
3458480093f4SDimitry Andric   auto NumeratorScaled =
3459480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3460480093f4SDimitry Andric       .addUse(LHS)
3461480093f4SDimitry Andric       .addUse(RHS)
34625ffd83dbSDimitry Andric       .addImm(1)
3463480093f4SDimitry Andric       .setMIFlags(Flags);
3464480093f4SDimitry Andric 
3465480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3466480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
3467480093f4SDimitry Andric     .setMIFlags(Flags);
3468480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3469480093f4SDimitry Andric 
3470480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3471480093f4SDimitry Andric   // aren't modeled as reading it.
34725ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3473480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
3474480093f4SDimitry Andric 
3475480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3476480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3477480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3478480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3479480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3480480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3481480093f4SDimitry Andric 
34825ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3483480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
3484480093f4SDimitry Andric 
3485480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3486480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3487480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
3488480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3489480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
3490480093f4SDimitry Andric     .setMIFlags(Flags);
3491480093f4SDimitry Andric 
3492480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3493480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3494480093f4SDimitry Andric     .addUse(RHS)
3495480093f4SDimitry Andric     .addUse(LHS)
3496480093f4SDimitry Andric     .setMIFlags(Flags);
3497480093f4SDimitry Andric 
3498480093f4SDimitry Andric   MI.eraseFromParent();
3499480093f4SDimitry Andric   return true;
3500480093f4SDimitry Andric }
3501480093f4SDimitry Andric 
3502480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3503480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3504480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3505e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3506e8d8bef9SDimitry Andric     return true;
3507e8d8bef9SDimitry Andric 
3508480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3509480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3510480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3511480093f4SDimitry Andric 
3512480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3513480093f4SDimitry Andric 
3514480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
3515480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3516480093f4SDimitry Andric 
3517480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
3518480093f4SDimitry Andric 
3519480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3520480093f4SDimitry Andric     .addUse(LHS)
3521480093f4SDimitry Andric     .addUse(RHS)
35225ffd83dbSDimitry Andric     .addImm(0)
3523480093f4SDimitry Andric     .setMIFlags(Flags);
3524480093f4SDimitry Andric 
3525480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3526480093f4SDimitry Andric 
3527480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3528480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
3529480093f4SDimitry Andric     .setMIFlags(Flags);
3530480093f4SDimitry Andric 
3531480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3532480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3533480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3534480093f4SDimitry Andric 
3535480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3536480093f4SDimitry Andric     .addUse(LHS)
3537480093f4SDimitry Andric     .addUse(RHS)
35385ffd83dbSDimitry Andric     .addImm(1)
3539480093f4SDimitry Andric     .setMIFlags(Flags);
3540480093f4SDimitry Andric 
3541480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
35425ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3543480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3544480093f4SDimitry Andric 
3545480093f4SDimitry Andric   Register Scale;
3546480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
3547480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
3548480093f4SDimitry Andric     // is not usable.
3549480093f4SDimitry Andric 
3550480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
3551480093f4SDimitry Andric 
3552480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3553480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3554480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3555480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3556480093f4SDimitry Andric 
3557480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3558480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
3559480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3560480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
35615ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3562480093f4SDimitry Andric   } else {
3563480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
3564480093f4SDimitry Andric   }
3565480093f4SDimitry Andric 
3566480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3567480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3568480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3569480093f4SDimitry Andric     .addUse(Mul.getReg(0))
3570480093f4SDimitry Andric     .addUse(Scale)
3571480093f4SDimitry Andric     .setMIFlags(Flags);
3572480093f4SDimitry Andric 
3573480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3574480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3575480093f4SDimitry Andric     .addUse(RHS)
3576480093f4SDimitry Andric     .addUse(LHS)
3577480093f4SDimitry Andric     .setMIFlags(Flags);
3578480093f4SDimitry Andric 
3579480093f4SDimitry Andric   MI.eraseFromParent();
3580480093f4SDimitry Andric   return true;
3581480093f4SDimitry Andric }
3582480093f4SDimitry Andric 
35838bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
35848bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
35858bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
35868bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
35878bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
35888bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
35898bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
35908bcb0991SDimitry Andric 
35918bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
35928bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
35938bcb0991SDimitry Andric 
35948bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
35958bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
35968bcb0991SDimitry Andric 
35978bcb0991SDimitry Andric   auto C0 = B.buildConstant(S32, 0x6f800000);
35988bcb0991SDimitry Andric   auto C1 = B.buildConstant(S32, 0x2f800000);
35998bcb0991SDimitry Andric   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
36008bcb0991SDimitry Andric 
36018bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
36028bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
36038bcb0991SDimitry Andric 
36048bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
36058bcb0991SDimitry Andric 
36068bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
36078bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
36088bcb0991SDimitry Andric     .setMIFlags(Flags);
36098bcb0991SDimitry Andric 
36108bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
36118bcb0991SDimitry Andric 
36128bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
36138bcb0991SDimitry Andric 
36148bcb0991SDimitry Andric   MI.eraseFromParent();
36158bcb0991SDimitry Andric   return true;
36168bcb0991SDimitry Andric }
36178bcb0991SDimitry Andric 
3618e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3619e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
3620e8d8bef9SDimitry Andric //
3621e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
3622e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3623e8d8bef9SDimitry Andric // +-max_float.
3624e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3625e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
3626e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
3627e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3628e8d8bef9SDimitry Andric     return true;
3629e8d8bef9SDimitry Andric 
3630e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3631e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
3632e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
3633e8d8bef9SDimitry Andric 
3634e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
3635e8d8bef9SDimitry Andric 
3636e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
3637e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
3638e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
3639e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
3640e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
3641e8d8bef9SDimitry Andric   else
3642e8d8bef9SDimitry Andric     return false;
3643e8d8bef9SDimitry Andric 
3644e8d8bef9SDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3645e8d8bef9SDimitry Andric     .addUse(Src)
3646e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3647e8d8bef9SDimitry Andric 
3648e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
3649e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
3650e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3651e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
3652e8d8bef9SDimitry Andric 
3653e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3654e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3655e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3656e8d8bef9SDimitry Andric 
3657e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3658e8d8bef9SDimitry Andric 
3659e8d8bef9SDimitry Andric   if (UseIEEE)
3660e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3661e8d8bef9SDimitry Andric   else
3662e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3663e8d8bef9SDimitry Andric   MI.eraseFromParent();
3664e8d8bef9SDimitry Andric   return true;
3665e8d8bef9SDimitry Andric }
3666e8d8bef9SDimitry Andric 
3667e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3668e8d8bef9SDimitry Andric   switch (IID) {
3669e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
3670e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
3671e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
3672e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3673e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
3674e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3675e8d8bef9SDimitry Andric   default:
3676e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
3677e8d8bef9SDimitry Andric   }
3678e8d8bef9SDimitry Andric }
3679e8d8bef9SDimitry Andric 
3680e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3681e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
3682e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
3683e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
3684e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
3685e8d8bef9SDimitry Andric 
3686e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3687e8d8bef9SDimitry Andric 
3688e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
3689e8d8bef9SDimitry Andric   // construction.
3690e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
3691e8d8bef9SDimitry Andric     MI.RemoveOperand(I);
3692e8d8bef9SDimitry Andric 
3693e8d8bef9SDimitry Andric   MI.RemoveOperand(1); // Remove the intrinsic ID.
3694e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
3695e8d8bef9SDimitry Andric   return true;
3696e8d8bef9SDimitry Andric }
3697e8d8bef9SDimitry Andric 
3698e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3699e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
3700e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
3701e8d8bef9SDimitry Andric   uint64_t Offset =
3702e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
3703e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3704e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
3705e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3706e8d8bef9SDimitry Andric 
3707e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3708e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
3709e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3710e8d8bef9SDimitry Andric     return false;
3711e8d8bef9SDimitry Andric 
3712e8d8bef9SDimitry Andric   // FIXME: This should be nuw
3713e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3714e8d8bef9SDimitry Andric   return true;
3715e8d8bef9SDimitry Andric }
3716e8d8bef9SDimitry Andric 
37170b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
37180b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
37190b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
37200b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
37210b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
37220b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
37230b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
37240b57cec5SDimitry Andric   }
37250b57cec5SDimitry Andric 
37260b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
3727e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
37280b57cec5SDimitry Andric     return false;
37290b57cec5SDimitry Andric 
37300b57cec5SDimitry Andric   MI.eraseFromParent();
37310b57cec5SDimitry Andric   return true;
37320b57cec5SDimitry Andric }
37330b57cec5SDimitry Andric 
37348bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
37358bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
37368bcb0991SDimitry Andric                                               MachineIRBuilder &B,
37378bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
37388bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3739e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3740e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
3741e8d8bef9SDimitry Andric 
37428bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
37438bcb0991SDimitry Andric   MI.eraseFromParent();
37448bcb0991SDimitry Andric   return true;
37458bcb0991SDimitry Andric }
37468bcb0991SDimitry Andric 
37475ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
37485ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
37495ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
37505ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
37515ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
37525ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
3753fe6060f1SDimitry Andric std::pair<Register, unsigned>
37545ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
37555ffd83dbSDimitry Andric                                         Register OrigOffset) const {
37565ffd83dbSDimitry Andric   const unsigned MaxImm = 4095;
37575ffd83dbSDimitry Andric   Register BaseReg;
3758fe6060f1SDimitry Andric   unsigned ImmOffset;
37595ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3760fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
37615ffd83dbSDimitry Andric 
3762fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
3763fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
37645ffd83dbSDimitry Andric 
3765fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
3766fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
3767fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
37685ffd83dbSDimitry Andric 
37695ffd83dbSDimitry Andric   // If the immediate value is too big for the immoffset field, put the value
37705ffd83dbSDimitry Andric   // and -4096 into the immoffset field so that the value that is copied/added
37715ffd83dbSDimitry Andric   // for the voffset field is a multiple of 4096, and it stands more chance
37725ffd83dbSDimitry Andric   // of being CSEd with the copy/add for another similar load/store.
37735ffd83dbSDimitry Andric   // However, do not do that rounding down to a multiple of 4096 if that is a
37745ffd83dbSDimitry Andric   // negative number, as it appears to be illegal to have a negative offset
37755ffd83dbSDimitry Andric   // in the vgpr, even if adding the immediate offset makes it positive.
37765ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
37775ffd83dbSDimitry Andric   ImmOffset -= Overflow;
37785ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
37795ffd83dbSDimitry Andric     Overflow += ImmOffset;
37805ffd83dbSDimitry Andric     ImmOffset = 0;
37815ffd83dbSDimitry Andric   }
37825ffd83dbSDimitry Andric 
37835ffd83dbSDimitry Andric   if (Overflow != 0) {
37845ffd83dbSDimitry Andric     if (!BaseReg) {
37855ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
37865ffd83dbSDimitry Andric     } else {
37875ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
37885ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
37895ffd83dbSDimitry Andric     }
37905ffd83dbSDimitry Andric   }
37915ffd83dbSDimitry Andric 
37925ffd83dbSDimitry Andric   if (!BaseReg)
37935ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
37945ffd83dbSDimitry Andric 
3795fe6060f1SDimitry Andric   return std::make_pair(BaseReg, ImmOffset);
3796fe6060f1SDimitry Andric }
3797fe6060f1SDimitry Andric 
3798fe6060f1SDimitry Andric /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3799fe6060f1SDimitry Andric void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3800fe6060f1SDimitry Andric                                           Register VOffset, Register SOffset,
3801fe6060f1SDimitry Andric                                           unsigned ImmOffset, Register VIndex,
3802fe6060f1SDimitry Andric                                           MachineRegisterInfo &MRI) const {
3803fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeVOffsetVal =
3804349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(VOffset, MRI);
3805fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeSOffsetVal =
3806349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(SOffset, MRI);
3807fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeVIndexVal =
3808349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(VIndex, MRI);
3809fe6060f1SDimitry Andric   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3810fe6060f1SDimitry Andric   // update the MMO with that offset. The stride is unknown so we can only do
3811fe6060f1SDimitry Andric   // this if VIndex is constant 0.
3812fe6060f1SDimitry Andric   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3813fe6060f1SDimitry Andric       MaybeVIndexVal->Value == 0) {
3814fe6060f1SDimitry Andric     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3815fe6060f1SDimitry Andric                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3816fe6060f1SDimitry Andric     MMO->setOffset(TotalOffset);
3817fe6060f1SDimitry Andric   } else {
3818fe6060f1SDimitry Andric     // We don't have a constant combined offset to use in the MMO. Give up.
3819fe6060f1SDimitry Andric     MMO->setValue((Value *)nullptr);
3820fe6060f1SDimitry Andric   }
38215ffd83dbSDimitry Andric }
38225ffd83dbSDimitry Andric 
38238bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
38248bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
38258bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
3826e8d8bef9SDimitry Andric                                              Register Reg,
3827e8d8bef9SDimitry Andric                                              bool ImageStore) const {
38288bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
38298bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
38308bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
38318bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
38328bcb0991SDimitry Andric 
3833e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
38348bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
38358bcb0991SDimitry Andric 
38368bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
38378bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
38388bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
38398bcb0991SDimitry Andric 
38408bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
38418bcb0991SDimitry Andric 
3842fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3843fe6060f1SDimitry Andric         .getReg(0);
38448bcb0991SDimitry Andric   }
38458bcb0991SDimitry Andric 
3846e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
3847e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
3848e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3849e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
3850e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
3851e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3852fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3853fe6060f1SDimitry Andric           .getReg(0);
3854e8d8bef9SDimitry Andric     }
3855e8d8bef9SDimitry Andric 
3856e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
3857e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3858e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
3859e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3860e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
3861e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3862fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3863fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3864e8d8bef9SDimitry Andric     }
3865e8d8bef9SDimitry Andric 
3866e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
3867e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3868fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3869e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
3870e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3871e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
3872e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3873fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3874fe6060f1SDimitry Andric           .getReg(0);
3875e8d8bef9SDimitry Andric     }
3876e8d8bef9SDimitry Andric 
3877e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
3878e8d8bef9SDimitry Andric   }
3879e8d8bef9SDimitry Andric 
38800eae32dcSDimitry Andric   if (StoreVT == LLT::fixed_vector(3, S16)) {
38810eae32dcSDimitry Andric     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
38820eae32dcSDimitry Andric               .getReg(0);
38830eae32dcSDimitry Andric   }
3884e8d8bef9SDimitry Andric   return Reg;
3885e8d8bef9SDimitry Andric }
3886e8d8bef9SDimitry Andric 
38875ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
38885ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
38895ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
38905ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
38918bcb0991SDimitry Andric 
38928bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
38938bcb0991SDimitry Andric 
38948bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
38958bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
38968bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
38975ffd83dbSDimitry Andric     return AnyExt;
38988bcb0991SDimitry Andric   }
38998bcb0991SDimitry Andric 
39008bcb0991SDimitry Andric   if (Ty.isVector()) {
39018bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
39028bcb0991SDimitry Andric       if (IsFormat)
39035ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
39045ffd83dbSDimitry Andric     }
39055ffd83dbSDimitry Andric   }
39065ffd83dbSDimitry Andric 
39075ffd83dbSDimitry Andric   return VData;
39085ffd83dbSDimitry Andric }
39095ffd83dbSDimitry Andric 
39105ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
39115ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
39125ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
39135ffd83dbSDimitry Andric                                               bool IsTyped,
39145ffd83dbSDimitry Andric                                               bool IsFormat) const {
39155ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
39165ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
39175ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
39185ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
39195ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
39205ffd83dbSDimitry Andric 
39215ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
39225ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
39235ffd83dbSDimitry Andric 
39245ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
39255ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
39265ffd83dbSDimitry Andric 
39275ffd83dbSDimitry Andric   unsigned ImmOffset;
39285ffd83dbSDimitry Andric 
39295ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
39305ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
39315ffd83dbSDimitry Andric 
39325ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
39335ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
39345ffd83dbSDimitry Andric   Register VIndex;
39355ffd83dbSDimitry Andric   int OpOffset = 0;
39365ffd83dbSDimitry Andric   if (HasVIndex) {
39375ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
39385ffd83dbSDimitry Andric     OpOffset = 1;
3939fe6060f1SDimitry Andric   } else {
3940fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
39415ffd83dbSDimitry Andric   }
39425ffd83dbSDimitry Andric 
39435ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
39445ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
39455ffd83dbSDimitry Andric 
39465ffd83dbSDimitry Andric   unsigned Format = 0;
39475ffd83dbSDimitry Andric   if (IsTyped) {
39485ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
39495ffd83dbSDimitry Andric     ++OpOffset;
39505ffd83dbSDimitry Andric   }
39515ffd83dbSDimitry Andric 
39525ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
39535ffd83dbSDimitry Andric 
3954fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3955fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
39565ffd83dbSDimitry Andric 
39575ffd83dbSDimitry Andric   unsigned Opc;
39585ffd83dbSDimitry Andric   if (IsTyped) {
39595ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
39605ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
39615ffd83dbSDimitry Andric   } else if (IsFormat) {
39625ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
39635ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
39645ffd83dbSDimitry Andric   } else {
39655ffd83dbSDimitry Andric     switch (MemSize) {
39665ffd83dbSDimitry Andric     case 1:
39675ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
39685ffd83dbSDimitry Andric       break;
39695ffd83dbSDimitry Andric     case 2:
39705ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
39715ffd83dbSDimitry Andric       break;
39725ffd83dbSDimitry Andric     default:
39735ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
39745ffd83dbSDimitry Andric       break;
39755ffd83dbSDimitry Andric     }
39765ffd83dbSDimitry Andric   }
39775ffd83dbSDimitry Andric 
39785ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
39795ffd83dbSDimitry Andric     .addUse(VData)              // vdata
39805ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
39815ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
39825ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
39835ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
39845ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
39855ffd83dbSDimitry Andric 
39865ffd83dbSDimitry Andric   if (IsTyped)
39875ffd83dbSDimitry Andric     MIB.addImm(Format);
39885ffd83dbSDimitry Andric 
39895ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
39905ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
39915ffd83dbSDimitry Andric      .addMemOperand(MMO);
39925ffd83dbSDimitry Andric 
39935ffd83dbSDimitry Andric   MI.eraseFromParent();
39948bcb0991SDimitry Andric   return true;
39958bcb0991SDimitry Andric }
39968bcb0991SDimitry Andric 
39975ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
39985ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
39995ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
40005ffd83dbSDimitry Andric                                              bool IsFormat,
40015ffd83dbSDimitry Andric                                              bool IsTyped) const {
40025ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
40035ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
4004fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
40055ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
40065ffd83dbSDimitry Andric 
40075ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
40085ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
40095ffd83dbSDimitry Andric 
40105ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
40115ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
40125ffd83dbSDimitry Andric 
40135ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
40145ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
40155ffd83dbSDimitry Andric   Register VIndex;
40165ffd83dbSDimitry Andric   int OpOffset = 0;
40175ffd83dbSDimitry Andric   if (HasVIndex) {
40185ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
40195ffd83dbSDimitry Andric     OpOffset = 1;
4020fe6060f1SDimitry Andric   } else {
4021fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
40228bcb0991SDimitry Andric   }
40238bcb0991SDimitry Andric 
40245ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
40255ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
40265ffd83dbSDimitry Andric 
40275ffd83dbSDimitry Andric   unsigned Format = 0;
40285ffd83dbSDimitry Andric   if (IsTyped) {
40295ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
40305ffd83dbSDimitry Andric     ++OpOffset;
40318bcb0991SDimitry Andric   }
40328bcb0991SDimitry Andric 
40335ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
40345ffd83dbSDimitry Andric   unsigned ImmOffset;
40355ffd83dbSDimitry Andric 
40365ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
40375ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
40385ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
40395ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
40405ffd83dbSDimitry Andric 
4041fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4042fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
40435ffd83dbSDimitry Andric 
40445ffd83dbSDimitry Andric   unsigned Opc;
40455ffd83dbSDimitry Andric 
40465ffd83dbSDimitry Andric   if (IsTyped) {
40475ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
40485ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
40495ffd83dbSDimitry Andric   } else if (IsFormat) {
40505ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
40515ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
40525ffd83dbSDimitry Andric   } else {
4053fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
4054fe6060f1SDimitry Andric     case 8:
40555ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
40565ffd83dbSDimitry Andric       break;
4057fe6060f1SDimitry Andric     case 16:
40585ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
40595ffd83dbSDimitry Andric       break;
40605ffd83dbSDimitry Andric     default:
40615ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
40625ffd83dbSDimitry Andric       break;
40635ffd83dbSDimitry Andric     }
40645ffd83dbSDimitry Andric   }
40655ffd83dbSDimitry Andric 
40665ffd83dbSDimitry Andric   Register LoadDstReg;
40675ffd83dbSDimitry Andric 
4068fe6060f1SDimitry Andric   bool IsExtLoad =
4069fe6060f1SDimitry Andric       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
40705ffd83dbSDimitry Andric   LLT UnpackedTy = Ty.changeElementSize(32);
40715ffd83dbSDimitry Andric 
40725ffd83dbSDimitry Andric   if (IsExtLoad)
40735ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
40745ffd83dbSDimitry Andric   else if (Unpacked && IsD16 && Ty.isVector())
40755ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
40765ffd83dbSDimitry Andric   else
40775ffd83dbSDimitry Andric     LoadDstReg = Dst;
40785ffd83dbSDimitry Andric 
40795ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
40805ffd83dbSDimitry Andric     .addDef(LoadDstReg)         // vdata
40815ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
40825ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
40835ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
40845ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
40855ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
40865ffd83dbSDimitry Andric 
40875ffd83dbSDimitry Andric   if (IsTyped)
40885ffd83dbSDimitry Andric     MIB.addImm(Format);
40895ffd83dbSDimitry Andric 
40905ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
40915ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
40925ffd83dbSDimitry Andric      .addMemOperand(MMO);
40935ffd83dbSDimitry Andric 
40945ffd83dbSDimitry Andric   if (LoadDstReg != Dst) {
40955ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
40965ffd83dbSDimitry Andric 
40975ffd83dbSDimitry Andric     // Widen result for extending loads was widened.
40985ffd83dbSDimitry Andric     if (IsExtLoad)
40995ffd83dbSDimitry Andric       B.buildTrunc(Dst, LoadDstReg);
41005ffd83dbSDimitry Andric     else {
41015ffd83dbSDimitry Andric       // Repack to original 16-bit vector result
41025ffd83dbSDimitry Andric       // FIXME: G_TRUNC should work, but legalization currently fails
41035ffd83dbSDimitry Andric       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
41045ffd83dbSDimitry Andric       SmallVector<Register, 4> Repack;
41055ffd83dbSDimitry Andric       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
41065ffd83dbSDimitry Andric         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
41075ffd83dbSDimitry Andric       B.buildMerge(Dst, Repack);
41085ffd83dbSDimitry Andric     }
41095ffd83dbSDimitry Andric   }
41105ffd83dbSDimitry Andric 
41115ffd83dbSDimitry Andric   MI.eraseFromParent();
41125ffd83dbSDimitry Andric   return true;
41135ffd83dbSDimitry Andric }
41145ffd83dbSDimitry Andric 
41155ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
41165ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
41175ffd83dbSDimitry Andric                                                bool IsInc) const {
41185ffd83dbSDimitry Andric   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
41195ffd83dbSDimitry Andric                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
41205ffd83dbSDimitry Andric   B.buildInstr(Opc)
41215ffd83dbSDimitry Andric     .addDef(MI.getOperand(0).getReg())
41225ffd83dbSDimitry Andric     .addUse(MI.getOperand(2).getReg())
41235ffd83dbSDimitry Andric     .addUse(MI.getOperand(3).getReg())
41245ffd83dbSDimitry Andric     .cloneMemRefs(MI);
41255ffd83dbSDimitry Andric   MI.eraseFromParent();
41265ffd83dbSDimitry Andric   return true;
41275ffd83dbSDimitry Andric }
41285ffd83dbSDimitry Andric 
41295ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
41305ffd83dbSDimitry Andric   switch (IntrID) {
41315ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
41325ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
41335ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
41345ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
41355ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
41365ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
41375ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
41385ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
41395ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
41405ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
41415ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
41425ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
41435ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
41445ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
41455ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
41465ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
41475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
41485ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
41495ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
41505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
41515ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
41525ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
41535ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
41545ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
41555ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
41565ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
41575ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
41585ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
41595ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
41605ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
41615ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
41625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
41635ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
41645ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
41655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
41665ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
41675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
41685ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
41695ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4170e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4171e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4172e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4173fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4174fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4175fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4176fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4177fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4178fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
41795ffd83dbSDimitry Andric   default:
41805ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
41815ffd83dbSDimitry Andric   }
41825ffd83dbSDimitry Andric }
41835ffd83dbSDimitry Andric 
41845ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
41855ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
41865ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
41875ffd83dbSDimitry Andric   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
41885ffd83dbSDimitry Andric                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4189e8d8bef9SDimitry Andric   const bool HasReturn = MI.getNumExplicitDefs() != 0;
41905ffd83dbSDimitry Andric 
4191e8d8bef9SDimitry Andric   Register Dst;
41925ffd83dbSDimitry Andric 
41935ffd83dbSDimitry Andric   int OpOffset = 0;
4194e8d8bef9SDimitry Andric   if (HasReturn) {
4195e8d8bef9SDimitry Andric     // A few FP atomics do not support return values.
4196e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4197e8d8bef9SDimitry Andric   } else {
4198e8d8bef9SDimitry Andric     OpOffset = -1;
4199e8d8bef9SDimitry Andric   }
4200e8d8bef9SDimitry Andric 
4201e8d8bef9SDimitry Andric   Register VData = MI.getOperand(2 + OpOffset).getReg();
4202e8d8bef9SDimitry Andric   Register CmpVal;
42035ffd83dbSDimitry Andric 
42045ffd83dbSDimitry Andric   if (IsCmpSwap) {
42055ffd83dbSDimitry Andric     CmpVal = MI.getOperand(3 + OpOffset).getReg();
42065ffd83dbSDimitry Andric     ++OpOffset;
42075ffd83dbSDimitry Andric   }
42085ffd83dbSDimitry Andric 
42095ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4210e8d8bef9SDimitry Andric   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
42115ffd83dbSDimitry Andric 
42125ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
42135ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
42145ffd83dbSDimitry Andric   Register VIndex;
42155ffd83dbSDimitry Andric   if (HasVIndex) {
42165ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
42175ffd83dbSDimitry Andric     ++OpOffset;
4218fe6060f1SDimitry Andric   } else {
4219fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
42205ffd83dbSDimitry Andric   }
42215ffd83dbSDimitry Andric 
42225ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
42235ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
42245ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
42255ffd83dbSDimitry Andric 
42265ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
42275ffd83dbSDimitry Andric 
42285ffd83dbSDimitry Andric   unsigned ImmOffset;
4229fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4230fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
42315ffd83dbSDimitry Andric 
4232e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4233e8d8bef9SDimitry Andric 
4234e8d8bef9SDimitry Andric   if (HasReturn)
4235e8d8bef9SDimitry Andric     MIB.addDef(Dst);
4236e8d8bef9SDimitry Andric 
4237e8d8bef9SDimitry Andric   MIB.addUse(VData); // vdata
42385ffd83dbSDimitry Andric 
42395ffd83dbSDimitry Andric   if (IsCmpSwap)
42405ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
42415ffd83dbSDimitry Andric 
42425ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
42435ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
42445ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
42455ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
42465ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
42475ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
42485ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
42495ffd83dbSDimitry Andric      .addMemOperand(MMO);
42505ffd83dbSDimitry Andric 
42515ffd83dbSDimitry Andric   MI.eraseFromParent();
42525ffd83dbSDimitry Andric   return true;
42535ffd83dbSDimitry Andric }
42545ffd83dbSDimitry Andric 
4255fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
42565ffd83dbSDimitry Andric /// vector with s16 typed elements.
4257fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4258fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
4259fe6060f1SDimitry Andric                                       unsigned ArgOffset,
4260fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4261fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
42625ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4263fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
4264fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
42655ffd83dbSDimitry Andric 
4266e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4267e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
42685ffd83dbSDimitry Andric     if (!SrcOp.isReg())
42695ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
42705ffd83dbSDimitry Andric 
42715ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
42725ffd83dbSDimitry Andric 
4273fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
4274fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4275fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
42760eae32dcSDimitry Andric       if ((I < Intr->GradientStart) && IsA16 &&
42770eae32dcSDimitry Andric           (B.getMRI()->getType(AddrReg) == S16)) {
4278*04eeddc0SDimitry Andric         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
42790eae32dcSDimitry Andric         // Special handling of bias when A16 is on. Bias is of type half but
42800eae32dcSDimitry Andric         // occupies full 32-bit.
42810eae32dcSDimitry Andric         PackedAddrs.push_back(
42820eae32dcSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
42830eae32dcSDimitry Andric                 .getReg(0));
42840eae32dcSDimitry Andric       } else {
4285*04eeddc0SDimitry Andric         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
4286*04eeddc0SDimitry Andric                "Bias needs to be converted to 16 bit in A16 mode");
4287*04eeddc0SDimitry Andric         // Handle any gradient or coordinate operands that should not be packed
42885ffd83dbSDimitry Andric         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
42895ffd83dbSDimitry Andric         PackedAddrs.push_back(AddrReg);
42900eae32dcSDimitry Andric       }
42915ffd83dbSDimitry Andric     } else {
42925ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
42935ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
42945ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
4295e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
4296e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
4297e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
4298e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
4299e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
43005ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
4301e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
43025ffd83dbSDimitry Andric         PackedAddrs.push_back(
43035ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
43045ffd83dbSDimitry Andric                 .getReg(0));
43055ffd83dbSDimitry Andric       } else {
43065ffd83dbSDimitry Andric         PackedAddrs.push_back(
4307e8d8bef9SDimitry Andric             B.buildBuildVector(
4308e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
43095ffd83dbSDimitry Andric                 .getReg(0));
43105ffd83dbSDimitry Andric         ++I;
43115ffd83dbSDimitry Andric       }
43125ffd83dbSDimitry Andric     }
43135ffd83dbSDimitry Andric   }
43145ffd83dbSDimitry Andric }
43155ffd83dbSDimitry Andric 
43165ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
43175ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
43185ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
43195ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
43205ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43215ffd83dbSDimitry Andric 
43225ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
43235ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
43245ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
43255ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
43265ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
43275ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
43285ffd83dbSDimitry Andric     }
43295ffd83dbSDimitry Andric   }
43305ffd83dbSDimitry Andric 
43315ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
43325ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
4333fe6060f1SDimitry Andric     // Above 8 elements round up to next power of 2 (i.e. 16).
4334fe6060f1SDimitry Andric     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
43355ffd83dbSDimitry Andric       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
43365ffd83dbSDimitry Andric       auto Undef = B.buildUndef(S32);
43375ffd83dbSDimitry Andric       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
43385ffd83dbSDimitry Andric       NumAddrRegs = RoundedNumRegs;
43395ffd83dbSDimitry Andric     }
43405ffd83dbSDimitry Andric 
4341fe6060f1SDimitry Andric     auto VAddr =
4342fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
43435ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
43445ffd83dbSDimitry Andric   }
43455ffd83dbSDimitry Andric 
43465ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
43475ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
43485ffd83dbSDimitry Andric     if (SrcOp.isReg())
43495ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
43505ffd83dbSDimitry Andric   }
43515ffd83dbSDimitry Andric }
43525ffd83dbSDimitry Andric 
43535ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
43545ffd83dbSDimitry Andric ///
43555ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
43565ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
43575ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
43585ffd83dbSDimitry Andric /// registers.
43595ffd83dbSDimitry Andric ///
43605ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
43615ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
43625ffd83dbSDimitry Andric /// want a selected instrution entering RegBankSelect. In order to avoid
43635ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
4364349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires
4365349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg.
43665ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4367e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4368e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
43695ffd83dbSDimitry Andric 
4370e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
4371e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
43725ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
43735ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
43745ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
43755ffd83dbSDimitry Andric 
43765ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
43775ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4378e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
43795ffd83dbSDimitry Andric 
43805ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
43815ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43825ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4383fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
43845ffd83dbSDimitry Andric 
43855ffd83dbSDimitry Andric   unsigned DMask = 0;
4386*04eeddc0SDimitry Andric   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
4387*04eeddc0SDimitry Andric   LLT Ty = MRI->getType(VData);
43885ffd83dbSDimitry Andric 
43895ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
4390e8d8bef9SDimitry Andric   LLT GradTy =
4391e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4392e8d8bef9SDimitry Andric   LLT AddrTy =
4393e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
43945ffd83dbSDimitry Andric   const bool IsG16 = GradTy == S16;
43955ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
4396*04eeddc0SDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
43975ffd83dbSDimitry Andric 
43985ffd83dbSDimitry Andric   int DMaskLanes = 0;
43995ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
4400e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
44015ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
44025ffd83dbSDimitry Andric       DMaskLanes = 4;
44035ffd83dbSDimitry Andric     } else if (DMask != 0) {
44045ffd83dbSDimitry Andric       DMaskLanes = countPopulation(DMask);
44055ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
44065ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
44075ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
44085ffd83dbSDimitry Andric       MI.eraseFromParent();
44095ffd83dbSDimitry Andric       return true;
44105ffd83dbSDimitry Andric     }
44115ffd83dbSDimitry Andric   }
44125ffd83dbSDimitry Andric 
44135ffd83dbSDimitry Andric   Observer.changingInstr(MI);
44145ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
44155ffd83dbSDimitry Andric 
4416*04eeddc0SDimitry Andric   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
4417*04eeddc0SDimitry Andric                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
4418*04eeddc0SDimitry Andric   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
4419*04eeddc0SDimitry Andric                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
4420*04eeddc0SDimitry Andric   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
44215ffd83dbSDimitry Andric 
44225ffd83dbSDimitry Andric   // Track that we legalized this
44235ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
44245ffd83dbSDimitry Andric 
44255ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
44265ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
44275ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
44285ffd83dbSDimitry Andric     DMask = 0x1;
44295ffd83dbSDimitry Andric     DMaskLanes = 1;
4430e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
44315ffd83dbSDimitry Andric   }
44325ffd83dbSDimitry Andric 
44335ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
44345ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
44355ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
44365ffd83dbSDimitry Andric 
44375ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
44385ffd83dbSDimitry Andric     if (Ty.isVector())
44395ffd83dbSDimitry Andric       return false;
44405ffd83dbSDimitry Andric 
44415ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
44425ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
44435ffd83dbSDimitry Andric       // The two values are packed in one register.
4444fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
44455ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
44465ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
44475ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
44485ffd83dbSDimitry Andric     }
44495ffd83dbSDimitry Andric   }
44505ffd83dbSDimitry Andric 
4451e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
44525ffd83dbSDimitry Andric 
44535ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
4454fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4455fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
4456fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
44575ffd83dbSDimitry Andric     return false;
4458fe6060f1SDimitry Andric   }
44595ffd83dbSDimitry Andric 
4460fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
4461fe6060f1SDimitry Andric     // A16 not supported
4462fe6060f1SDimitry Andric     return false;
4463fe6060f1SDimitry Andric   }
4464fe6060f1SDimitry Andric 
4465fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
4466e8d8bef9SDimitry Andric     if (Intr->NumVAddrs > 1) {
44675ffd83dbSDimitry Andric       SmallVector<Register, 4> PackedRegs;
44685ffd83dbSDimitry Andric 
4469fe6060f1SDimitry Andric       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4470fe6060f1SDimitry Andric                                 IsG16);
44715ffd83dbSDimitry Andric 
44725ffd83dbSDimitry Andric       // See also below in the non-a16 branch
4473fe6060f1SDimitry Andric       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4474fe6060f1SDimitry Andric                           PackedRegs.size() <= ST.getNSAMaxSize();
44755ffd83dbSDimitry Andric 
44765ffd83dbSDimitry Andric       if (!UseNSA && PackedRegs.size() > 1) {
4477fe6060f1SDimitry Andric         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
44785ffd83dbSDimitry Andric         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
44795ffd83dbSDimitry Andric         PackedRegs[0] = Concat.getReg(0);
44805ffd83dbSDimitry Andric         PackedRegs.resize(1);
44815ffd83dbSDimitry Andric       }
44825ffd83dbSDimitry Andric 
4483e8d8bef9SDimitry Andric       const unsigned NumPacked = PackedRegs.size();
4484e8d8bef9SDimitry Andric       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4485e8d8bef9SDimitry Andric         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
44865ffd83dbSDimitry Andric         if (!SrcOp.isReg()) {
44875ffd83dbSDimitry Andric           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
44885ffd83dbSDimitry Andric           continue;
44895ffd83dbSDimitry Andric         }
44905ffd83dbSDimitry Andric 
44915ffd83dbSDimitry Andric         assert(SrcOp.getReg() != AMDGPU::NoRegister);
44925ffd83dbSDimitry Andric 
4493e8d8bef9SDimitry Andric         if (I - Intr->VAddrStart < NumPacked)
4494e8d8bef9SDimitry Andric           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
44955ffd83dbSDimitry Andric         else
44965ffd83dbSDimitry Andric           SrcOp.setReg(AMDGPU::NoRegister);
44975ffd83dbSDimitry Andric       }
44985ffd83dbSDimitry Andric     }
44995ffd83dbSDimitry Andric   } else {
45005ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
45015ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
45025ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
45035ffd83dbSDimitry Andric     // wash in terms of code size or even better.
45045ffd83dbSDimitry Andric     //
45055ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
45065ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
45075ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
45085ffd83dbSDimitry Andric     //
45095ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
45105ffd83dbSDimitry Andric     // allocation when possible.
4511fe6060f1SDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4512fe6060f1SDimitry Andric                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
45135ffd83dbSDimitry Andric 
4514e8d8bef9SDimitry Andric     if (!UseNSA && Intr->NumVAddrs > 1)
4515e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4516e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
45175ffd83dbSDimitry Andric   }
45185ffd83dbSDimitry Andric 
45195ffd83dbSDimitry Andric   int Flags = 0;
45205ffd83dbSDimitry Andric   if (IsA16)
45215ffd83dbSDimitry Andric     Flags |= 1;
45225ffd83dbSDimitry Andric   if (IsG16)
45235ffd83dbSDimitry Andric     Flags |= 2;
45245ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
45255ffd83dbSDimitry Andric 
45265ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
45275ffd83dbSDimitry Andric     // TODO: Handle dmask trim
4528*04eeddc0SDimitry Andric     if (!Ty.isVector() || !IsD16)
45295ffd83dbSDimitry Andric       return true;
45305ffd83dbSDimitry Andric 
4531e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
45325ffd83dbSDimitry Andric     if (RepackedReg != VData) {
45335ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
45345ffd83dbSDimitry Andric     }
45355ffd83dbSDimitry Andric 
45365ffd83dbSDimitry Andric     return true;
45375ffd83dbSDimitry Andric   }
45385ffd83dbSDimitry Andric 
45395ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
45405ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
45415ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
45425ffd83dbSDimitry Andric 
45435ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
45445ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
45455ffd83dbSDimitry Andric     return false;
45465ffd83dbSDimitry Andric 
45475ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
45485ffd83dbSDimitry Andric     return false;
45495ffd83dbSDimitry Andric 
45505ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4551fe6060f1SDimitry Andric   const LLT AdjustedTy =
4552fe6060f1SDimitry Andric       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
45535ffd83dbSDimitry Andric 
45545ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
45555ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
45565ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
45575ffd83dbSDimitry Andric   LLT RoundedTy;
45585ffd83dbSDimitry Andric 
45595ffd83dbSDimitry Andric   // S32 vector to to cover all data, plus TFE result element.
45605ffd83dbSDimitry Andric   LLT TFETy;
45615ffd83dbSDimitry Andric 
45625ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
45635ffd83dbSDimitry Andric   LLT RegTy;
45645ffd83dbSDimitry Andric 
45655ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
4566fe6060f1SDimitry Andric     RoundedTy =
4567fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4568fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
45695ffd83dbSDimitry Andric     RegTy = S32;
45705ffd83dbSDimitry Andric   } else {
45715ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
45725ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
45735ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
4574fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
4575fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4576fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
45775ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
45785ffd83dbSDimitry Andric   }
45795ffd83dbSDimitry Andric 
45805ffd83dbSDimitry Andric   // The return type does not need adjustment.
45815ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
45825ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
45835ffd83dbSDimitry Andric     return true;
45845ffd83dbSDimitry Andric 
45855ffd83dbSDimitry Andric   Register Dst1Reg;
45865ffd83dbSDimitry Andric 
45875ffd83dbSDimitry Andric   // Insert after the instruction.
45885ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
45895ffd83dbSDimitry Andric 
45905ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
45915ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
45925ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
45935ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
45945ffd83dbSDimitry Andric 
45955ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
45965ffd83dbSDimitry Andric 
45975ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
45985ffd83dbSDimitry Andric 
45995ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
4600349cc55cSDimitry Andric   // type. The instruction really returns these two values in one contiguous
46015ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
46025ffd83dbSDimitry Andric   // return type to use a single register result.
46035ffd83dbSDimitry Andric 
46045ffd83dbSDimitry Andric   if (IsTFE) {
46055ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
46065ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
46075ffd83dbSDimitry Andric       return false;
46085ffd83dbSDimitry Andric 
46095ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
46105ffd83dbSDimitry Andric     MI.RemoveOperand(1);
46115ffd83dbSDimitry Andric 
46125ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
46135ffd83dbSDimitry Andric     if (Ty == S32) {
46145ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
46155ffd83dbSDimitry Andric       return true;
46165ffd83dbSDimitry Andric     }
46175ffd83dbSDimitry Andric   }
46185ffd83dbSDimitry Andric 
46195ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
46205ffd83dbSDimitry Andric   // result.
46215ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
46225ffd83dbSDimitry Andric 
46235ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
46245ffd83dbSDimitry Andric 
46255ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
46265ffd83dbSDimitry Andric     assert(!IsTFE);
46275ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
46285ffd83dbSDimitry Andric   } else {
46295ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
46305ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
46315ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
46325ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
46335ffd83dbSDimitry Andric 
46345ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
46355ffd83dbSDimitry Andric     // directly written to the right place already.
46365ffd83dbSDimitry Andric     if (IsTFE)
46375ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
46385ffd83dbSDimitry Andric   }
46395ffd83dbSDimitry Andric 
46405ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
46415ffd83dbSDimitry Andric   // of packed vs. unpacked.
46425ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
46435ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
46445ffd83dbSDimitry Andric     return true;
46455ffd83dbSDimitry Andric   }
46465ffd83dbSDimitry Andric 
46475ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
46485ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
46495ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
46505ffd83dbSDimitry Andric     return true;
46515ffd83dbSDimitry Andric   }
46525ffd83dbSDimitry Andric 
46535ffd83dbSDimitry Andric   assert(Ty.isVector());
46545ffd83dbSDimitry Andric 
46555ffd83dbSDimitry Andric   if (IsD16) {
46565ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
46575ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
46585ffd83dbSDimitry Andric     //
46595ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
46605ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
46615ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
46625ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
46635ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
46645ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
46655ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
46665ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
46675ffd83dbSDimitry Andric     }
46685ffd83dbSDimitry Andric   }
46695ffd83dbSDimitry Andric 
46705ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
46715ffd83dbSDimitry Andric     if (NumElts == 0)
46725ffd83dbSDimitry Andric       return;
46735ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
46745ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
46755ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
46765ffd83dbSDimitry Andric   };
46775ffd83dbSDimitry Andric 
46785ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
46795ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
46805ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
46815ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
46825ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
46835ffd83dbSDimitry Andric     return true;
46845ffd83dbSDimitry Andric   }
46855ffd83dbSDimitry Andric 
46865ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
46875ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
46885ffd83dbSDimitry Andric 
46895ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
4690fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
46915ffd83dbSDimitry Andric   if (Ty == V3S16) {
46920eae32dcSDimitry Andric     if (IsTFE) {
46930eae32dcSDimitry Andric       if (ResultRegs.size() == 1) {
46940eae32dcSDimitry Andric         NewResultReg = ResultRegs[0];
46950eae32dcSDimitry Andric       } else if (ResultRegs.size() == 2) {
46960eae32dcSDimitry Andric         LLT V4S16 = LLT::fixed_vector(4, 16);
46970eae32dcSDimitry Andric         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
46980eae32dcSDimitry Andric       } else {
46990eae32dcSDimitry Andric         return false;
47000eae32dcSDimitry Andric       }
47010eae32dcSDimitry Andric     }
47020eae32dcSDimitry Andric 
47030eae32dcSDimitry Andric     if (MRI->getType(DstReg).getNumElements() <
47040eae32dcSDimitry Andric         MRI->getType(NewResultReg).getNumElements()) {
47050eae32dcSDimitry Andric       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
47060eae32dcSDimitry Andric     } else {
47070eae32dcSDimitry Andric       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
47080eae32dcSDimitry Andric     }
47095ffd83dbSDimitry Andric     return true;
47105ffd83dbSDimitry Andric   }
47115ffd83dbSDimitry Andric 
47125ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
47135ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
47145ffd83dbSDimitry Andric   return true;
47155ffd83dbSDimitry Andric }
47165ffd83dbSDimitry Andric 
47175ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4718e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
4719e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
4720e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
4721e8d8bef9SDimitry Andric 
47225ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
47235ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
47245ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
47255ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
47265ffd83dbSDimitry Andric 
47275ffd83dbSDimitry Andric   Observer.changingInstr(MI);
47285ffd83dbSDimitry Andric 
4729fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4730e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
4731e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
4732e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4733e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
4734e8d8bef9SDimitry Andric   }
4735e8d8bef9SDimitry Andric 
47365ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
47375ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
47385ffd83dbSDimitry Andric   // allowed to add one.
47395ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
47405ffd83dbSDimitry Andric   MI.RemoveOperand(1); // Remove intrinsic ID
47415ffd83dbSDimitry Andric 
47425ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
47435ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
47445ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
47455ffd83dbSDimitry Andric   const Align MemAlign(4);
47465ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
47475ffd83dbSDimitry Andric       MachinePointerInfo(),
47485ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
47495ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
47505ffd83dbSDimitry Andric       MemSize, MemAlign);
47515ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
47525ffd83dbSDimitry Andric 
47535ffd83dbSDimitry Andric   // There are no 96-bit result scalar loads, but widening to 128-bit should
47545ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
47555ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
47565ffd83dbSDimitry Andric   if (!isPowerOf2_32(Size)) {
47575ffd83dbSDimitry Andric     if (Ty.isVector())
47585ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
47595ffd83dbSDimitry Andric     else
47605ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
47615ffd83dbSDimitry Andric   }
47625ffd83dbSDimitry Andric 
47635ffd83dbSDimitry Andric   Observer.changedInstr(MI);
47645ffd83dbSDimitry Andric   return true;
47655ffd83dbSDimitry Andric }
47665ffd83dbSDimitry Andric 
4767e8d8bef9SDimitry Andric // TODO: Move to selection
47685ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
47690b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
47700b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
4771fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
4772fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4773fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
4774fe6060f1SDimitry Andric 
4775fe6060f1SDimitry Andric   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4776fe6060f1SDimitry Andric     switch (*HsaAbiVer) {
4777fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4778fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4779fe6060f1SDimitry Andric       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4780fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4781fe6060f1SDimitry Andric       return ST.supportsGetDoorbellID() ?
4782fe6060f1SDimitry Andric           legalizeTrapHsa(MI, MRI, B) :
4783fe6060f1SDimitry Andric           legalizeTrapHsaQueuePtr(MI, MRI, B);
4784fe6060f1SDimitry Andric     }
4785fe6060f1SDimitry Andric   }
4786fe6060f1SDimitry Andric 
4787fe6060f1SDimitry Andric   llvm_unreachable("Unknown trap handler");
4788fe6060f1SDimitry Andric }
4789fe6060f1SDimitry Andric 
4790fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4791fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
47925ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4793fe6060f1SDimitry Andric   MI.eraseFromParent();
4794fe6060f1SDimitry Andric   return true;
4795fe6060f1SDimitry Andric }
4796fe6060f1SDimitry Andric 
4797fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4798fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
47995ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
48005ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4801e8d8bef9SDimitry Andric   Register LiveIn =
4802e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4803e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
48045ffd83dbSDimitry Andric     return false;
4805e8d8bef9SDimitry Andric 
4806e8d8bef9SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
48075ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
48085ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
4809fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
48105ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
4811fe6060f1SDimitry Andric 
4812fe6060f1SDimitry Andric   MI.eraseFromParent();
4813fe6060f1SDimitry Andric   return true;
48145ffd83dbSDimitry Andric }
48155ffd83dbSDimitry Andric 
4816fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(
4817fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4818fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
4819fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
48205ffd83dbSDimitry Andric   MI.eraseFromParent();
48215ffd83dbSDimitry Andric   return true;
48225ffd83dbSDimitry Andric }
48235ffd83dbSDimitry Andric 
48245ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
48255ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4826349cc55cSDimitry Andric   // Is non-HSA path or trap-handler disabled? Then, report a warning
48275ffd83dbSDimitry Andric   // accordingly
4828fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
4829fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
48305ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
48315ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
48325ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
48335ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
48345ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
48355ffd83dbSDimitry Andric   } else {
48365ffd83dbSDimitry Andric     // Insert debug-trap instruction
4837fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
4838fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
48395ffd83dbSDimitry Andric   }
48405ffd83dbSDimitry Andric 
48415ffd83dbSDimitry Andric   MI.eraseFromParent();
48425ffd83dbSDimitry Andric   return true;
48435ffd83dbSDimitry Andric }
48445ffd83dbSDimitry Andric 
4845e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4846e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
4847e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
4848e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
4849e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
4850e8d8bef9SDimitry Andric 
4851e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4852e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
4853e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
4854e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
4855e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
4856e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
4857e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
4858e8d8bef9SDimitry Andric 
4859fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
4860fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4861fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
4862fe6060f1SDimitry Andric                                         MI.getDebugLoc());
4863fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4864fe6060f1SDimitry Andric     return false;
4865fe6060f1SDimitry Andric   }
4866fe6060f1SDimitry Andric 
4867349cc55cSDimitry Andric   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4868349cc55cSDimitry Andric   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4869349cc55cSDimitry Andric   const unsigned NumVDataDwords = 4;
4870349cc55cSDimitry Andric   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4871349cc55cSDimitry Andric   const bool UseNSA =
4872349cc55cSDimitry Andric       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4873349cc55cSDimitry Andric   const unsigned BaseOpcodes[2][2] = {
4874349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4875349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4876349cc55cSDimitry Andric        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4877349cc55cSDimitry Andric   int Opcode;
4878349cc55cSDimitry Andric   if (UseNSA) {
4879349cc55cSDimitry Andric     Opcode =
4880349cc55cSDimitry Andric         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4881349cc55cSDimitry Andric                               NumVDataDwords, NumVAddrDwords);
4882349cc55cSDimitry Andric   } else {
4883349cc55cSDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4884349cc55cSDimitry Andric                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4885349cc55cSDimitry Andric                                    PowerOf2Ceil(NumVAddrDwords));
4886349cc55cSDimitry Andric   }
4887349cc55cSDimitry Andric   assert(Opcode != -1);
4888e8d8bef9SDimitry Andric 
4889e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
4890e8d8bef9SDimitry Andric   if (Is64) {
4891e8d8bef9SDimitry Andric     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4892e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(0));
4893e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(1));
4894e8d8bef9SDimitry Andric   } else {
4895e8d8bef9SDimitry Andric     Ops.push_back(NodePtr);
4896e8d8bef9SDimitry Andric   }
4897e8d8bef9SDimitry Andric   Ops.push_back(RayExtent);
4898e8d8bef9SDimitry Andric 
4899e8d8bef9SDimitry Andric   auto packLanes = [&Ops, &S32, &B](Register Src) {
49000eae32dcSDimitry Andric     auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
4901e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(0));
4902e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(1));
4903e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(2));
4904e8d8bef9SDimitry Andric   };
4905e8d8bef9SDimitry Andric 
4906e8d8bef9SDimitry Andric   packLanes(RayOrigin);
4907e8d8bef9SDimitry Andric   if (IsA16) {
49080eae32dcSDimitry Andric     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
49090eae32dcSDimitry Andric     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
4910e8d8bef9SDimitry Andric     Register R1 = MRI.createGenericVirtualRegister(S32);
4911e8d8bef9SDimitry Andric     Register R2 = MRI.createGenericVirtualRegister(S32);
4912e8d8bef9SDimitry Andric     Register R3 = MRI.createGenericVirtualRegister(S32);
4913e8d8bef9SDimitry Andric     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4914e8d8bef9SDimitry Andric     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4915e8d8bef9SDimitry Andric     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4916e8d8bef9SDimitry Andric     Ops.push_back(R1);
4917e8d8bef9SDimitry Andric     Ops.push_back(R2);
4918e8d8bef9SDimitry Andric     Ops.push_back(R3);
4919e8d8bef9SDimitry Andric   } else {
4920e8d8bef9SDimitry Andric     packLanes(RayDir);
4921e8d8bef9SDimitry Andric     packLanes(RayInvDir);
4922e8d8bef9SDimitry Andric   }
4923e8d8bef9SDimitry Andric 
4924349cc55cSDimitry Andric   if (!UseNSA) {
4925349cc55cSDimitry Andric     // Build a single vector containing all the operands so far prepared.
4926349cc55cSDimitry Andric     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4927349cc55cSDimitry Andric     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4928349cc55cSDimitry Andric     Ops.clear();
4929349cc55cSDimitry Andric     Ops.push_back(MergedOps);
4930349cc55cSDimitry Andric   }
4931349cc55cSDimitry Andric 
4932e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4933e8d8bef9SDimitry Andric     .addDef(DstReg)
4934e8d8bef9SDimitry Andric     .addImm(Opcode);
4935e8d8bef9SDimitry Andric 
4936e8d8bef9SDimitry Andric   for (Register R : Ops) {
4937e8d8bef9SDimitry Andric     MIB.addUse(R);
4938e8d8bef9SDimitry Andric   }
4939e8d8bef9SDimitry Andric 
4940e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
4941e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
4942e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
4943e8d8bef9SDimitry Andric 
4944e8d8bef9SDimitry Andric   MI.eraseFromParent();
4945e8d8bef9SDimitry Andric   return true;
4946e8d8bef9SDimitry Andric }
4947e8d8bef9SDimitry Andric 
4948*04eeddc0SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
4949*04eeddc0SDimitry Andric   B.buildConstant(MI.getOperand(0).getReg(), C);
4950*04eeddc0SDimitry Andric   MI.eraseFromParent();
4951*04eeddc0SDimitry Andric   return true;
4952*04eeddc0SDimitry Andric }
4953*04eeddc0SDimitry Andric 
49545ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
49555ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
49565ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
49575ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
49585ffd83dbSDimitry Andric 
49590b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4960480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
4961480093f4SDimitry Andric   switch (IntrID) {
4962480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
4963480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
4964480093f4SDimitry Andric     MachineInstr *Br = nullptr;
49655ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4966e8d8bef9SDimitry Andric     bool Negated = false;
4967e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
4968e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
49690b57cec5SDimitry Andric       const SIRegisterInfo *TRI
49700b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
49710b57cec5SDimitry Andric 
49720b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
49730b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
4974480093f4SDimitry Andric 
49755ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4976e8d8bef9SDimitry Andric 
4977e8d8bef9SDimitry Andric       if (Negated)
4978e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
4979e8d8bef9SDimitry Andric 
49805ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4981480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
49820b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
49830b57cec5SDimitry Andric           .addDef(Def)
49840b57cec5SDimitry Andric           .addUse(Use)
49855ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
4986480093f4SDimitry Andric       } else {
4987480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
4988480093f4SDimitry Andric             .addDef(Def)
4989480093f4SDimitry Andric             .addUse(Use)
4990e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
4991480093f4SDimitry Andric       }
4992480093f4SDimitry Andric 
49935ffd83dbSDimitry Andric       if (Br) {
49945ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
49955ffd83dbSDimitry Andric       } else {
49965ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
49975ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
49985ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
49995ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
50005ffd83dbSDimitry Andric       }
50010b57cec5SDimitry Andric 
50020b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
50030b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
50040b57cec5SDimitry Andric       MI.eraseFromParent();
50050b57cec5SDimitry Andric       BrCond->eraseFromParent();
50060b57cec5SDimitry Andric       return true;
50070b57cec5SDimitry Andric     }
50080b57cec5SDimitry Andric 
50090b57cec5SDimitry Andric     return false;
50100b57cec5SDimitry Andric   }
50110b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
5012480093f4SDimitry Andric     MachineInstr *Br = nullptr;
50135ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
5014e8d8bef9SDimitry Andric     bool Negated = false;
5015e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
5016e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
50170b57cec5SDimitry Andric       const SIRegisterInfo *TRI
50180b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
50190b57cec5SDimitry Andric 
50205ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
50210b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
50225ffd83dbSDimitry Andric 
5023e8d8bef9SDimitry Andric       if (Negated)
5024e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
5025e8d8bef9SDimitry Andric 
50265ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
50270b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
50280b57cec5SDimitry Andric         .addUse(Reg)
50295ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
50305ffd83dbSDimitry Andric 
50315ffd83dbSDimitry Andric       if (Br)
50325ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
50335ffd83dbSDimitry Andric       else
50345ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
50355ffd83dbSDimitry Andric 
50360b57cec5SDimitry Andric       MI.eraseFromParent();
50370b57cec5SDimitry Andric       BrCond->eraseFromParent();
50380b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
50390b57cec5SDimitry Andric       return true;
50400b57cec5SDimitry Andric     }
50410b57cec5SDimitry Andric 
50420b57cec5SDimitry Andric     return false;
50430b57cec5SDimitry Andric   }
50440b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
50455ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
50465ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
50475ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
50485ffd83dbSDimitry Andric       MI.eraseFromParent();
50495ffd83dbSDimitry Andric       return true;
50505ffd83dbSDimitry Andric     }
50515ffd83dbSDimitry Andric 
50520b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
50530b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
50540b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
50550b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
50560b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
5057*04eeddc0SDimitry Andric     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
5058*04eeddc0SDimitry Andric       return replaceWithConstant(B, MI, 0);
50590b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50600b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
50610b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
5062*04eeddc0SDimitry Andric     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
5063*04eeddc0SDimitry Andric       return replaceWithConstant(B, MI, 0);
5064*04eeddc0SDimitry Andric 
50650b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50660b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
50670b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
5068*04eeddc0SDimitry Andric     if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
5069*04eeddc0SDimitry Andric       return replaceWithConstant(B, MI, 0);
5070*04eeddc0SDimitry Andric 
50710b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50720b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
50730b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
50740b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50750b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
50760b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
50770b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50780b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
50790b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
50800b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50810b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
50820b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
50830b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50840b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
50850b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
50860b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50870b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
50880b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
50890b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
50900b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
50910b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
50920b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50930b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
50948bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
50958bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
50968bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
50978bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
50988bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
50998bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
51008bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
51018bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
51028bcb0991SDimitry Andric     MI.eraseFromParent();
51038bcb0991SDimitry Andric     return true;
51048bcb0991SDimitry Andric   }
51055ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
5106e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
51078bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
51085ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
51095ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
51108bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
51115ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
51125ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
51135ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
51145ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
51155ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
51165ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
51175ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
51185ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
51195ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
51205ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
51215ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
51225ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
51235ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
51245ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
51255ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
51265ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
51275ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
51285ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
51295ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
51305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
51315ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
51325ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
51335ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
51345ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
51355ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
51365ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
51375ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
51385ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
51395ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
51405ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
51415ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
51425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
51435ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
51445ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
51455ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
51465ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
51475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
51485ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
51495ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
51505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5151fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5152fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5153fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5154fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
51555ffd83dbSDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
5156*04eeddc0SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5157*04eeddc0SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
5158*04eeddc0SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
5159*04eeddc0SDimitry Andric     if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
5160*04eeddc0SDimitry Andric       Function &F = B.getMF().getFunction();
5161*04eeddc0SDimitry Andric       DiagnosticInfoUnsupported NoFpRet(
5162*04eeddc0SDimitry Andric           F, "return versions of fp atomics not supported", B.getDebugLoc(),
5163*04eeddc0SDimitry Andric           DS_Error);
5164*04eeddc0SDimitry Andric       F.getContext().diagnose(NoFpRet);
5165*04eeddc0SDimitry Andric       B.buildUndef(DstReg);
5166*04eeddc0SDimitry Andric       MI.eraseFromParent();
5167*04eeddc0SDimitry Andric       return true;
5168*04eeddc0SDimitry Andric     }
5169*04eeddc0SDimitry Andric 
5170*04eeddc0SDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
5171*04eeddc0SDimitry Andric   }
51725ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_inc:
51735ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, true);
51745ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_dec:
51755ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, false);
51765ffd83dbSDimitry Andric   case Intrinsic::trap:
51775ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
51785ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
51795ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5180e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
5181e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
5182e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
5183e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
5184e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
5185e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5186e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5187e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
51885ffd83dbSDimitry Andric   default: {
51895ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
51905ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
51915ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
51920b57cec5SDimitry Andric     return true;
51930b57cec5SDimitry Andric   }
51945ffd83dbSDimitry Andric   }
51950b57cec5SDimitry Andric 
51960b57cec5SDimitry Andric   return true;
51970b57cec5SDimitry Andric }
5198