xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 349cc55c9796c4596a5b9904cd3281af295f878f)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
21fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
23fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
28e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
290b57cec5SDimitry Andric 
300b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
310b57cec5SDimitry Andric 
320b57cec5SDimitry Andric using namespace llvm;
330b57cec5SDimitry Andric using namespace LegalizeActions;
340b57cec5SDimitry Andric using namespace LegalizeMutations;
350b57cec5SDimitry Andric using namespace LegalityPredicates;
365ffd83dbSDimitry Andric using namespace MIPatternMatch;
370b57cec5SDimitry Andric 
385ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
395ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
405ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
415ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
425ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
435ffd83dbSDimitry Andric   cl::init(false),
445ffd83dbSDimitry Andric   cl::ReallyHidden);
450b57cec5SDimitry Andric 
465ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
475ffd83dbSDimitry Andric 
485ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
495ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
505ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
515ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
530b57cec5SDimitry Andric }
540b57cec5SDimitry Andric 
555ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
565ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
575ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
585ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
595ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
608bcb0991SDimitry Andric }
618bcb0991SDimitry Andric 
62*349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an
63e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
650b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
660b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
670b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
68e8d8bef9SDimitry Andric     if (!Ty.isVector())
69e8d8bef9SDimitry Andric       return false;
70e8d8bef9SDimitry Andric 
71e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
72e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
73e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
74e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
758bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
768bcb0991SDimitry Andric   };
778bcb0991SDimitry Andric }
788bcb0991SDimitry Andric 
79e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
81e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
82e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
83e8d8bef9SDimitry Andric   };
84e8d8bef9SDimitry Andric }
85e8d8bef9SDimitry Andric 
868bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
878bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
888bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
898bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
908bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
910b57cec5SDimitry Andric   };
920b57cec5SDimitry Andric }
930b57cec5SDimitry Andric 
940b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
950b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
960b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
970b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
98fe6060f1SDimitry Andric     return std::make_pair(TypeIdx,
99fe6060f1SDimitry Andric                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1000b57cec5SDimitry Andric   };
1010b57cec5SDimitry Andric }
1020b57cec5SDimitry Andric 
1030b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1040b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1050b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1060b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1070b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1080b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1090b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110fe6060f1SDimitry Andric     return std::make_pair(
111fe6060f1SDimitry Andric         TypeIdx,
112fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
1130b57cec5SDimitry Andric   };
1140b57cec5SDimitry Andric }
1150b57cec5SDimitry Andric 
1168bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1178bcb0991SDimitry Andric // type.
1188bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1198bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1208bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1218bcb0991SDimitry Andric 
1228bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1238bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1248bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1258bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1268bcb0991SDimitry Andric 
1278bcb0991SDimitry Andric     assert(EltSize < 32);
1288bcb0991SDimitry Andric 
1298bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130fe6060f1SDimitry Andric     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1318bcb0991SDimitry Andric   };
1328bcb0991SDimitry Andric }
1338bcb0991SDimitry Andric 
134e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
135e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1365ffd83dbSDimitry Andric 
1375ffd83dbSDimitry Andric   LLT CoercedTy;
1385ffd83dbSDimitry Andric   if (Size <= 32) {
1395ffd83dbSDimitry Andric     // <2 x s8> -> s16
1405ffd83dbSDimitry Andric     // <4 x s8> -> s32
141e8d8bef9SDimitry Andric     return LLT::scalar(Size);
142e8d8bef9SDimitry Andric   }
1435ffd83dbSDimitry Andric 
144fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145e8d8bef9SDimitry Andric }
146e8d8bef9SDimitry Andric 
147e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
149e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
150e8d8bef9SDimitry Andric     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151e8d8bef9SDimitry Andric   };
152e8d8bef9SDimitry Andric }
153e8d8bef9SDimitry Andric 
154e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
156e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
157e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
158e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
159fe6060f1SDimitry Andric     return std::make_pair(
160fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
1615ffd83dbSDimitry Andric   };
1625ffd83dbSDimitry Andric }
1635ffd83dbSDimitry Andric 
1648bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
1658bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1668bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1678bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
1688bcb0991SDimitry Andric   };
1698bcb0991SDimitry Andric }
1708bcb0991SDimitry Andric 
1710b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
1720b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1730b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1740b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
1750b57cec5SDimitry Andric   };
1760b57cec5SDimitry Andric }
1770b57cec5SDimitry Andric 
1780b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
1790b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1800b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1810b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
1820b57cec5SDimitry Andric   };
1830b57cec5SDimitry Andric }
1840b57cec5SDimitry Andric 
1855ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
1865ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
1875ffd83dbSDimitry Andric }
1885ffd83dbSDimitry Andric 
1895ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
1905ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
1915ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
1925ffd83dbSDimitry Andric }
1935ffd83dbSDimitry Andric 
1945ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
1950b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
1960b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
1970b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
1980b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
1990b57cec5SDimitry Andric }
2000b57cec5SDimitry Andric 
2015ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2025ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2035ffd83dbSDimitry Andric     return false;
2045ffd83dbSDimitry Andric 
2055ffd83dbSDimitry Andric   if (Ty.isVector())
2065ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2075ffd83dbSDimitry Andric 
2085ffd83dbSDimitry Andric   return true;
2095ffd83dbSDimitry Andric }
2105ffd83dbSDimitry Andric 
2115ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2125ffd83dbSDimitry Andric // multiples of v2s16.
2135ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2145ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2155ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2168bcb0991SDimitry Andric   };
2178bcb0991SDimitry Andric }
2188bcb0991SDimitry Andric 
2195ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2208bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2215ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2225ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2235ffd83dbSDimitry Andric       return false;
2245ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2255ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2268bcb0991SDimitry Andric   };
2278bcb0991SDimitry Andric }
2288bcb0991SDimitry Andric 
229fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
230fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
231fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
2328bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2338bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2348bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
2360b57cec5SDimitry Andric   };
2370b57cec5SDimitry Andric }
2380b57cec5SDimitry Andric 
2395ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2405ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2415ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2425ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
2435ffd83dbSDimitry Andric                                     bool IsLoad) {
2445ffd83dbSDimitry Andric   switch (AS) {
2455ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2465ffd83dbSDimitry Andric     // FIXME: Private element size.
247e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2485ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2495ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
2505ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
2515ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
2525ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
2535ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
2545ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
2555ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
2565ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
2575ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
2585ffd83dbSDimitry Andric     // kernel.
2595ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
2605ffd83dbSDimitry Andric   default:
2615ffd83dbSDimitry Andric     // Flat addresses may contextually need to be split to 32-bit parts if they
2625ffd83dbSDimitry Andric     // may alias scratch depending on the subtarget.
2635ffd83dbSDimitry Andric     return 128;
2645ffd83dbSDimitry Andric   }
2655ffd83dbSDimitry Andric }
2665ffd83dbSDimitry Andric 
2675ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
2695ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
2705ffd83dbSDimitry Andric 
2715ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
2735ffd83dbSDimitry Andric 
2745ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
275fe6060f1SDimitry Andric   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276e8d8bef9SDimitry Andric   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
2775ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
2785ffd83dbSDimitry Andric 
2795ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
2805ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2815ffd83dbSDimitry Andric     return false;
2825ffd83dbSDimitry Andric 
283fe6060f1SDimitry Andric   // Do not handle extending vector loads.
284fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
285fe6060f1SDimitry Andric     return false;
286fe6060f1SDimitry Andric 
2875ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
2885ffd83dbSDimitry Andric   // we also need to modify the memory access size.
2895ffd83dbSDimitry Andric #if 0
2905ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
2915ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
2925ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
2935ffd83dbSDimitry Andric #endif
2945ffd83dbSDimitry Andric 
2955ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
2965ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
2975ffd83dbSDimitry Andric     return false;
2985ffd83dbSDimitry Andric 
2995ffd83dbSDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
3005ffd83dbSDimitry Andric     return false;
3015ffd83dbSDimitry Andric 
3025ffd83dbSDimitry Andric   switch (MemSize) {
3035ffd83dbSDimitry Andric   case 8:
3045ffd83dbSDimitry Andric   case 16:
3055ffd83dbSDimitry Andric   case 32:
3065ffd83dbSDimitry Andric   case 64:
3075ffd83dbSDimitry Andric   case 128:
3085ffd83dbSDimitry Andric     break;
3095ffd83dbSDimitry Andric   case 96:
3105ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3115ffd83dbSDimitry Andric       return false;
3125ffd83dbSDimitry Andric     break;
3135ffd83dbSDimitry Andric   case 256:
3145ffd83dbSDimitry Andric   case 512:
3155ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3165ffd83dbSDimitry Andric     break;
3175ffd83dbSDimitry Andric   default:
3185ffd83dbSDimitry Andric     return false;
3195ffd83dbSDimitry Andric   }
3205ffd83dbSDimitry Andric 
3215ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3225ffd83dbSDimitry Andric 
323e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3245ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
325e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3275ffd83dbSDimitry Andric       return false;
3285ffd83dbSDimitry Andric   }
3295ffd83dbSDimitry Andric 
3305ffd83dbSDimitry Andric   return true;
3315ffd83dbSDimitry Andric }
3325ffd83dbSDimitry Andric 
3335ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
3345ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
3355ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
3365ffd83dbSDimitry Andric // bitcasting.
3375ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
3385ffd83dbSDimitry Andric   if (EnableNewLegality)
3395ffd83dbSDimitry Andric     return false;
3405ffd83dbSDimitry Andric 
3415ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
3425ffd83dbSDimitry Andric   if (Size <= 64)
3435ffd83dbSDimitry Andric     return false;
3445ffd83dbSDimitry Andric   if (!Ty.isVector())
3455ffd83dbSDimitry Andric     return true;
346e8d8bef9SDimitry Andric 
347e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
348e8d8bef9SDimitry Andric   if (EltTy.isPointer())
349e8d8bef9SDimitry Andric     return true;
350e8d8bef9SDimitry Andric 
351e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
3525ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
3535ffd83dbSDimitry Andric }
3545ffd83dbSDimitry Andric 
355fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
3565ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
357fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
3585ffd83dbSDimitry Andric          !loadStoreBitcastWorkaround(Ty);
3595ffd83dbSDimitry Andric }
3605ffd83dbSDimitry Andric 
361e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
362e8d8bef9SDimitry Andric /// to a different type.
363e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364fe6060f1SDimitry Andric                                        const LLT MemTy) {
365fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
367e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
368e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
369e8d8bef9SDimitry Andric 
370e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371e8d8bef9SDimitry Andric     return true;
372fe6060f1SDimitry Andric 
373fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
374fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
376e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
377e8d8bef9SDimitry Andric }
378e8d8bef9SDimitry Andric 
379e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
380e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
381e8d8bef9SDimitry Andric /// changes, not the size of the result register.
382fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383e8d8bef9SDimitry Andric                             unsigned AlignInBits, unsigned AddrSpace,
384e8d8bef9SDimitry Andric                             unsigned Opcode) {
385fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
386e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
387e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
388e8d8bef9SDimitry Andric     return false;
389e8d8bef9SDimitry Andric 
390e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391e8d8bef9SDimitry Andric   // end up widening these for a scalar load during RegBankSelect, since there
392e8d8bef9SDimitry Andric   // aren't 96-bit scalar loads.
393e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394e8d8bef9SDimitry Andric     return false;
395e8d8bef9SDimitry Andric 
396e8d8bef9SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397e8d8bef9SDimitry Andric     return false;
398e8d8bef9SDimitry Andric 
399e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
400e8d8bef9SDimitry Andric   // to it.
401e8d8bef9SDimitry Andric   //
402e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
403e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
405e8d8bef9SDimitry Andric     return false;
406e8d8bef9SDimitry Andric 
407e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
408e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
409e8d8bef9SDimitry Andric   bool Fast = false;
410e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
411e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
413e8d8bef9SDimitry Andric          Fast;
414e8d8bef9SDimitry Andric }
415e8d8bef9SDimitry Andric 
416e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417e8d8bef9SDimitry Andric                             unsigned Opcode) {
418e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419e8d8bef9SDimitry Andric     return false;
420e8d8bef9SDimitry Andric 
421fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
423e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
424e8d8bef9SDimitry Andric }
425e8d8bef9SDimitry Andric 
4260b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
4270b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
4280b57cec5SDimitry Andric   :  ST(ST_) {
4290b57cec5SDimitry Andric   using namespace TargetOpcode;
4300b57cec5SDimitry Andric 
4310b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
4320b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
4330b57cec5SDimitry Andric   };
4340b57cec5SDimitry Andric 
4350b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
436e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
4370b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
4380b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
4390b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
4400b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
4410b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
4425ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
4435ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
4440b57cec5SDimitry Andric 
445fe6060f1SDimitry Andric   const LLT V2S8 = LLT::fixed_vector(2, 8);
446fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
447fe6060f1SDimitry Andric   const LLT V4S16 = LLT::fixed_vector(4, 16);
4480b57cec5SDimitry Andric 
449fe6060f1SDimitry Andric   const LLT V2S32 = LLT::fixed_vector(2, 32);
450fe6060f1SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
451fe6060f1SDimitry Andric   const LLT V4S32 = LLT::fixed_vector(4, 32);
452fe6060f1SDimitry Andric   const LLT V5S32 = LLT::fixed_vector(5, 32);
453fe6060f1SDimitry Andric   const LLT V6S32 = LLT::fixed_vector(6, 32);
454fe6060f1SDimitry Andric   const LLT V7S32 = LLT::fixed_vector(7, 32);
455fe6060f1SDimitry Andric   const LLT V8S32 = LLT::fixed_vector(8, 32);
456fe6060f1SDimitry Andric   const LLT V9S32 = LLT::fixed_vector(9, 32);
457fe6060f1SDimitry Andric   const LLT V10S32 = LLT::fixed_vector(10, 32);
458fe6060f1SDimitry Andric   const LLT V11S32 = LLT::fixed_vector(11, 32);
459fe6060f1SDimitry Andric   const LLT V12S32 = LLT::fixed_vector(12, 32);
460fe6060f1SDimitry Andric   const LLT V13S32 = LLT::fixed_vector(13, 32);
461fe6060f1SDimitry Andric   const LLT V14S32 = LLT::fixed_vector(14, 32);
462fe6060f1SDimitry Andric   const LLT V15S32 = LLT::fixed_vector(15, 32);
463fe6060f1SDimitry Andric   const LLT V16S32 = LLT::fixed_vector(16, 32);
464fe6060f1SDimitry Andric   const LLT V32S32 = LLT::fixed_vector(32, 32);
4650b57cec5SDimitry Andric 
466fe6060f1SDimitry Andric   const LLT V2S64 = LLT::fixed_vector(2, 64);
467fe6060f1SDimitry Andric   const LLT V3S64 = LLT::fixed_vector(3, 64);
468fe6060f1SDimitry Andric   const LLT V4S64 = LLT::fixed_vector(4, 64);
469fe6060f1SDimitry Andric   const LLT V5S64 = LLT::fixed_vector(5, 64);
470fe6060f1SDimitry Andric   const LLT V6S64 = LLT::fixed_vector(6, 64);
471fe6060f1SDimitry Andric   const LLT V7S64 = LLT::fixed_vector(7, 64);
472fe6060f1SDimitry Andric   const LLT V8S64 = LLT::fixed_vector(8, 64);
473fe6060f1SDimitry Andric   const LLT V16S64 = LLT::fixed_vector(16, 64);
4740b57cec5SDimitry Andric 
4750b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
4760b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
4778bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
4780b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
4798bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
4800b57cec5SDimitry Andric 
4810b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
4820b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
4838bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
4840b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
4858bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
4860b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
4870b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
4880b57cec5SDimitry Andric 
4890b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
4900b57cec5SDimitry Andric 
4910b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
4920b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
4930b57cec5SDimitry Andric   };
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
4968bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
4970b57cec5SDimitry Andric   };
4980b57cec5SDimitry Andric 
4990b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
5000b57cec5SDimitry Andric     S32, S64
5010b57cec5SDimitry Andric   };
5020b57cec5SDimitry Andric 
5030b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
5040b57cec5SDimitry Andric     S32, S64, S16
5050b57cec5SDimitry Andric   };
5060b57cec5SDimitry Andric 
5070b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
5080b57cec5SDimitry Andric     S32, S64, S16, V2S16
5090b57cec5SDimitry Andric   };
5100b57cec5SDimitry Andric 
5115ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
5125ffd83dbSDimitry Andric 
513fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
514fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
5150b57cec5SDimitry Andric 
5160b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
5170b57cec5SDimitry Andric   // elements for v3s16
5180b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
519e8d8bef9SDimitry Andric     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
5200b57cec5SDimitry Andric     .legalFor(AllS32Vectors)
5210b57cec5SDimitry Andric     .legalFor(AllS64Vectors)
5220b57cec5SDimitry Andric     .legalFor(AddrSpaces64)
5230b57cec5SDimitry Andric     .legalFor(AddrSpaces32)
524e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
525e8d8bef9SDimitry Andric     .clampScalar(0, S16, S256)
5260b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
5270b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16)
5280b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529e8d8bef9SDimitry Andric     .scalarize(0);
5300b57cec5SDimitry Andric 
531e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532e8d8bef9SDimitry Andric     // Full set of gfx9 features.
5335ffd83dbSDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5345ffd83dbSDimitry Andric       .legalFor({S32, S16, V2S16})
535*349cc55cSDimitry Andric       .minScalar(0, S16)
5365ffd83dbSDimitry Andric       .clampMaxNumElements(0, S16, 2)
537*349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
538*349cc55cSDimitry Andric       .maxScalar(0, S32)
539*349cc55cSDimitry Andric       .scalarize(0);
540e8d8bef9SDimitry Andric 
541e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
542e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
543e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
544e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S16, 2)
545e8d8bef9SDimitry Andric       .scalarize(0)
546e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
547e8d8bef9SDimitry Andric       .lower();
5485ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
5490b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5500b57cec5SDimitry Andric       .legalFor({S32, S16})
551*349cc55cSDimitry Andric       .minScalar(0, S16)
552*349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
553*349cc55cSDimitry Andric       .maxScalar(0, S32)
554*349cc55cSDimitry Andric       .scalarize(0);
555e8d8bef9SDimitry Andric 
556e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
557e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
558e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
559e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
560e8d8bef9SDimitry Andric       .minScalar(0, S16)
561e8d8bef9SDimitry Andric       .scalarize(0)
562e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
563e8d8bef9SDimitry Andric       .lower();
564e8d8bef9SDimitry Andric 
565e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
566e8d8bef9SDimitry Andric     // coerce to the desired type first.
567e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
568e8d8bef9SDimitry Andric       .minScalar(0, S16)
569e8d8bef9SDimitry Andric       .scalarize(0)
570e8d8bef9SDimitry Andric       .lower();
5710b57cec5SDimitry Andric   } else {
5720b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5730b57cec5SDimitry Andric       .legalFor({S32})
574*349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
5750b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
5760b57cec5SDimitry Andric       .scalarize(0);
577e8d8bef9SDimitry Andric 
578e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
579e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
580e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
581e8d8bef9SDimitry Andric         .scalarize(0)
582e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
583e8d8bef9SDimitry Andric         .lower();
584e8d8bef9SDimitry Andric     } else {
585e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
586e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
587e8d8bef9SDimitry Andric         .minScalar(0, S32)
588e8d8bef9SDimitry Andric         .scalarize(0)
589e8d8bef9SDimitry Andric         .lower();
5900b57cec5SDimitry Andric     }
5910b57cec5SDimitry Andric 
592e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
593e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
594e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
595e8d8bef9SDimitry Andric       .minScalar(0, S32)
596e8d8bef9SDimitry Andric       .scalarize(0)
597e8d8bef9SDimitry Andric       .lower();
598e8d8bef9SDimitry Andric   }
599e8d8bef9SDimitry Andric 
600fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
601fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
6025ffd83dbSDimitry Andric       .customFor({S32, S64})
603480093f4SDimitry Andric       .clampScalar(0, S32, S64)
604480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
605480093f4SDimitry Andric       .scalarize(0);
606480093f4SDimitry Andric 
607e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
6080b57cec5SDimitry Andric                    .legalFor({S32})
609*349cc55cSDimitry Andric                    .maxScalar(0, S32);
610e8d8bef9SDimitry Andric 
611e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
612e8d8bef9SDimitry Andric     Mulh
613e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
614e8d8bef9SDimitry Andric       .lowerFor({V2S8});
615e8d8bef9SDimitry Andric   }
616e8d8bef9SDimitry Andric 
617e8d8bef9SDimitry Andric   Mulh
618e8d8bef9SDimitry Andric     .scalarize(0)
619e8d8bef9SDimitry Andric     .lower();
6200b57cec5SDimitry Andric 
6210b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
6220b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
6230b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
6240b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
6250b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
6260b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6278bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
6280b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
6290b57cec5SDimitry Andric     .scalarize(0);
6300b57cec5SDimitry Andric 
6318bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
6320b57cec5SDimitry Andric                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
633480093f4SDimitry Andric     .legalFor({{S32, S1}, {S32, S32}})
6345ffd83dbSDimitry Andric     .minScalar(0, S32)
6355ffd83dbSDimitry Andric     // TODO: .scalarize(0)
6368bcb0991SDimitry Andric     .lower();
6370b57cec5SDimitry Andric 
6380b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
6390b57cec5SDimitry Andric     // Don't worry about the size constraint.
6408bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
6415ffd83dbSDimitry Andric     .lower();
6420b57cec5SDimitry Andric 
6430b57cec5SDimitry Andric 
6440b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
6458bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
6460b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
647e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
6480b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
649e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
6500b57cec5SDimitry Andric 
6515ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
6525ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
6535ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
6548bcb0991SDimitry Andric 
6555ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
6565ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
6575ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
6585ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
6595ffd83dbSDimitry Andric       .legalFor({S1, S16})
6605ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6615ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
6625ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
6635ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
6645ffd83dbSDimitry Andric 
665fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
6665ffd83dbSDimitry Andric 
6675ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
6685ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
6695ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
6705ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
6715ffd83dbSDimitry Andric 
6725ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
673e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
674e8d8bef9SDimitry Andric 
675fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
6760b57cec5SDimitry Andric 
6770b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
6788bcb0991SDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
6790b57cec5SDimitry Andric     .legalFor({S32, S64});
6808bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
6818bcb0991SDimitry Andric     .customFor({S32, S64});
6828bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
6838bcb0991SDimitry Andric     .customFor({S32, S64});
6840b57cec5SDimitry Andric 
6850b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
6860b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
6870b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
6880b57cec5SDimitry Andric     else
6890b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
6908bcb0991SDimitry Andric 
6918bcb0991SDimitry Andric     TrigActions.customFor({S16});
6928bcb0991SDimitry Andric     FDIVActions.customFor({S16});
6930b57cec5SDimitry Andric   }
6940b57cec5SDimitry Andric 
6950b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
6960b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
6970b57cec5SDimitry Andric 
6980b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
6990b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
700480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
7010b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
7020b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7030b57cec5SDimitry Andric       .scalarize(0);
7040b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
7050b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
7060b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7070b57cec5SDimitry Andric       .scalarize(0);
7080b57cec5SDimitry Andric   } else {
7090b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
7100b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
7110b57cec5SDimitry Andric       .scalarize(0);
7120b57cec5SDimitry Andric   }
7130b57cec5SDimitry Andric 
7140b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
7150b57cec5SDimitry Andric     FPOpActions.clampMaxNumElements(0, S16, 2);
7168bcb0991SDimitry Andric 
7170b57cec5SDimitry Andric   FPOpActions
7180b57cec5SDimitry Andric     .scalarize(0)
7190b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7200b57cec5SDimitry Andric 
7218bcb0991SDimitry Andric   TrigActions
7228bcb0991SDimitry Andric     .scalarize(0)
7238bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7248bcb0991SDimitry Andric 
7258bcb0991SDimitry Andric   FDIVActions
7268bcb0991SDimitry Andric     .scalarize(0)
7278bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7288bcb0991SDimitry Andric 
7298bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
7308bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
7318bcb0991SDimitry Andric     .clampMaxNumElements(0, S16, 2)
7328bcb0991SDimitry Andric     .scalarize(0)
7338bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
7348bcb0991SDimitry Andric 
7350b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7368bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
7370b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
7380b57cec5SDimitry Andric       .scalarize(0)
7390b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
7400b57cec5SDimitry Andric   } else {
7415ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
7425ffd83dbSDimitry Andric       .legalFor({S32, S64})
7435ffd83dbSDimitry Andric       .scalarize(0)
7445ffd83dbSDimitry Andric       .clampScalar(0, S32, S64);
7455ffd83dbSDimitry Andric 
7465ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
7475ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7485ffd83dbSDimitry Andric         .customFor({S64})
7495ffd83dbSDimitry Andric         .legalFor({S32, S64})
7505ffd83dbSDimitry Andric         .scalarize(0)
7515ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
7525ffd83dbSDimitry Andric     } else {
7535ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7540b57cec5SDimitry Andric         .legalFor({S32, S64})
7550b57cec5SDimitry Andric         .scalarize(0)
7560b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
7570b57cec5SDimitry Andric     }
7585ffd83dbSDimitry Andric   }
7590b57cec5SDimitry Andric 
7600b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
7610b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
7625ffd83dbSDimitry Andric     .scalarize(0)
7635ffd83dbSDimitry Andric     .lower();
7640b57cec5SDimitry Andric 
7650b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
7660b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
767e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
7680b57cec5SDimitry Andric     .scalarize(0);
7690b57cec5SDimitry Andric 
7700b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FSUB)
7710b57cec5SDimitry Andric       // Use actual fsub instruction
7720b57cec5SDimitry Andric       .legalFor({S32})
7730b57cec5SDimitry Andric       // Must use fadd + fneg
7740b57cec5SDimitry Andric       .lowerFor({S64, S16, V2S16})
7750b57cec5SDimitry Andric       .scalarize(0)
7760b57cec5SDimitry Andric       .clampScalar(0, S32, S64);
7770b57cec5SDimitry Andric 
7788bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
7798bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
7805ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
7818bcb0991SDimitry Andric     FMad.customFor({S32, S16});
7825ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
7838bcb0991SDimitry Andric     FMad.customFor({S32});
7845ffd83dbSDimitry Andric   else if (ST.hasMadF16())
7855ffd83dbSDimitry Andric     FMad.customFor({S16});
7868bcb0991SDimitry Andric   FMad.scalarize(0)
7878bcb0991SDimitry Andric       .lower();
7888bcb0991SDimitry Andric 
789e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
790e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
791e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
792e8d8bef9SDimitry Andric   } else {
793e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
794e8d8bef9SDimitry Andric         .customFor({S32, S64});
795e8d8bef9SDimitry Andric   }
796e8d8bef9SDimitry Andric   FRem.scalarize(0);
797e8d8bef9SDimitry Andric 
7985ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
7995ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
8005ffd83dbSDimitry Andric     .legalIf(isScalar(0))
8015ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
8025ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
8035ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
8045ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
8055ffd83dbSDimitry Andric     // in the legalizer.
8065ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
8075ffd83dbSDimitry Andric     .alwaysLegal();
8085ffd83dbSDimitry Andric 
8090b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
8100b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
8115ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
812480093f4SDimitry Andric     .scalarize(0)
8135ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
8145ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
8150b57cec5SDimitry Andric 
8168bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
8178bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
818480093f4SDimitry Andric                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
819480093f4SDimitry Andric                     .lowerIf(typeIs(1, S1))
820*349cc55cSDimitry Andric                     .customFor({{S32, S64}, {S64, S64}});
8218bcb0991SDimitry Andric   if (ST.has16BitInsts())
8228bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
8238bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
824e8d8bef9SDimitry Andric        .minScalar(0, S32)
8255ffd83dbSDimitry Andric        .scalarize(0)
8265ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
8270b57cec5SDimitry Andric 
8288bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
8295ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
830fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
831e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
8328bcb0991SDimitry Andric   if (ST.has16BitInsts())
8338bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
8348bcb0991SDimitry Andric   else
8358bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
8368bcb0991SDimitry Andric 
8378bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
838fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
8395ffd83dbSDimitry Andric        .scalarize(0)
8405ffd83dbSDimitry Andric        .lower();
8410b57cec5SDimitry Andric 
842e8d8bef9SDimitry Andric   // Lower roundeven into G_FRINT
843e8d8bef9SDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
844480093f4SDimitry Andric     .scalarize(0)
845480093f4SDimitry Andric     .lower();
8460b57cec5SDimitry Andric 
847480093f4SDimitry Andric   if (ST.has16BitInsts()) {
848480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
849480093f4SDimitry Andric       .legalFor({S16, S32, S64})
850480093f4SDimitry Andric       .clampScalar(0, S16, S64)
851480093f4SDimitry Andric       .scalarize(0);
852480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
8530b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8540b57cec5SDimitry Andric       .legalFor({S32, S64})
8550b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8560b57cec5SDimitry Andric       .scalarize(0);
8570b57cec5SDimitry Andric   } else {
8580b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8590b57cec5SDimitry Andric       .legalFor({S32})
8600b57cec5SDimitry Andric       .customFor({S64})
8610b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8620b57cec5SDimitry Andric       .scalarize(0);
8630b57cec5SDimitry Andric   }
8640b57cec5SDimitry Andric 
865480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
866e8d8bef9SDimitry Andric     .legalIf(all(isPointer(0), sameSize(0, 1)))
867e8d8bef9SDimitry Andric     .scalarize(0)
868e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0);
8690b57cec5SDimitry Andric 
8705ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
871e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
872e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
8735ffd83dbSDimitry Andric     .scalarize(0);
8740b57cec5SDimitry Andric 
8750b57cec5SDimitry Andric   auto &CmpBuilder =
8760b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
877480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
878480093f4SDimitry Andric     // so make both s1 and s32 legal.
879480093f4SDimitry Andric     //
880480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
881480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
882480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
883480093f4SDimitry Andric     // before that won't try to use s32 result types.
884480093f4SDimitry Andric     //
885480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
886480093f4SDimitry Andric     // bank.
8870b57cec5SDimitry Andric     .legalForCartesianProduct(
8880b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
889480093f4SDimitry Andric     .legalForCartesianProduct(
890480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
8910b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
8920b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
8930b57cec5SDimitry Andric   }
8940b57cec5SDimitry Andric 
8950b57cec5SDimitry Andric   CmpBuilder
8960b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
8970b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
8980b57cec5SDimitry Andric     .scalarize(0)
899480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
9000b57cec5SDimitry Andric 
9010b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
9020b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
9030b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
9040b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9050b57cec5SDimitry Andric     .scalarize(0);
9060b57cec5SDimitry Andric 
9075ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
9085ffd83dbSDimitry Andric   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
9095ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9105ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32, S16});
9115ffd83dbSDimitry Andric   else
9125ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32});
9135ffd83dbSDimitry Andric   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
9145ffd83dbSDimitry Andric   Exp2Ops.scalarize(0);
9155ffd83dbSDimitry Andric 
9165ffd83dbSDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
9175ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9185ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
9195ffd83dbSDimitry Andric   else
9205ffd83dbSDimitry Andric     ExpOps.customFor({S32});
9215ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
9220b57cec5SDimitry Andric         .scalarize(0);
9230b57cec5SDimitry Andric 
924e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
925e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
926e8d8bef9SDimitry Andric     .lower();
927e8d8bef9SDimitry Andric 
9280b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9295ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
9300b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9310b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
9320b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9330b57cec5SDimitry Andric     .scalarize(0)
9340b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
9350b57cec5SDimitry Andric     .widenScalarToNextPow2(1, 32);
9360b57cec5SDimitry Andric 
9375ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
9385ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
9395ffd83dbSDimitry Andric   // bitwidth.
9405ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
9415ffd83dbSDimitry Andric     .scalarize(0)
9425ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9435ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9445ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9455ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
946*349cc55cSDimitry Andric     .custom();
9475ffd83dbSDimitry Andric 
9485ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9495ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
9505ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9515ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9525ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9535ffd83dbSDimitry Andric     .scalarize(0)
9545ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9555ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
9565ffd83dbSDimitry Andric 
957fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
958fe6060f1SDimitry Andric   // RegBankSelect.
9595ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
960fe6060f1SDimitry Andric     .legalFor({S32, S64})
961fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
962fe6060f1SDimitry Andric     .scalarize(0)
963fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
9640b57cec5SDimitry Andric 
9650b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
9665ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
9675ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
9685ffd83dbSDimitry Andric       .clampMaxNumElements(0, S16, 2)
9695ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
9705ffd83dbSDimitry Andric       // narrowScalar limitation.
9715ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
9725ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
9735ffd83dbSDimitry Andric       .scalarize(0);
9745ffd83dbSDimitry Andric 
9750b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
976fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
9770b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
9780b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
9790b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
9805ffd83dbSDimitry Andric         .minScalar(0, S16)
9810b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
9825ffd83dbSDimitry Andric         .scalarize(0)
9835ffd83dbSDimitry Andric         .lower();
9840b57cec5SDimitry Andric     } else {
985fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
9860b57cec5SDimitry Andric         .legalFor({S32, S16})
9870b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
9885ffd83dbSDimitry Andric         .minScalar(0, S16)
9895ffd83dbSDimitry Andric         .scalarize(0)
9905ffd83dbSDimitry Andric         .lower();
9910b57cec5SDimitry Andric     }
9920b57cec5SDimitry Andric   } else {
9935ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
9945ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
9955ffd83dbSDimitry Andric       .legalFor({S32})
9965ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
9975ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
9985ffd83dbSDimitry Andric       // narrowScalar limitation.
9995ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
10005ffd83dbSDimitry Andric       .maxScalar(0, S32)
10015ffd83dbSDimitry Andric       .scalarize(0)
10025ffd83dbSDimitry Andric       .lower();
10035ffd83dbSDimitry Andric 
1004fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10050b57cec5SDimitry Andric       .legalFor({S32})
10065ffd83dbSDimitry Andric       .minScalar(0, S32)
10070b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
10085ffd83dbSDimitry Andric       .scalarize(0)
10095ffd83dbSDimitry Andric       .lower();
10100b57cec5SDimitry Andric   }
10110b57cec5SDimitry Andric 
10120b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
10130b57cec5SDimitry Andric     // List the common cases
10140b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10150b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10160b57cec5SDimitry Andric     .scalarize(0)
10170b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10180b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10190b57cec5SDimitry Andric     .widenScalarIf(smallerThan(1, 0),
10200b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10210b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10220b57cec5SDimitry Andric       })
10235ffd83dbSDimitry Andric     .narrowScalarIf(largerThan(1, 0),
10240b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10250b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10260b57cec5SDimitry Andric       });
10270b57cec5SDimitry Andric 
10280b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
10290b57cec5SDimitry Andric     // List the common cases
10300b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10310b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10320b57cec5SDimitry Andric     .scalarize(0)
10330b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10340b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10350b57cec5SDimitry Andric     .widenScalarIf(smallerThan(0, 1),
10360b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10370b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10380b57cec5SDimitry Andric       })
10390b57cec5SDimitry Andric     .narrowScalarIf(
10405ffd83dbSDimitry Andric       largerThan(0, 1),
10410b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10420b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10430b57cec5SDimitry Andric       });
10440b57cec5SDimitry Andric 
10450b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
10460b57cec5SDimitry Andric     .scalarize(0)
10470b57cec5SDimitry Andric     .custom();
10480b57cec5SDimitry Andric 
10495ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
10505ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
10518bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
10528bcb0991SDimitry Andric 
10538bcb0991SDimitry Andric     // Split vector extloads.
1054fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1055e8d8bef9SDimitry Andric     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1056480093f4SDimitry Andric 
1057480093f4SDimitry Andric     if (MemSize < DstTy.getSizeInBits())
1058e8d8bef9SDimitry Andric       MemSize = std::max(MemSize, AlignBits);
1059480093f4SDimitry Andric 
10608bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
10618bcb0991SDimitry Andric       return true;
10628bcb0991SDimitry Andric 
10638bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
10648bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
10655ffd83dbSDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
10668bcb0991SDimitry Andric       return true;
10678bcb0991SDimitry Andric 
10688bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
10698bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
10705ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
10715ffd83dbSDimitry Andric     if (NumRegs == 3) {
10725ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
10738bcb0991SDimitry Andric         return true;
10745ffd83dbSDimitry Andric     } else {
10755ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
10765ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
10775ffd83dbSDimitry Andric         return true;
10785ffd83dbSDimitry Andric     }
10798bcb0991SDimitry Andric 
1080e8d8bef9SDimitry Andric     if (AlignBits < MemSize) {
10818bcb0991SDimitry Andric       const SITargetLowering *TLI = ST.getTargetLowering();
1082e8d8bef9SDimitry Andric       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1083e8d8bef9SDimitry Andric                                                       Align(AlignBits / 8));
10848bcb0991SDimitry Andric     }
10858bcb0991SDimitry Andric 
10868bcb0991SDimitry Andric     return false;
10878bcb0991SDimitry Andric   };
10888bcb0991SDimitry Andric 
1089e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1090e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1091e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
10928bcb0991SDimitry Andric 
10938bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
10948bcb0991SDimitry Andric   // LDS
10958bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
10968bcb0991SDimitry Andric 
10978bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
10988bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
10998bcb0991SDimitry Andric 
11008bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
11015ffd83dbSDimitry Andric     // Explicitly list some common cases.
11025ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1103fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1104fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1105fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1106fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1107fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1108fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1109fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1110fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
11118bcb0991SDimitry Andric 
1112fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1113fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1114fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1115fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1116fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1117fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
11188bcb0991SDimitry Andric 
1119fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1120fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1121fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1122fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
11238bcb0991SDimitry Andric 
1124fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1125fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1126fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1127fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1128fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
11295ffd83dbSDimitry Andric     Actions.legalIf(
11305ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1131fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
11325ffd83dbSDimitry Andric       });
11335ffd83dbSDimitry Andric 
11345ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
11355ffd83dbSDimitry Andric     // 64-bits.
11365ffd83dbSDimitry Andric     //
11375ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
11385ffd83dbSDimitry Andric     // inserting addrspacecasts.
11395ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
11405ffd83dbSDimitry Andric 
11415ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
11425ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
11435ffd83dbSDimitry Andric     // parts anyway.
11445ffd83dbSDimitry Andric     //
11455ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
11465ffd83dbSDimitry Andric     // 16-bit vector parts.
11475ffd83dbSDimitry Andric     Actions.bitcastIf(
11485ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1149e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1150fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
11515ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
11525ffd83dbSDimitry Andric 
1153e8d8bef9SDimitry Andric     if (!IsStore) {
1154e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1155e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1156e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1157e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1158e8d8bef9SDimitry Andric       });
1159e8d8bef9SDimitry Andric     }
1160e8d8bef9SDimitry Andric 
1161e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
11628bcb0991SDimitry Andric     Actions
11638bcb0991SDimitry Andric         .narrowScalarIf(
11648bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
11655ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
11665ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
11678bcb0991SDimitry Andric             },
11688bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
11698bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
11708bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
11718bcb0991SDimitry Andric 
11728bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1173fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
11748bcb0991SDimitry Andric 
11758bcb0991SDimitry Andric               // Split extloads.
11768bcb0991SDimitry Andric               if (DstSize > MemSize)
11778bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MemSize));
11788bcb0991SDimitry Andric 
11795ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
11805ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
11815ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
11825ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
11835ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
11845ffd83dbSDimitry Andric                 return std::make_pair(0, LLT::scalar(FloorSize));
11855ffd83dbSDimitry Andric               }
11865ffd83dbSDimitry Andric 
11878bcb0991SDimitry Andric               if (DstSize > 32 && (DstSize % 32 != 0)) {
11888bcb0991SDimitry Andric                 // FIXME: Need a way to specify non-extload of larger size if
11898bcb0991SDimitry Andric                 // suitably aligned.
11908bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
11918bcb0991SDimitry Andric               }
11928bcb0991SDimitry Andric 
11935ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
11945ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
11955ffd83dbSDimitry Andric                                                      Op == G_LOAD);
11968bcb0991SDimitry Andric               if (MemSize > MaxSize)
11978bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MaxSize));
11988bcb0991SDimitry Andric 
11998bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
12008bcb0991SDimitry Andric               return std::make_pair(0, LLT::scalar(Align));
12018bcb0991SDimitry Andric             })
12028bcb0991SDimitry Andric         .fewerElementsIf(
12038bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
12045ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
12055ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
12068bcb0991SDimitry Andric             },
12078bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
12088bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
12098bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
12108bcb0991SDimitry Andric 
12118bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
12125ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
12135ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
12145ffd83dbSDimitry Andric                                                      Op == G_LOAD);
12155ffd83dbSDimitry Andric 
12165ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
12175ffd83dbSDimitry Andric               // up scalarizing.
12185ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
12198bcb0991SDimitry Andric 
12208bcb0991SDimitry Andric               // Split if it's too large for the address space.
1221fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1222fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
12238bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
12245ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
12255ffd83dbSDimitry Andric 
12265ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
12275ffd83dbSDimitry Andric                   return std::make_pair(
1228fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1229fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
12305ffd83dbSDimitry Andric                 }
12315ffd83dbSDimitry Andric 
1232fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
12338bcb0991SDimitry Andric 
12348bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
12358bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
12368bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
12378bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
12388bcb0991SDimitry Andric                   return std::make_pair(0, EltTy);
12398bcb0991SDimitry Andric 
1240fe6060f1SDimitry Andric                 return std::make_pair(
1241fe6060f1SDimitry Andric                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
12428bcb0991SDimitry Andric               }
12438bcb0991SDimitry Andric 
12445ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
12455ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
12465ffd83dbSDimitry Andric                 return std::make_pair(0, EltTy);
12475ffd83dbSDimitry Andric 
12485ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
12495ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
12505ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
12515ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
12525ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
12535ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
12545ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
12555ffd83dbSDimitry Andric                 return std::make_pair(
1256fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1257fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
12585ffd83dbSDimitry Andric               }
12595ffd83dbSDimitry Andric 
12608bcb0991SDimitry Andric               // Need to split because of alignment.
12618bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
12628bcb0991SDimitry Andric               if (EltSize > Align &&
12638bcb0991SDimitry Andric                   (EltSize / Align < DstTy.getNumElements())) {
1264fe6060f1SDimitry Andric                 return std::make_pair(
1265fe6060f1SDimitry Andric                     0, LLT::fixed_vector(EltSize / Align, EltTy));
12668bcb0991SDimitry Andric               }
12678bcb0991SDimitry Andric 
12688bcb0991SDimitry Andric               // May need relegalization for the scalars.
12698bcb0991SDimitry Andric               return std::make_pair(0, EltTy);
12708bcb0991SDimitry Andric             })
1271fe6060f1SDimitry Andric     .minScalar(0, S32)
1272fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
12738bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1274e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1275e8d8bef9SDimitry Andric     .lower();
12768bcb0991SDimitry Andric   }
12770b57cec5SDimitry Andric 
1278fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
12790b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1280fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1281fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1282fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1283fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1284fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1285fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1286fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1287fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1288fe6060f1SDimitry Andric                        .legalIf(
1289fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1290fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1291fe6060f1SDimitry Andric                          });
1292fe6060f1SDimitry Andric 
12930b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
12948bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1295fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
12960b57cec5SDimitry Andric   }
12970b57cec5SDimitry Andric 
1298fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1299fe6060f1SDimitry Andric   // 64-bits.
1300fe6060f1SDimitry Andric   //
1301fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1302fe6060f1SDimitry Andric   // inserting addrspacecasts.
1303fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1304fe6060f1SDimitry Andric 
13050b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
13060b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
13070b57cec5SDimitry Andric           .lower();
13080b57cec5SDimitry Andric 
13090b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
13100b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
13110b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
13120b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1313480093f4SDimitry Andric      G_ATOMICRMW_UMIN})
13140b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1315e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1316e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13170b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
13180b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
13190b57cec5SDimitry Andric   }
13200b57cec5SDimitry Andric 
1321fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1322*349cc55cSDimitry Andric   if (ST.hasLDSFPAtomicAdd()) {
1323fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1324fe6060f1SDimitry Andric     if (ST.hasGFX90AInsts())
1325fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
13265ffd83dbSDimitry Andric   }
1327fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1328fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
13298bcb0991SDimitry Andric 
1330480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1331480093f4SDimitry Andric   // demarshalling
1332480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1333480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1334480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1335480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1336480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13370b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1338480093f4SDimitry Andric 
1339480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
13400b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1341fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1342fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1343fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1344fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1345fe6060f1SDimitry Andric                                 {S1, S32})
13460b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
13475ffd83dbSDimitry Andric       .scalarize(1)
13480b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
13490b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
13500b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
13510b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
13520b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
13530b57cec5SDimitry Andric       .scalarize(0)
13540b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1355480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
13560b57cec5SDimitry Andric 
13570b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
13580b57cec5SDimitry Andric   // be more flexible with the shift amount type.
13590b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
13600b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
13610b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
13620b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
13635ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
13640b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
13650b57cec5SDimitry Andric     } else
13665ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
13670b57cec5SDimitry Andric 
13685ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
13695ffd83dbSDimitry Andric     Shifts.widenScalarIf(
13705ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
13715ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
13725ffd83dbSDimitry Andric         // 32-bit amount.
13735ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
13745ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
13755ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
13765ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
13775ffd83dbSDimitry Andric       }, changeTo(1, S16));
13785ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1379480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
13800b57cec5SDimitry Andric     Shifts.clampScalar(0, S16, S64);
13810b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
1382e8d8bef9SDimitry Andric 
1383e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1384e8d8bef9SDimitry Andric       .minScalar(0, S16)
1385e8d8bef9SDimitry Andric       .scalarize(0)
1386e8d8bef9SDimitry Andric       .lower();
13870b57cec5SDimitry Andric   } else {
13880b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
13890b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
13900b57cec5SDimitry Andric     // been truncated already.
13910b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
13920b57cec5SDimitry Andric     Shifts.clampScalar(0, S32, S64);
13930b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
1394e8d8bef9SDimitry Andric 
1395e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1396e8d8bef9SDimitry Andric       .minScalar(0, S32)
1397e8d8bef9SDimitry Andric       .scalarize(0)
1398e8d8bef9SDimitry Andric       .lower();
13990b57cec5SDimitry Andric   }
14000b57cec5SDimitry Andric   Shifts.scalarize(0);
14010b57cec5SDimitry Andric 
14020b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
14030b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
14040b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
14050b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
14060b57cec5SDimitry Andric 
14070b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14080b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
14090b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
14100b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
14110b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1412e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
1413e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
14140b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
14155ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
14160b57cec5SDimitry Andric                   IdxTy.getSizeInBits() == 32;
14170b57cec5SDimitry Andric         })
1418e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1419e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1420e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1421e8d8bef9SDimitry Andric       .bitcastIf(
1422e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1423e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1424e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1425e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1426e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1427e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1428e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1429e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1430e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1431e8d8bef9SDimitry Andric 
1432e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1433e8d8bef9SDimitry Andric           return std::make_pair(
1434fe6060f1SDimitry Andric               VecTypeIdx,
1435fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1436e8d8bef9SDimitry Andric         })
14370b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
14380b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1439e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1440e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1441e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
1442e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1443e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1444e8d8bef9SDimitry Andric       .lower();
14450b57cec5SDimitry Andric   }
14460b57cec5SDimitry Andric 
14470b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
14480b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
14490b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
14500b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
14510b57cec5SDimitry Andric       });
14520b57cec5SDimitry Andric 
14530b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
14540b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
14550b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
14560b57cec5SDimitry Andric 
14570b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
14580b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14598bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
14608bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
14610b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
14620b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14630b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14640b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
14650b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
14660b57cec5SDimitry Andric         })
14670b57cec5SDimitry Andric       .widenScalarIf(
14680b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
14690b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14700b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
14710b57cec5SDimitry Andric         },
14720b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
14730b57cec5SDimitry Andric       .widenScalarIf(
14740b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
14750b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14760b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
14770b57cec5SDimitry Andric         },
14780b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
14790b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
14800b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
14810b57cec5SDimitry Andric 
14820b57cec5SDimitry Andric   }
14830b57cec5SDimitry Andric 
14848bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
14850b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
14860b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
14878bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
14888bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
14898bcb0991SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
14908bcb0991SDimitry Andric 
14918bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
14925ffd83dbSDimitry Andric     BuildVector
14935ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
14945ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
14955ffd83dbSDimitry Andric       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
14965ffd83dbSDimitry Andric       .minScalar(1, S32);
14975ffd83dbSDimitry Andric 
14988bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
14998bcb0991SDimitry Andric       .legalFor({V2S16, S32})
15008bcb0991SDimitry Andric       .lower();
15015ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
15028bcb0991SDimitry Andric   } else {
15035ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
15045ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
15055ffd83dbSDimitry Andric 
15068bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
15075ffd83dbSDimitry Andric       .customFor({V2S16, S32})
15088bcb0991SDimitry Andric       .lower();
15098bcb0991SDimitry Andric   }
15108bcb0991SDimitry Andric 
15115ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
15125ffd83dbSDimitry Andric 
15135ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
15140b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1515e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1516e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1517e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1518e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
15190b57cec5SDimitry Andric 
15205ffd83dbSDimitry Andric   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
15215ffd83dbSDimitry Andric   // pre-legalize.
15225ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
15235ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
15245ffd83dbSDimitry Andric       .customFor({V2S16, V2S16})
15255ffd83dbSDimitry Andric       .lower();
15265ffd83dbSDimitry Andric   } else
15278bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
15288bcb0991SDimitry Andric 
15290b57cec5SDimitry Andric   // Merge/Unmerge
15300b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
15310b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
15320b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
15330b57cec5SDimitry Andric 
15340b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
15355ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
15360b57cec5SDimitry Andric       if (Ty.isVector()) {
15370b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
15385ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
15390b57cec5SDimitry Andric           return true;
15400b57cec5SDimitry Andric         if (!isPowerOf2_32(EltTy.getSizeInBits()))
15410b57cec5SDimitry Andric           return true;
15420b57cec5SDimitry Andric       }
15430b57cec5SDimitry Andric       return false;
15440b57cec5SDimitry Andric     };
15450b57cec5SDimitry Andric 
15468bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1547e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
15485ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
15495ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
15505ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15515ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
15525ffd83dbSDimitry Andric         })
15535ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
15545ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
15555ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
15560b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
15578bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
15588bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
15598bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
15608bcb0991SDimitry Andric                        changeTo(1, V2S16))
15615ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
15625ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
15635ffd83dbSDimitry Andric       // valid.
15645ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
15655ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
15660b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
15670b57cec5SDimitry Andric       .fewerElementsIf(
15685ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
15690b57cec5SDimitry Andric         scalarize(0))
15700b57cec5SDimitry Andric       .fewerElementsIf(
15715ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
15720b57cec5SDimitry Andric         scalarize(1))
15735ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
15748bcb0991SDimitry Andric 
15758bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
15768bcb0991SDimitry Andric       Builder.widenScalarIf(
15778bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
15780b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
15798bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
15808bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
15818bcb0991SDimitry Andric         },
15828bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
15838bcb0991SDimitry Andric     }
15848bcb0991SDimitry Andric 
15858bcb0991SDimitry Andric     Builder.widenScalarIf(
15868bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
15878bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
15880b57cec5SDimitry Andric         return !isPowerOf2_32(Ty.getSizeInBits()) &&
15890b57cec5SDimitry Andric           Ty.getSizeInBits() % 16 != 0;
15900b57cec5SDimitry Andric       },
15910b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
15920b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
15930b57cec5SDimitry Andric         // Whichever is smaller.
15940b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
15950b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
15960b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
15970b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
15980b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
15990b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
16000b57cec5SDimitry Andric         }
16010b57cec5SDimitry Andric         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
16020b57cec5SDimitry Andric       })
16030b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
16040b57cec5SDimitry Andric       .scalarize(0)
16050b57cec5SDimitry Andric       .scalarize(1);
16060b57cec5SDimitry Andric   }
16070b57cec5SDimitry Andric 
16085ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
16095ffd83dbSDimitry Andric   // RegBankSelect.
16105ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
16115ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
16128bcb0991SDimitry Andric 
16135ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
16145ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
16155ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
16165ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
16175ffd83dbSDimitry Andric       // expanded.
16185ffd83dbSDimitry Andric       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
16195ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
16205ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
16215ffd83dbSDimitry Andric   } else {
16225ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
16235ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
16245ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
16255ffd83dbSDimitry Andric   }
16265ffd83dbSDimitry Andric 
16275ffd83dbSDimitry Andric   SextInReg
16285ffd83dbSDimitry Andric     .scalarize(0)
16295ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
16305ffd83dbSDimitry Andric     .lower();
16315ffd83dbSDimitry Andric 
1632*349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1633*349cc55cSDimitry Andric     .scalarize(0)
1634*349cc55cSDimitry Andric     .lower();
1635*349cc55cSDimitry Andric 
1636fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
16375ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
16385ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1639fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
1640fe6060f1SDimitry Andric     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
16415ffd83dbSDimitry Andric     .scalarize(0)
16425ffd83dbSDimitry Andric     .lower();
1643480093f4SDimitry Andric 
1644fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
1645fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1646fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
1647fe6060f1SDimitry Andric       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1648fe6060f1SDimitry Andric       .scalarize(0)
1649fe6060f1SDimitry Andric       .lower();
1650fe6060f1SDimitry Andric   } else {
1651fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1652fe6060f1SDimitry Andric       .scalarize(0)
1653fe6060f1SDimitry Andric       .lower();
1654fe6060f1SDimitry Andric   }
1655fe6060f1SDimitry Andric 
1656480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1657480093f4SDimitry Andric     .legalFor({S64});
1658480093f4SDimitry Andric 
1659e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1660e8d8bef9SDimitry Andric     .alwaysLegal();
1661e8d8bef9SDimitry Andric 
1662fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1663fe6060f1SDimitry Andric       .scalarize(0)
1664fe6060f1SDimitry Andric       .minScalar(0, S32)
1665fe6060f1SDimitry Andric       .lower();
1666fe6060f1SDimitry Andric 
1667fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1668fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
1669fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
1670fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
1671fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
1672fe6060f1SDimitry Andric       .scalarize(0);
1673fe6060f1SDimitry Andric 
16745ffd83dbSDimitry Andric   getActionDefinitionsBuilder({
16755ffd83dbSDimitry Andric       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
16765ffd83dbSDimitry Andric       G_FCOPYSIGN,
16775ffd83dbSDimitry Andric 
16785ffd83dbSDimitry Andric       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1679e8d8bef9SDimitry Andric       G_ATOMICRMW_NAND,
1680e8d8bef9SDimitry Andric       G_ATOMICRMW_FSUB,
16815ffd83dbSDimitry Andric       G_READ_REGISTER,
16825ffd83dbSDimitry Andric       G_WRITE_REGISTER,
16835ffd83dbSDimitry Andric 
16845ffd83dbSDimitry Andric       G_SADDO, G_SSUBO,
16855ffd83dbSDimitry Andric 
16865ffd83dbSDimitry Andric        // TODO: Implement
1687fe6060f1SDimitry Andric       G_FMINIMUM, G_FMAXIMUM}).lower();
16885ffd83dbSDimitry Andric 
1689*349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1690*349cc55cSDimitry Andric       .lower();
1691*349cc55cSDimitry Andric 
1692480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
16935ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1694480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1695480093f4SDimitry Andric     .unsupported();
1696480093f4SDimitry Andric 
1697fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
16980b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
16990b57cec5SDimitry Andric }
17000b57cec5SDimitry Andric 
17015ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
17025ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
17035ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
17045ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
17055ffd83dbSDimitry Andric 
17060b57cec5SDimitry Andric   switch (MI.getOpcode()) {
17070b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
17088bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
17090b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
17108bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
17110b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
17128bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
1713e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
1714e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
17150b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
17168bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
17170b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
17188bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
17190b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
17208bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
17215ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
17225ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
17235ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
17245ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
17250b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
17260b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
17270b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
17280b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
17295ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
17300b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
17318bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
17320b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
17338bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
17345ffd83dbSDimitry Andric   case TargetOpcode::G_SHUFFLE_VECTOR:
17355ffd83dbSDimitry Andric     return legalizeShuffleVector(MI, MRI, B);
17368bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
17378bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
17388bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
17398bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
17408bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
17418bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
1742fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
1743fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
1744e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
17458bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
17468bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
17478bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
17488bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
17495ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
17505ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
1751fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
1752fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
17535ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
17545ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
1755fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
1756fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
1757480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
1758480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
17595ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
17605ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f);
17615ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
17625ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
17635ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
17645ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
17655ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
17665ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
17675ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
17685ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
17695ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
17705ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
1771*349cc55cSDimitry Andric   case TargetOpcode::G_CTLZ:
1772*349cc55cSDimitry Andric   case TargetOpcode::G_CTTZ:
1773*349cc55cSDimitry Andric     return legalizeCTLZ_CTTZ(MI, MRI, B);
17740b57cec5SDimitry Andric   default:
17750b57cec5SDimitry Andric     return false;
17760b57cec5SDimitry Andric   }
17770b57cec5SDimitry Andric 
17780b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
17790b57cec5SDimitry Andric }
17800b57cec5SDimitry Andric 
17810b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
17820b57cec5SDimitry Andric   unsigned AS,
17830b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
17848bcb0991SDimitry Andric   MachineIRBuilder &B) const {
17858bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
17860b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17870b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
17880b57cec5SDimitry Andric 
17898bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
17908bcb0991SDimitry Andric 
17910b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
17920b57cec5SDimitry Andric     // FIXME: Use inline constants (src_{shared, private}_base) instead of
17930b57cec5SDimitry Andric     // getreg.
17940b57cec5SDimitry Andric     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
17950b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
17960b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
17970b57cec5SDimitry Andric     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
17980b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
17990b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
18000b57cec5SDimitry Andric     unsigned Encoding =
18010b57cec5SDimitry Andric         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
18020b57cec5SDimitry Andric         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
18030b57cec5SDimitry Andric         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
18040b57cec5SDimitry Andric 
18050b57cec5SDimitry Andric     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
18060b57cec5SDimitry Andric 
18078bcb0991SDimitry Andric     B.buildInstr(AMDGPU::S_GETREG_B32)
18080b57cec5SDimitry Andric       .addDef(GetReg)
18090b57cec5SDimitry Andric       .addImm(Encoding);
18100b57cec5SDimitry Andric     MRI.setType(GetReg, S32);
18110b57cec5SDimitry Andric 
18128bcb0991SDimitry Andric     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
18135ffd83dbSDimitry Andric     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
18140b57cec5SDimitry Andric   }
18150b57cec5SDimitry Andric 
18160b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
18170b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
18180b57cec5SDimitry Andric 
1819e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
18208bcb0991SDimitry Andric     return Register();
18210b57cec5SDimitry Andric 
18220b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
18230b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
18240b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
18250b57cec5SDimitry Andric 
1826480093f4SDimitry Andric   // TODO: can we be smarter about machine pointer info?
1827480093f4SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
18280b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
18290b57cec5SDimitry Andric       PtrInfo,
18305ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
18310b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
1832fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
18330b57cec5SDimitry Andric 
18340b57cec5SDimitry Andric   Register LoadAddr;
18350b57cec5SDimitry Andric 
1836480093f4SDimitry Andric   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
18375ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
18380b57cec5SDimitry Andric }
18390b57cec5SDimitry Andric 
18400b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
18410b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
18428bcb0991SDimitry Andric   MachineIRBuilder &B) const {
18438bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
18440b57cec5SDimitry Andric 
18458bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
18460b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
18470b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
18480b57cec5SDimitry Andric 
18490b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
18500b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
18510b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
18520b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
18530b57cec5SDimitry Andric 
18540b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
18550b57cec5SDimitry Andric   // vector element.
18560b57cec5SDimitry Andric   assert(!DstTy.isVector());
18570b57cec5SDimitry Andric 
18580b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
18590b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
18600b57cec5SDimitry Andric 
1861e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
18628bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
18638bcb0991SDimitry Andric     return true;
18648bcb0991SDimitry Andric   }
18658bcb0991SDimitry Andric 
18668bcb0991SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
18678bcb0991SDimitry Andric     // Truncate.
18688bcb0991SDimitry Andric     B.buildExtract(Dst, Src, 0);
18698bcb0991SDimitry Andric     MI.eraseFromParent();
18708bcb0991SDimitry Andric     return true;
18718bcb0991SDimitry Andric   }
18728bcb0991SDimitry Andric 
18738bcb0991SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
18748bcb0991SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18758bcb0991SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
18768bcb0991SDimitry Andric 
18778bcb0991SDimitry Andric     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
18788bcb0991SDimitry Andric     // another. Merge operands are required to be the same type, but creating an
18798bcb0991SDimitry Andric     // extra ptrtoint would be kind of pointless.
18808bcb0991SDimitry Andric     auto HighAddr = B.buildConstant(
18818bcb0991SDimitry Andric       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
18825ffd83dbSDimitry Andric     B.buildMerge(Dst, {Src, HighAddr});
18838bcb0991SDimitry Andric     MI.eraseFromParent();
18840b57cec5SDimitry Andric     return true;
18850b57cec5SDimitry Andric   }
18860b57cec5SDimitry Andric 
18870b57cec5SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
18880b57cec5SDimitry Andric     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
18890b57cec5SDimitry Andric            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
18900b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
18910b57cec5SDimitry Andric 
18928bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
18938bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
18940b57cec5SDimitry Andric 
18950b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
18965ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
18970b57cec5SDimitry Andric 
18985ffd83dbSDimitry Andric     auto CmpRes =
18995ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
19008bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
19010b57cec5SDimitry Andric 
19020b57cec5SDimitry Andric     MI.eraseFromParent();
19030b57cec5SDimitry Andric     return true;
19040b57cec5SDimitry Andric   }
19050b57cec5SDimitry Andric 
19068bcb0991SDimitry Andric   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
19078bcb0991SDimitry Andric     return false;
19088bcb0991SDimitry Andric 
19098bcb0991SDimitry Andric   if (!ST.hasFlatAddressSpace())
19108bcb0991SDimitry Andric     return false;
19110b57cec5SDimitry Andric 
19120b57cec5SDimitry Andric   auto SegmentNull =
19138bcb0991SDimitry Andric       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
19140b57cec5SDimitry Andric   auto FlatNull =
19158bcb0991SDimitry Andric       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
19160b57cec5SDimitry Andric 
19178bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
19188bcb0991SDimitry Andric   if (!ApertureReg.isValid())
19198bcb0991SDimitry Andric     return false;
19200b57cec5SDimitry Andric 
19215ffd83dbSDimitry Andric   auto CmpRes =
19225ffd83dbSDimitry Andric       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
19230b57cec5SDimitry Andric 
19240b57cec5SDimitry Andric   // Coerce the type of the low half of the result so we can use merge_values.
19255ffd83dbSDimitry Andric   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
19260b57cec5SDimitry Andric 
19270b57cec5SDimitry Andric   // TODO: Should we allow mismatched types but matching sizes in merges to
19280b57cec5SDimitry Andric   // avoid the ptrtoint?
19295ffd83dbSDimitry Andric   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
19305ffd83dbSDimitry Andric   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
19310b57cec5SDimitry Andric 
19320b57cec5SDimitry Andric   MI.eraseFromParent();
19330b57cec5SDimitry Andric   return true;
19340b57cec5SDimitry Andric }
19350b57cec5SDimitry Andric 
19360b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
19370b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19388bcb0991SDimitry Andric   MachineIRBuilder &B) const {
19390b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19400b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
19410b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
19420b57cec5SDimitry Andric 
19430b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
19440b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
19450b57cec5SDimitry Andric 
19468bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
19478bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
19480b57cec5SDimitry Andric 
19490b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
19508bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
19518bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
19520b57cec5SDimitry Andric 
19538bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
19548bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
19550b57cec5SDimitry Andric 
19568bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
19578bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1958e8d8bef9SDimitry Andric   MI.eraseFromParent();
19590b57cec5SDimitry Andric   return true;
19600b57cec5SDimitry Andric }
19610b57cec5SDimitry Andric 
19620b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
19630b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19640b57cec5SDimitry Andric   MachineIRBuilder &B) const {
19650b57cec5SDimitry Andric 
19660b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
19670b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
19680b57cec5SDimitry Andric 
19690b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19700b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
19710b57cec5SDimitry Andric 
19720b57cec5SDimitry Andric   // result = trunc(src)
19730b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
19740b57cec5SDimitry Andric   //   result += 1.0
19750b57cec5SDimitry Andric 
19765ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
19770b57cec5SDimitry Andric 
19780b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
19790b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
19800b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
19810b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
19820b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
19830b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
19840b57cec5SDimitry Andric 
19850b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
19860b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
19870b57cec5SDimitry Andric   return true;
19880b57cec5SDimitry Andric }
19890b57cec5SDimitry Andric 
1990e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
1991e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
1992e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
1993e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
1994e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
1995e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
1996e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
1997e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
1998e8d8bef9SDimitry Andric 
1999e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2000e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2001e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2002e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2003e8d8bef9SDimitry Andric     MI.eraseFromParent();
2004e8d8bef9SDimitry Andric     return true;
2005e8d8bef9SDimitry Andric }
2006e8d8bef9SDimitry Andric 
2007e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
20080b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
20090b57cec5SDimitry Andric   const unsigned FractBits = 52;
20100b57cec5SDimitry Andric   const unsigned ExpBits = 11;
20110b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
20120b57cec5SDimitry Andric 
20130b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
20140b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
20150b57cec5SDimitry Andric 
20160b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2017e8d8bef9SDimitry Andric     .addUse(Hi)
20180b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
20190b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
20200b57cec5SDimitry Andric 
20210b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
20220b57cec5SDimitry Andric }
20230b57cec5SDimitry Andric 
20240b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
20250b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20260b57cec5SDimitry Andric   MachineIRBuilder &B) const {
20270b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
20280b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
20290b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
20300b57cec5SDimitry Andric 
20310b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20320b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
20330b57cec5SDimitry Andric 
20340b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
20350b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
20360b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
20370b57cec5SDimitry Andric 
20380b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
20390b57cec5SDimitry Andric   // exponent.
20400b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
20410b57cec5SDimitry Andric 
20420b57cec5SDimitry Andric   const unsigned FractBits = 52;
20430b57cec5SDimitry Andric 
20440b57cec5SDimitry Andric   // Extract the sign bit.
20450b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
20460b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
20470b57cec5SDimitry Andric 
20480b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
20490b57cec5SDimitry Andric 
20500b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
20510b57cec5SDimitry Andric 
20520b57cec5SDimitry Andric   // Extend back to 64-bits.
20535ffd83dbSDimitry Andric   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
20540b57cec5SDimitry Andric 
20550b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
20560b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
20570b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
20580b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
20590b57cec5SDimitry Andric 
20600b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
20610b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
20620b57cec5SDimitry Andric 
20630b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
20640b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2065e8d8bef9SDimitry Andric   MI.eraseFromParent();
20660b57cec5SDimitry Andric   return true;
20670b57cec5SDimitry Andric }
20680b57cec5SDimitry Andric 
20690b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
20700b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20710b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
20720b57cec5SDimitry Andric 
20730b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
20740b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20750b57cec5SDimitry Andric 
20760b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
20770b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
20780b57cec5SDimitry Andric 
2079*349cc55cSDimitry Andric   assert(MRI.getType(Src) == S64);
20800b57cec5SDimitry Andric 
20810b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2082*349cc55cSDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
20830b57cec5SDimitry Andric 
2084*349cc55cSDimitry Andric   if (MRI.getType(Dst) == S64) {
2085*349cc55cSDimitry Andric     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2086*349cc55cSDimitry Andric                         : B.buildUITOFP(S64, Unmerge.getReg(1));
20870b57cec5SDimitry Andric 
20880b57cec5SDimitry Andric     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
20890b57cec5SDimitry Andric     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
20900b57cec5SDimitry Andric                      .addUse(CvtHi.getReg(0))
20910b57cec5SDimitry Andric                      .addUse(ThirtyTwo.getReg(0));
20920b57cec5SDimitry Andric 
20930b57cec5SDimitry Andric     // TODO: Should this propagate fast-math-flags?
20940b57cec5SDimitry Andric     B.buildFAdd(Dst, LdExp, CvtLo);
20950b57cec5SDimitry Andric     MI.eraseFromParent();
20960b57cec5SDimitry Andric     return true;
20970b57cec5SDimitry Andric   }
20980b57cec5SDimitry Andric 
2099*349cc55cSDimitry Andric   assert(MRI.getType(Dst) == S32);
2100*349cc55cSDimitry Andric 
2101*349cc55cSDimitry Andric   auto One = B.buildConstant(S32, 1);
2102*349cc55cSDimitry Andric 
2103*349cc55cSDimitry Andric   MachineInstrBuilder ShAmt;
2104*349cc55cSDimitry Andric   if (Signed) {
2105*349cc55cSDimitry Andric     auto ThirtyOne = B.buildConstant(S32, 31);
2106*349cc55cSDimitry Andric     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2107*349cc55cSDimitry Andric     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2108*349cc55cSDimitry Andric     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2109*349cc55cSDimitry Andric     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2110*349cc55cSDimitry Andric                                /*HasSideEffects=*/false)
2111*349cc55cSDimitry Andric                   .addUse(Unmerge.getReg(1));
2112*349cc55cSDimitry Andric     auto LS2 = B.buildSub(S32, LS, One);
2113*349cc55cSDimitry Andric     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2114*349cc55cSDimitry Andric   } else
2115*349cc55cSDimitry Andric     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2116*349cc55cSDimitry Andric   auto Norm = B.buildShl(S64, Src, ShAmt);
2117*349cc55cSDimitry Andric   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2118*349cc55cSDimitry Andric   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2119*349cc55cSDimitry Andric   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2120*349cc55cSDimitry Andric   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2121*349cc55cSDimitry Andric   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2122*349cc55cSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2123*349cc55cSDimitry Andric                    /*HasSideEffects=*/false)
2124*349cc55cSDimitry Andric       .addUse(FVal.getReg(0))
2125*349cc55cSDimitry Andric       .addUse(Scale.getReg(0));
2126*349cc55cSDimitry Andric   MI.eraseFromParent();
2127*349cc55cSDimitry Andric   return true;
2128*349cc55cSDimitry Andric }
2129*349cc55cSDimitry Andric 
21305ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
21315ffd83dbSDimitry Andric // actually works.
2132fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2133fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2134fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2135fe6060f1SDimitry Andric                                         bool Signed) const {
21365ffd83dbSDimitry Andric 
21375ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
21385ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
21395ffd83dbSDimitry Andric 
21405ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
21415ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
21425ffd83dbSDimitry Andric 
2143fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2144fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
21455ffd83dbSDimitry Andric 
21465ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
21475ffd83dbSDimitry Andric 
2148fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2149fe6060f1SDimitry Andric   // integers is illustrated as follows:
2150fe6060f1SDimitry Andric   //
2151fe6060f1SDimitry Andric   //     tf := trunc(val);
2152fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2153fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2154fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2155fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2156fe6060f1SDimitry Andric   //
2157fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2158fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2159fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2160fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2161fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2162fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2163fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2164fe6060f1SDimitry Andric     // signedness.
2165fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2166fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2167fe6060f1SDimitry Andric   }
2168fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2169fe6060f1SDimitry Andric   if (SrcLT == S64) {
2170fe6060f1SDimitry Andric     K0 = B.buildFConstant(S64,
2171fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2172fe6060f1SDimitry Andric     K1 = B.buildFConstant(S64,
2173fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2174fe6060f1SDimitry Andric   } else {
2175fe6060f1SDimitry Andric     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2176fe6060f1SDimitry Andric     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2177fe6060f1SDimitry Andric   }
21785ffd83dbSDimitry Andric 
2179fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2180fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2181fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
21825ffd83dbSDimitry Andric 
2183fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2184fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
21855ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
21865ffd83dbSDimitry Andric 
2187fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2188fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2189fe6060f1SDimitry Andric     Sign = B.buildMerge(S64, {Sign, Sign});
2190fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2191fe6060f1SDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2192fe6060f1SDimitry Andric   } else
21935ffd83dbSDimitry Andric     B.buildMerge(Dst, {Lo, Hi});
21945ffd83dbSDimitry Andric   MI.eraseFromParent();
21955ffd83dbSDimitry Andric 
21965ffd83dbSDimitry Andric   return true;
21975ffd83dbSDimitry Andric }
21985ffd83dbSDimitry Andric 
21995ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
22005ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
22015ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
22020b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
22030b57cec5SDimitry Andric 
22040b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
22050b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
22060b57cec5SDimitry Andric 
22070b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
22080b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
22090b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
22100b57cec5SDimitry Andric     return !IsIEEEOp;
22110b57cec5SDimitry Andric 
22120b57cec5SDimitry Andric   if (IsIEEEOp)
22130b57cec5SDimitry Andric     return true;
22140b57cec5SDimitry Andric 
22150b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
22160b57cec5SDimitry Andric }
22170b57cec5SDimitry Andric 
22180b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
22190b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22200b57cec5SDimitry Andric   MachineIRBuilder &B) const {
22210b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
22220b57cec5SDimitry Andric 
22230b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
22245ffd83dbSDimitry Andric 
22255ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
22265ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2227*349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2228e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2229*349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2230e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
22310b57cec5SDimitry Andric     return true;
2232e8d8bef9SDimitry Andric   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
22330b57cec5SDimitry Andric 
22340b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22350b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
22360b57cec5SDimitry Andric 
22370b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
22380b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
22390b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Dst));
22400b57cec5SDimitry Andric 
2241e8d8bef9SDimitry Andric   if (IdxVal < VecTy.getNumElements())
2242e8d8bef9SDimitry Andric     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
22430b57cec5SDimitry Andric   else
22440b57cec5SDimitry Andric     B.buildUndef(Dst);
22450b57cec5SDimitry Andric 
22460b57cec5SDimitry Andric   MI.eraseFromParent();
22470b57cec5SDimitry Andric   return true;
22480b57cec5SDimitry Andric }
22490b57cec5SDimitry Andric 
22500b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
22510b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22520b57cec5SDimitry Andric   MachineIRBuilder &B) const {
22530b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
22540b57cec5SDimitry Andric 
22550b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
22565ffd83dbSDimitry Andric 
22575ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
22585ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2259*349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2260e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2261*349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2262e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
22630b57cec5SDimitry Andric     return true;
22640b57cec5SDimitry Andric 
2265e8d8bef9SDimitry Andric   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
22660b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22670b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
22680b57cec5SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
22690b57cec5SDimitry Andric 
22700b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
22710b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
22720b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Ins));
22730b57cec5SDimitry Andric 
2274e8d8bef9SDimitry Andric   if (IdxVal < VecTy.getNumElements())
2275e8d8bef9SDimitry Andric     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
22760b57cec5SDimitry Andric   else
22770b57cec5SDimitry Andric     B.buildUndef(Dst);
22780b57cec5SDimitry Andric 
22790b57cec5SDimitry Andric   MI.eraseFromParent();
22800b57cec5SDimitry Andric   return true;
22810b57cec5SDimitry Andric }
22820b57cec5SDimitry Andric 
22835ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector(
22845ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22855ffd83dbSDimitry Andric   MachineIRBuilder &B) const {
2286fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
22875ffd83dbSDimitry Andric 
22885ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22895ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
22905ffd83dbSDimitry Andric   LLT DstTy = MRI.getType(Dst);
22915ffd83dbSDimitry Andric   LLT SrcTy = MRI.getType(Src0);
22925ffd83dbSDimitry Andric 
22935ffd83dbSDimitry Andric   if (SrcTy == V2S16 && DstTy == V2S16 &&
22945ffd83dbSDimitry Andric       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
22955ffd83dbSDimitry Andric     return true;
22965ffd83dbSDimitry Andric 
22975ffd83dbSDimitry Andric   MachineIRBuilder HelperBuilder(MI);
22985ffd83dbSDimitry Andric   GISelObserverWrapper DummyObserver;
22995ffd83dbSDimitry Andric   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
23005ffd83dbSDimitry Andric   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
23015ffd83dbSDimitry Andric }
23025ffd83dbSDimitry Andric 
23038bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
23048bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23058bcb0991SDimitry Andric   MachineIRBuilder &B) const {
23068bcb0991SDimitry Andric 
23078bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
23088bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
23098bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
23108bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
23118bcb0991SDimitry Andric 
23128bcb0991SDimitry Andric   Register TrigVal;
23135ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
23148bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
23158bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
23168bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
23178bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
23188bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
23198bcb0991SDimitry Andric   } else
23208bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
23218bcb0991SDimitry Andric 
23228bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
23238bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
23248bcb0991SDimitry Andric   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
23258bcb0991SDimitry Andric     .addUse(TrigVal)
23268bcb0991SDimitry Andric     .setMIFlags(Flags);
23278bcb0991SDimitry Andric   MI.eraseFromParent();
23288bcb0991SDimitry Andric   return true;
23298bcb0991SDimitry Andric }
23308bcb0991SDimitry Andric 
23315ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
23325ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
23335ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
23345ffd83dbSDimitry Andric                                                   int64_t Offset,
23355ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
23365ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
23378bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
23388bcb0991SDimitry Andric   // to the following code sequence:
23398bcb0991SDimitry Andric   //
23408bcb0991SDimitry Andric   // For constant address space:
23418bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
23428bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
23438bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
23448bcb0991SDimitry Andric   //
23458bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
23468bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
23478bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
23488bcb0991SDimitry Andric   //   operand to the global variable.
23498bcb0991SDimitry Andric   //
23508bcb0991SDimitry Andric   // For global address space:
23518bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
23528bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
23538bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
23548bcb0991SDimitry Andric   //
23558bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
23568bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
23578bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
23588bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
23598bcb0991SDimitry Andric   //   operand to the global variable.
23608bcb0991SDimitry Andric   //
23618bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
23628bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
23638bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
23648bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
23658bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
2366e8d8bef9SDimitry Andric   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2367e8d8bef9SDimitry Andric   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2368e8d8bef9SDimitry Andric   // instruction.
23698bcb0991SDimitry Andric 
23708bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
23718bcb0991SDimitry Andric 
23728bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
23738bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
23748bcb0991SDimitry Andric 
23758bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
23768bcb0991SDimitry Andric     .addDef(PCReg);
23778bcb0991SDimitry Andric 
23788bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
23798bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
23808bcb0991SDimitry Andric     MIB.addImm(0);
23818bcb0991SDimitry Andric   else
2382e8d8bef9SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
23838bcb0991SDimitry Andric 
23848bcb0991SDimitry Andric   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
23858bcb0991SDimitry Andric 
23868bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
23878bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
23888bcb0991SDimitry Andric   return true;
23898bcb0991SDimitry Andric  }
23908bcb0991SDimitry Andric 
23918bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
23928bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23938bcb0991SDimitry Andric   MachineIRBuilder &B) const {
23948bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
23958bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
23968bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
23978bcb0991SDimitry Andric 
23988bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
23998bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
24008bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
24018bcb0991SDimitry Andric 
24028bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2403fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2404fe6060f1SDimitry Andric         !GV->getName().equals("llvm.amdgcn.module.lds")) {
24058bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
24068bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
24075ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
24085ffd83dbSDimitry Andric         DS_Warning);
24098bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
24105ffd83dbSDimitry Andric 
24115ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
24125ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
24135ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
24145ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
24155ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
24165ffd83dbSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
24175ffd83dbSDimitry Andric       B.buildUndef(DstReg);
24185ffd83dbSDimitry Andric       MI.eraseFromParent();
24195ffd83dbSDimitry Andric       return true;
24208bcb0991SDimitry Andric     }
24218bcb0991SDimitry Andric 
24228bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
2423*349cc55cSDimitry Andric     // We ignore the initializer for now and legalize it to allow selection.
2424*349cc55cSDimitry Andric     // The initializer will anyway get errored out during assembly emission.
24255ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
24265ffd83dbSDimitry Andric     if (!TLI->shouldUseLDSConstAddress(GV)) {
24275ffd83dbSDimitry Andric       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
24285ffd83dbSDimitry Andric       return true; // Leave in place;
24295ffd83dbSDimitry Andric     }
24305ffd83dbSDimitry Andric 
2431e8d8bef9SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2432e8d8bef9SDimitry Andric       Type *Ty = GV->getValueType();
2433e8d8bef9SDimitry Andric       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2434e8d8bef9SDimitry Andric       // zero-sized type in other languages to declare the dynamic shared
2435e8d8bef9SDimitry Andric       // memory which size is not known at the compile time. They will be
2436e8d8bef9SDimitry Andric       // allocated by the runtime and placed directly after the static
2437e8d8bef9SDimitry Andric       // allocated ones. They all share the same offset.
2438e8d8bef9SDimitry Andric       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2439e8d8bef9SDimitry Andric         // Adjust alignment for that dynamic shared memory array.
2440e8d8bef9SDimitry Andric         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2441e8d8bef9SDimitry Andric         LLT S32 = LLT::scalar(32);
2442e8d8bef9SDimitry Andric         auto Sz =
2443e8d8bef9SDimitry Andric             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2444e8d8bef9SDimitry Andric         B.buildIntToPtr(DstReg, Sz);
2445e8d8bef9SDimitry Andric         MI.eraseFromParent();
2446e8d8bef9SDimitry Andric         return true;
2447e8d8bef9SDimitry Andric       }
2448e8d8bef9SDimitry Andric     }
2449e8d8bef9SDimitry Andric 
2450*349cc55cSDimitry Andric     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2451*349cc55cSDimitry Andric                                                    *cast<GlobalVariable>(GV)));
24528bcb0991SDimitry Andric     MI.eraseFromParent();
24538bcb0991SDimitry Andric     return true;
24548bcb0991SDimitry Andric   }
24558bcb0991SDimitry Andric 
24568bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
24578bcb0991SDimitry Andric 
24588bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
24598bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
24608bcb0991SDimitry Andric     MI.eraseFromParent();
24618bcb0991SDimitry Andric     return true;
24628bcb0991SDimitry Andric   }
24638bcb0991SDimitry Andric 
24648bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
24658bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
24668bcb0991SDimitry Andric     MI.eraseFromParent();
24678bcb0991SDimitry Andric     return true;
24688bcb0991SDimitry Andric   }
24698bcb0991SDimitry Andric 
24708bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
24718bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
24728bcb0991SDimitry Andric 
2473fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
24748bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
24758bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
24768bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
24778bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2478fe6060f1SDimitry Andric       LoadTy, Align(8));
24798bcb0991SDimitry Andric 
24808bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
24818bcb0991SDimitry Andric 
24828bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
2483*349cc55cSDimitry Andric     // Truncate if this is a 32-bit constant address.
24848bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
24858bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
24868bcb0991SDimitry Andric   } else
24878bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
24888bcb0991SDimitry Andric 
24898bcb0991SDimitry Andric   MI.eraseFromParent();
24908bcb0991SDimitry Andric   return true;
24918bcb0991SDimitry Andric }
24928bcb0991SDimitry Andric 
2493e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2494e8d8bef9SDimitry Andric   if (Ty.isVector())
2495fe6060f1SDimitry Andric     return Ty.changeElementCount(
2496fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2497e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2498e8d8bef9SDimitry Andric }
2499e8d8bef9SDimitry Andric 
2500e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2501e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2502e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2503e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2504e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2505e8d8bef9SDimitry Andric 
2506e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2507e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2508e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2509e8d8bef9SDimitry Andric 
2510e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
25118bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2512e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
25138bcb0991SDimitry Andric     Observer.changingInstr(MI);
25148bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
25158bcb0991SDimitry Andric     Observer.changedInstr(MI);
25168bcb0991SDimitry Andric     return true;
25178bcb0991SDimitry Andric   }
25188bcb0991SDimitry Andric 
2519fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
2520fe6060f1SDimitry Andric     return false;
2521fe6060f1SDimitry Andric 
2522e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2523e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2524e8d8bef9SDimitry Andric 
2525e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
2526e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
2527fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
2528e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
2529fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
2530e8d8bef9SDimitry Andric   const unsigned AlignInBits = 8 * MemAlign.value();
2531e8d8bef9SDimitry Andric 
2532e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
2533fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2534e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2535e8d8bef9SDimitry Andric 
2536e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
2537e8d8bef9SDimitry Andric     // the memory type.
2538e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
2539e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
2540e8d8bef9SDimitry Andric 
2541e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
2542e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2543e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
2544e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
2545e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
2546e8d8bef9SDimitry Andric       return true;
2547e8d8bef9SDimitry Andric     }
2548e8d8bef9SDimitry Andric 
2549e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
2550e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
2551e8d8bef9SDimitry Andric       return false;
2552e8d8bef9SDimitry Andric 
2553e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
2554e8d8bef9SDimitry Andric 
2555e8d8bef9SDimitry Andric     Register WideLoad;
2556e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
2557e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2558e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
2559e8d8bef9SDimitry Andric     } else {
2560e8d8bef9SDimitry Andric       // Extract the subvector.
2561e8d8bef9SDimitry Andric 
2562e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
2563e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
2564e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
2565e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2566e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
2567e8d8bef9SDimitry Andric       } else {
2568e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
2569e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2570e8d8bef9SDimitry Andric         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2571e8d8bef9SDimitry Andric         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2572e8d8bef9SDimitry Andric         B.setInsertPt(B.getMBB(), MI.getIterator());
2573e8d8bef9SDimitry Andric         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2574e8d8bef9SDimitry Andric       }
2575e8d8bef9SDimitry Andric     }
2576e8d8bef9SDimitry Andric 
2577e8d8bef9SDimitry Andric     MI.eraseFromParent();
2578e8d8bef9SDimitry Andric     return true;
2579e8d8bef9SDimitry Andric   }
2580e8d8bef9SDimitry Andric 
2581e8d8bef9SDimitry Andric   return false;
2582e8d8bef9SDimitry Andric }
2583e8d8bef9SDimitry Andric 
25848bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
25858bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
25868bcb0991SDimitry Andric   MachineIRBuilder &B) const {
25878bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
25888bcb0991SDimitry Andric   assert(Ty.isScalar());
25898bcb0991SDimitry Andric 
2590480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
2591480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2592480093f4SDimitry Andric 
25938bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
25945ffd83dbSDimitry Andric   // FIXME: Do we need just output?
25955ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
25968bcb0991SDimitry Andric     return true;
25975ffd83dbSDimitry Andric   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
25988bcb0991SDimitry Andric     return true;
25998bcb0991SDimitry Andric 
26008bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
26018bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
26028bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
26038bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
26048bcb0991SDimitry Andric }
26058bcb0991SDimitry Andric 
2606480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2607480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2608480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2609480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2610480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
2611480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
2612480093f4SDimitry Andric 
2613e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2614480093f4SDimitry Andric          "this should not have been custom lowered");
2615480093f4SDimitry Andric 
2616480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
2617fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
2618480093f4SDimitry Andric 
2619480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2620480093f4SDimitry Andric 
2621480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2622480093f4SDimitry Andric     .addDef(DstReg)
2623480093f4SDimitry Andric     .addUse(PtrReg)
2624480093f4SDimitry Andric     .addUse(PackedVal)
2625480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
2626480093f4SDimitry Andric 
2627480093f4SDimitry Andric   MI.eraseFromParent();
2628480093f4SDimitry Andric   return true;
2629480093f4SDimitry Andric }
2630480093f4SDimitry Andric 
26315ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog(
26325ffd83dbSDimitry Andric   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
26335ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26345ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
26355ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26365ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26375ffd83dbSDimitry Andric 
26385ffd83dbSDimitry Andric   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
26395ffd83dbSDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
26405ffd83dbSDimitry Andric 
26415ffd83dbSDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
26425ffd83dbSDimitry Andric   MI.eraseFromParent();
26435ffd83dbSDimitry Andric   return true;
26445ffd83dbSDimitry Andric }
26455ffd83dbSDimitry Andric 
26465ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
26475ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
26485ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26495ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
26505ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26515ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26525ffd83dbSDimitry Andric 
26535ffd83dbSDimitry Andric   auto K = B.buildFConstant(Ty, numbers::log2e);
26545ffd83dbSDimitry Andric   auto Mul = B.buildFMul(Ty, Src, K, Flags);
26555ffd83dbSDimitry Andric   B.buildFExp2(Dst, Mul, Flags);
26565ffd83dbSDimitry Andric   MI.eraseFromParent();
26575ffd83dbSDimitry Andric   return true;
26585ffd83dbSDimitry Andric }
26595ffd83dbSDimitry Andric 
26605ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
26615ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
26625ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26635ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
26645ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
26655ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26665ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26675ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
26685ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
26695ffd83dbSDimitry Andric 
26705ffd83dbSDimitry Andric   if (Ty == S32) {
26715ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S32, Src0, Flags);
26725ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
26735ffd83dbSDimitry Andric       .addUse(Log.getReg(0))
26745ffd83dbSDimitry Andric       .addUse(Src1)
26755ffd83dbSDimitry Andric       .setMIFlags(Flags);
26765ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
26775ffd83dbSDimitry Andric   } else if (Ty == S16) {
26785ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
26795ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S16, Src0, Flags);
26805ffd83dbSDimitry Andric     auto Ext0 = B.buildFPExt(S32, Log, Flags);
26815ffd83dbSDimitry Andric     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
26825ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
26835ffd83dbSDimitry Andric       .addUse(Ext0.getReg(0))
26845ffd83dbSDimitry Andric       .addUse(Ext1.getReg(0))
26855ffd83dbSDimitry Andric       .setMIFlags(Flags);
26865ffd83dbSDimitry Andric 
26875ffd83dbSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
26885ffd83dbSDimitry Andric   } else
26895ffd83dbSDimitry Andric     return false;
26905ffd83dbSDimitry Andric 
26915ffd83dbSDimitry Andric   MI.eraseFromParent();
26925ffd83dbSDimitry Andric   return true;
26935ffd83dbSDimitry Andric }
26945ffd83dbSDimitry Andric 
26955ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
26965ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
26975ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
26985ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
26995ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
27005ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
27015ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
27025ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
27035ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
27045ffd83dbSDimitry Andric   return ModSrc;
27055ffd83dbSDimitry Andric }
27065ffd83dbSDimitry Andric 
27075ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
27085ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
27095ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
27105ffd83dbSDimitry Andric 
27115ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
27125ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
27135ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27145ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
27155ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
27165ffd83dbSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
27175ffd83dbSDimitry Andric          "this should not have been custom lowered");
27185ffd83dbSDimitry Andric 
27195ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
27205ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
27215ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
27225ffd83dbSDimitry Andric   // V_FRACT bug is:
27235ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
27245ffd83dbSDimitry Andric   //
27255ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
27265ffd83dbSDimitry Andric 
27275ffd83dbSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
27285ffd83dbSDimitry Andric     .addUse(OrigSrc)
27295ffd83dbSDimitry Andric     .setMIFlags(Flags);
27305ffd83dbSDimitry Andric 
27315ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
27325ffd83dbSDimitry Andric   // pattern.
27335ffd83dbSDimitry Andric 
27345ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
27355ffd83dbSDimitry Andric   // shouldn't matter?
27365ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
27375ffd83dbSDimitry Andric 
27385ffd83dbSDimitry Andric   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
27395ffd83dbSDimitry Andric 
27405ffd83dbSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(S64);
27415ffd83dbSDimitry Andric 
27425ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
27435ffd83dbSDimitry Andric   // use the one which will directly select.
27445ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
27455ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
27465ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
27475ffd83dbSDimitry Andric   else
27485ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
27495ffd83dbSDimitry Andric 
27505ffd83dbSDimitry Andric   Register CorrectedFract = Min;
27515ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
27525ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
27535ffd83dbSDimitry Andric     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
27545ffd83dbSDimitry Andric   }
27555ffd83dbSDimitry Andric 
27565ffd83dbSDimitry Andric   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
27575ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
27585ffd83dbSDimitry Andric 
27595ffd83dbSDimitry Andric   MI.eraseFromParent();
27605ffd83dbSDimitry Andric   return true;
27615ffd83dbSDimitry Andric }
27625ffd83dbSDimitry Andric 
27635ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
27645ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
27655ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
27665ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
27675ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27685ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2769fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
27705ffd83dbSDimitry Andric 
27715ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
27725ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
27735ffd83dbSDimitry Andric   assert(MRI.getType(Src0) == LLT::scalar(16));
27745ffd83dbSDimitry Andric 
27755ffd83dbSDimitry Andric   auto Merge = B.buildMerge(S32, {Src0, Src1});
27765ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
27775ffd83dbSDimitry Andric 
27785ffd83dbSDimitry Andric   MI.eraseFromParent();
27795ffd83dbSDimitry Andric   return true;
27805ffd83dbSDimitry Andric }
27815ffd83dbSDimitry Andric 
2782*349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
2783*349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
2784*349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select.
2785*349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
2786*349cc55cSDimitry Andric                                             MachineRegisterInfo &MRI,
2787*349cc55cSDimitry Andric                                             MachineIRBuilder &B) const {
2788*349cc55cSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2789*349cc55cSDimitry Andric   Register Src = MI.getOperand(1).getReg();
2790*349cc55cSDimitry Andric   LLT DstTy = MRI.getType(Dst);
2791*349cc55cSDimitry Andric   LLT SrcTy = MRI.getType(Src);
2792*349cc55cSDimitry Andric 
2793*349cc55cSDimitry Andric   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
2794*349cc55cSDimitry Andric                         ? AMDGPU::G_AMDGPU_FFBH_U32
2795*349cc55cSDimitry Andric                         : AMDGPU::G_AMDGPU_FFBL_B32;
2796*349cc55cSDimitry Andric   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
2797*349cc55cSDimitry Andric   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
2798*349cc55cSDimitry Andric 
2799*349cc55cSDimitry Andric   MI.eraseFromParent();
2800*349cc55cSDimitry Andric   return true;
2801*349cc55cSDimitry Andric }
2802*349cc55cSDimitry Andric 
2803e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
2804e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2805e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
2806e8d8bef9SDimitry Andric     return false;
2807*349cc55cSDimitry Andric   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2808e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
2809e8d8bef9SDimitry Andric }
2810e8d8bef9SDimitry Andric 
28110b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
2812e8d8bef9SDimitry Andric static MachineInstr *
2813e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2814e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
28150b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
28160b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
28170b57cec5SDimitry Andric     return nullptr;
28180b57cec5SDimitry Andric 
28195ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
2820e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2821e8d8bef9SDimitry Andric 
2822e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
2823e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
2824e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
2825e8d8bef9SDimitry Andric       return nullptr;
2826e8d8bef9SDimitry Andric 
2827e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
2828*349cc55cSDimitry Andric     eraseInstr(*UseMI, MRI);
2829e8d8bef9SDimitry Andric 
2830e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2831e8d8bef9SDimitry Andric     Negated = true;
2832e8d8bef9SDimitry Andric   }
2833e8d8bef9SDimitry Andric 
2834e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2835480093f4SDimitry Andric     return nullptr;
2836480093f4SDimitry Andric 
28375ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2838e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
28395ffd83dbSDimitry Andric   if (Next == Parent->end()) {
28405ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
28415ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
28425ffd83dbSDimitry Andric       return nullptr;
28435ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
28445ffd83dbSDimitry Andric   } else {
2845480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
2846480093f4SDimitry Andric       return nullptr;
2847480093f4SDimitry Andric     Br = &*Next;
28485ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
2849480093f4SDimitry Andric   }
2850480093f4SDimitry Andric 
2851e8d8bef9SDimitry Andric   return UseMI;
28520b57cec5SDimitry Andric }
28530b57cec5SDimitry Andric 
28540b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2855e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
2856e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
2857e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
2858e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
2859e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
28605ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
28610b57cec5SDimitry Andric 
2862e8d8bef9SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2863e8d8bef9SDimitry Andric                                              ArgTy);
28640b57cec5SDimitry Andric   if (Arg->isMasked()) {
28650b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
28660b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
28670b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
28680b57cec5SDimitry Andric     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
28690b57cec5SDimitry Andric 
28708bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
28718bcb0991SDimitry Andric 
28728bcb0991SDimitry Andric     if (Shift != 0) {
28730b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
28748bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
28758bcb0991SDimitry Andric     }
28768bcb0991SDimitry Andric 
28778bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
28785ffd83dbSDimitry Andric   } else {
28790b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
28800b57cec5SDimitry Andric   }
28810b57cec5SDimitry Andric 
28820b57cec5SDimitry Andric   return true;
28830b57cec5SDimitry Andric }
28840b57cec5SDimitry Andric 
2885e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
2886e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
2887e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2888e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2889e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
2890e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
2891e8d8bef9SDimitry Andric   LLT ArgTy;
2892e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2893e8d8bef9SDimitry Andric 
2894*349cc55cSDimitry Andric   if (!Arg) {
2895*349cc55cSDimitry Andric     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
2896*349cc55cSDimitry Andric       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
2897*349cc55cSDimitry Andric       // case the pointer argument may be missing and we use null.
2898*349cc55cSDimitry Andric       B.buildConstant(DstReg, 0);
2899*349cc55cSDimitry Andric       return true;
2900*349cc55cSDimitry Andric     }
2901*349cc55cSDimitry Andric 
2902*349cc55cSDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
2903*349cc55cSDimitry Andric     // attributes uses the corresponding intrinsic.
2904*349cc55cSDimitry Andric     B.buildUndef(DstReg);
2905*349cc55cSDimitry Andric     return true;
2906*349cc55cSDimitry Andric   }
2907*349cc55cSDimitry Andric 
2908e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2909e8d8bef9SDimitry Andric     return false; // TODO: Handle these
2910e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2911e8d8bef9SDimitry Andric }
2912e8d8bef9SDimitry Andric 
29130b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
29145ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
29150b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2916e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
29175ffd83dbSDimitry Andric     return false;
29185ffd83dbSDimitry Andric 
29190b57cec5SDimitry Andric   MI.eraseFromParent();
29200b57cec5SDimitry Andric   return true;
29210b57cec5SDimitry Andric }
29220b57cec5SDimitry Andric 
29238bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
29248bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
29258bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
2926480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2927480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
2928480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
2929480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2930480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
29318bcb0991SDimitry Andric 
2932480093f4SDimitry Andric   if (DstTy == S16)
2933480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
2934480093f4SDimitry Andric   if (DstTy == S32)
2935480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
2936480093f4SDimitry Andric   if (DstTy == S64)
2937480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
2938480093f4SDimitry Andric 
29398bcb0991SDimitry Andric   return false;
29408bcb0991SDimitry Andric }
29418bcb0991SDimitry Andric 
2942fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2943fe6060f1SDimitry Andric                                                         Register DstDivReg,
2944fe6060f1SDimitry Andric                                                         Register DstRemReg,
29455ffd83dbSDimitry Andric                                                         Register X,
2946fe6060f1SDimitry Andric                                                         Register Y) const {
29475ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
29485ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
29495ffd83dbSDimitry Andric 
29505ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
29515ffd83dbSDimitry Andric   // algorithm used here.
29525ffd83dbSDimitry Andric 
29535ffd83dbSDimitry Andric   // Initial estimate of inv(y).
29545ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
29555ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
29565ffd83dbSDimitry Andric   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
29575ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
29585ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
29595ffd83dbSDimitry Andric 
29605ffd83dbSDimitry Andric   // One round of UNR.
29615ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
29625ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
29635ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
29645ffd83dbSDimitry Andric 
29655ffd83dbSDimitry Andric   // Quotient/remainder estimate.
29665ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
29675ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
29685ffd83dbSDimitry Andric 
29695ffd83dbSDimitry Andric   // First quotient/remainder refinement.
29705ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
29715ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2972fe6060f1SDimitry Andric   if (DstDivReg)
29735ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
29745ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
29755ffd83dbSDimitry Andric 
29765ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
29775ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2978fe6060f1SDimitry Andric   if (DstDivReg)
2979fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
29805ffd83dbSDimitry Andric 
2981fe6060f1SDimitry Andric   if (DstRemReg)
2982fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
29835ffd83dbSDimitry Andric }
29845ffd83dbSDimitry Andric 
2985*349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32
29865ffd83dbSDimitry Andric //
29875ffd83dbSDimitry Andric // Return lo, hi of result
29885ffd83dbSDimitry Andric //
29895ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
29905ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
29915ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
29925ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
29935ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
29945ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
29955ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
29965ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
29975ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
29985ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
29995ffd83dbSDimitry Andric                                                        Register Val) {
30005ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
30015ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
30025ffd83dbSDimitry Andric 
30035ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
30045ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
30055ffd83dbSDimitry Andric 
30065ffd83dbSDimitry Andric   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
30075ffd83dbSDimitry Andric                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
30085ffd83dbSDimitry Andric 
30095ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
30105ffd83dbSDimitry Andric   auto Mul1 =
30115ffd83dbSDimitry Andric       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
30125ffd83dbSDimitry Andric 
30135ffd83dbSDimitry Andric   // 2**(-32)
30145ffd83dbSDimitry Andric   auto Mul2 =
30155ffd83dbSDimitry Andric       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
30165ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
30175ffd83dbSDimitry Andric 
30185ffd83dbSDimitry Andric   // -(2**32)
30195ffd83dbSDimitry Andric   auto Mad2 = B.buildFMAD(S32, Trunc,
30205ffd83dbSDimitry Andric                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
30215ffd83dbSDimitry Andric 
30225ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
30235ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
30245ffd83dbSDimitry Andric 
30255ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
30265ffd83dbSDimitry Andric }
30275ffd83dbSDimitry Andric 
3028fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3029fe6060f1SDimitry Andric                                                         Register DstDivReg,
3030fe6060f1SDimitry Andric                                                         Register DstRemReg,
30315ffd83dbSDimitry Andric                                                         Register Numer,
3032fe6060f1SDimitry Andric                                                         Register Denom) const {
30335ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
30345ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
30355ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
30365ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
30375ffd83dbSDimitry Andric 
30385ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
30395ffd83dbSDimitry Andric 
30405ffd83dbSDimitry Andric   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
30415ffd83dbSDimitry Andric 
30425ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
30435ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
30445ffd83dbSDimitry Andric 
30455ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
30465ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
30475ffd83dbSDimitry Andric 
30485ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
30495ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
30505ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
30515ffd83dbSDimitry Andric 
30525ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
30535ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
30545ffd83dbSDimitry Andric   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
30555ffd83dbSDimitry Andric 
30565ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
30575ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
30585ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
30595ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
30605ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
30615ffd83dbSDimitry Andric 
30625ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
30635ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3064*349cc55cSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
30655ffd83dbSDimitry Andric   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
30665ffd83dbSDimitry Andric 
30675ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
30685ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
30695ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
30705ffd83dbSDimitry Andric 
30715ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
30725ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
30735ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
30745ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
30755ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
30765ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
30775ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
30785ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
30795ffd83dbSDimitry Andric   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
30805ffd83dbSDimitry Andric 
30815ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
30825ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
30835ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
30845ffd83dbSDimitry Andric 
30855ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
30865ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
30875ffd83dbSDimitry Andric 
30885ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
30895ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
30905ffd83dbSDimitry Andric 
30915ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
30925ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
30935ffd83dbSDimitry Andric 
30945ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
30955ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
30965ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
30975ffd83dbSDimitry Andric 
30985ffd83dbSDimitry Andric   // if C3 != 0 ...
30995ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
31005ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
31015ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
31025ffd83dbSDimitry Andric   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
31035ffd83dbSDimitry Andric 
31045ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
31055ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
31065ffd83dbSDimitry Andric 
31075ffd83dbSDimitry Andric   auto C4 =
31085ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
31095ffd83dbSDimitry Andric   auto C5 =
31105ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
31115ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
31125ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
31135ffd83dbSDimitry Andric 
31145ffd83dbSDimitry Andric   // if (C6 != 0)
31155ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
31165ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
31175ffd83dbSDimitry Andric 
31185ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
31195ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
31205ffd83dbSDimitry Andric   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
31215ffd83dbSDimitry Andric 
31225ffd83dbSDimitry Andric   // endif C6
31235ffd83dbSDimitry Andric   // endif C3
31245ffd83dbSDimitry Andric 
3125fe6060f1SDimitry Andric   if (DstDivReg) {
31265ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
31275ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3128fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3129fe6060f1SDimitry Andric                   Sel1, MulHi3);
3130fe6060f1SDimitry Andric   }
3131fe6060f1SDimitry Andric 
3132fe6060f1SDimitry Andric   if (DstRemReg) {
31335ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
31345ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3135fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3136fe6060f1SDimitry Andric                   Sel2, Sub1);
31375ffd83dbSDimitry Andric   }
31385ffd83dbSDimitry Andric }
31395ffd83dbSDimitry Andric 
3140fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
31415ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
31425ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
3143fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
3144fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3145fe6060f1SDimitry Andric   default:
3146fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3147fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
3148fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3149fe6060f1SDimitry Andric     break;
3150fe6060f1SDimitry Andric   }
3151fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
3152fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3153fe6060f1SDimitry Andric     break;
3154fe6060f1SDimitry Andric   }
3155fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
3156fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3157fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3158fe6060f1SDimitry Andric     break;
3159fe6060f1SDimitry Andric   }
3160fe6060f1SDimitry Andric   }
3161fe6060f1SDimitry Andric 
31625ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
31635ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3164fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3165fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3166fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3167fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
31685ffd83dbSDimitry Andric 
31695ffd83dbSDimitry Andric   if (Ty == S32)
3170fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
31715ffd83dbSDimitry Andric   else if (Ty == S64)
3172fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
31735ffd83dbSDimitry Andric   else
31745ffd83dbSDimitry Andric     return false;
31755ffd83dbSDimitry Andric 
31765ffd83dbSDimitry Andric   MI.eraseFromParent();
31775ffd83dbSDimitry Andric   return true;
31785ffd83dbSDimitry Andric }
31795ffd83dbSDimitry Andric 
3180fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
31815ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
31825ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
31835ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
31845ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
31855ffd83dbSDimitry Andric 
3186fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
31875ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
31885ffd83dbSDimitry Andric     return false;
31895ffd83dbSDimitry Andric 
3190fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3191fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3192fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
31935ffd83dbSDimitry Andric 
31945ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
31955ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
31965ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
31975ffd83dbSDimitry Andric 
31985ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
31995ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
32005ffd83dbSDimitry Andric 
32015ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
32025ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
32035ffd83dbSDimitry Andric 
3204fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3205fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3206fe6060f1SDimitry Andric   default:
3207fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3208fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
3209fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3210fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3211fe6060f1SDimitry Andric     break;
3212fe6060f1SDimitry Andric   }
3213fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
3214fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3215fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3216fe6060f1SDimitry Andric     break;
3217fe6060f1SDimitry Andric   }
3218fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
3219fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3220fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3221fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3222fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3223fe6060f1SDimitry Andric     break;
3224fe6060f1SDimitry Andric   }
3225fe6060f1SDimitry Andric   }
3226fe6060f1SDimitry Andric 
32275ffd83dbSDimitry Andric   if (Ty == S32)
3228fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
32295ffd83dbSDimitry Andric   else
3230fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
32315ffd83dbSDimitry Andric 
3232fe6060f1SDimitry Andric   if (DstDivReg) {
3233fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3234fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3235fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
3236fe6060f1SDimitry Andric   }
32375ffd83dbSDimitry Andric 
3238fe6060f1SDimitry Andric   if (DstRemReg) {
3239fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3240fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3241fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
3242fe6060f1SDimitry Andric   }
32435ffd83dbSDimitry Andric 
32445ffd83dbSDimitry Andric   MI.eraseFromParent();
32455ffd83dbSDimitry Andric   return true;
32465ffd83dbSDimitry Andric }
32475ffd83dbSDimitry Andric 
32488bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
32498bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
32508bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
32518bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
32528bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
32538bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
32548bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
32558bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
32568bcb0991SDimitry Andric 
32578bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
3258e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3259e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
32608bcb0991SDimitry Andric 
3261e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
32628bcb0991SDimitry Andric     return false;
32638bcb0991SDimitry Andric 
32648bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
32658bcb0991SDimitry Andric     // 1 / x -> RCP(x)
32668bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
32678bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
32688bcb0991SDimitry Andric         .addUse(RHS)
32698bcb0991SDimitry Andric         .setMIFlags(Flags);
32708bcb0991SDimitry Andric 
32718bcb0991SDimitry Andric       MI.eraseFromParent();
32728bcb0991SDimitry Andric       return true;
32738bcb0991SDimitry Andric     }
32748bcb0991SDimitry Andric 
32758bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
32768bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
32778bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
32788bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
32798bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
32808bcb0991SDimitry Andric         .setMIFlags(Flags);
32818bcb0991SDimitry Andric 
32828bcb0991SDimitry Andric       MI.eraseFromParent();
32838bcb0991SDimitry Andric       return true;
32848bcb0991SDimitry Andric     }
32858bcb0991SDimitry Andric   }
32868bcb0991SDimitry Andric 
32878bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
32888bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
32898bcb0991SDimitry Andric     .addUse(RHS)
32908bcb0991SDimitry Andric     .setMIFlags(Flags);
32918bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
32928bcb0991SDimitry Andric 
32938bcb0991SDimitry Andric   MI.eraseFromParent();
32948bcb0991SDimitry Andric   return true;
32958bcb0991SDimitry Andric }
32968bcb0991SDimitry Andric 
3297e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3298e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
3299e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
3300e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3301e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
3302e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
3303e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
3304e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
3305e8d8bef9SDimitry Andric 
3306e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
3307e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3308e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
3309e8d8bef9SDimitry Andric 
3310e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
33118bcb0991SDimitry Andric     return false;
3312e8d8bef9SDimitry Andric 
3313e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
3314e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
3315e8d8bef9SDimitry Andric 
3316e8d8bef9SDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3317e8d8bef9SDimitry Andric     .addUse(Y)
3318e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3319e8d8bef9SDimitry Andric 
3320e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3321e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
3322e8d8bef9SDimitry Andric 
3323e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3324e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
3325e8d8bef9SDimitry Andric 
3326e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
3327e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3328e8d8bef9SDimitry Andric 
3329e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
3330e8d8bef9SDimitry Andric   MI.eraseFromParent();
3331e8d8bef9SDimitry Andric   return true;
33328bcb0991SDimitry Andric }
33338bcb0991SDimitry Andric 
3334480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3335480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3336480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3337e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3338e8d8bef9SDimitry Andric     return true;
3339e8d8bef9SDimitry Andric 
3340480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3341480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3342480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3343480093f4SDimitry Andric 
3344480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3345480093f4SDimitry Andric 
3346480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
3347480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3348480093f4SDimitry Andric 
3349480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3350480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3351480093f4SDimitry Andric 
3352480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3353480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
3354480093f4SDimitry Andric     .setMIFlags(Flags);
3355480093f4SDimitry Andric 
3356480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3357480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3358480093f4SDimitry Andric 
3359480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3360480093f4SDimitry Andric     .addUse(RDst.getReg(0))
3361480093f4SDimitry Andric     .addUse(RHS)
3362480093f4SDimitry Andric     .addUse(LHS)
3363480093f4SDimitry Andric     .setMIFlags(Flags);
3364480093f4SDimitry Andric 
3365480093f4SDimitry Andric   MI.eraseFromParent();
3366480093f4SDimitry Andric   return true;
3367480093f4SDimitry Andric }
3368480093f4SDimitry Andric 
3369480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3370480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3371480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable,
3372480093f4SDimitry Andric                                MachineIRBuilder &B,
3373480093f4SDimitry Andric                                const GCNSubtarget &ST,
3374480093f4SDimitry Andric                                AMDGPU::SIModeRegisterDefaults Mode) {
3375480093f4SDimitry Andric   // Set SP denorm mode to this value.
3376480093f4SDimitry Andric   unsigned SPDenormMode =
33775ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3378480093f4SDimitry Andric 
3379480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
3380480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
33815ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3382480093f4SDimitry Andric 
33835ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3384480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
3385480093f4SDimitry Andric       .addImm(NewDenormModeValue);
3386480093f4SDimitry Andric 
3387480093f4SDimitry Andric   } else {
3388480093f4SDimitry Andric     // Select FP32 bit field in mode register.
3389480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3390480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3391480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3392480093f4SDimitry Andric 
3393480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3394480093f4SDimitry Andric       .addImm(SPDenormMode)
3395480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
3396480093f4SDimitry Andric   }
3397480093f4SDimitry Andric }
3398480093f4SDimitry Andric 
3399480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3400480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3401480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3402e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3403e8d8bef9SDimitry Andric     return true;
3404e8d8bef9SDimitry Andric 
3405480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3406480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3407480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3408480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3409480093f4SDimitry Andric   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3410480093f4SDimitry Andric 
3411480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3412480093f4SDimitry Andric 
3413480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3414480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3415480093f4SDimitry Andric 
3416480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
3417480093f4SDimitry Andric 
3418480093f4SDimitry Andric   auto DenominatorScaled =
3419480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3420480093f4SDimitry Andric       .addUse(LHS)
34215ffd83dbSDimitry Andric       .addUse(RHS)
34225ffd83dbSDimitry Andric       .addImm(0)
3423480093f4SDimitry Andric       .setMIFlags(Flags);
3424480093f4SDimitry Andric   auto NumeratorScaled =
3425480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3426480093f4SDimitry Andric       .addUse(LHS)
3427480093f4SDimitry Andric       .addUse(RHS)
34285ffd83dbSDimitry Andric       .addImm(1)
3429480093f4SDimitry Andric       .setMIFlags(Flags);
3430480093f4SDimitry Andric 
3431480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3432480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
3433480093f4SDimitry Andric     .setMIFlags(Flags);
3434480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3435480093f4SDimitry Andric 
3436480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3437480093f4SDimitry Andric   // aren't modeled as reading it.
34385ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3439480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
3440480093f4SDimitry Andric 
3441480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3442480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3443480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3444480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3445480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3446480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3447480093f4SDimitry Andric 
34485ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3449480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
3450480093f4SDimitry Andric 
3451480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3452480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3453480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
3454480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3455480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
3456480093f4SDimitry Andric     .setMIFlags(Flags);
3457480093f4SDimitry Andric 
3458480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3459480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3460480093f4SDimitry Andric     .addUse(RHS)
3461480093f4SDimitry Andric     .addUse(LHS)
3462480093f4SDimitry Andric     .setMIFlags(Flags);
3463480093f4SDimitry Andric 
3464480093f4SDimitry Andric   MI.eraseFromParent();
3465480093f4SDimitry Andric   return true;
3466480093f4SDimitry Andric }
3467480093f4SDimitry Andric 
3468480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3469480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3470480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3471e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3472e8d8bef9SDimitry Andric     return true;
3473e8d8bef9SDimitry Andric 
3474480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3475480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3476480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3477480093f4SDimitry Andric 
3478480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3479480093f4SDimitry Andric 
3480480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
3481480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3482480093f4SDimitry Andric 
3483480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
3484480093f4SDimitry Andric 
3485480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3486480093f4SDimitry Andric     .addUse(LHS)
3487480093f4SDimitry Andric     .addUse(RHS)
34885ffd83dbSDimitry Andric     .addImm(0)
3489480093f4SDimitry Andric     .setMIFlags(Flags);
3490480093f4SDimitry Andric 
3491480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3492480093f4SDimitry Andric 
3493480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3494480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
3495480093f4SDimitry Andric     .setMIFlags(Flags);
3496480093f4SDimitry Andric 
3497480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3498480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3499480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3500480093f4SDimitry Andric 
3501480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3502480093f4SDimitry Andric     .addUse(LHS)
3503480093f4SDimitry Andric     .addUse(RHS)
35045ffd83dbSDimitry Andric     .addImm(1)
3505480093f4SDimitry Andric     .setMIFlags(Flags);
3506480093f4SDimitry Andric 
3507480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
35085ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3509480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3510480093f4SDimitry Andric 
3511480093f4SDimitry Andric   Register Scale;
3512480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
3513480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
3514480093f4SDimitry Andric     // is not usable.
3515480093f4SDimitry Andric 
3516480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
3517480093f4SDimitry Andric 
3518480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3519480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3520480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3521480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3522480093f4SDimitry Andric 
3523480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3524480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
3525480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3526480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
35275ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3528480093f4SDimitry Andric   } else {
3529480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
3530480093f4SDimitry Andric   }
3531480093f4SDimitry Andric 
3532480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3533480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3534480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3535480093f4SDimitry Andric     .addUse(Mul.getReg(0))
3536480093f4SDimitry Andric     .addUse(Scale)
3537480093f4SDimitry Andric     .setMIFlags(Flags);
3538480093f4SDimitry Andric 
3539480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3540480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3541480093f4SDimitry Andric     .addUse(RHS)
3542480093f4SDimitry Andric     .addUse(LHS)
3543480093f4SDimitry Andric     .setMIFlags(Flags);
3544480093f4SDimitry Andric 
3545480093f4SDimitry Andric   MI.eraseFromParent();
3546480093f4SDimitry Andric   return true;
3547480093f4SDimitry Andric }
3548480093f4SDimitry Andric 
35498bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
35508bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
35518bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
35528bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
35538bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
35548bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
35558bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
35568bcb0991SDimitry Andric 
35578bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
35588bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
35598bcb0991SDimitry Andric 
35608bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
35618bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
35628bcb0991SDimitry Andric 
35638bcb0991SDimitry Andric   auto C0 = B.buildConstant(S32, 0x6f800000);
35648bcb0991SDimitry Andric   auto C1 = B.buildConstant(S32, 0x2f800000);
35658bcb0991SDimitry Andric   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
35668bcb0991SDimitry Andric 
35678bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
35688bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
35698bcb0991SDimitry Andric 
35708bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
35718bcb0991SDimitry Andric 
35728bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
35738bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
35748bcb0991SDimitry Andric     .setMIFlags(Flags);
35758bcb0991SDimitry Andric 
35768bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
35778bcb0991SDimitry Andric 
35788bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
35798bcb0991SDimitry Andric 
35808bcb0991SDimitry Andric   MI.eraseFromParent();
35818bcb0991SDimitry Andric   return true;
35828bcb0991SDimitry Andric }
35838bcb0991SDimitry Andric 
3584e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3585e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
3586e8d8bef9SDimitry Andric //
3587e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
3588e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3589e8d8bef9SDimitry Andric // +-max_float.
3590e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3591e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
3592e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
3593e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3594e8d8bef9SDimitry Andric     return true;
3595e8d8bef9SDimitry Andric 
3596e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3597e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
3598e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
3599e8d8bef9SDimitry Andric 
3600e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
3601e8d8bef9SDimitry Andric 
3602e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
3603e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
3604e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
3605e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
3606e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
3607e8d8bef9SDimitry Andric   else
3608e8d8bef9SDimitry Andric     return false;
3609e8d8bef9SDimitry Andric 
3610e8d8bef9SDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3611e8d8bef9SDimitry Andric     .addUse(Src)
3612e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3613e8d8bef9SDimitry Andric 
3614e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
3615e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
3616e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3617e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
3618e8d8bef9SDimitry Andric 
3619e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3620e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3621e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3622e8d8bef9SDimitry Andric 
3623e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3624e8d8bef9SDimitry Andric 
3625e8d8bef9SDimitry Andric   if (UseIEEE)
3626e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3627e8d8bef9SDimitry Andric   else
3628e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3629e8d8bef9SDimitry Andric   MI.eraseFromParent();
3630e8d8bef9SDimitry Andric   return true;
3631e8d8bef9SDimitry Andric }
3632e8d8bef9SDimitry Andric 
3633e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3634e8d8bef9SDimitry Andric   switch (IID) {
3635e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
3636e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
3637e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
3638e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3639e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
3640e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3641e8d8bef9SDimitry Andric   default:
3642e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
3643e8d8bef9SDimitry Andric   }
3644e8d8bef9SDimitry Andric }
3645e8d8bef9SDimitry Andric 
3646e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3647e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
3648e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
3649e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
3650e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
3651e8d8bef9SDimitry Andric 
3652e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3653e8d8bef9SDimitry Andric 
3654e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
3655e8d8bef9SDimitry Andric   // construction.
3656e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
3657e8d8bef9SDimitry Andric     MI.RemoveOperand(I);
3658e8d8bef9SDimitry Andric 
3659e8d8bef9SDimitry Andric   MI.RemoveOperand(1); // Remove the intrinsic ID.
3660e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
3661e8d8bef9SDimitry Andric   return true;
3662e8d8bef9SDimitry Andric }
3663e8d8bef9SDimitry Andric 
3664e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3665e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
3666e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
3667e8d8bef9SDimitry Andric   uint64_t Offset =
3668e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
3669e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3670e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
3671e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3672e8d8bef9SDimitry Andric 
3673e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3674e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
3675e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3676e8d8bef9SDimitry Andric     return false;
3677e8d8bef9SDimitry Andric 
3678e8d8bef9SDimitry Andric   // FIXME: This should be nuw
3679e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3680e8d8bef9SDimitry Andric   return true;
3681e8d8bef9SDimitry Andric }
3682e8d8bef9SDimitry Andric 
36830b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
36840b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
36850b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
36860b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
36870b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
36880b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
36890b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
36900b57cec5SDimitry Andric   }
36910b57cec5SDimitry Andric 
36920b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
3693e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
36940b57cec5SDimitry Andric     return false;
36950b57cec5SDimitry Andric 
36960b57cec5SDimitry Andric   MI.eraseFromParent();
36970b57cec5SDimitry Andric   return true;
36980b57cec5SDimitry Andric }
36990b57cec5SDimitry Andric 
37008bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
37018bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
37028bcb0991SDimitry Andric                                               MachineIRBuilder &B,
37038bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
37048bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3705e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3706e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
3707e8d8bef9SDimitry Andric 
37088bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
37098bcb0991SDimitry Andric   MI.eraseFromParent();
37108bcb0991SDimitry Andric   return true;
37118bcb0991SDimitry Andric }
37128bcb0991SDimitry Andric 
37135ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
37145ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
37155ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
37165ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
37175ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
37185ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
3719fe6060f1SDimitry Andric std::pair<Register, unsigned>
37205ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
37215ffd83dbSDimitry Andric                                         Register OrigOffset) const {
37225ffd83dbSDimitry Andric   const unsigned MaxImm = 4095;
37235ffd83dbSDimitry Andric   Register BaseReg;
3724fe6060f1SDimitry Andric   unsigned ImmOffset;
37255ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3726fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
37275ffd83dbSDimitry Andric 
3728fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
3729fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
37305ffd83dbSDimitry Andric 
3731fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
3732fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
3733fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
37345ffd83dbSDimitry Andric 
37355ffd83dbSDimitry Andric   // If the immediate value is too big for the immoffset field, put the value
37365ffd83dbSDimitry Andric   // and -4096 into the immoffset field so that the value that is copied/added
37375ffd83dbSDimitry Andric   // for the voffset field is a multiple of 4096, and it stands more chance
37385ffd83dbSDimitry Andric   // of being CSEd with the copy/add for another similar load/store.
37395ffd83dbSDimitry Andric   // However, do not do that rounding down to a multiple of 4096 if that is a
37405ffd83dbSDimitry Andric   // negative number, as it appears to be illegal to have a negative offset
37415ffd83dbSDimitry Andric   // in the vgpr, even if adding the immediate offset makes it positive.
37425ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
37435ffd83dbSDimitry Andric   ImmOffset -= Overflow;
37445ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
37455ffd83dbSDimitry Andric     Overflow += ImmOffset;
37465ffd83dbSDimitry Andric     ImmOffset = 0;
37475ffd83dbSDimitry Andric   }
37485ffd83dbSDimitry Andric 
37495ffd83dbSDimitry Andric   if (Overflow != 0) {
37505ffd83dbSDimitry Andric     if (!BaseReg) {
37515ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
37525ffd83dbSDimitry Andric     } else {
37535ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
37545ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
37555ffd83dbSDimitry Andric     }
37565ffd83dbSDimitry Andric   }
37575ffd83dbSDimitry Andric 
37585ffd83dbSDimitry Andric   if (!BaseReg)
37595ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
37605ffd83dbSDimitry Andric 
3761fe6060f1SDimitry Andric   return std::make_pair(BaseReg, ImmOffset);
3762fe6060f1SDimitry Andric }
3763fe6060f1SDimitry Andric 
3764fe6060f1SDimitry Andric /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3765fe6060f1SDimitry Andric void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3766fe6060f1SDimitry Andric                                           Register VOffset, Register SOffset,
3767fe6060f1SDimitry Andric                                           unsigned ImmOffset, Register VIndex,
3768fe6060f1SDimitry Andric                                           MachineRegisterInfo &MRI) const {
3769fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeVOffsetVal =
3770*349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(VOffset, MRI);
3771fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeSOffsetVal =
3772*349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(SOffset, MRI);
3773fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeVIndexVal =
3774*349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(VIndex, MRI);
3775fe6060f1SDimitry Andric   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3776fe6060f1SDimitry Andric   // update the MMO with that offset. The stride is unknown so we can only do
3777fe6060f1SDimitry Andric   // this if VIndex is constant 0.
3778fe6060f1SDimitry Andric   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3779fe6060f1SDimitry Andric       MaybeVIndexVal->Value == 0) {
3780fe6060f1SDimitry Andric     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3781fe6060f1SDimitry Andric                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3782fe6060f1SDimitry Andric     MMO->setOffset(TotalOffset);
3783fe6060f1SDimitry Andric   } else {
3784fe6060f1SDimitry Andric     // We don't have a constant combined offset to use in the MMO. Give up.
3785fe6060f1SDimitry Andric     MMO->setValue((Value *)nullptr);
3786fe6060f1SDimitry Andric   }
37875ffd83dbSDimitry Andric }
37885ffd83dbSDimitry Andric 
37898bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
37908bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
37918bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
3792e8d8bef9SDimitry Andric                                              Register Reg,
3793e8d8bef9SDimitry Andric                                              bool ImageStore) const {
37948bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
37958bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
37968bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
37978bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
37988bcb0991SDimitry Andric 
3799e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
38008bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
38018bcb0991SDimitry Andric 
38028bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
38038bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
38048bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
38058bcb0991SDimitry Andric 
38068bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
38078bcb0991SDimitry Andric 
3808fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3809fe6060f1SDimitry Andric         .getReg(0);
38108bcb0991SDimitry Andric   }
38118bcb0991SDimitry Andric 
3812e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
3813e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
3814e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3815e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
3816e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
3817e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3818fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3819fe6060f1SDimitry Andric           .getReg(0);
3820e8d8bef9SDimitry Andric     }
3821e8d8bef9SDimitry Andric 
3822e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
3823e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3824e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
3825e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3826e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
3827e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3828fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3829fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3830e8d8bef9SDimitry Andric     }
3831e8d8bef9SDimitry Andric 
3832e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
3833e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3834fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3835e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
3836e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3837e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
3838e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3839fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3840fe6060f1SDimitry Andric           .getReg(0);
3841e8d8bef9SDimitry Andric     }
3842e8d8bef9SDimitry Andric 
3843e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
3844e8d8bef9SDimitry Andric   }
3845e8d8bef9SDimitry Andric 
3846e8d8bef9SDimitry Andric   return Reg;
3847e8d8bef9SDimitry Andric }
3848e8d8bef9SDimitry Andric 
38495ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
38505ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
38515ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
38525ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
38538bcb0991SDimitry Andric 
38548bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
38558bcb0991SDimitry Andric 
38568bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
38578bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
38588bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
38595ffd83dbSDimitry Andric     return AnyExt;
38608bcb0991SDimitry Andric   }
38618bcb0991SDimitry Andric 
38628bcb0991SDimitry Andric   if (Ty.isVector()) {
38638bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
38648bcb0991SDimitry Andric       if (IsFormat)
38655ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
38665ffd83dbSDimitry Andric     }
38675ffd83dbSDimitry Andric   }
38685ffd83dbSDimitry Andric 
38695ffd83dbSDimitry Andric   return VData;
38705ffd83dbSDimitry Andric }
38715ffd83dbSDimitry Andric 
38725ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
38735ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
38745ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
38755ffd83dbSDimitry Andric                                               bool IsTyped,
38765ffd83dbSDimitry Andric                                               bool IsFormat) const {
38775ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
38785ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
38795ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
38805ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
38815ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
38825ffd83dbSDimitry Andric 
38835ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
38845ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
38855ffd83dbSDimitry Andric 
38865ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
38875ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
38885ffd83dbSDimitry Andric 
38895ffd83dbSDimitry Andric   unsigned ImmOffset;
38905ffd83dbSDimitry Andric 
38915ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
38925ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
38935ffd83dbSDimitry Andric 
38945ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
38955ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
38965ffd83dbSDimitry Andric   Register VIndex;
38975ffd83dbSDimitry Andric   int OpOffset = 0;
38985ffd83dbSDimitry Andric   if (HasVIndex) {
38995ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
39005ffd83dbSDimitry Andric     OpOffset = 1;
3901fe6060f1SDimitry Andric   } else {
3902fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
39035ffd83dbSDimitry Andric   }
39045ffd83dbSDimitry Andric 
39055ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
39065ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
39075ffd83dbSDimitry Andric 
39085ffd83dbSDimitry Andric   unsigned Format = 0;
39095ffd83dbSDimitry Andric   if (IsTyped) {
39105ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
39115ffd83dbSDimitry Andric     ++OpOffset;
39125ffd83dbSDimitry Andric   }
39135ffd83dbSDimitry Andric 
39145ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
39155ffd83dbSDimitry Andric 
3916fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3917fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
39185ffd83dbSDimitry Andric 
39195ffd83dbSDimitry Andric   unsigned Opc;
39205ffd83dbSDimitry Andric   if (IsTyped) {
39215ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
39225ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
39235ffd83dbSDimitry Andric   } else if (IsFormat) {
39245ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
39255ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
39265ffd83dbSDimitry Andric   } else {
39275ffd83dbSDimitry Andric     switch (MemSize) {
39285ffd83dbSDimitry Andric     case 1:
39295ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
39305ffd83dbSDimitry Andric       break;
39315ffd83dbSDimitry Andric     case 2:
39325ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
39335ffd83dbSDimitry Andric       break;
39345ffd83dbSDimitry Andric     default:
39355ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
39365ffd83dbSDimitry Andric       break;
39375ffd83dbSDimitry Andric     }
39385ffd83dbSDimitry Andric   }
39395ffd83dbSDimitry Andric 
39405ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
39415ffd83dbSDimitry Andric     .addUse(VData)              // vdata
39425ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
39435ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
39445ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
39455ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
39465ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
39475ffd83dbSDimitry Andric 
39485ffd83dbSDimitry Andric   if (IsTyped)
39495ffd83dbSDimitry Andric     MIB.addImm(Format);
39505ffd83dbSDimitry Andric 
39515ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
39525ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
39535ffd83dbSDimitry Andric      .addMemOperand(MMO);
39545ffd83dbSDimitry Andric 
39555ffd83dbSDimitry Andric   MI.eraseFromParent();
39568bcb0991SDimitry Andric   return true;
39578bcb0991SDimitry Andric }
39588bcb0991SDimitry Andric 
39595ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
39605ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
39615ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
39625ffd83dbSDimitry Andric                                              bool IsFormat,
39635ffd83dbSDimitry Andric                                              bool IsTyped) const {
39645ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
39655ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
3966fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
39675ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
39685ffd83dbSDimitry Andric 
39695ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
39705ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
39715ffd83dbSDimitry Andric 
39725ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
39735ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
39745ffd83dbSDimitry Andric 
39755ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
39765ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
39775ffd83dbSDimitry Andric   Register VIndex;
39785ffd83dbSDimitry Andric   int OpOffset = 0;
39795ffd83dbSDimitry Andric   if (HasVIndex) {
39805ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
39815ffd83dbSDimitry Andric     OpOffset = 1;
3982fe6060f1SDimitry Andric   } else {
3983fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
39848bcb0991SDimitry Andric   }
39858bcb0991SDimitry Andric 
39865ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
39875ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
39885ffd83dbSDimitry Andric 
39895ffd83dbSDimitry Andric   unsigned Format = 0;
39905ffd83dbSDimitry Andric   if (IsTyped) {
39915ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
39925ffd83dbSDimitry Andric     ++OpOffset;
39938bcb0991SDimitry Andric   }
39948bcb0991SDimitry Andric 
39955ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
39965ffd83dbSDimitry Andric   unsigned ImmOffset;
39975ffd83dbSDimitry Andric 
39985ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
39995ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
40005ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
40015ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
40025ffd83dbSDimitry Andric 
4003fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4004fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
40055ffd83dbSDimitry Andric 
40065ffd83dbSDimitry Andric   unsigned Opc;
40075ffd83dbSDimitry Andric 
40085ffd83dbSDimitry Andric   if (IsTyped) {
40095ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
40105ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
40115ffd83dbSDimitry Andric   } else if (IsFormat) {
40125ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
40135ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
40145ffd83dbSDimitry Andric   } else {
4015fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
4016fe6060f1SDimitry Andric     case 8:
40175ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
40185ffd83dbSDimitry Andric       break;
4019fe6060f1SDimitry Andric     case 16:
40205ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
40215ffd83dbSDimitry Andric       break;
40225ffd83dbSDimitry Andric     default:
40235ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
40245ffd83dbSDimitry Andric       break;
40255ffd83dbSDimitry Andric     }
40265ffd83dbSDimitry Andric   }
40275ffd83dbSDimitry Andric 
40285ffd83dbSDimitry Andric   Register LoadDstReg;
40295ffd83dbSDimitry Andric 
4030fe6060f1SDimitry Andric   bool IsExtLoad =
4031fe6060f1SDimitry Andric       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
40325ffd83dbSDimitry Andric   LLT UnpackedTy = Ty.changeElementSize(32);
40335ffd83dbSDimitry Andric 
40345ffd83dbSDimitry Andric   if (IsExtLoad)
40355ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
40365ffd83dbSDimitry Andric   else if (Unpacked && IsD16 && Ty.isVector())
40375ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
40385ffd83dbSDimitry Andric   else
40395ffd83dbSDimitry Andric     LoadDstReg = Dst;
40405ffd83dbSDimitry Andric 
40415ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
40425ffd83dbSDimitry Andric     .addDef(LoadDstReg)         // vdata
40435ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
40445ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
40455ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
40465ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
40475ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
40485ffd83dbSDimitry Andric 
40495ffd83dbSDimitry Andric   if (IsTyped)
40505ffd83dbSDimitry Andric     MIB.addImm(Format);
40515ffd83dbSDimitry Andric 
40525ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
40535ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
40545ffd83dbSDimitry Andric      .addMemOperand(MMO);
40555ffd83dbSDimitry Andric 
40565ffd83dbSDimitry Andric   if (LoadDstReg != Dst) {
40575ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
40585ffd83dbSDimitry Andric 
40595ffd83dbSDimitry Andric     // Widen result for extending loads was widened.
40605ffd83dbSDimitry Andric     if (IsExtLoad)
40615ffd83dbSDimitry Andric       B.buildTrunc(Dst, LoadDstReg);
40625ffd83dbSDimitry Andric     else {
40635ffd83dbSDimitry Andric       // Repack to original 16-bit vector result
40645ffd83dbSDimitry Andric       // FIXME: G_TRUNC should work, but legalization currently fails
40655ffd83dbSDimitry Andric       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
40665ffd83dbSDimitry Andric       SmallVector<Register, 4> Repack;
40675ffd83dbSDimitry Andric       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
40685ffd83dbSDimitry Andric         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
40695ffd83dbSDimitry Andric       B.buildMerge(Dst, Repack);
40705ffd83dbSDimitry Andric     }
40715ffd83dbSDimitry Andric   }
40725ffd83dbSDimitry Andric 
40735ffd83dbSDimitry Andric   MI.eraseFromParent();
40745ffd83dbSDimitry Andric   return true;
40755ffd83dbSDimitry Andric }
40765ffd83dbSDimitry Andric 
40775ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
40785ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
40795ffd83dbSDimitry Andric                                                bool IsInc) const {
40805ffd83dbSDimitry Andric   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
40815ffd83dbSDimitry Andric                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
40825ffd83dbSDimitry Andric   B.buildInstr(Opc)
40835ffd83dbSDimitry Andric     .addDef(MI.getOperand(0).getReg())
40845ffd83dbSDimitry Andric     .addUse(MI.getOperand(2).getReg())
40855ffd83dbSDimitry Andric     .addUse(MI.getOperand(3).getReg())
40865ffd83dbSDimitry Andric     .cloneMemRefs(MI);
40875ffd83dbSDimitry Andric   MI.eraseFromParent();
40885ffd83dbSDimitry Andric   return true;
40895ffd83dbSDimitry Andric }
40905ffd83dbSDimitry Andric 
40915ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
40925ffd83dbSDimitry Andric   switch (IntrID) {
40935ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
40945ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
40955ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
40965ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
40975ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
40985ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
40995ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
41005ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
41015ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
41025ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
41035ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
41045ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
41055ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
41065ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
41075ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
41085ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
41095ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
41105ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
41115ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
41125ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
41135ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
41145ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
41155ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
41165ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
41175ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
41185ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
41195ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
41205ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
41215ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
41225ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
41235ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
41245ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
41255ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
41265ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
41275ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
41285ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
41295ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
41305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
41315ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4132fe6060f1SDimitry Andric   case Intrinsic::amdgcn_buffer_atomic_fadd:
4133e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4134e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4135e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4136fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4137fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4138fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4139fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4140fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4141fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
41425ffd83dbSDimitry Andric   default:
41435ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
41445ffd83dbSDimitry Andric   }
41455ffd83dbSDimitry Andric }
41465ffd83dbSDimitry Andric 
41475ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
41485ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
41495ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
41505ffd83dbSDimitry Andric   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
41515ffd83dbSDimitry Andric                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4152e8d8bef9SDimitry Andric   const bool HasReturn = MI.getNumExplicitDefs() != 0;
41535ffd83dbSDimitry Andric 
4154e8d8bef9SDimitry Andric   Register Dst;
41555ffd83dbSDimitry Andric 
41565ffd83dbSDimitry Andric   int OpOffset = 0;
4157e8d8bef9SDimitry Andric   if (HasReturn) {
4158e8d8bef9SDimitry Andric     // A few FP atomics do not support return values.
4159e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4160e8d8bef9SDimitry Andric   } else {
4161e8d8bef9SDimitry Andric     OpOffset = -1;
4162e8d8bef9SDimitry Andric   }
4163e8d8bef9SDimitry Andric 
4164e8d8bef9SDimitry Andric   Register VData = MI.getOperand(2 + OpOffset).getReg();
4165e8d8bef9SDimitry Andric   Register CmpVal;
41665ffd83dbSDimitry Andric 
41675ffd83dbSDimitry Andric   if (IsCmpSwap) {
41685ffd83dbSDimitry Andric     CmpVal = MI.getOperand(3 + OpOffset).getReg();
41695ffd83dbSDimitry Andric     ++OpOffset;
41705ffd83dbSDimitry Andric   }
41715ffd83dbSDimitry Andric 
41725ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4173e8d8bef9SDimitry Andric   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
41745ffd83dbSDimitry Andric 
41755ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
41765ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
41775ffd83dbSDimitry Andric   Register VIndex;
41785ffd83dbSDimitry Andric   if (HasVIndex) {
41795ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
41805ffd83dbSDimitry Andric     ++OpOffset;
4181fe6060f1SDimitry Andric   } else {
4182fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
41835ffd83dbSDimitry Andric   }
41845ffd83dbSDimitry Andric 
41855ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
41865ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
41875ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
41885ffd83dbSDimitry Andric 
41895ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
41905ffd83dbSDimitry Andric 
41915ffd83dbSDimitry Andric   unsigned ImmOffset;
4192fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4193fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
41945ffd83dbSDimitry Andric 
4195e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4196e8d8bef9SDimitry Andric 
4197e8d8bef9SDimitry Andric   if (HasReturn)
4198e8d8bef9SDimitry Andric     MIB.addDef(Dst);
4199e8d8bef9SDimitry Andric 
4200e8d8bef9SDimitry Andric   MIB.addUse(VData); // vdata
42015ffd83dbSDimitry Andric 
42025ffd83dbSDimitry Andric   if (IsCmpSwap)
42035ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
42045ffd83dbSDimitry Andric 
42055ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
42065ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
42075ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
42085ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
42095ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
42105ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
42115ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
42125ffd83dbSDimitry Andric      .addMemOperand(MMO);
42135ffd83dbSDimitry Andric 
42145ffd83dbSDimitry Andric   MI.eraseFromParent();
42155ffd83dbSDimitry Andric   return true;
42165ffd83dbSDimitry Andric }
42175ffd83dbSDimitry Andric 
4218fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
42195ffd83dbSDimitry Andric /// vector with s16 typed elements.
4220fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4221fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
4222fe6060f1SDimitry Andric                                       unsigned ArgOffset,
4223fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4224fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
42255ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4226fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
4227fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
42285ffd83dbSDimitry Andric 
4229e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4230e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
42315ffd83dbSDimitry Andric     if (!SrcOp.isReg())
42325ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
42335ffd83dbSDimitry Andric 
42345ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
42355ffd83dbSDimitry Andric 
4236fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
4237fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4238fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
4239fe6060f1SDimitry Andric       // Handle any gradient or coordinate operands that should not be packed
42405ffd83dbSDimitry Andric       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
42415ffd83dbSDimitry Andric       PackedAddrs.push_back(AddrReg);
42425ffd83dbSDimitry Andric     } else {
42435ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
42445ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
42455ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
4246e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
4247e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
4248e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
4249e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
4250e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
42515ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
4252e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
42535ffd83dbSDimitry Andric         PackedAddrs.push_back(
42545ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
42555ffd83dbSDimitry Andric                 .getReg(0));
42565ffd83dbSDimitry Andric       } else {
42575ffd83dbSDimitry Andric         PackedAddrs.push_back(
4258e8d8bef9SDimitry Andric             B.buildBuildVector(
4259e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
42605ffd83dbSDimitry Andric                 .getReg(0));
42615ffd83dbSDimitry Andric         ++I;
42625ffd83dbSDimitry Andric       }
42635ffd83dbSDimitry Andric     }
42645ffd83dbSDimitry Andric   }
42655ffd83dbSDimitry Andric }
42665ffd83dbSDimitry Andric 
42675ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
42685ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
42695ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
42705ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
42715ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
42725ffd83dbSDimitry Andric 
42735ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
42745ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
42755ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
42765ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
42775ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
42785ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
42795ffd83dbSDimitry Andric     }
42805ffd83dbSDimitry Andric   }
42815ffd83dbSDimitry Andric 
42825ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
42835ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
4284fe6060f1SDimitry Andric     // Above 8 elements round up to next power of 2 (i.e. 16).
4285fe6060f1SDimitry Andric     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
42865ffd83dbSDimitry Andric       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
42875ffd83dbSDimitry Andric       auto Undef = B.buildUndef(S32);
42885ffd83dbSDimitry Andric       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
42895ffd83dbSDimitry Andric       NumAddrRegs = RoundedNumRegs;
42905ffd83dbSDimitry Andric     }
42915ffd83dbSDimitry Andric 
4292fe6060f1SDimitry Andric     auto VAddr =
4293fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
42945ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
42955ffd83dbSDimitry Andric   }
42965ffd83dbSDimitry Andric 
42975ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
42985ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
42995ffd83dbSDimitry Andric     if (SrcOp.isReg())
43005ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
43015ffd83dbSDimitry Andric   }
43025ffd83dbSDimitry Andric }
43035ffd83dbSDimitry Andric 
43045ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
43055ffd83dbSDimitry Andric ///
43065ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
43075ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
43085ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
43095ffd83dbSDimitry Andric /// registers.
43105ffd83dbSDimitry Andric ///
43115ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
43125ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
43135ffd83dbSDimitry Andric /// want a selected instrution entering RegBankSelect. In order to avoid
43145ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
4315*349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires
4316*349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg.
43175ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4318e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4319e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
43205ffd83dbSDimitry Andric 
4321e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
4322e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
43235ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
43245ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
43255ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
43265ffd83dbSDimitry Andric 
43275ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
43285ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4329e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
43305ffd83dbSDimitry Andric 
43315ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
43325ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43335ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4334fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
43355ffd83dbSDimitry Andric 
43365ffd83dbSDimitry Andric   unsigned DMask = 0;
43375ffd83dbSDimitry Andric 
43385ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
4339e8d8bef9SDimitry Andric   LLT GradTy =
4340e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4341e8d8bef9SDimitry Andric   LLT AddrTy =
4342e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
43435ffd83dbSDimitry Andric   const bool IsG16 = GradTy == S16;
43445ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
43455ffd83dbSDimitry Andric 
43465ffd83dbSDimitry Andric   int DMaskLanes = 0;
43475ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
4348e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
43495ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
43505ffd83dbSDimitry Andric       DMaskLanes = 4;
43515ffd83dbSDimitry Andric     } else if (DMask != 0) {
43525ffd83dbSDimitry Andric       DMaskLanes = countPopulation(DMask);
43535ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
43545ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
43555ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
43565ffd83dbSDimitry Andric       MI.eraseFromParent();
43575ffd83dbSDimitry Andric       return true;
43585ffd83dbSDimitry Andric     }
43595ffd83dbSDimitry Andric   }
43605ffd83dbSDimitry Andric 
43615ffd83dbSDimitry Andric   Observer.changingInstr(MI);
43625ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
43635ffd83dbSDimitry Andric 
43645ffd83dbSDimitry Andric   unsigned NewOpcode = NumDefs == 0 ?
43655ffd83dbSDimitry Andric     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
43665ffd83dbSDimitry Andric 
43675ffd83dbSDimitry Andric   // Track that we legalized this
43685ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
43695ffd83dbSDimitry Andric 
43705ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
43715ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
43725ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
43735ffd83dbSDimitry Andric     DMask = 0x1;
43745ffd83dbSDimitry Andric     DMaskLanes = 1;
4375e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
43765ffd83dbSDimitry Andric   }
43775ffd83dbSDimitry Andric 
43785ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
43795ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
43805ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
43815ffd83dbSDimitry Andric 
43825ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
43835ffd83dbSDimitry Andric     if (Ty.isVector())
43845ffd83dbSDimitry Andric       return false;
43855ffd83dbSDimitry Andric 
43865ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
43875ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
43885ffd83dbSDimitry Andric       // The two values are packed in one register.
4389fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
43905ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
43915ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
43925ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
43935ffd83dbSDimitry Andric     }
43945ffd83dbSDimitry Andric   }
43955ffd83dbSDimitry Andric 
4396e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
43975ffd83dbSDimitry Andric 
43985ffd83dbSDimitry Andric   // Optimize _L to _LZ when _L is zero
43995ffd83dbSDimitry Andric   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4400e8d8bef9SDimitry Andric           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
44015ffd83dbSDimitry Andric     const ConstantFP *ConstantLod;
44025ffd83dbSDimitry Andric 
4403e8d8bef9SDimitry Andric     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4404e8d8bef9SDimitry Andric                  m_GFCst(ConstantLod))) {
44055ffd83dbSDimitry Andric       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
44065ffd83dbSDimitry Andric         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4407e8d8bef9SDimitry Andric         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4408*349cc55cSDimitry Andric             AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
4409e8d8bef9SDimitry Andric                                                      Intr->Dim);
44105ffd83dbSDimitry Andric 
44115ffd83dbSDimitry Andric         // The starting indexes should remain in the same place.
44125ffd83dbSDimitry Andric         --CorrectedNumVAddrs;
44135ffd83dbSDimitry Andric 
4414e8d8bef9SDimitry Andric         MI.getOperand(MI.getNumExplicitDefs())
4415e8d8bef9SDimitry Andric             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4416e8d8bef9SDimitry Andric         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4417e8d8bef9SDimitry Andric         Intr = NewImageDimIntr;
44185ffd83dbSDimitry Andric       }
44195ffd83dbSDimitry Andric     }
44205ffd83dbSDimitry Andric   }
44215ffd83dbSDimitry Andric 
44225ffd83dbSDimitry Andric   // Optimize _mip away, when 'lod' is zero
4423e8d8bef9SDimitry Andric   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
44245ffd83dbSDimitry Andric     int64_t ConstantLod;
4425e8d8bef9SDimitry Andric     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4426e8d8bef9SDimitry Andric                  m_ICst(ConstantLod))) {
44275ffd83dbSDimitry Andric       if (ConstantLod == 0) {
44285ffd83dbSDimitry Andric         // TODO: Change intrinsic opcode and remove operand instead or replacing
44295ffd83dbSDimitry Andric         // it with 0, as the _L to _LZ handling is done above.
4430e8d8bef9SDimitry Andric         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
44315ffd83dbSDimitry Andric         --CorrectedNumVAddrs;
44325ffd83dbSDimitry Andric       }
44335ffd83dbSDimitry Andric     }
44345ffd83dbSDimitry Andric   }
44355ffd83dbSDimitry Andric 
44365ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
4437fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4438fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
4439fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
44405ffd83dbSDimitry Andric     return false;
4441fe6060f1SDimitry Andric   }
44425ffd83dbSDimitry Andric 
4443fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
4444fe6060f1SDimitry Andric     // A16 not supported
4445fe6060f1SDimitry Andric     return false;
4446fe6060f1SDimitry Andric   }
4447fe6060f1SDimitry Andric 
4448fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
4449e8d8bef9SDimitry Andric     if (Intr->NumVAddrs > 1) {
44505ffd83dbSDimitry Andric       SmallVector<Register, 4> PackedRegs;
44515ffd83dbSDimitry Andric 
4452fe6060f1SDimitry Andric       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4453fe6060f1SDimitry Andric                                 IsG16);
44545ffd83dbSDimitry Andric 
44555ffd83dbSDimitry Andric       // See also below in the non-a16 branch
4456fe6060f1SDimitry Andric       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4457fe6060f1SDimitry Andric                           PackedRegs.size() <= ST.getNSAMaxSize();
44585ffd83dbSDimitry Andric 
44595ffd83dbSDimitry Andric       if (!UseNSA && PackedRegs.size() > 1) {
4460fe6060f1SDimitry Andric         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
44615ffd83dbSDimitry Andric         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
44625ffd83dbSDimitry Andric         PackedRegs[0] = Concat.getReg(0);
44635ffd83dbSDimitry Andric         PackedRegs.resize(1);
44645ffd83dbSDimitry Andric       }
44655ffd83dbSDimitry Andric 
4466e8d8bef9SDimitry Andric       const unsigned NumPacked = PackedRegs.size();
4467e8d8bef9SDimitry Andric       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4468e8d8bef9SDimitry Andric         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
44695ffd83dbSDimitry Andric         if (!SrcOp.isReg()) {
44705ffd83dbSDimitry Andric           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
44715ffd83dbSDimitry Andric           continue;
44725ffd83dbSDimitry Andric         }
44735ffd83dbSDimitry Andric 
44745ffd83dbSDimitry Andric         assert(SrcOp.getReg() != AMDGPU::NoRegister);
44755ffd83dbSDimitry Andric 
4476e8d8bef9SDimitry Andric         if (I - Intr->VAddrStart < NumPacked)
4477e8d8bef9SDimitry Andric           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
44785ffd83dbSDimitry Andric         else
44795ffd83dbSDimitry Andric           SrcOp.setReg(AMDGPU::NoRegister);
44805ffd83dbSDimitry Andric       }
44815ffd83dbSDimitry Andric     }
44825ffd83dbSDimitry Andric   } else {
44835ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
44845ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
44855ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
44865ffd83dbSDimitry Andric     // wash in terms of code size or even better.
44875ffd83dbSDimitry Andric     //
44885ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
44895ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
44905ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
44915ffd83dbSDimitry Andric     //
44925ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
44935ffd83dbSDimitry Andric     // allocation when possible.
4494fe6060f1SDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4495fe6060f1SDimitry Andric                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
44965ffd83dbSDimitry Andric 
4497e8d8bef9SDimitry Andric     if (!UseNSA && Intr->NumVAddrs > 1)
4498e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4499e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
45005ffd83dbSDimitry Andric   }
45015ffd83dbSDimitry Andric 
45025ffd83dbSDimitry Andric   int Flags = 0;
45035ffd83dbSDimitry Andric   if (IsA16)
45045ffd83dbSDimitry Andric     Flags |= 1;
45055ffd83dbSDimitry Andric   if (IsG16)
45065ffd83dbSDimitry Andric     Flags |= 2;
45075ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
45085ffd83dbSDimitry Andric 
45095ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
45105ffd83dbSDimitry Andric     // TODO: Handle dmask trim
45115ffd83dbSDimitry Andric     Register VData = MI.getOperand(1).getReg();
45125ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData);
45135ffd83dbSDimitry Andric     if (!Ty.isVector() || Ty.getElementType() != S16)
45145ffd83dbSDimitry Andric       return true;
45155ffd83dbSDimitry Andric 
4516e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
45175ffd83dbSDimitry Andric     if (RepackedReg != VData) {
45185ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
45195ffd83dbSDimitry Andric     }
45205ffd83dbSDimitry Andric 
45215ffd83dbSDimitry Andric     return true;
45225ffd83dbSDimitry Andric   }
45235ffd83dbSDimitry Andric 
45245ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
45255ffd83dbSDimitry Andric   LLT Ty = MRI->getType(DstReg);
45265ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
45275ffd83dbSDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
45285ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
45295ffd83dbSDimitry Andric 
45305ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
45315ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
45325ffd83dbSDimitry Andric     return false;
45335ffd83dbSDimitry Andric 
45345ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
45355ffd83dbSDimitry Andric     return false;
45365ffd83dbSDimitry Andric 
45375ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4538fe6060f1SDimitry Andric   const LLT AdjustedTy =
4539fe6060f1SDimitry Andric       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
45405ffd83dbSDimitry Andric 
45415ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
45425ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
45435ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
45445ffd83dbSDimitry Andric   LLT RoundedTy;
45455ffd83dbSDimitry Andric 
45465ffd83dbSDimitry Andric   // S32 vector to to cover all data, plus TFE result element.
45475ffd83dbSDimitry Andric   LLT TFETy;
45485ffd83dbSDimitry Andric 
45495ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
45505ffd83dbSDimitry Andric   LLT RegTy;
45515ffd83dbSDimitry Andric 
45525ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
4553fe6060f1SDimitry Andric     RoundedTy =
4554fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4555fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
45565ffd83dbSDimitry Andric     RegTy = S32;
45575ffd83dbSDimitry Andric   } else {
45585ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
45595ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
45605ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
4561fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
4562fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4563fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
45645ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
45655ffd83dbSDimitry Andric   }
45665ffd83dbSDimitry Andric 
45675ffd83dbSDimitry Andric   // The return type does not need adjustment.
45685ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
45695ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
45705ffd83dbSDimitry Andric     return true;
45715ffd83dbSDimitry Andric 
45725ffd83dbSDimitry Andric   Register Dst1Reg;
45735ffd83dbSDimitry Andric 
45745ffd83dbSDimitry Andric   // Insert after the instruction.
45755ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
45765ffd83dbSDimitry Andric 
45775ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
45785ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
45795ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
45805ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
45815ffd83dbSDimitry Andric 
45825ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
45835ffd83dbSDimitry Andric 
45845ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
45855ffd83dbSDimitry Andric 
45865ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
4587*349cc55cSDimitry Andric   // type. The instruction really returns these two values in one contiguous
45885ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
45895ffd83dbSDimitry Andric   // return type to use a single register result.
45905ffd83dbSDimitry Andric 
45915ffd83dbSDimitry Andric   if (IsTFE) {
45925ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
45935ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
45945ffd83dbSDimitry Andric       return false;
45955ffd83dbSDimitry Andric 
45965ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
45975ffd83dbSDimitry Andric     MI.RemoveOperand(1);
45985ffd83dbSDimitry Andric 
45995ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
46005ffd83dbSDimitry Andric     if (Ty == S32) {
46015ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
46025ffd83dbSDimitry Andric       return true;
46035ffd83dbSDimitry Andric     }
46045ffd83dbSDimitry Andric   }
46055ffd83dbSDimitry Andric 
46065ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
46075ffd83dbSDimitry Andric   // result.
46085ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
46095ffd83dbSDimitry Andric 
46105ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
46115ffd83dbSDimitry Andric 
46125ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
46135ffd83dbSDimitry Andric     assert(!IsTFE);
46145ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
46155ffd83dbSDimitry Andric   } else {
46165ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
46175ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
46185ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
46195ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
46205ffd83dbSDimitry Andric 
46215ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
46225ffd83dbSDimitry Andric     // directly written to the right place already.
46235ffd83dbSDimitry Andric     if (IsTFE)
46245ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
46255ffd83dbSDimitry Andric   }
46265ffd83dbSDimitry Andric 
46275ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
46285ffd83dbSDimitry Andric   // of packed vs. unpacked.
46295ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
46305ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
46315ffd83dbSDimitry Andric     return true;
46325ffd83dbSDimitry Andric   }
46335ffd83dbSDimitry Andric 
46345ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
46355ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
46365ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
46375ffd83dbSDimitry Andric     return true;
46385ffd83dbSDimitry Andric   }
46395ffd83dbSDimitry Andric 
46405ffd83dbSDimitry Andric   assert(Ty.isVector());
46415ffd83dbSDimitry Andric 
46425ffd83dbSDimitry Andric   if (IsD16) {
46435ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
46445ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
46455ffd83dbSDimitry Andric     //
46465ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
46475ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
46485ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
46495ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
46505ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
46515ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
46525ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
46535ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
46545ffd83dbSDimitry Andric     }
46555ffd83dbSDimitry Andric   }
46565ffd83dbSDimitry Andric 
46575ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
46585ffd83dbSDimitry Andric     if (NumElts == 0)
46595ffd83dbSDimitry Andric       return;
46605ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
46615ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
46625ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
46635ffd83dbSDimitry Andric   };
46645ffd83dbSDimitry Andric 
46655ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
46665ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
46675ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
46685ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
46695ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
46705ffd83dbSDimitry Andric     return true;
46715ffd83dbSDimitry Andric   }
46725ffd83dbSDimitry Andric 
46735ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
46745ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
46755ffd83dbSDimitry Andric 
46765ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
4677fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
46785ffd83dbSDimitry Andric   if (Ty == V3S16) {
46795ffd83dbSDimitry Andric     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4680fe6060f1SDimitry Andric     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
46815ffd83dbSDimitry Andric     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
46825ffd83dbSDimitry Andric     return true;
46835ffd83dbSDimitry Andric   }
46845ffd83dbSDimitry Andric 
46855ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
46865ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
46875ffd83dbSDimitry Andric   return true;
46885ffd83dbSDimitry Andric }
46895ffd83dbSDimitry Andric 
46905ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4691e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
4692e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
4693e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
4694e8d8bef9SDimitry Andric 
46955ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
46965ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
46975ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
46985ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
46995ffd83dbSDimitry Andric 
47005ffd83dbSDimitry Andric   Observer.changingInstr(MI);
47015ffd83dbSDimitry Andric 
4702fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4703e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
4704e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
4705e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4706e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
4707e8d8bef9SDimitry Andric   }
4708e8d8bef9SDimitry Andric 
47095ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
47105ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
47115ffd83dbSDimitry Andric   // allowed to add one.
47125ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
47135ffd83dbSDimitry Andric   MI.RemoveOperand(1); // Remove intrinsic ID
47145ffd83dbSDimitry Andric 
47155ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
47165ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
47175ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
47185ffd83dbSDimitry Andric   const Align MemAlign(4);
47195ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
47205ffd83dbSDimitry Andric       MachinePointerInfo(),
47215ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
47225ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
47235ffd83dbSDimitry Andric       MemSize, MemAlign);
47245ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
47255ffd83dbSDimitry Andric 
47265ffd83dbSDimitry Andric   // There are no 96-bit result scalar loads, but widening to 128-bit should
47275ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
47285ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
47295ffd83dbSDimitry Andric   if (!isPowerOf2_32(Size)) {
47305ffd83dbSDimitry Andric     if (Ty.isVector())
47315ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
47325ffd83dbSDimitry Andric     else
47335ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
47345ffd83dbSDimitry Andric   }
47355ffd83dbSDimitry Andric 
47365ffd83dbSDimitry Andric   Observer.changedInstr(MI);
47375ffd83dbSDimitry Andric   return true;
47385ffd83dbSDimitry Andric }
47395ffd83dbSDimitry Andric 
4740e8d8bef9SDimitry Andric // TODO: Move to selection
47415ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
47420b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
47430b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
4744fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
4745fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4746fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
4747fe6060f1SDimitry Andric 
4748fe6060f1SDimitry Andric   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4749fe6060f1SDimitry Andric     switch (*HsaAbiVer) {
4750fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4751fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4752fe6060f1SDimitry Andric       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4753fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4754fe6060f1SDimitry Andric       return ST.supportsGetDoorbellID() ?
4755fe6060f1SDimitry Andric           legalizeTrapHsa(MI, MRI, B) :
4756fe6060f1SDimitry Andric           legalizeTrapHsaQueuePtr(MI, MRI, B);
4757fe6060f1SDimitry Andric     }
4758fe6060f1SDimitry Andric   }
4759fe6060f1SDimitry Andric 
4760fe6060f1SDimitry Andric   llvm_unreachable("Unknown trap handler");
4761fe6060f1SDimitry Andric }
4762fe6060f1SDimitry Andric 
4763fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4764fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
47655ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4766fe6060f1SDimitry Andric   MI.eraseFromParent();
4767fe6060f1SDimitry Andric   return true;
4768fe6060f1SDimitry Andric }
4769fe6060f1SDimitry Andric 
4770fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4771fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
47725ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
47735ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4774e8d8bef9SDimitry Andric   Register LiveIn =
4775e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4776e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
47775ffd83dbSDimitry Andric     return false;
4778e8d8bef9SDimitry Andric 
4779e8d8bef9SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
47805ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
47815ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
4782fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
47835ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
4784fe6060f1SDimitry Andric 
4785fe6060f1SDimitry Andric   MI.eraseFromParent();
4786fe6060f1SDimitry Andric   return true;
47875ffd83dbSDimitry Andric }
47885ffd83dbSDimitry Andric 
4789fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(
4790fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4791fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
4792fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
47935ffd83dbSDimitry Andric   MI.eraseFromParent();
47945ffd83dbSDimitry Andric   return true;
47955ffd83dbSDimitry Andric }
47965ffd83dbSDimitry Andric 
47975ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
47985ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4799*349cc55cSDimitry Andric   // Is non-HSA path or trap-handler disabled? Then, report a warning
48005ffd83dbSDimitry Andric   // accordingly
4801fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
4802fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
48035ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
48045ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
48055ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
48065ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
48075ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
48085ffd83dbSDimitry Andric   } else {
48095ffd83dbSDimitry Andric     // Insert debug-trap instruction
4810fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
4811fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
48125ffd83dbSDimitry Andric   }
48135ffd83dbSDimitry Andric 
48145ffd83dbSDimitry Andric   MI.eraseFromParent();
48155ffd83dbSDimitry Andric   return true;
48165ffd83dbSDimitry Andric }
48175ffd83dbSDimitry Andric 
4818e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4819e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
4820e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
4821e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
4822e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
4823e8d8bef9SDimitry Andric 
4824e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4825e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
4826e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
4827e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
4828e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
4829e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
4830e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
4831e8d8bef9SDimitry Andric 
4832fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
4833fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4834fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
4835fe6060f1SDimitry Andric                                         MI.getDebugLoc());
4836fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4837fe6060f1SDimitry Andric     return false;
4838fe6060f1SDimitry Andric   }
4839fe6060f1SDimitry Andric 
4840*349cc55cSDimitry Andric   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4841*349cc55cSDimitry Andric   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
4842*349cc55cSDimitry Andric   const unsigned NumVDataDwords = 4;
4843*349cc55cSDimitry Andric   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
4844*349cc55cSDimitry Andric   const bool UseNSA =
4845*349cc55cSDimitry Andric       ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
4846*349cc55cSDimitry Andric   const unsigned BaseOpcodes[2][2] = {
4847*349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
4848*349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
4849*349cc55cSDimitry Andric        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
4850*349cc55cSDimitry Andric   int Opcode;
4851*349cc55cSDimitry Andric   if (UseNSA) {
4852*349cc55cSDimitry Andric     Opcode =
4853*349cc55cSDimitry Andric         AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
4854*349cc55cSDimitry Andric                               NumVDataDwords, NumVAddrDwords);
4855*349cc55cSDimitry Andric   } else {
4856*349cc55cSDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
4857*349cc55cSDimitry Andric                                    AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
4858*349cc55cSDimitry Andric                                    PowerOf2Ceil(NumVAddrDwords));
4859*349cc55cSDimitry Andric   }
4860*349cc55cSDimitry Andric   assert(Opcode != -1);
4861e8d8bef9SDimitry Andric 
4862e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
4863e8d8bef9SDimitry Andric   if (Is64) {
4864e8d8bef9SDimitry Andric     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4865e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(0));
4866e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(1));
4867e8d8bef9SDimitry Andric   } else {
4868e8d8bef9SDimitry Andric     Ops.push_back(NodePtr);
4869e8d8bef9SDimitry Andric   }
4870e8d8bef9SDimitry Andric   Ops.push_back(RayExtent);
4871e8d8bef9SDimitry Andric 
4872e8d8bef9SDimitry Andric   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4873e8d8bef9SDimitry Andric     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4874e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(0));
4875e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(1));
4876e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(2));
4877e8d8bef9SDimitry Andric   };
4878e8d8bef9SDimitry Andric 
4879e8d8bef9SDimitry Andric   packLanes(RayOrigin);
4880e8d8bef9SDimitry Andric   if (IsA16) {
4881e8d8bef9SDimitry Andric     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4882e8d8bef9SDimitry Andric     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4883e8d8bef9SDimitry Andric     Register R1 = MRI.createGenericVirtualRegister(S32);
4884e8d8bef9SDimitry Andric     Register R2 = MRI.createGenericVirtualRegister(S32);
4885e8d8bef9SDimitry Andric     Register R3 = MRI.createGenericVirtualRegister(S32);
4886e8d8bef9SDimitry Andric     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4887e8d8bef9SDimitry Andric     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4888e8d8bef9SDimitry Andric     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4889e8d8bef9SDimitry Andric     Ops.push_back(R1);
4890e8d8bef9SDimitry Andric     Ops.push_back(R2);
4891e8d8bef9SDimitry Andric     Ops.push_back(R3);
4892e8d8bef9SDimitry Andric   } else {
4893e8d8bef9SDimitry Andric     packLanes(RayDir);
4894e8d8bef9SDimitry Andric     packLanes(RayInvDir);
4895e8d8bef9SDimitry Andric   }
4896e8d8bef9SDimitry Andric 
4897*349cc55cSDimitry Andric   if (!UseNSA) {
4898*349cc55cSDimitry Andric     // Build a single vector containing all the operands so far prepared.
4899*349cc55cSDimitry Andric     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
4900*349cc55cSDimitry Andric     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
4901*349cc55cSDimitry Andric     Ops.clear();
4902*349cc55cSDimitry Andric     Ops.push_back(MergedOps);
4903*349cc55cSDimitry Andric   }
4904*349cc55cSDimitry Andric 
4905e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4906e8d8bef9SDimitry Andric     .addDef(DstReg)
4907e8d8bef9SDimitry Andric     .addImm(Opcode);
4908e8d8bef9SDimitry Andric 
4909e8d8bef9SDimitry Andric   for (Register R : Ops) {
4910e8d8bef9SDimitry Andric     MIB.addUse(R);
4911e8d8bef9SDimitry Andric   }
4912e8d8bef9SDimitry Andric 
4913e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
4914e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
4915e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
4916e8d8bef9SDimitry Andric 
4917e8d8bef9SDimitry Andric   MI.eraseFromParent();
4918e8d8bef9SDimitry Andric   return true;
4919e8d8bef9SDimitry Andric }
4920e8d8bef9SDimitry Andric 
49215ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
49225ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
49235ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
49245ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
49255ffd83dbSDimitry Andric 
49260b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4927480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
4928480093f4SDimitry Andric   switch (IntrID) {
4929480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
4930480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
4931480093f4SDimitry Andric     MachineInstr *Br = nullptr;
49325ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4933e8d8bef9SDimitry Andric     bool Negated = false;
4934e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
4935e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
49360b57cec5SDimitry Andric       const SIRegisterInfo *TRI
49370b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
49380b57cec5SDimitry Andric 
49390b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
49400b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
4941480093f4SDimitry Andric 
49425ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4943e8d8bef9SDimitry Andric 
4944e8d8bef9SDimitry Andric       if (Negated)
4945e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
4946e8d8bef9SDimitry Andric 
49475ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4948480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
49490b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
49500b57cec5SDimitry Andric           .addDef(Def)
49510b57cec5SDimitry Andric           .addUse(Use)
49525ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
4953480093f4SDimitry Andric       } else {
4954480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
4955480093f4SDimitry Andric             .addDef(Def)
4956480093f4SDimitry Andric             .addUse(Use)
4957e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
4958480093f4SDimitry Andric       }
4959480093f4SDimitry Andric 
49605ffd83dbSDimitry Andric       if (Br) {
49615ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
49625ffd83dbSDimitry Andric       } else {
49635ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
49645ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
49655ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
49665ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
49675ffd83dbSDimitry Andric       }
49680b57cec5SDimitry Andric 
49690b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
49700b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
49710b57cec5SDimitry Andric       MI.eraseFromParent();
49720b57cec5SDimitry Andric       BrCond->eraseFromParent();
49730b57cec5SDimitry Andric       return true;
49740b57cec5SDimitry Andric     }
49750b57cec5SDimitry Andric 
49760b57cec5SDimitry Andric     return false;
49770b57cec5SDimitry Andric   }
49780b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
4979480093f4SDimitry Andric     MachineInstr *Br = nullptr;
49805ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4981e8d8bef9SDimitry Andric     bool Negated = false;
4982e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
4983e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
49840b57cec5SDimitry Andric       const SIRegisterInfo *TRI
49850b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
49860b57cec5SDimitry Andric 
49875ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
49880b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
49895ffd83dbSDimitry Andric 
4990e8d8bef9SDimitry Andric       if (Negated)
4991e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
4992e8d8bef9SDimitry Andric 
49935ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
49940b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
49950b57cec5SDimitry Andric         .addUse(Reg)
49965ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
49975ffd83dbSDimitry Andric 
49985ffd83dbSDimitry Andric       if (Br)
49995ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
50005ffd83dbSDimitry Andric       else
50015ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
50025ffd83dbSDimitry Andric 
50030b57cec5SDimitry Andric       MI.eraseFromParent();
50040b57cec5SDimitry Andric       BrCond->eraseFromParent();
50050b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
50060b57cec5SDimitry Andric       return true;
50070b57cec5SDimitry Andric     }
50080b57cec5SDimitry Andric 
50090b57cec5SDimitry Andric     return false;
50100b57cec5SDimitry Andric   }
50110b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
50125ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
50135ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
50145ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
50155ffd83dbSDimitry Andric       MI.eraseFromParent();
50165ffd83dbSDimitry Andric       return true;
50175ffd83dbSDimitry Andric     }
50185ffd83dbSDimitry Andric 
50190b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
50200b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
50210b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
50220b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
50230b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
50240b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50250b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
50260b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
50270b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50280b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
50290b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
50300b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50310b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
50320b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
50330b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50340b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
50350b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
50360b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50370b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
50380b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
50390b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50400b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
50410b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
50420b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50430b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
50440b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
50450b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50460b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
50470b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
50480b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
50490b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
50500b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
50510b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50520b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
50538bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
50548bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
50558bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
50568bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
50578bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
50588bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
50598bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
50608bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
50618bcb0991SDimitry Andric     MI.eraseFromParent();
50628bcb0991SDimitry Andric     return true;
50638bcb0991SDimitry Andric   }
50645ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
5065e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
50668bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
50675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
50685ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
50698bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
50705ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
50715ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
50725ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
50735ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
50745ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
50755ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
50765ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
50775ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
50785ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
50795ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
50805ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
50815ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
50825ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
50835ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
50845ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
50855ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
50865ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
50875ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
50885ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
50895ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
50905ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
50915ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
50925ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
50935ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
50945ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
50955ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
50965ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
50975ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
50985ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
50995ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
51005ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
51015ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
51025ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
51035ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
51045ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
51055ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
51065ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
51075ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5108e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5109e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
51105ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
51115ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5112fe6060f1SDimitry Andric   case Intrinsic::amdgcn_buffer_atomic_fadd:
5113fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5114fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5115fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5116fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
51175ffd83dbSDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
51185ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_inc:
51195ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, true);
51205ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_dec:
51215ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, false);
51225ffd83dbSDimitry Andric   case Intrinsic::trap:
51235ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
51245ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
51255ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5126e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
5127e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
5128e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
5129e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
5130e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
5131e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5132e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5133e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
51345ffd83dbSDimitry Andric   default: {
51355ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
51365ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
51375ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
51380b57cec5SDimitry Andric     return true;
51390b57cec5SDimitry Andric   }
51405ffd83dbSDimitry Andric   }
51410b57cec5SDimitry Andric 
51420b57cec5SDimitry Andric   return true;
51430b57cec5SDimitry Andric }
5144