xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision bdd1243df58e60e85101c09001d9812a789b6bc4)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
21fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
23fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
28e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
2981ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
300b57cec5SDimitry Andric 
310b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
320b57cec5SDimitry Andric 
330b57cec5SDimitry Andric using namespace llvm;
340b57cec5SDimitry Andric using namespace LegalizeActions;
350b57cec5SDimitry Andric using namespace LegalizeMutations;
360b57cec5SDimitry Andric using namespace LegalityPredicates;
375ffd83dbSDimitry Andric using namespace MIPatternMatch;
380b57cec5SDimitry Andric 
395ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
405ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
415ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
425ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
435ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
445ffd83dbSDimitry Andric   cl::init(false),
455ffd83dbSDimitry Andric   cl::ReallyHidden);
460b57cec5SDimitry Andric 
475ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
485ffd83dbSDimitry Andric 
495ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
505ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
515ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
525ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
540b57cec5SDimitry Andric }
550b57cec5SDimitry Andric 
565ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
575ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
585ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
595ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
605ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
618bcb0991SDimitry Andric }
628bcb0991SDimitry Andric 
63349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an
64e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
660b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
670b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
680b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
69e8d8bef9SDimitry Andric     if (!Ty.isVector())
70e8d8bef9SDimitry Andric       return false;
71e8d8bef9SDimitry Andric 
72e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
73e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
74e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
75e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
768bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
778bcb0991SDimitry Andric   };
788bcb0991SDimitry Andric }
798bcb0991SDimitry Andric 
80e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
82e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
83e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
84e8d8bef9SDimitry Andric   };
85e8d8bef9SDimitry Andric }
86e8d8bef9SDimitry Andric 
878bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
888bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
898bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
908bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
918bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
920b57cec5SDimitry Andric   };
930b57cec5SDimitry Andric }
940b57cec5SDimitry Andric 
950b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
960b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
970b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
980b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
99*bdd1243dSDimitry Andric     return std::pair(TypeIdx,
100fe6060f1SDimitry Andric                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1010b57cec5SDimitry Andric   };
1020b57cec5SDimitry Andric }
1030b57cec5SDimitry Andric 
1040b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1050b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1060b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1070b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1080b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1090b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1100b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111*bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::scalarOrVector(
112*bdd1243dSDimitry Andric                                   ElementCount::getFixed(NewNumElts), EltTy));
1130b57cec5SDimitry Andric   };
1140b57cec5SDimitry Andric }
1150b57cec5SDimitry Andric 
1168bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1178bcb0991SDimitry Andric // type.
1188bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1198bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1208bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1218bcb0991SDimitry Andric 
1228bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1238bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1248bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1258bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1268bcb0991SDimitry Andric 
1278bcb0991SDimitry Andric     assert(EltSize < 32);
1288bcb0991SDimitry Andric 
1298bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130*bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1318bcb0991SDimitry Andric   };
1328bcb0991SDimitry Andric }
1338bcb0991SDimitry Andric 
134e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
135e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1365ffd83dbSDimitry Andric 
1375ffd83dbSDimitry Andric   if (Size <= 32) {
1385ffd83dbSDimitry Andric     // <2 x s8> -> s16
1395ffd83dbSDimitry Andric     // <4 x s8> -> s32
140e8d8bef9SDimitry Andric     return LLT::scalar(Size);
141e8d8bef9SDimitry Andric   }
1425ffd83dbSDimitry Andric 
143fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
144e8d8bef9SDimitry Andric }
145e8d8bef9SDimitry Andric 
146e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
147e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
148e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
149*bdd1243dSDimitry Andric     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
150e8d8bef9SDimitry Andric   };
151e8d8bef9SDimitry Andric }
152e8d8bef9SDimitry Andric 
153e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
154e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
155e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
156e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
157e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
158*bdd1243dSDimitry Andric     return std::pair(
159fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
1605ffd83dbSDimitry Andric   };
1615ffd83dbSDimitry Andric }
1625ffd83dbSDimitry Andric 
1638bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
1648bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1658bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1668bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
1678bcb0991SDimitry Andric   };
1688bcb0991SDimitry Andric }
1698bcb0991SDimitry Andric 
1700b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
1710b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1720b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1730b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
1740b57cec5SDimitry Andric   };
1750b57cec5SDimitry Andric }
1760b57cec5SDimitry Andric 
1770b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
1780b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1790b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1800b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
1810b57cec5SDimitry Andric   };
1820b57cec5SDimitry Andric }
1830b57cec5SDimitry Andric 
1845ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
1855ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
1865ffd83dbSDimitry Andric }
1875ffd83dbSDimitry Andric 
1885ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
1895ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
1905ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
1915ffd83dbSDimitry Andric }
1925ffd83dbSDimitry Andric 
1935ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
1940b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
1950b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
1960b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
1970b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
1980b57cec5SDimitry Andric }
1990b57cec5SDimitry Andric 
2005ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2015ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2025ffd83dbSDimitry Andric     return false;
2035ffd83dbSDimitry Andric 
2045ffd83dbSDimitry Andric   if (Ty.isVector())
2055ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2065ffd83dbSDimitry Andric 
2075ffd83dbSDimitry Andric   return true;
2085ffd83dbSDimitry Andric }
2095ffd83dbSDimitry Andric 
2105ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2115ffd83dbSDimitry Andric // multiples of v2s16.
2125ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2135ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2145ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2158bcb0991SDimitry Andric   };
2168bcb0991SDimitry Andric }
2178bcb0991SDimitry Andric 
2185ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2198bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2205ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2215ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2225ffd83dbSDimitry Andric       return false;
2235ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2245ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2258bcb0991SDimitry Andric   };
2268bcb0991SDimitry Andric }
2278bcb0991SDimitry Andric 
228fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
229fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
230fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
2318bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2328bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2338bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
234fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
2350b57cec5SDimitry Andric   };
2360b57cec5SDimitry Andric }
2370b57cec5SDimitry Andric 
2385ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2395ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2405ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2415ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
2425ffd83dbSDimitry Andric                                     bool IsLoad) {
2435ffd83dbSDimitry Andric   switch (AS) {
2445ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2455ffd83dbSDimitry Andric     // FIXME: Private element size.
246e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2475ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2485ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
2495ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
2505ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
2515ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
2525ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
2535ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
2545ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
2555ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
2565ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
2575ffd83dbSDimitry Andric     // kernel.
2585ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
2595ffd83dbSDimitry Andric   default:
2605ffd83dbSDimitry Andric     // Flat addresses may contextually need to be split to 32-bit parts if they
2615ffd83dbSDimitry Andric     // may alias scratch depending on the subtarget.
2625ffd83dbSDimitry Andric     return 128;
2635ffd83dbSDimitry Andric   }
2645ffd83dbSDimitry Andric }
2655ffd83dbSDimitry Andric 
2665ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
267fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
2685ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
2695ffd83dbSDimitry Andric 
2705ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
271fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
2725ffd83dbSDimitry Andric 
2735ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
27404eeddc0SDimitry Andric   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
27504eeddc0SDimitry Andric   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
2765ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
2775ffd83dbSDimitry Andric 
2785ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
2795ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2805ffd83dbSDimitry Andric     return false;
2815ffd83dbSDimitry Andric 
282fe6060f1SDimitry Andric   // Do not handle extending vector loads.
283fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
284fe6060f1SDimitry Andric     return false;
285fe6060f1SDimitry Andric 
2865ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
2875ffd83dbSDimitry Andric   // we also need to modify the memory access size.
2885ffd83dbSDimitry Andric #if 0
2895ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
2905ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
2915ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
2925ffd83dbSDimitry Andric #endif
2935ffd83dbSDimitry Andric 
2945ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
2955ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
2965ffd83dbSDimitry Andric     return false;
2975ffd83dbSDimitry Andric 
2985ffd83dbSDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
2995ffd83dbSDimitry Andric     return false;
3005ffd83dbSDimitry Andric 
3015ffd83dbSDimitry Andric   switch (MemSize) {
3025ffd83dbSDimitry Andric   case 8:
3035ffd83dbSDimitry Andric   case 16:
3045ffd83dbSDimitry Andric   case 32:
3055ffd83dbSDimitry Andric   case 64:
3065ffd83dbSDimitry Andric   case 128:
3075ffd83dbSDimitry Andric     break;
3085ffd83dbSDimitry Andric   case 96:
3095ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3105ffd83dbSDimitry Andric       return false;
3115ffd83dbSDimitry Andric     break;
3125ffd83dbSDimitry Andric   case 256:
3135ffd83dbSDimitry Andric   case 512:
3145ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3155ffd83dbSDimitry Andric     break;
3165ffd83dbSDimitry Andric   default:
3175ffd83dbSDimitry Andric     return false;
3185ffd83dbSDimitry Andric   }
3195ffd83dbSDimitry Andric 
3205ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3215ffd83dbSDimitry Andric 
322e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3235ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
324e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
325e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3265ffd83dbSDimitry Andric       return false;
3275ffd83dbSDimitry Andric   }
3285ffd83dbSDimitry Andric 
3295ffd83dbSDimitry Andric   return true;
3305ffd83dbSDimitry Andric }
3315ffd83dbSDimitry Andric 
3325ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
3335ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
3345ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
3355ffd83dbSDimitry Andric // bitcasting.
3365ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
3375ffd83dbSDimitry Andric   if (EnableNewLegality)
3385ffd83dbSDimitry Andric     return false;
3395ffd83dbSDimitry Andric 
3405ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
3415ffd83dbSDimitry Andric   if (Size <= 64)
3425ffd83dbSDimitry Andric     return false;
3435ffd83dbSDimitry Andric   if (!Ty.isVector())
3445ffd83dbSDimitry Andric     return true;
345e8d8bef9SDimitry Andric 
346e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
347e8d8bef9SDimitry Andric   if (EltTy.isPointer())
348e8d8bef9SDimitry Andric     return true;
349e8d8bef9SDimitry Andric 
350e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
3515ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
3525ffd83dbSDimitry Andric }
3535ffd83dbSDimitry Andric 
354fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
3555ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
356fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
3575ffd83dbSDimitry Andric          !loadStoreBitcastWorkaround(Ty);
3585ffd83dbSDimitry Andric }
3595ffd83dbSDimitry Andric 
360e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
361e8d8bef9SDimitry Andric /// to a different type.
362e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
363fe6060f1SDimitry Andric                                        const LLT MemTy) {
364fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
365e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
366e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
367e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
368e8d8bef9SDimitry Andric 
369e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
370e8d8bef9SDimitry Andric     return true;
371fe6060f1SDimitry Andric 
372fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
373fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
374fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
375e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
376e8d8bef9SDimitry Andric }
377e8d8bef9SDimitry Andric 
378e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
379e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
380e8d8bef9SDimitry Andric /// changes, not the size of the result register.
381fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
38204eeddc0SDimitry Andric                             uint64_t AlignInBits, unsigned AddrSpace,
383e8d8bef9SDimitry Andric                             unsigned Opcode) {
384fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
385e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
386e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
387e8d8bef9SDimitry Andric     return false;
388e8d8bef9SDimitry Andric 
389e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
390e8d8bef9SDimitry Andric   // end up widening these for a scalar load during RegBankSelect, since there
391e8d8bef9SDimitry Andric   // aren't 96-bit scalar loads.
392e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
393e8d8bef9SDimitry Andric     return false;
394e8d8bef9SDimitry Andric 
395e8d8bef9SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
396e8d8bef9SDimitry Andric     return false;
397e8d8bef9SDimitry Andric 
398e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
399e8d8bef9SDimitry Andric   // to it.
400e8d8bef9SDimitry Andric   //
401e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
402e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
403e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
404e8d8bef9SDimitry Andric     return false;
405e8d8bef9SDimitry Andric 
406e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
407e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
408*bdd1243dSDimitry Andric   unsigned Fast = 0;
409e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
410e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
411e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
412e8d8bef9SDimitry Andric          Fast;
413e8d8bef9SDimitry Andric }
414e8d8bef9SDimitry Andric 
415e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
416e8d8bef9SDimitry Andric                             unsigned Opcode) {
417e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
418e8d8bef9SDimitry Andric     return false;
419e8d8bef9SDimitry Andric 
420fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
421e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
422e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
423e8d8bef9SDimitry Andric }
424e8d8bef9SDimitry Andric 
4250b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
4260b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
4270b57cec5SDimitry Andric   :  ST(ST_) {
4280b57cec5SDimitry Andric   using namespace TargetOpcode;
4290b57cec5SDimitry Andric 
4300b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
4310b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
4320b57cec5SDimitry Andric   };
4330b57cec5SDimitry Andric 
4340b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
435e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
4360b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
4370b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
4380b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
4390b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
4400b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
4415ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
4425ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
4430b57cec5SDimitry Andric 
444fe6060f1SDimitry Andric   const LLT V2S8 = LLT::fixed_vector(2, 8);
445fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
446fe6060f1SDimitry Andric   const LLT V4S16 = LLT::fixed_vector(4, 16);
4470b57cec5SDimitry Andric 
448fe6060f1SDimitry Andric   const LLT V2S32 = LLT::fixed_vector(2, 32);
449fe6060f1SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
450fe6060f1SDimitry Andric   const LLT V4S32 = LLT::fixed_vector(4, 32);
451fe6060f1SDimitry Andric   const LLT V5S32 = LLT::fixed_vector(5, 32);
452fe6060f1SDimitry Andric   const LLT V6S32 = LLT::fixed_vector(6, 32);
453fe6060f1SDimitry Andric   const LLT V7S32 = LLT::fixed_vector(7, 32);
454fe6060f1SDimitry Andric   const LLT V8S32 = LLT::fixed_vector(8, 32);
455fe6060f1SDimitry Andric   const LLT V9S32 = LLT::fixed_vector(9, 32);
456fe6060f1SDimitry Andric   const LLT V10S32 = LLT::fixed_vector(10, 32);
457fe6060f1SDimitry Andric   const LLT V11S32 = LLT::fixed_vector(11, 32);
458fe6060f1SDimitry Andric   const LLT V12S32 = LLT::fixed_vector(12, 32);
459fe6060f1SDimitry Andric   const LLT V13S32 = LLT::fixed_vector(13, 32);
460fe6060f1SDimitry Andric   const LLT V14S32 = LLT::fixed_vector(14, 32);
461fe6060f1SDimitry Andric   const LLT V15S32 = LLT::fixed_vector(15, 32);
462fe6060f1SDimitry Andric   const LLT V16S32 = LLT::fixed_vector(16, 32);
463fe6060f1SDimitry Andric   const LLT V32S32 = LLT::fixed_vector(32, 32);
4640b57cec5SDimitry Andric 
465fe6060f1SDimitry Andric   const LLT V2S64 = LLT::fixed_vector(2, 64);
466fe6060f1SDimitry Andric   const LLT V3S64 = LLT::fixed_vector(3, 64);
467fe6060f1SDimitry Andric   const LLT V4S64 = LLT::fixed_vector(4, 64);
468fe6060f1SDimitry Andric   const LLT V5S64 = LLT::fixed_vector(5, 64);
469fe6060f1SDimitry Andric   const LLT V6S64 = LLT::fixed_vector(6, 64);
470fe6060f1SDimitry Andric   const LLT V7S64 = LLT::fixed_vector(7, 64);
471fe6060f1SDimitry Andric   const LLT V8S64 = LLT::fixed_vector(8, 64);
472fe6060f1SDimitry Andric   const LLT V16S64 = LLT::fixed_vector(16, 64);
4730b57cec5SDimitry Andric 
4740b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
4750b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
4768bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
4770b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
4788bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
4790b57cec5SDimitry Andric 
4800b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
4810b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
4828bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
4830b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
4848bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
4850b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
4860b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
4870b57cec5SDimitry Andric 
4880b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
4890b57cec5SDimitry Andric 
4900b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
4910b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
4920b57cec5SDimitry Andric   };
4930b57cec5SDimitry Andric 
4940b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
4958bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
4960b57cec5SDimitry Andric   };
4970b57cec5SDimitry Andric 
4980b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
4990b57cec5SDimitry Andric     S32, S64
5000b57cec5SDimitry Andric   };
5010b57cec5SDimitry Andric 
5020b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
5030b57cec5SDimitry Andric     S32, S64, S16
5040b57cec5SDimitry Andric   };
5050b57cec5SDimitry Andric 
5060b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
5070b57cec5SDimitry Andric     S32, S64, S16, V2S16
5080b57cec5SDimitry Andric   };
5090b57cec5SDimitry Andric 
5105ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
5115ffd83dbSDimitry Andric 
512fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
513fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
5140b57cec5SDimitry Andric 
5150b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
5160b57cec5SDimitry Andric   // elements for v3s16
5170b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
518e8d8bef9SDimitry Andric     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
5190b57cec5SDimitry Andric     .legalFor(AllS32Vectors)
5200b57cec5SDimitry Andric     .legalFor(AllS64Vectors)
5210b57cec5SDimitry Andric     .legalFor(AddrSpaces64)
5220b57cec5SDimitry Andric     .legalFor(AddrSpaces32)
523e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
524e8d8bef9SDimitry Andric     .clampScalar(0, S16, S256)
5250b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
5260b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16)
5270b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
528e8d8bef9SDimitry Andric     .scalarize(0);
5290b57cec5SDimitry Andric 
530e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
531e8d8bef9SDimitry Andric     // Full set of gfx9 features.
53281ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
5335ffd83dbSDimitry Andric       .legalFor({S32, S16, V2S16})
5340eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
53581ad6265SDimitry Andric       .scalarize(0)
53681ad6265SDimitry Andric       .minScalar(0, S16)
537349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
53881ad6265SDimitry Andric       .maxScalar(0, S32);
53981ad6265SDimitry Andric 
54081ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
54181ad6265SDimitry Andric       .legalFor({S32, S16, V2S16})
54281ad6265SDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
54381ad6265SDimitry Andric       .scalarize(0)
54481ad6265SDimitry Andric       .minScalar(0, S16)
54581ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
54681ad6265SDimitry Andric       .custom();
54781ad6265SDimitry Andric     assert(ST.hasMad64_32());
548e8d8bef9SDimitry Andric 
549e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
550e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
551e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
5520eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
553e8d8bef9SDimitry Andric       .scalarize(0)
554e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
555e8d8bef9SDimitry Andric       .lower();
5565ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
55781ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
5580b57cec5SDimitry Andric       .legalFor({S32, S16})
559349cc55cSDimitry Andric       .minScalar(0, S16)
560349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
561349cc55cSDimitry Andric       .maxScalar(0, S32)
562349cc55cSDimitry Andric       .scalarize(0);
563e8d8bef9SDimitry Andric 
56481ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
56581ad6265SDimitry Andric       .legalFor({S32, S16})
56681ad6265SDimitry Andric       .scalarize(0)
56781ad6265SDimitry Andric       .minScalar(0, S16)
56881ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
56981ad6265SDimitry Andric       .custom();
57081ad6265SDimitry Andric     assert(ST.hasMad64_32());
57181ad6265SDimitry Andric 
572e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
573e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
574e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
575e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
576e8d8bef9SDimitry Andric       .minScalar(0, S16)
577e8d8bef9SDimitry Andric       .scalarize(0)
578e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
579e8d8bef9SDimitry Andric       .lower();
580e8d8bef9SDimitry Andric 
581e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
582e8d8bef9SDimitry Andric     // coerce to the desired type first.
583e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
584e8d8bef9SDimitry Andric       .minScalar(0, S16)
585e8d8bef9SDimitry Andric       .scalarize(0)
586e8d8bef9SDimitry Andric       .lower();
5870b57cec5SDimitry Andric   } else {
58881ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
5890b57cec5SDimitry Andric       .legalFor({S32})
590349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
5910b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
5920b57cec5SDimitry Andric       .scalarize(0);
593e8d8bef9SDimitry Andric 
59481ad6265SDimitry Andric     auto &Mul = getActionDefinitionsBuilder(G_MUL)
59581ad6265SDimitry Andric       .legalFor({S32})
59681ad6265SDimitry Andric       .scalarize(0)
59781ad6265SDimitry Andric       .minScalar(0, S32)
59881ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32);
59981ad6265SDimitry Andric 
60081ad6265SDimitry Andric     if (ST.hasMad64_32())
60181ad6265SDimitry Andric       Mul.custom();
60281ad6265SDimitry Andric     else
60381ad6265SDimitry Andric       Mul.maxScalar(0, S32);
60481ad6265SDimitry Andric 
605e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
606e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
607e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
608e8d8bef9SDimitry Andric         .scalarize(0)
609e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
610e8d8bef9SDimitry Andric         .lower();
611e8d8bef9SDimitry Andric     } else {
612e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
613e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
614e8d8bef9SDimitry Andric         .minScalar(0, S32)
615e8d8bef9SDimitry Andric         .scalarize(0)
616e8d8bef9SDimitry Andric         .lower();
6170b57cec5SDimitry Andric     }
6180b57cec5SDimitry Andric 
619e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
620e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
621e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
622e8d8bef9SDimitry Andric       .minScalar(0, S32)
623e8d8bef9SDimitry Andric       .scalarize(0)
624e8d8bef9SDimitry Andric       .lower();
625e8d8bef9SDimitry Andric   }
626e8d8bef9SDimitry Andric 
627fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
628fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
6295ffd83dbSDimitry Andric       .customFor({S32, S64})
630480093f4SDimitry Andric       .clampScalar(0, S32, S64)
631480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
632480093f4SDimitry Andric       .scalarize(0);
633480093f4SDimitry Andric 
634e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
6350b57cec5SDimitry Andric                    .legalFor({S32})
636349cc55cSDimitry Andric                    .maxScalar(0, S32);
637e8d8bef9SDimitry Andric 
638e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
639e8d8bef9SDimitry Andric     Mulh
640e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
641e8d8bef9SDimitry Andric       .lowerFor({V2S8});
642e8d8bef9SDimitry Andric   }
643e8d8bef9SDimitry Andric 
644e8d8bef9SDimitry Andric   Mulh
645e8d8bef9SDimitry Andric     .scalarize(0)
646e8d8bef9SDimitry Andric     .lower();
6470b57cec5SDimitry Andric 
6480b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
6490b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
6500b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
6510b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
6520b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
6530b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6548bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
6550b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
6560b57cec5SDimitry Andric     .scalarize(0);
6570b57cec5SDimitry Andric 
658*bdd1243dSDimitry Andric   getActionDefinitionsBuilder(
659*bdd1243dSDimitry Andric       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
660480093f4SDimitry Andric       .legalFor({{S32, S1}, {S32, S32}})
661*bdd1243dSDimitry Andric       .clampScalar(0, S32, S32)
662*bdd1243dSDimitry Andric       .scalarize(0);
6630b57cec5SDimitry Andric 
6640b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
6650b57cec5SDimitry Andric     // Don't worry about the size constraint.
6668bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
6675ffd83dbSDimitry Andric     .lower();
6680b57cec5SDimitry Andric 
6690b57cec5SDimitry Andric 
6700b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
6718bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
6720b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
673e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
6740b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
675e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
6760b57cec5SDimitry Andric 
6775ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
6785ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
6795ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
6808bcb0991SDimitry Andric 
6815ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
6825ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
6835ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
6845ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
6855ffd83dbSDimitry Andric       .legalFor({S1, S16})
6865ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6875ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
6885ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
6895ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
6905ffd83dbSDimitry Andric 
691fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
6925ffd83dbSDimitry Andric 
6935ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
6945ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
6955ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
6965ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
6975ffd83dbSDimitry Andric 
6985ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
699e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
700e8d8bef9SDimitry Andric 
701fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
7020b57cec5SDimitry Andric 
7030b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
704*bdd1243dSDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
705*bdd1243dSDimitry Andric       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
7060b57cec5SDimitry Andric     .legalFor({S32, S64});
7078bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
7088bcb0991SDimitry Andric     .customFor({S32, S64});
7098bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
7108bcb0991SDimitry Andric     .customFor({S32, S64});
7110b57cec5SDimitry Andric 
7120b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7130b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
7140b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
7150b57cec5SDimitry Andric     else
7160b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
7178bcb0991SDimitry Andric 
7188bcb0991SDimitry Andric     TrigActions.customFor({S16});
7198bcb0991SDimitry Andric     FDIVActions.customFor({S16});
7200b57cec5SDimitry Andric   }
7210b57cec5SDimitry Andric 
7220b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
7230b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
7240b57cec5SDimitry Andric 
7250b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
7260b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
727480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
7280b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
7290b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7300b57cec5SDimitry Andric       .scalarize(0);
7310b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
7320b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
7330b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7340b57cec5SDimitry Andric       .scalarize(0);
7350b57cec5SDimitry Andric   } else {
7360b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
7370b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
7380b57cec5SDimitry Andric       .scalarize(0);
7390b57cec5SDimitry Andric   }
7400b57cec5SDimitry Andric 
7410b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
7420eae32dcSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
7438bcb0991SDimitry Andric 
7440b57cec5SDimitry Andric   FPOpActions
7450b57cec5SDimitry Andric     .scalarize(0)
7460b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7470b57cec5SDimitry Andric 
7488bcb0991SDimitry Andric   TrigActions
7498bcb0991SDimitry Andric     .scalarize(0)
7508bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7518bcb0991SDimitry Andric 
7528bcb0991SDimitry Andric   FDIVActions
7538bcb0991SDimitry Andric     .scalarize(0)
7548bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7558bcb0991SDimitry Andric 
7568bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
7578bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
7580eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
7598bcb0991SDimitry Andric     .scalarize(0)
7608bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
7618bcb0991SDimitry Andric 
7620b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7638bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
7640b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
7650b57cec5SDimitry Andric       .scalarize(0)
7660b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
7670b57cec5SDimitry Andric   } else {
7685ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
7695ffd83dbSDimitry Andric       .legalFor({S32, S64})
7705ffd83dbSDimitry Andric       .scalarize(0)
7715ffd83dbSDimitry Andric       .clampScalar(0, S32, S64);
7725ffd83dbSDimitry Andric 
7735ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
7745ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7755ffd83dbSDimitry Andric         .customFor({S64})
7765ffd83dbSDimitry Andric         .legalFor({S32, S64})
7775ffd83dbSDimitry Andric         .scalarize(0)
7785ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
7795ffd83dbSDimitry Andric     } else {
7805ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7810b57cec5SDimitry Andric         .legalFor({S32, S64})
7820b57cec5SDimitry Andric         .scalarize(0)
7830b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
7840b57cec5SDimitry Andric     }
7855ffd83dbSDimitry Andric   }
7860b57cec5SDimitry Andric 
7870b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
7880b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
7895ffd83dbSDimitry Andric     .scalarize(0)
7905ffd83dbSDimitry Andric     .lower();
7910b57cec5SDimitry Andric 
7920b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
7930b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
794e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
7950b57cec5SDimitry Andric     .scalarize(0);
7960b57cec5SDimitry Andric 
797*bdd1243dSDimitry Andric   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
79881ad6265SDimitry Andric   if (ST.has16BitInsts()) {
79981ad6265SDimitry Andric     FSubActions
80081ad6265SDimitry Andric       // Use actual fsub instruction
80181ad6265SDimitry Andric       .legalFor({S32, S16})
80281ad6265SDimitry Andric       // Must use fadd + fneg
80381ad6265SDimitry Andric       .lowerFor({S64, V2S16});
80481ad6265SDimitry Andric   } else {
80581ad6265SDimitry Andric     FSubActions
8060b57cec5SDimitry Andric       // Use actual fsub instruction
8070b57cec5SDimitry Andric       .legalFor({S32})
8080b57cec5SDimitry Andric       // Must use fadd + fneg
80981ad6265SDimitry Andric       .lowerFor({S64, S16, V2S16});
81081ad6265SDimitry Andric   }
81181ad6265SDimitry Andric 
81281ad6265SDimitry Andric   FSubActions
8130b57cec5SDimitry Andric     .scalarize(0)
8140b57cec5SDimitry Andric     .clampScalar(0, S32, S64);
8150b57cec5SDimitry Andric 
8168bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
8178bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
8185ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
8198bcb0991SDimitry Andric     FMad.customFor({S32, S16});
8205ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
8218bcb0991SDimitry Andric     FMad.customFor({S32});
8225ffd83dbSDimitry Andric   else if (ST.hasMadF16())
8235ffd83dbSDimitry Andric     FMad.customFor({S16});
8248bcb0991SDimitry Andric   FMad.scalarize(0)
8258bcb0991SDimitry Andric       .lower();
8268bcb0991SDimitry Andric 
827e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
828e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
829e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
830e8d8bef9SDimitry Andric   } else {
831e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
832e8d8bef9SDimitry Andric         .customFor({S32, S64});
833e8d8bef9SDimitry Andric   }
834e8d8bef9SDimitry Andric   FRem.scalarize(0);
835e8d8bef9SDimitry Andric 
8365ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
8375ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
8385ffd83dbSDimitry Andric     .legalIf(isScalar(0))
8395ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
8405ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
8415ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
8425ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
8435ffd83dbSDimitry Andric     // in the legalizer.
8445ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
8455ffd83dbSDimitry Andric     .alwaysLegal();
8465ffd83dbSDimitry Andric 
8470b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
8480b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
8495ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
850480093f4SDimitry Andric     .scalarize(0)
8515ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
8525ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
8530b57cec5SDimitry Andric 
8548bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
8558bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
856480093f4SDimitry Andric                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
857480093f4SDimitry Andric                     .lowerIf(typeIs(1, S1))
858349cc55cSDimitry Andric                     .customFor({{S32, S64}, {S64, S64}});
8598bcb0991SDimitry Andric   if (ST.has16BitInsts())
8608bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
8618bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
862e8d8bef9SDimitry Andric        .minScalar(0, S32)
8635ffd83dbSDimitry Andric        .scalarize(0)
8645ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
8650b57cec5SDimitry Andric 
8668bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
8675ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
868fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
869e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
8708bcb0991SDimitry Andric   if (ST.has16BitInsts())
8718bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
8728bcb0991SDimitry Andric   else
8738bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
8748bcb0991SDimitry Andric 
8758bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
876fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
8775ffd83dbSDimitry Andric        .scalarize(0)
8785ffd83dbSDimitry Andric        .lower();
8790b57cec5SDimitry Andric 
88081ad6265SDimitry Andric   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
88181ad6265SDimitry Andric       .customFor({S16, S32})
88281ad6265SDimitry Andric       .scalarize(0)
88381ad6265SDimitry Andric       .lower();
88481ad6265SDimitry Andric 
885e8d8bef9SDimitry Andric   // Lower roundeven into G_FRINT
886e8d8bef9SDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
887480093f4SDimitry Andric     .scalarize(0)
888480093f4SDimitry Andric     .lower();
8890b57cec5SDimitry Andric 
890480093f4SDimitry Andric   if (ST.has16BitInsts()) {
891480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
892480093f4SDimitry Andric       .legalFor({S16, S32, S64})
893480093f4SDimitry Andric       .clampScalar(0, S16, S64)
894480093f4SDimitry Andric       .scalarize(0);
895480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
8960b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8970b57cec5SDimitry Andric       .legalFor({S32, S64})
8980b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8990b57cec5SDimitry Andric       .scalarize(0);
9000b57cec5SDimitry Andric   } else {
9010b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
9020b57cec5SDimitry Andric       .legalFor({S32})
9030b57cec5SDimitry Andric       .customFor({S64})
9040b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
9050b57cec5SDimitry Andric       .scalarize(0);
9060b57cec5SDimitry Andric   }
9070b57cec5SDimitry Andric 
908480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
909e8d8bef9SDimitry Andric     .legalIf(all(isPointer(0), sameSize(0, 1)))
910e8d8bef9SDimitry Andric     .scalarize(0)
911e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0);
9120b57cec5SDimitry Andric 
9135ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
914e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
915e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
9165ffd83dbSDimitry Andric     .scalarize(0);
9170b57cec5SDimitry Andric 
9180b57cec5SDimitry Andric   auto &CmpBuilder =
9190b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
920480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
921480093f4SDimitry Andric     // so make both s1 and s32 legal.
922480093f4SDimitry Andric     //
923480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
924480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
925480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
926480093f4SDimitry Andric     // before that won't try to use s32 result types.
927480093f4SDimitry Andric     //
928480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
929480093f4SDimitry Andric     // bank.
9300b57cec5SDimitry Andric     .legalForCartesianProduct(
9310b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
932480093f4SDimitry Andric     .legalForCartesianProduct(
933480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
9340b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
9350b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
9360b57cec5SDimitry Andric   }
9370b57cec5SDimitry Andric 
9380b57cec5SDimitry Andric   CmpBuilder
9390b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
9400b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9410b57cec5SDimitry Andric     .scalarize(0)
942480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
9430b57cec5SDimitry Andric 
9440b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
9450b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
9460b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
9470b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9480b57cec5SDimitry Andric     .scalarize(0);
9490b57cec5SDimitry Andric 
9505ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
9515ffd83dbSDimitry Andric   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
9525ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9535ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32, S16});
9545ffd83dbSDimitry Andric   else
9555ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32});
9565ffd83dbSDimitry Andric   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
9575ffd83dbSDimitry Andric   Exp2Ops.scalarize(0);
9585ffd83dbSDimitry Andric 
9595ffd83dbSDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
9605ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9615ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
9625ffd83dbSDimitry Andric   else
9635ffd83dbSDimitry Andric     ExpOps.customFor({S32});
9645ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
9650b57cec5SDimitry Andric         .scalarize(0);
9660b57cec5SDimitry Andric 
967e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
968e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
969e8d8bef9SDimitry Andric     .lower();
970e8d8bef9SDimitry Andric 
9710b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9725ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
9730b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9740b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
97504eeddc0SDimitry Andric     .widenScalarToNextPow2(1, 32)
9760b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9770b57cec5SDimitry Andric     .scalarize(0)
97804eeddc0SDimitry Andric     .widenScalarToNextPow2(0, 32);
97904eeddc0SDimitry Andric 
980*bdd1243dSDimitry Andric   // If no 16 bit instr is available, lower into different instructions.
981*bdd1243dSDimitry Andric   if (ST.has16BitInsts())
982*bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
983*bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypes16)
984*bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
985*bdd1243dSDimitry Andric         .scalarize(0)
986*bdd1243dSDimitry Andric         .lower();
987*bdd1243dSDimitry Andric   else
988*bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
989*bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypesBase)
990*bdd1243dSDimitry Andric         .lowerFor({S1, S16})
991*bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
992*bdd1243dSDimitry Andric         .scalarize(0)
993*bdd1243dSDimitry Andric         .lower();
9940b57cec5SDimitry Andric 
9955ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
9965ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
9975ffd83dbSDimitry Andric   // bitwidth.
9985ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
9995ffd83dbSDimitry Andric     .scalarize(0)
10005ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
10015ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
10025ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
10035ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
1004349cc55cSDimitry Andric     .custom();
10055ffd83dbSDimitry Andric 
10065ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
10075ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
10085ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
10095ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
10105ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
10115ffd83dbSDimitry Andric     .scalarize(0)
10125ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
10135ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
10145ffd83dbSDimitry Andric 
1015fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1016fe6060f1SDimitry Andric   // RegBankSelect.
10175ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
1018fe6060f1SDimitry Andric     .legalFor({S32, S64})
1019fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
1020fe6060f1SDimitry Andric     .scalarize(0)
1021fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
10220b57cec5SDimitry Andric 
10230b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
10245ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
10255ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
10260eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
10275ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
10285ffd83dbSDimitry Andric       // narrowScalar limitation.
10295ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
10305ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
10315ffd83dbSDimitry Andric       .scalarize(0);
10325ffd83dbSDimitry Andric 
10330b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
1034fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10350b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
10360b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
10370b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
10385ffd83dbSDimitry Andric         .minScalar(0, S16)
10390b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
10405ffd83dbSDimitry Andric         .scalarize(0)
10415ffd83dbSDimitry Andric         .lower();
10420b57cec5SDimitry Andric     } else {
1043fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10440b57cec5SDimitry Andric         .legalFor({S32, S16})
10450b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
10465ffd83dbSDimitry Andric         .minScalar(0, S16)
10475ffd83dbSDimitry Andric         .scalarize(0)
10485ffd83dbSDimitry Andric         .lower();
10490b57cec5SDimitry Andric     }
10500b57cec5SDimitry Andric   } else {
10515ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
10525ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
10535ffd83dbSDimitry Andric       .legalFor({S32})
10545ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
10555ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
10565ffd83dbSDimitry Andric       // narrowScalar limitation.
10575ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
10585ffd83dbSDimitry Andric       .maxScalar(0, S32)
10595ffd83dbSDimitry Andric       .scalarize(0)
10605ffd83dbSDimitry Andric       .lower();
10615ffd83dbSDimitry Andric 
1062fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10630b57cec5SDimitry Andric       .legalFor({S32})
10645ffd83dbSDimitry Andric       .minScalar(0, S32)
10650b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
10665ffd83dbSDimitry Andric       .scalarize(0)
10675ffd83dbSDimitry Andric       .lower();
10680b57cec5SDimitry Andric   }
10690b57cec5SDimitry Andric 
10700b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
10710b57cec5SDimitry Andric       // List the common cases
10720b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
10730b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
10740b57cec5SDimitry Andric       .scalarize(0)
10750b57cec5SDimitry Andric       // Accept any address space as long as the size matches
10760b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
10770b57cec5SDimitry Andric       .widenScalarIf(smallerThan(1, 0),
10780b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1079*bdd1243dSDimitry Andric                        return std::pair(
1080*bdd1243dSDimitry Andric                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
10810b57cec5SDimitry Andric                      })
1082*bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1083*bdd1243dSDimitry Andric         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10840b57cec5SDimitry Andric       });
10850b57cec5SDimitry Andric 
10860b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
10870b57cec5SDimitry Andric       // List the common cases
10880b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
10890b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
10900b57cec5SDimitry Andric       .scalarize(0)
10910b57cec5SDimitry Andric       // Accept any address space as long as the size matches
10920b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
10930b57cec5SDimitry Andric       .widenScalarIf(smallerThan(0, 1),
10940b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1095*bdd1243dSDimitry Andric                        return std::pair(
1096*bdd1243dSDimitry Andric                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
10970b57cec5SDimitry Andric                      })
1098*bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1099*bdd1243dSDimitry Andric         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
11000b57cec5SDimitry Andric       });
11010b57cec5SDimitry Andric 
11020b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
11030b57cec5SDimitry Andric     .scalarize(0)
11040b57cec5SDimitry Andric     .custom();
11050b57cec5SDimitry Andric 
11065ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
11075ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
11088bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
11098bcb0991SDimitry Andric 
11108bcb0991SDimitry Andric     // Split vector extloads.
1111fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1112480093f4SDimitry Andric 
11138bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
11148bcb0991SDimitry Andric       return true;
11158bcb0991SDimitry Andric 
11168bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
11178bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
11185ffd83dbSDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
11198bcb0991SDimitry Andric       return true;
11208bcb0991SDimitry Andric 
11218bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
11228bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
11235ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
11245ffd83dbSDimitry Andric     if (NumRegs == 3) {
11255ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
11268bcb0991SDimitry Andric         return true;
11275ffd83dbSDimitry Andric     } else {
11285ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
11295ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
11305ffd83dbSDimitry Andric         return true;
11315ffd83dbSDimitry Andric     }
11328bcb0991SDimitry Andric 
11338bcb0991SDimitry Andric     return false;
11348bcb0991SDimitry Andric   };
11358bcb0991SDimitry Andric 
1136e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1137e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1138e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
11398bcb0991SDimitry Andric 
11408bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
11418bcb0991SDimitry Andric   // LDS
11428bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
11438bcb0991SDimitry Andric 
11448bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
11458bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
11468bcb0991SDimitry Andric 
11478bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
11485ffd83dbSDimitry Andric     // Explicitly list some common cases.
11495ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1150fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1151fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1152fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1153fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1154fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1155fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1156fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1157fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
11588bcb0991SDimitry Andric 
1159fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1160fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1161fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1162fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1163fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1164fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
11658bcb0991SDimitry Andric 
1166fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1167fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1168fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1169fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
11708bcb0991SDimitry Andric 
1171fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1172fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1173fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1174fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1175fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
11765ffd83dbSDimitry Andric     Actions.legalIf(
11775ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1178fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
11795ffd83dbSDimitry Andric       });
11805ffd83dbSDimitry Andric 
11815ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
11825ffd83dbSDimitry Andric     // 64-bits.
11835ffd83dbSDimitry Andric     //
11845ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
11855ffd83dbSDimitry Andric     // inserting addrspacecasts.
11865ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
11875ffd83dbSDimitry Andric 
11885ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
11895ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
11905ffd83dbSDimitry Andric     // parts anyway.
11915ffd83dbSDimitry Andric     //
11925ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
11935ffd83dbSDimitry Andric     // 16-bit vector parts.
11945ffd83dbSDimitry Andric     Actions.bitcastIf(
11955ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1196e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1197fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
11985ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
11995ffd83dbSDimitry Andric 
1200e8d8bef9SDimitry Andric     if (!IsStore) {
1201e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1202e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1203e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1204e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1205e8d8bef9SDimitry Andric       });
1206e8d8bef9SDimitry Andric     }
1207e8d8bef9SDimitry Andric 
1208e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
12098bcb0991SDimitry Andric     Actions
12108bcb0991SDimitry Andric         .narrowScalarIf(
12118bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
12125ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
12135ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
12148bcb0991SDimitry Andric             },
12158bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
12168bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
12178bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
12188bcb0991SDimitry Andric 
12198bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1220fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
12218bcb0991SDimitry Andric 
12228bcb0991SDimitry Andric               // Split extloads.
12238bcb0991SDimitry Andric               if (DstSize > MemSize)
1224*bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MemSize));
12258bcb0991SDimitry Andric 
12265ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
12275ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
12285ffd83dbSDimitry Andric                                                      Op == G_LOAD);
12298bcb0991SDimitry Andric               if (MemSize > MaxSize)
1230*bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MaxSize));
12318bcb0991SDimitry Andric 
123204eeddc0SDimitry Andric               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1233*bdd1243dSDimitry Andric               return std::pair(0, LLT::scalar(Align));
12348bcb0991SDimitry Andric             })
12358bcb0991SDimitry Andric         .fewerElementsIf(
12368bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
12375ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
12385ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
12398bcb0991SDimitry Andric             },
12408bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
12418bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
12428bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
12438bcb0991SDimitry Andric 
12448bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
12455ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
12465ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
12475ffd83dbSDimitry Andric                                                      Op == G_LOAD);
12485ffd83dbSDimitry Andric 
12495ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
12505ffd83dbSDimitry Andric               // up scalarizing.
12515ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
12528bcb0991SDimitry Andric 
12538bcb0991SDimitry Andric               // Split if it's too large for the address space.
1254fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1255fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
12568bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
12575ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
12585ffd83dbSDimitry Andric 
12595ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
1260*bdd1243dSDimitry Andric                   return std::pair(
1261fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1262fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
12635ffd83dbSDimitry Andric                 }
12645ffd83dbSDimitry Andric 
1265fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
12668bcb0991SDimitry Andric 
12678bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
12688bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
12698bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
12708bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
1271*bdd1243dSDimitry Andric                   return std::pair(0, EltTy);
12728bcb0991SDimitry Andric 
1273*bdd1243dSDimitry Andric                 return std::pair(0,
1274*bdd1243dSDimitry Andric                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
12758bcb0991SDimitry Andric               }
12768bcb0991SDimitry Andric 
12775ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
12785ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
1279*bdd1243dSDimitry Andric                 return std::pair(0, EltTy);
12805ffd83dbSDimitry Andric 
12815ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
12825ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
12835ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
12845ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
12855ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
12865ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
12875ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
1288*bdd1243dSDimitry Andric                 return std::pair(
1289fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1290fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
12915ffd83dbSDimitry Andric               }
12925ffd83dbSDimitry Andric 
12938bcb0991SDimitry Andric               // May need relegalization for the scalars.
1294*bdd1243dSDimitry Andric               return std::pair(0, EltTy);
12958bcb0991SDimitry Andric             })
1296fe6060f1SDimitry Andric     .minScalar(0, S32)
1297fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
12988bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1299e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1300e8d8bef9SDimitry Andric     .lower();
13018bcb0991SDimitry Andric   }
13020b57cec5SDimitry Andric 
1303fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
13040b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1305fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1306fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1307fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1308fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1309fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1310fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1311fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1312fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1313fe6060f1SDimitry Andric                        .legalIf(
1314fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1315fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1316fe6060f1SDimitry Andric                          });
1317fe6060f1SDimitry Andric 
13180b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
13198bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1320fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
13210b57cec5SDimitry Andric   }
13220b57cec5SDimitry Andric 
1323fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1324fe6060f1SDimitry Andric   // 64-bits.
1325fe6060f1SDimitry Andric   //
1326fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1327fe6060f1SDimitry Andric   // inserting addrspacecasts.
1328fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1329fe6060f1SDimitry Andric 
13300b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
13310b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
13320b57cec5SDimitry Andric           .lower();
13330b57cec5SDimitry Andric 
13340b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
13350b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
13360b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
13370b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1338480093f4SDimitry Andric      G_ATOMICRMW_UMIN})
13390b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1340e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1341e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13420b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
13430b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
13440b57cec5SDimitry Andric   }
13450b57cec5SDimitry Andric 
1346fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1347349cc55cSDimitry Andric   if (ST.hasLDSFPAtomicAdd()) {
1348fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1349fe6060f1SDimitry Andric     if (ST.hasGFX90AInsts())
1350fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
135181ad6265SDimitry Andric     if (ST.hasGFX940Insts())
135281ad6265SDimitry Andric       Atomic.legalFor({{V2S16, LocalPtr}});
13535ffd83dbSDimitry Andric   }
1354fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1355fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
1356*bdd1243dSDimitry Andric   if (ST.hasFlatAtomicFaddF32Inst())
1357*bdd1243dSDimitry Andric     Atomic.legalFor({{S32, FlatPtr}});
13588bcb0991SDimitry Andric 
135904eeddc0SDimitry Andric   if (ST.hasGFX90AInsts()) {
136004eeddc0SDimitry Andric     // These are legal with some caveats, and should have undergone expansion in
136104eeddc0SDimitry Andric     // the IR in most situations
136204eeddc0SDimitry Andric     // TODO: Move atomic expansion into legalizer
136304eeddc0SDimitry Andric     Atomic.legalFor({
136404eeddc0SDimitry Andric         {S32, GlobalPtr},
136504eeddc0SDimitry Andric         {S64, GlobalPtr},
136604eeddc0SDimitry Andric         {S64, FlatPtr}
136704eeddc0SDimitry Andric       });
136804eeddc0SDimitry Andric   }
136904eeddc0SDimitry Andric 
1370480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1371480093f4SDimitry Andric   // demarshalling
1372480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1373480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1374480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1375480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1376480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13770b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1378480093f4SDimitry Andric 
1379480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
13800b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1381fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1382fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1383fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1384fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1385fe6060f1SDimitry Andric                                 {S1, S32})
13860b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
13875ffd83dbSDimitry Andric       .scalarize(1)
13880b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
13890b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
13900b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
13910b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
13920b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
13930b57cec5SDimitry Andric       .scalarize(0)
13940b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1395480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
13960b57cec5SDimitry Andric 
13970b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
13980b57cec5SDimitry Andric   // be more flexible with the shift amount type.
13990b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
14000b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
14010b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
14020b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
14035ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
14040b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
14050b57cec5SDimitry Andric     } else
14065ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
14070b57cec5SDimitry Andric 
14085ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
14095ffd83dbSDimitry Andric     Shifts.widenScalarIf(
14105ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
14115ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
14125ffd83dbSDimitry Andric         // 32-bit amount.
14135ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
14145ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
14155ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
14165ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
14175ffd83dbSDimitry Andric       }, changeTo(1, S16));
14185ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1419480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
14200b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
142104eeddc0SDimitry Andric     Shifts.clampScalar(0, S16, S64);
1422e8d8bef9SDimitry Andric 
1423e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1424e8d8bef9SDimitry Andric       .minScalar(0, S16)
1425e8d8bef9SDimitry Andric       .scalarize(0)
1426e8d8bef9SDimitry Andric       .lower();
14270b57cec5SDimitry Andric   } else {
14280b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
14290b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
14300b57cec5SDimitry Andric     // been truncated already.
14310b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
14320b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
143304eeddc0SDimitry Andric     Shifts.clampScalar(0, S32, S64);
1434e8d8bef9SDimitry Andric 
1435e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1436e8d8bef9SDimitry Andric       .minScalar(0, S32)
1437e8d8bef9SDimitry Andric       .scalarize(0)
1438e8d8bef9SDimitry Andric       .lower();
14390b57cec5SDimitry Andric   }
14400b57cec5SDimitry Andric   Shifts.scalarize(0);
14410b57cec5SDimitry Andric 
14420b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
14430b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
14440b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
14450b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
14460b57cec5SDimitry Andric 
14470b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14480b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
14490b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
14500b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
14510b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1452e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
1453e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
14540b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
14555ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
14560b57cec5SDimitry Andric                   IdxTy.getSizeInBits() == 32;
14570b57cec5SDimitry Andric         })
1458e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1459e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1460e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1461e8d8bef9SDimitry Andric       .bitcastIf(
1462e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1463e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1464e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1465e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1466e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1467e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1468e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1469e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1470e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1471e8d8bef9SDimitry Andric 
1472e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1473*bdd1243dSDimitry Andric           return std::pair(
1474fe6060f1SDimitry Andric               VecTypeIdx,
1475fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1476e8d8bef9SDimitry Andric         })
14770b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
14780b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1479e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1480e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1481e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
1482e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1483e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1484e8d8bef9SDimitry Andric       .lower();
14850b57cec5SDimitry Andric   }
14860b57cec5SDimitry Andric 
14870b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
14880b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
14890b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
14900b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
14910b57cec5SDimitry Andric       });
14920b57cec5SDimitry Andric 
14930b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
14940b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
14950b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
14960b57cec5SDimitry Andric 
14970b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
14980b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14998bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
15000eae32dcSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
15010eae32dcSDimitry Andric           // Sub-vector(or single element) insert and extract.
15020eae32dcSDimitry Andric           // TODO: verify immediate offset here since lower only works with
15030eae32dcSDimitry Andric           // whole elements.
15040eae32dcSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15050eae32dcSDimitry Andric           return BigTy.isVector();
15060eae32dcSDimitry Andric         })
15078bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
15080b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
15090b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15100b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
15110b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
15120b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
15130b57cec5SDimitry Andric         })
15140b57cec5SDimitry Andric       .widenScalarIf(
15150b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
15160b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15170b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
15180b57cec5SDimitry Andric         },
15190b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
15200b57cec5SDimitry Andric       .widenScalarIf(
15210b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
15220b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
15230b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
15240b57cec5SDimitry Andric         },
15250b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
15260b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
15270b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
15280b57cec5SDimitry Andric 
15290b57cec5SDimitry Andric   }
15300b57cec5SDimitry Andric 
15318bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
15320b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
15330b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
15348bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
15358bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
15368bcb0991SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
15378bcb0991SDimitry Andric 
15388bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
15395ffd83dbSDimitry Andric     BuildVector
15405ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
15415ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
1542*bdd1243dSDimitry Andric       .minScalar(1, S16);
15435ffd83dbSDimitry Andric 
15448bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
15458bcb0991SDimitry Andric       .legalFor({V2S16, S32})
15468bcb0991SDimitry Andric       .lower();
15478bcb0991SDimitry Andric   } else {
15485ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
15495ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
15505ffd83dbSDimitry Andric 
15518bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
15525ffd83dbSDimitry Andric       .customFor({V2S16, S32})
15538bcb0991SDimitry Andric       .lower();
15548bcb0991SDimitry Andric   }
15558bcb0991SDimitry Andric 
15565ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
15575ffd83dbSDimitry Andric 
15585ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
15590b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1560e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1561e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1562e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1563e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
15640b57cec5SDimitry Andric 
15658bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
15668bcb0991SDimitry Andric 
15670b57cec5SDimitry Andric   // Merge/Unmerge
15680b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
15690b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
15700b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
15710b57cec5SDimitry Andric 
15720b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
15735ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
15740b57cec5SDimitry Andric       if (Ty.isVector()) {
15750b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
15765ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
15770b57cec5SDimitry Andric           return true;
15780b57cec5SDimitry Andric         if (!isPowerOf2_32(EltTy.getSizeInBits()))
15790b57cec5SDimitry Andric           return true;
15800b57cec5SDimitry Andric       }
15810b57cec5SDimitry Andric       return false;
15820b57cec5SDimitry Andric     };
15830b57cec5SDimitry Andric 
15848bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1585e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
15865ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
15875ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
15885ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15895ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
15905ffd83dbSDimitry Andric         })
15915ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
15925ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
15935ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
15940b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
15958bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
15968bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
15978bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
15988bcb0991SDimitry Andric                        changeTo(1, V2S16))
15995ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
16005ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
16015ffd83dbSDimitry Andric       // valid.
16025ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
16035ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
16040b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
16050b57cec5SDimitry Andric       .fewerElementsIf(
16065ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
16070b57cec5SDimitry Andric         scalarize(0))
16080b57cec5SDimitry Andric       .fewerElementsIf(
16095ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
16100b57cec5SDimitry Andric         scalarize(1))
16115ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
16128bcb0991SDimitry Andric 
16138bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
16148bcb0991SDimitry Andric       Builder.widenScalarIf(
16158bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
16160b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
16178bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
16188bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
16198bcb0991SDimitry Andric         },
16208bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
16218bcb0991SDimitry Andric     }
16228bcb0991SDimitry Andric 
16238bcb0991SDimitry Andric     Builder.widenScalarIf(
16248bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
16258bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
16260b57cec5SDimitry Andric         return !isPowerOf2_32(Ty.getSizeInBits()) &&
16270b57cec5SDimitry Andric           Ty.getSizeInBits() % 16 != 0;
16280b57cec5SDimitry Andric       },
16290b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
16300b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
16310b57cec5SDimitry Andric         // Whichever is smaller.
16320b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
16330b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
16340b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
16350b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
16360b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
16370b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
16380b57cec5SDimitry Andric         }
1639*bdd1243dSDimitry Andric         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
16400b57cec5SDimitry Andric       })
16410b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
16420b57cec5SDimitry Andric       .scalarize(0)
16430b57cec5SDimitry Andric       .scalarize(1);
16440b57cec5SDimitry Andric   }
16450b57cec5SDimitry Andric 
16465ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
16475ffd83dbSDimitry Andric   // RegBankSelect.
16485ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
16495ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
16508bcb0991SDimitry Andric 
16515ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
16525ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
16535ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
16545ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
16555ffd83dbSDimitry Andric       // expanded.
16560eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2);
16575ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
16585ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
16595ffd83dbSDimitry Andric   } else {
16605ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
16615ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
16625ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
16635ffd83dbSDimitry Andric   }
16645ffd83dbSDimitry Andric 
16655ffd83dbSDimitry Andric   SextInReg
16665ffd83dbSDimitry Andric     .scalarize(0)
16675ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
16685ffd83dbSDimitry Andric     .lower();
16695ffd83dbSDimitry Andric 
1670349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1671349cc55cSDimitry Andric     .scalarize(0)
1672349cc55cSDimitry Andric     .lower();
1673349cc55cSDimitry Andric 
1674fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
16755ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
16765ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1677fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
16780eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
16795ffd83dbSDimitry Andric     .scalarize(0)
16805ffd83dbSDimitry Andric     .lower();
1681480093f4SDimitry Andric 
1682fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
1683fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1684fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
16850eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
1686fe6060f1SDimitry Andric       .scalarize(0)
1687fe6060f1SDimitry Andric       .lower();
1688fe6060f1SDimitry Andric   } else {
1689fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1690fe6060f1SDimitry Andric       .scalarize(0)
1691fe6060f1SDimitry Andric       .lower();
1692fe6060f1SDimitry Andric   }
1693fe6060f1SDimitry Andric 
1694480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1695480093f4SDimitry Andric     .legalFor({S64});
1696480093f4SDimitry Andric 
1697e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1698e8d8bef9SDimitry Andric     .alwaysLegal();
1699e8d8bef9SDimitry Andric 
1700fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1701fe6060f1SDimitry Andric       .scalarize(0)
1702fe6060f1SDimitry Andric       .minScalar(0, S32)
1703fe6060f1SDimitry Andric       .lower();
1704fe6060f1SDimitry Andric 
1705fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1706fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
1707fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
1708fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
1709fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
1710fe6060f1SDimitry Andric       .scalarize(0);
1711fe6060f1SDimitry Andric 
17125ffd83dbSDimitry Andric   getActionDefinitionsBuilder({
17135ffd83dbSDimitry Andric       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
17145ffd83dbSDimitry Andric       G_FCOPYSIGN,
17155ffd83dbSDimitry Andric 
17165ffd83dbSDimitry Andric       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1717e8d8bef9SDimitry Andric       G_ATOMICRMW_NAND,
1718e8d8bef9SDimitry Andric       G_ATOMICRMW_FSUB,
17195ffd83dbSDimitry Andric       G_READ_REGISTER,
17205ffd83dbSDimitry Andric       G_WRITE_REGISTER,
17215ffd83dbSDimitry Andric 
17225ffd83dbSDimitry Andric       G_SADDO, G_SSUBO,
17235ffd83dbSDimitry Andric 
17245ffd83dbSDimitry Andric        // TODO: Implement
1725fe6060f1SDimitry Andric       G_FMINIMUM, G_FMAXIMUM}).lower();
17265ffd83dbSDimitry Andric 
1727349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1728349cc55cSDimitry Andric       .lower();
1729349cc55cSDimitry Andric 
1730480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
17315ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1732480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1733480093f4SDimitry Andric     .unsupported();
1734480093f4SDimitry Andric 
1735fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
17360b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
17370b57cec5SDimitry Andric }
17380b57cec5SDimitry Andric 
17395ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
17405ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
17415ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
17425ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
17435ffd83dbSDimitry Andric 
17440b57cec5SDimitry Andric   switch (MI.getOpcode()) {
17450b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
17468bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
17470b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
17488bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
17490b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
17508bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
1751e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
1752e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
17530b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
17548bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
17550b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
17568bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
17570b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
17588bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
17595ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
17605ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
17615ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
17625ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
17630b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
17640b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
17650b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
17660b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
17675ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
17680b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
17698bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
17700b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
17718bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
17728bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
17738bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
17748bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
17758bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
17768bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
17778bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
1778fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
1779fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
1780e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
17818bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
17828bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
17838bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
17848bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
17855ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
17865ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
1787fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
1788fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
17895ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
17905ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
1791fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
1792fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
1793480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
1794480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
17955ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
17965ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f);
17975ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
17985ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
17995ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
18005ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
18015ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
18025ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
18035ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
18045ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
18055ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
1806*bdd1243dSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
18075ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
180881ad6265SDimitry Andric   case TargetOpcode::G_MUL:
180981ad6265SDimitry Andric     return legalizeMul(Helper, MI);
1810349cc55cSDimitry Andric   case TargetOpcode::G_CTLZ:
1811349cc55cSDimitry Andric   case TargetOpcode::G_CTTZ:
1812349cc55cSDimitry Andric     return legalizeCTLZ_CTTZ(MI, MRI, B);
181381ad6265SDimitry Andric   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
181481ad6265SDimitry Andric     return legalizeFPTruncRound(MI, B);
18150b57cec5SDimitry Andric   default:
18160b57cec5SDimitry Andric     return false;
18170b57cec5SDimitry Andric   }
18180b57cec5SDimitry Andric 
18190b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
18200b57cec5SDimitry Andric }
18210b57cec5SDimitry Andric 
18220b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
18230b57cec5SDimitry Andric   unsigned AS,
18240b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
18258bcb0991SDimitry Andric   MachineIRBuilder &B) const {
18268bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
18270b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18280b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
1829*bdd1243dSDimitry Andric   const LLT S64 = LLT::scalar(64);
18300b57cec5SDimitry Andric 
18318bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
18328bcb0991SDimitry Andric 
18330b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
1834*bdd1243dSDimitry Andric     // Note: this register is somewhat broken. When used as a 32-bit operand,
1835*bdd1243dSDimitry Andric     // it only returns zeroes. The real value is in the upper 32 bits.
1836*bdd1243dSDimitry Andric     // Thus, we must emit extract the high 32 bits.
1837*bdd1243dSDimitry Andric     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
1838*bdd1243dSDimitry Andric                                        ? AMDGPU::SRC_SHARED_BASE
1839*bdd1243dSDimitry Andric                                        : AMDGPU::SRC_PRIVATE_BASE;
1840*bdd1243dSDimitry Andric     // FIXME: It would be more natural to emit a COPY here, but then copy
1841*bdd1243dSDimitry Andric     // coalescing would kick in and it would think it's okay to use the "HI"
1842*bdd1243dSDimitry Andric     // subregister (instead of extracting the HI 32 bits) which is an artificial
1843*bdd1243dSDimitry Andric     // (unusable) register.
1844*bdd1243dSDimitry Andric     //  Register TableGen definitions would need an overhaul to get rid of the
1845*bdd1243dSDimitry Andric     //  artificial "HI" aperture registers and prevent this kind of issue from
1846*bdd1243dSDimitry Andric     //  happening.
1847*bdd1243dSDimitry Andric     Register Dst = MRI.createGenericVirtualRegister(S64);
1848*bdd1243dSDimitry Andric     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
1849*bdd1243dSDimitry Andric     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
1850*bdd1243dSDimitry Andric     return B.buildUnmerge(S32, Dst).getReg(1);
18510b57cec5SDimitry Andric   }
18520b57cec5SDimitry Andric 
185381ad6265SDimitry Andric   // TODO: can we be smarter about machine pointer info?
185481ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
185581ad6265SDimitry Andric   Register LoadAddr = MRI.createGenericVirtualRegister(
185681ad6265SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
185781ad6265SDimitry Andric   // For code object version 5, private_base and shared_base are passed through
185881ad6265SDimitry Andric   // implicit kernargs.
185981ad6265SDimitry Andric   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
186081ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
186181ad6265SDimitry Andric         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
186281ad6265SDimitry Andric                                       : AMDGPUTargetLowering::PRIVATE_BASE;
186381ad6265SDimitry Andric     uint64_t Offset =
186481ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
186581ad6265SDimitry Andric 
186681ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
186781ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
186881ad6265SDimitry Andric 
186981ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
187081ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
187181ad6265SDimitry Andric       return Register();
187281ad6265SDimitry Andric 
187381ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
187481ad6265SDimitry Andric         PtrInfo,
187581ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
187681ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
187781ad6265SDimitry Andric         LLT::scalar(32), commonAlignment(Align(64), Offset));
187881ad6265SDimitry Andric 
187981ad6265SDimitry Andric     // Pointer address
188081ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
188181ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
188281ad6265SDimitry Andric     // Load address
188381ad6265SDimitry Andric     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
188481ad6265SDimitry Andric   }
188581ad6265SDimitry Andric 
18860b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
18870b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
18880b57cec5SDimitry Andric 
1889e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
18908bcb0991SDimitry Andric     return Register();
18910b57cec5SDimitry Andric 
18920b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
18930b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
18940b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
18950b57cec5SDimitry Andric 
18960b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
18970b57cec5SDimitry Andric       PtrInfo,
18985ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
18990b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
1900fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
19010b57cec5SDimitry Andric 
190281ad6265SDimitry Andric   B.buildPtrAdd(LoadAddr, QueuePtr,
190381ad6265SDimitry Andric                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
19045ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
19050b57cec5SDimitry Andric }
19060b57cec5SDimitry Andric 
190704eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is
190804eeddc0SDimitry Andric /// not necessary.
190904eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
191004eeddc0SDimitry Andric                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
191104eeddc0SDimitry Andric   MachineInstr *Def = MRI.getVRegDef(Val);
191204eeddc0SDimitry Andric   switch (Def->getOpcode()) {
191304eeddc0SDimitry Andric   case AMDGPU::G_FRAME_INDEX:
191404eeddc0SDimitry Andric   case AMDGPU::G_GLOBAL_VALUE:
191504eeddc0SDimitry Andric   case AMDGPU::G_BLOCK_ADDR:
191604eeddc0SDimitry Andric     return true;
191704eeddc0SDimitry Andric   case AMDGPU::G_CONSTANT: {
191804eeddc0SDimitry Andric     const ConstantInt *CI = Def->getOperand(1).getCImm();
191904eeddc0SDimitry Andric     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
192004eeddc0SDimitry Andric   }
192104eeddc0SDimitry Andric   default:
192204eeddc0SDimitry Andric     return false;
192304eeddc0SDimitry Andric   }
192404eeddc0SDimitry Andric 
192504eeddc0SDimitry Andric   return false;
192604eeddc0SDimitry Andric }
192704eeddc0SDimitry Andric 
19280b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
19290b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19308bcb0991SDimitry Andric   MachineIRBuilder &B) const {
19318bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
19320b57cec5SDimitry Andric 
19338bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
19340b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
19350b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19360b57cec5SDimitry Andric 
19370b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
19380b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
19390b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
19400b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
19410b57cec5SDimitry Andric 
19420b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
19430b57cec5SDimitry Andric   // vector element.
19440b57cec5SDimitry Andric   assert(!DstTy.isVector());
19450b57cec5SDimitry Andric 
19460b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
19470b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
19480b57cec5SDimitry Andric 
1949e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
19508bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
19518bcb0991SDimitry Andric     return true;
19528bcb0991SDimitry Andric   }
19538bcb0991SDimitry Andric 
195481ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
195581ad6265SDimitry Andric       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
195681ad6265SDimitry Andric        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
195704eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
195804eeddc0SDimitry Andric       // Extract low 32-bits of the pointer.
195904eeddc0SDimitry Andric       B.buildExtract(Dst, Src, 0);
196004eeddc0SDimitry Andric       MI.eraseFromParent();
196104eeddc0SDimitry Andric       return true;
196204eeddc0SDimitry Andric     }
196304eeddc0SDimitry Andric 
19640b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
19650b57cec5SDimitry Andric 
19668bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
19678bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
19680b57cec5SDimitry Andric 
19690b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
19705ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
19710b57cec5SDimitry Andric 
19725ffd83dbSDimitry Andric     auto CmpRes =
19735ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
19748bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
19750b57cec5SDimitry Andric 
19760b57cec5SDimitry Andric     MI.eraseFromParent();
19770b57cec5SDimitry Andric     return true;
19780b57cec5SDimitry Andric   }
19790b57cec5SDimitry Andric 
198081ad6265SDimitry Andric   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
198181ad6265SDimitry Andric       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
198281ad6265SDimitry Andric        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
19838bcb0991SDimitry Andric     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
19848bcb0991SDimitry Andric     if (!ApertureReg.isValid())
19858bcb0991SDimitry Andric       return false;
19860b57cec5SDimitry Andric 
19870b57cec5SDimitry Andric     // Coerce the type of the low half of the result so we can use merge_values.
19885ffd83dbSDimitry Andric     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
19890b57cec5SDimitry Andric 
19900b57cec5SDimitry Andric     // TODO: Should we allow mismatched types but matching sizes in merges to
19910b57cec5SDimitry Andric     // avoid the ptrtoint?
1992*bdd1243dSDimitry Andric     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
199304eeddc0SDimitry Andric 
199404eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
199504eeddc0SDimitry Andric       B.buildCopy(Dst, BuildPtr);
199604eeddc0SDimitry Andric       MI.eraseFromParent();
199704eeddc0SDimitry Andric       return true;
199804eeddc0SDimitry Andric     }
199904eeddc0SDimitry Andric 
200004eeddc0SDimitry Andric     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
200104eeddc0SDimitry Andric     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
200204eeddc0SDimitry Andric 
200381ad6265SDimitry Andric     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
200481ad6265SDimitry Andric                               SegmentNull.getReg(0));
200504eeddc0SDimitry Andric 
20065ffd83dbSDimitry Andric     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
20070b57cec5SDimitry Andric 
20080b57cec5SDimitry Andric     MI.eraseFromParent();
20090b57cec5SDimitry Andric     return true;
20100b57cec5SDimitry Andric   }
20110b57cec5SDimitry Andric 
201281ad6265SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
201381ad6265SDimitry Andric       SrcTy.getSizeInBits() == 64) {
201481ad6265SDimitry Andric     // Truncate.
201581ad6265SDimitry Andric     B.buildExtract(Dst, Src, 0);
201681ad6265SDimitry Andric     MI.eraseFromParent();
201781ad6265SDimitry Andric     return true;
201881ad6265SDimitry Andric   }
201981ad6265SDimitry Andric 
202081ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
202181ad6265SDimitry Andric       DstTy.getSizeInBits() == 64) {
202281ad6265SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
202381ad6265SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2024*bdd1243dSDimitry Andric     auto PtrLo = B.buildPtrToInt(S32, Src);
2025*bdd1243dSDimitry Andric     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2026*bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
202781ad6265SDimitry Andric     MI.eraseFromParent();
202881ad6265SDimitry Andric     return true;
202981ad6265SDimitry Andric   }
203081ad6265SDimitry Andric 
203181ad6265SDimitry Andric   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
203281ad6265SDimitry Andric       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
203381ad6265SDimitry Andric 
203481ad6265SDimitry Andric   LLVMContext &Ctx = MF.getFunction().getContext();
203581ad6265SDimitry Andric   Ctx.diagnose(InvalidAddrSpaceCast);
203681ad6265SDimitry Andric   B.buildUndef(Dst);
203781ad6265SDimitry Andric   MI.eraseFromParent();
203881ad6265SDimitry Andric   return true;
203981ad6265SDimitry Andric }
204081ad6265SDimitry Andric 
20410b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
20420b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20438bcb0991SDimitry Andric   MachineIRBuilder &B) const {
20440b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20450b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
20460b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
20470b57cec5SDimitry Andric 
20480b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
20490b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
20500b57cec5SDimitry Andric 
20518bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
20528bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
20530b57cec5SDimitry Andric 
20540b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
20558bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
20568bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
20570b57cec5SDimitry Andric 
20588bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
20598bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
20600b57cec5SDimitry Andric 
20618bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
20628bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2063e8d8bef9SDimitry Andric   MI.eraseFromParent();
20640b57cec5SDimitry Andric   return true;
20650b57cec5SDimitry Andric }
20660b57cec5SDimitry Andric 
20670b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
20680b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20690b57cec5SDimitry Andric   MachineIRBuilder &B) const {
20700b57cec5SDimitry Andric 
20710b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
20720b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
20730b57cec5SDimitry Andric 
20740b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20750b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
20760b57cec5SDimitry Andric 
20770b57cec5SDimitry Andric   // result = trunc(src)
20780b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
20790b57cec5SDimitry Andric   //   result += 1.0
20800b57cec5SDimitry Andric 
20815ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
20820b57cec5SDimitry Andric 
20830b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
20840b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
20850b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
20860b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
20870b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
20880b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
20890b57cec5SDimitry Andric 
20900b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
20910b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
209204eeddc0SDimitry Andric   MI.eraseFromParent();
20930b57cec5SDimitry Andric   return true;
20940b57cec5SDimitry Andric }
20950b57cec5SDimitry Andric 
2096e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
2097e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
2098e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
2099e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
2100e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
2101e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
2102e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
2103e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
2104e8d8bef9SDimitry Andric 
2105e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2106e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2107e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2108e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2109e8d8bef9SDimitry Andric     MI.eraseFromParent();
2110e8d8bef9SDimitry Andric     return true;
2111e8d8bef9SDimitry Andric }
2112e8d8bef9SDimitry Andric 
2113e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
21140b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
21150b57cec5SDimitry Andric   const unsigned FractBits = 52;
21160b57cec5SDimitry Andric   const unsigned ExpBits = 11;
21170b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
21180b57cec5SDimitry Andric 
21190b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
21200b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
21210b57cec5SDimitry Andric 
21220b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2123e8d8bef9SDimitry Andric     .addUse(Hi)
21240b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
21250b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
21260b57cec5SDimitry Andric 
21270b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
21280b57cec5SDimitry Andric }
21290b57cec5SDimitry Andric 
21300b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
21310b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21320b57cec5SDimitry Andric   MachineIRBuilder &B) const {
21330b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
21340b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
21350b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
21360b57cec5SDimitry Andric 
21370b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
21380b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
21390b57cec5SDimitry Andric 
21400b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
21410b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
21420b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
21430b57cec5SDimitry Andric 
21440b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
21450b57cec5SDimitry Andric   // exponent.
21460b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
21470b57cec5SDimitry Andric 
21480b57cec5SDimitry Andric   const unsigned FractBits = 52;
21490b57cec5SDimitry Andric 
21500b57cec5SDimitry Andric   // Extract the sign bit.
21510b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
21520b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
21530b57cec5SDimitry Andric 
21540b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
21550b57cec5SDimitry Andric 
21560b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
21570b57cec5SDimitry Andric 
21580b57cec5SDimitry Andric   // Extend back to 64-bits.
2159*bdd1243dSDimitry Andric   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
21600b57cec5SDimitry Andric 
21610b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
21620b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
21630b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
21640b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
21650b57cec5SDimitry Andric 
21660b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
21670b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
21680b57cec5SDimitry Andric 
21690b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
21700b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2171e8d8bef9SDimitry Andric   MI.eraseFromParent();
21720b57cec5SDimitry Andric   return true;
21730b57cec5SDimitry Andric }
21740b57cec5SDimitry Andric 
21750b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
21760b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21770b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
21780b57cec5SDimitry Andric 
21790b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
21800b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
21810b57cec5SDimitry Andric 
21820b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
21830b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
21840b57cec5SDimitry Andric 
2185349cc55cSDimitry Andric   assert(MRI.getType(Src) == S64);
21860b57cec5SDimitry Andric 
21870b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2188349cc55cSDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
21890b57cec5SDimitry Andric 
2190349cc55cSDimitry Andric   if (MRI.getType(Dst) == S64) {
2191349cc55cSDimitry Andric     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2192349cc55cSDimitry Andric                         : B.buildUITOFP(S64, Unmerge.getReg(1));
21930b57cec5SDimitry Andric 
21940b57cec5SDimitry Andric     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
21950b57cec5SDimitry Andric     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
21960b57cec5SDimitry Andric                      .addUse(CvtHi.getReg(0))
21970b57cec5SDimitry Andric                      .addUse(ThirtyTwo.getReg(0));
21980b57cec5SDimitry Andric 
21990b57cec5SDimitry Andric     // TODO: Should this propagate fast-math-flags?
22000b57cec5SDimitry Andric     B.buildFAdd(Dst, LdExp, CvtLo);
22010b57cec5SDimitry Andric     MI.eraseFromParent();
22020b57cec5SDimitry Andric     return true;
22030b57cec5SDimitry Andric   }
22040b57cec5SDimitry Andric 
2205349cc55cSDimitry Andric   assert(MRI.getType(Dst) == S32);
2206349cc55cSDimitry Andric 
2207349cc55cSDimitry Andric   auto One = B.buildConstant(S32, 1);
2208349cc55cSDimitry Andric 
2209349cc55cSDimitry Andric   MachineInstrBuilder ShAmt;
2210349cc55cSDimitry Andric   if (Signed) {
2211349cc55cSDimitry Andric     auto ThirtyOne = B.buildConstant(S32, 31);
2212349cc55cSDimitry Andric     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2213349cc55cSDimitry Andric     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2214349cc55cSDimitry Andric     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2215349cc55cSDimitry Andric     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2216349cc55cSDimitry Andric                                /*HasSideEffects=*/false)
2217349cc55cSDimitry Andric                   .addUse(Unmerge.getReg(1));
2218349cc55cSDimitry Andric     auto LS2 = B.buildSub(S32, LS, One);
2219349cc55cSDimitry Andric     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2220349cc55cSDimitry Andric   } else
2221349cc55cSDimitry Andric     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2222349cc55cSDimitry Andric   auto Norm = B.buildShl(S64, Src, ShAmt);
2223349cc55cSDimitry Andric   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2224349cc55cSDimitry Andric   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2225349cc55cSDimitry Andric   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2226349cc55cSDimitry Andric   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2227349cc55cSDimitry Andric   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2228349cc55cSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2229349cc55cSDimitry Andric                    /*HasSideEffects=*/false)
2230349cc55cSDimitry Andric       .addUse(FVal.getReg(0))
2231349cc55cSDimitry Andric       .addUse(Scale.getReg(0));
2232349cc55cSDimitry Andric   MI.eraseFromParent();
2233349cc55cSDimitry Andric   return true;
2234349cc55cSDimitry Andric }
2235349cc55cSDimitry Andric 
22365ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
22375ffd83dbSDimitry Andric // actually works.
2238fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2239fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2240fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2241fe6060f1SDimitry Andric                                         bool Signed) const {
22425ffd83dbSDimitry Andric 
22435ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22445ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
22455ffd83dbSDimitry Andric 
22465ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
22475ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
22485ffd83dbSDimitry Andric 
2249fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2250fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
22515ffd83dbSDimitry Andric 
22525ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
22535ffd83dbSDimitry Andric 
2254fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2255fe6060f1SDimitry Andric   // integers is illustrated as follows:
2256fe6060f1SDimitry Andric   //
2257fe6060f1SDimitry Andric   //     tf := trunc(val);
2258fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2259fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2260fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2261fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2262fe6060f1SDimitry Andric   //
2263fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2264fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2265fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2266fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2267fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2268fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2269fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2270fe6060f1SDimitry Andric     // signedness.
2271fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2272fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2273fe6060f1SDimitry Andric   }
2274fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2275fe6060f1SDimitry Andric   if (SrcLT == S64) {
2276fe6060f1SDimitry Andric     K0 = B.buildFConstant(S64,
2277fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2278fe6060f1SDimitry Andric     K1 = B.buildFConstant(S64,
2279fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2280fe6060f1SDimitry Andric   } else {
2281fe6060f1SDimitry Andric     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2282fe6060f1SDimitry Andric     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2283fe6060f1SDimitry Andric   }
22845ffd83dbSDimitry Andric 
2285fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2286fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2287fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
22885ffd83dbSDimitry Andric 
2289fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2290fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
22915ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
22925ffd83dbSDimitry Andric 
2293fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2294fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2295*bdd1243dSDimitry Andric     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2296fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2297*bdd1243dSDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2298*bdd1243dSDimitry Andric                Sign);
2299fe6060f1SDimitry Andric   } else
2300*bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {Lo, Hi});
23015ffd83dbSDimitry Andric   MI.eraseFromParent();
23025ffd83dbSDimitry Andric 
23035ffd83dbSDimitry Andric   return true;
23045ffd83dbSDimitry Andric }
23055ffd83dbSDimitry Andric 
23065ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
23075ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
23085ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
23090b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
23100b57cec5SDimitry Andric 
23110b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
23120b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
23130b57cec5SDimitry Andric 
23140b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
23150b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
23160b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
23170b57cec5SDimitry Andric     return !IsIEEEOp;
23180b57cec5SDimitry Andric 
23190b57cec5SDimitry Andric   if (IsIEEEOp)
23200b57cec5SDimitry Andric     return true;
23210b57cec5SDimitry Andric 
23220b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
23230b57cec5SDimitry Andric }
23240b57cec5SDimitry Andric 
23250b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
23260b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23270b57cec5SDimitry Andric   MachineIRBuilder &B) const {
23280b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
23290b57cec5SDimitry Andric 
23300b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
23315ffd83dbSDimitry Andric 
23325ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
23335ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2334349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2335*bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2336349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2337e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
23380b57cec5SDimitry Andric     return true;
2339*bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
23400b57cec5SDimitry Andric 
23410b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
23420b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
23430b57cec5SDimitry Andric 
23440b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
23450b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
23460b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Dst));
23470b57cec5SDimitry Andric 
234804eeddc0SDimitry Andric   if (IdxVal < VecTy.getNumElements()) {
234904eeddc0SDimitry Andric     auto Unmerge = B.buildUnmerge(EltTy, Vec);
235004eeddc0SDimitry Andric     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
235104eeddc0SDimitry Andric   } else {
23520b57cec5SDimitry Andric     B.buildUndef(Dst);
235304eeddc0SDimitry Andric   }
23540b57cec5SDimitry Andric 
23550b57cec5SDimitry Andric   MI.eraseFromParent();
23560b57cec5SDimitry Andric   return true;
23570b57cec5SDimitry Andric }
23580b57cec5SDimitry Andric 
23590b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
23600b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23610b57cec5SDimitry Andric   MachineIRBuilder &B) const {
23620b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
23630b57cec5SDimitry Andric 
23640b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
23655ffd83dbSDimitry Andric 
23665ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
23675ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2368349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2369*bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2370349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2371e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
23720b57cec5SDimitry Andric     return true;
23730b57cec5SDimitry Andric 
2374*bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
23750b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
23760b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
23770b57cec5SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
23780b57cec5SDimitry Andric 
23790b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
23800b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
23810b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Ins));
238204eeddc0SDimitry Andric   (void)Ins;
23830b57cec5SDimitry Andric 
238404eeddc0SDimitry Andric   unsigned NumElts = VecTy.getNumElements();
238504eeddc0SDimitry Andric   if (IdxVal < NumElts) {
238604eeddc0SDimitry Andric     SmallVector<Register, 8> SrcRegs;
238704eeddc0SDimitry Andric     for (unsigned i = 0; i < NumElts; ++i)
238804eeddc0SDimitry Andric       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
238904eeddc0SDimitry Andric     B.buildUnmerge(SrcRegs, Vec);
239004eeddc0SDimitry Andric 
239104eeddc0SDimitry Andric     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2392*bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, SrcRegs);
239304eeddc0SDimitry Andric   } else {
23940b57cec5SDimitry Andric     B.buildUndef(Dst);
239504eeddc0SDimitry Andric   }
23960b57cec5SDimitry Andric 
23970b57cec5SDimitry Andric   MI.eraseFromParent();
23980b57cec5SDimitry Andric   return true;
23990b57cec5SDimitry Andric }
24000b57cec5SDimitry Andric 
24018bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
24028bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24038bcb0991SDimitry Andric   MachineIRBuilder &B) const {
24048bcb0991SDimitry Andric 
24058bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
24068bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
24078bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
24088bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
24098bcb0991SDimitry Andric 
24108bcb0991SDimitry Andric   Register TrigVal;
24115ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
24128bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
24138bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
24148bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
24158bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
24168bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
24178bcb0991SDimitry Andric   } else
24188bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
24198bcb0991SDimitry Andric 
24208bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
24218bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2422*bdd1243dSDimitry Andric   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
24238bcb0991SDimitry Andric       .addUse(TrigVal)
24248bcb0991SDimitry Andric       .setMIFlags(Flags);
24258bcb0991SDimitry Andric   MI.eraseFromParent();
24268bcb0991SDimitry Andric   return true;
24278bcb0991SDimitry Andric }
24288bcb0991SDimitry Andric 
24295ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
24305ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
24315ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
24325ffd83dbSDimitry Andric                                                   int64_t Offset,
24335ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
24345ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
24358bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
24368bcb0991SDimitry Andric   // to the following code sequence:
24378bcb0991SDimitry Andric   //
24388bcb0991SDimitry Andric   // For constant address space:
24398bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
24408bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
24418bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
24428bcb0991SDimitry Andric   //
24438bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
24448bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
24458bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
24468bcb0991SDimitry Andric   //   operand to the global variable.
24478bcb0991SDimitry Andric   //
24488bcb0991SDimitry Andric   // For global address space:
24498bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
24508bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
24518bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
24528bcb0991SDimitry Andric   //
24538bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
24548bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
24558bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
24568bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
24578bcb0991SDimitry Andric   //   operand to the global variable.
24588bcb0991SDimitry Andric   //
24598bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
24608bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
24618bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
24628bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
24638bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
2464e8d8bef9SDimitry Andric   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2465e8d8bef9SDimitry Andric   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2466e8d8bef9SDimitry Andric   // instruction.
24678bcb0991SDimitry Andric 
24688bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
24698bcb0991SDimitry Andric 
24708bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
24718bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
24728bcb0991SDimitry Andric 
24738bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
24748bcb0991SDimitry Andric     .addDef(PCReg);
24758bcb0991SDimitry Andric 
24768bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
24778bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
24788bcb0991SDimitry Andric     MIB.addImm(0);
24798bcb0991SDimitry Andric   else
2480e8d8bef9SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
24818bcb0991SDimitry Andric 
24828bcb0991SDimitry Andric   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
24838bcb0991SDimitry Andric 
24848bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
24858bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
24868bcb0991SDimitry Andric   return true;
24878bcb0991SDimitry Andric  }
24888bcb0991SDimitry Andric 
24898bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
24908bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24918bcb0991SDimitry Andric   MachineIRBuilder &B) const {
24928bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
24938bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
24948bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
24958bcb0991SDimitry Andric 
24968bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
24978bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
24988bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
24998bcb0991SDimitry Andric 
25008bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2501fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2502fe6060f1SDimitry Andric         !GV->getName().equals("llvm.amdgcn.module.lds")) {
25038bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
25048bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
25055ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
25065ffd83dbSDimitry Andric         DS_Warning);
25078bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
25085ffd83dbSDimitry Andric 
25095ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
25105ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
25115ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
25125ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
25135ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
25145ffd83dbSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
25155ffd83dbSDimitry Andric       B.buildUndef(DstReg);
25165ffd83dbSDimitry Andric       MI.eraseFromParent();
25175ffd83dbSDimitry Andric       return true;
25188bcb0991SDimitry Andric     }
25198bcb0991SDimitry Andric 
25208bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
2521349cc55cSDimitry Andric     // We ignore the initializer for now and legalize it to allow selection.
2522349cc55cSDimitry Andric     // The initializer will anyway get errored out during assembly emission.
25235ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
25245ffd83dbSDimitry Andric     if (!TLI->shouldUseLDSConstAddress(GV)) {
25255ffd83dbSDimitry Andric       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
25265ffd83dbSDimitry Andric       return true; // Leave in place;
25275ffd83dbSDimitry Andric     }
25285ffd83dbSDimitry Andric 
2529e8d8bef9SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2530e8d8bef9SDimitry Andric       Type *Ty = GV->getValueType();
2531e8d8bef9SDimitry Andric       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2532e8d8bef9SDimitry Andric       // zero-sized type in other languages to declare the dynamic shared
2533e8d8bef9SDimitry Andric       // memory which size is not known at the compile time. They will be
2534e8d8bef9SDimitry Andric       // allocated by the runtime and placed directly after the static
2535e8d8bef9SDimitry Andric       // allocated ones. They all share the same offset.
2536e8d8bef9SDimitry Andric       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2537e8d8bef9SDimitry Andric         // Adjust alignment for that dynamic shared memory array.
2538e8d8bef9SDimitry Andric         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2539e8d8bef9SDimitry Andric         LLT S32 = LLT::scalar(32);
2540e8d8bef9SDimitry Andric         auto Sz =
2541e8d8bef9SDimitry Andric             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2542e8d8bef9SDimitry Andric         B.buildIntToPtr(DstReg, Sz);
2543e8d8bef9SDimitry Andric         MI.eraseFromParent();
2544e8d8bef9SDimitry Andric         return true;
2545e8d8bef9SDimitry Andric       }
2546e8d8bef9SDimitry Andric     }
2547e8d8bef9SDimitry Andric 
2548349cc55cSDimitry Andric     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2549349cc55cSDimitry Andric                                                    *cast<GlobalVariable>(GV)));
25508bcb0991SDimitry Andric     MI.eraseFromParent();
25518bcb0991SDimitry Andric     return true;
25528bcb0991SDimitry Andric   }
25538bcb0991SDimitry Andric 
25548bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
25558bcb0991SDimitry Andric 
25568bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
25578bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
25588bcb0991SDimitry Andric     MI.eraseFromParent();
25598bcb0991SDimitry Andric     return true;
25608bcb0991SDimitry Andric   }
25618bcb0991SDimitry Andric 
25628bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
25638bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
25648bcb0991SDimitry Andric     MI.eraseFromParent();
25658bcb0991SDimitry Andric     return true;
25668bcb0991SDimitry Andric   }
25678bcb0991SDimitry Andric 
25688bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
25698bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
25708bcb0991SDimitry Andric 
2571fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
25728bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
25738bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
25748bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
25758bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2576fe6060f1SDimitry Andric       LoadTy, Align(8));
25778bcb0991SDimitry Andric 
25788bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
25798bcb0991SDimitry Andric 
25808bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
2581349cc55cSDimitry Andric     // Truncate if this is a 32-bit constant address.
25828bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
25838bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
25848bcb0991SDimitry Andric   } else
25858bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
25868bcb0991SDimitry Andric 
25878bcb0991SDimitry Andric   MI.eraseFromParent();
25888bcb0991SDimitry Andric   return true;
25898bcb0991SDimitry Andric }
25908bcb0991SDimitry Andric 
2591e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2592e8d8bef9SDimitry Andric   if (Ty.isVector())
2593fe6060f1SDimitry Andric     return Ty.changeElementCount(
2594fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2595e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2596e8d8bef9SDimitry Andric }
2597e8d8bef9SDimitry Andric 
2598e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2599e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2600e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2601e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2602e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2603e8d8bef9SDimitry Andric 
2604e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2605e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2606e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2607e8d8bef9SDimitry Andric 
2608e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
26098bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2610e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
26118bcb0991SDimitry Andric     Observer.changingInstr(MI);
26128bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
26138bcb0991SDimitry Andric     Observer.changedInstr(MI);
26148bcb0991SDimitry Andric     return true;
26158bcb0991SDimitry Andric   }
26168bcb0991SDimitry Andric 
2617fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
2618fe6060f1SDimitry Andric     return false;
2619fe6060f1SDimitry Andric 
2620e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2621e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2622e8d8bef9SDimitry Andric 
2623e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
2624e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
2625fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
2626e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
2627fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
262804eeddc0SDimitry Andric   const uint64_t AlignInBits = 8 * MemAlign.value();
2629e8d8bef9SDimitry Andric 
2630e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
2631fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2632e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2633e8d8bef9SDimitry Andric 
2634e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
2635e8d8bef9SDimitry Andric     // the memory type.
2636e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
2637e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
2638e8d8bef9SDimitry Andric 
2639e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
2640e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2641e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
2642e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
2643e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
2644e8d8bef9SDimitry Andric       return true;
2645e8d8bef9SDimitry Andric     }
2646e8d8bef9SDimitry Andric 
2647e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
2648e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
2649e8d8bef9SDimitry Andric       return false;
2650e8d8bef9SDimitry Andric 
2651e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
2652e8d8bef9SDimitry Andric 
2653e8d8bef9SDimitry Andric     Register WideLoad;
2654e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
2655e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2656e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
2657e8d8bef9SDimitry Andric     } else {
2658e8d8bef9SDimitry Andric       // Extract the subvector.
2659e8d8bef9SDimitry Andric 
2660e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
2661e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
2662e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
2663e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2664e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
2665e8d8bef9SDimitry Andric       } else {
2666e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
2667e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
26680eae32dcSDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
26690eae32dcSDimitry Andric         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2670e8d8bef9SDimitry Andric       }
2671e8d8bef9SDimitry Andric     }
2672e8d8bef9SDimitry Andric 
2673e8d8bef9SDimitry Andric     MI.eraseFromParent();
2674e8d8bef9SDimitry Andric     return true;
2675e8d8bef9SDimitry Andric   }
2676e8d8bef9SDimitry Andric 
2677e8d8bef9SDimitry Andric   return false;
2678e8d8bef9SDimitry Andric }
2679e8d8bef9SDimitry Andric 
26808bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
26818bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
26828bcb0991SDimitry Andric   MachineIRBuilder &B) const {
26838bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
26848bcb0991SDimitry Andric   assert(Ty.isScalar());
26858bcb0991SDimitry Andric 
2686480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
2687480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2688480093f4SDimitry Andric 
26898bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
26905ffd83dbSDimitry Andric   // FIXME: Do we need just output?
26915ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
26928bcb0991SDimitry Andric     return true;
26935ffd83dbSDimitry Andric   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
26948bcb0991SDimitry Andric     return true;
26958bcb0991SDimitry Andric 
26968bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
26978bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
26988bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
26998bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
27008bcb0991SDimitry Andric }
27018bcb0991SDimitry Andric 
2702480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2703480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2704480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2705480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2706480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
2707480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
2708480093f4SDimitry Andric 
2709e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2710480093f4SDimitry Andric          "this should not have been custom lowered");
2711480093f4SDimitry Andric 
2712480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
2713fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
2714480093f4SDimitry Andric 
2715480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2716480093f4SDimitry Andric 
2717480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2718480093f4SDimitry Andric     .addDef(DstReg)
2719480093f4SDimitry Andric     .addUse(PtrReg)
2720480093f4SDimitry Andric     .addUse(PackedVal)
2721480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
2722480093f4SDimitry Andric 
2723480093f4SDimitry Andric   MI.eraseFromParent();
2724480093f4SDimitry Andric   return true;
2725480093f4SDimitry Andric }
2726480093f4SDimitry Andric 
27275ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog(
27285ffd83dbSDimitry Andric   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
27295ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27305ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
27315ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
27325ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
27335ffd83dbSDimitry Andric 
27345ffd83dbSDimitry Andric   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
27355ffd83dbSDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
27365ffd83dbSDimitry Andric 
27375ffd83dbSDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
27385ffd83dbSDimitry Andric   MI.eraseFromParent();
27395ffd83dbSDimitry Andric   return true;
27405ffd83dbSDimitry Andric }
27415ffd83dbSDimitry Andric 
27425ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
27435ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
27445ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27455ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
27465ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
27475ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
27485ffd83dbSDimitry Andric 
27495ffd83dbSDimitry Andric   auto K = B.buildFConstant(Ty, numbers::log2e);
27505ffd83dbSDimitry Andric   auto Mul = B.buildFMul(Ty, Src, K, Flags);
27515ffd83dbSDimitry Andric   B.buildFExp2(Dst, Mul, Flags);
27525ffd83dbSDimitry Andric   MI.eraseFromParent();
27535ffd83dbSDimitry Andric   return true;
27545ffd83dbSDimitry Andric }
27555ffd83dbSDimitry Andric 
27565ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
27575ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
27585ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27595ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
27605ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
27615ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
27625ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
27635ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
27645ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
27655ffd83dbSDimitry Andric 
27665ffd83dbSDimitry Andric   if (Ty == S32) {
27675ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S32, Src0, Flags);
27685ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
27695ffd83dbSDimitry Andric       .addUse(Log.getReg(0))
27705ffd83dbSDimitry Andric       .addUse(Src1)
27715ffd83dbSDimitry Andric       .setMIFlags(Flags);
27725ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
27735ffd83dbSDimitry Andric   } else if (Ty == S16) {
27745ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
27755ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S16, Src0, Flags);
27765ffd83dbSDimitry Andric     auto Ext0 = B.buildFPExt(S32, Log, Flags);
27775ffd83dbSDimitry Andric     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
27785ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
27795ffd83dbSDimitry Andric       .addUse(Ext0.getReg(0))
27805ffd83dbSDimitry Andric       .addUse(Ext1.getReg(0))
27815ffd83dbSDimitry Andric       .setMIFlags(Flags);
27825ffd83dbSDimitry Andric 
27835ffd83dbSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
27845ffd83dbSDimitry Andric   } else
27855ffd83dbSDimitry Andric     return false;
27865ffd83dbSDimitry Andric 
27875ffd83dbSDimitry Andric   MI.eraseFromParent();
27885ffd83dbSDimitry Andric   return true;
27895ffd83dbSDimitry Andric }
27905ffd83dbSDimitry Andric 
27915ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
27925ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
27935ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
27945ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
27955ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
27965ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
27975ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
27985ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
27995ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
28005ffd83dbSDimitry Andric   return ModSrc;
28015ffd83dbSDimitry Andric }
28025ffd83dbSDimitry Andric 
28035ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
28045ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
28055ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
28065ffd83dbSDimitry Andric 
28075ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
28085ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
28095ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
28105ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
28115ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
28125ffd83dbSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
28135ffd83dbSDimitry Andric          "this should not have been custom lowered");
28145ffd83dbSDimitry Andric 
28155ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
28165ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
28175ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
28185ffd83dbSDimitry Andric   // V_FRACT bug is:
28195ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
28205ffd83dbSDimitry Andric   //
28215ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
28225ffd83dbSDimitry Andric 
28235ffd83dbSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
28245ffd83dbSDimitry Andric     .addUse(OrigSrc)
28255ffd83dbSDimitry Andric     .setMIFlags(Flags);
28265ffd83dbSDimitry Andric 
28275ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
28285ffd83dbSDimitry Andric   // pattern.
28295ffd83dbSDimitry Andric 
28305ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
28315ffd83dbSDimitry Andric   // shouldn't matter?
28325ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
28335ffd83dbSDimitry Andric 
28345ffd83dbSDimitry Andric   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
28355ffd83dbSDimitry Andric 
28365ffd83dbSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(S64);
28375ffd83dbSDimitry Andric 
28385ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
28395ffd83dbSDimitry Andric   // use the one which will directly select.
28405ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
28415ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
28425ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
28435ffd83dbSDimitry Andric   else
28445ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
28455ffd83dbSDimitry Andric 
28465ffd83dbSDimitry Andric   Register CorrectedFract = Min;
28475ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
28485ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
28495ffd83dbSDimitry Andric     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
28505ffd83dbSDimitry Andric   }
28515ffd83dbSDimitry Andric 
28525ffd83dbSDimitry Andric   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
28535ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
28545ffd83dbSDimitry Andric 
28555ffd83dbSDimitry Andric   MI.eraseFromParent();
28565ffd83dbSDimitry Andric   return true;
28575ffd83dbSDimitry Andric }
28585ffd83dbSDimitry Andric 
28595ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
28605ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
28615ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
28625ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
28635ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
28645ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2865*bdd1243dSDimitry Andric   const LLT S16 = LLT::scalar(16);
2866fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
28675ffd83dbSDimitry Andric 
28685ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
28695ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
28705ffd83dbSDimitry Andric 
2871*bdd1243dSDimitry Andric   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
2872*bdd1243dSDimitry Andric     assert(MRI.getType(Src0) == S32);
2873*bdd1243dSDimitry Andric     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
2874*bdd1243dSDimitry Andric     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
2875*bdd1243dSDimitry Andric   }
2876*bdd1243dSDimitry Andric 
2877*bdd1243dSDimitry Andric   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
28785ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
28795ffd83dbSDimitry Andric 
28805ffd83dbSDimitry Andric   MI.eraseFromParent();
28815ffd83dbSDimitry Andric   return true;
28825ffd83dbSDimitry Andric }
28835ffd83dbSDimitry Andric 
288481ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
288581ad6265SDimitry Andric //
288681ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits.
288781ad6265SDimitry Andric //
288881ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence
288981ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of
289081ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go
289181ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction
289281ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions.
289381ad6265SDimitry Andric void AMDGPULegalizerInfo::buildMultiply(
289481ad6265SDimitry Andric     LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
289581ad6265SDimitry Andric     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
289681ad6265SDimitry Andric     bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
289781ad6265SDimitry Andric   // Use (possibly empty) vectors of S1 registers to represent the set of
289881ad6265SDimitry Andric   // carries from one pair of positions to the next.
289981ad6265SDimitry Andric   using Carry = SmallVector<Register, 2>;
290081ad6265SDimitry Andric 
290181ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
290281ad6265SDimitry Andric 
290381ad6265SDimitry Andric   const LLT S1 = LLT::scalar(1);
290481ad6265SDimitry Andric   const LLT S32 = LLT::scalar(32);
290581ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
290681ad6265SDimitry Andric 
290781ad6265SDimitry Andric   Register Zero32;
290881ad6265SDimitry Andric   Register Zero64;
290981ad6265SDimitry Andric 
291081ad6265SDimitry Andric   auto getZero32 = [&]() -> Register {
291181ad6265SDimitry Andric     if (!Zero32)
291281ad6265SDimitry Andric       Zero32 = B.buildConstant(S32, 0).getReg(0);
291381ad6265SDimitry Andric     return Zero32;
291481ad6265SDimitry Andric   };
291581ad6265SDimitry Andric   auto getZero64 = [&]() -> Register {
291681ad6265SDimitry Andric     if (!Zero64)
291781ad6265SDimitry Andric       Zero64 = B.buildConstant(S64, 0).getReg(0);
291881ad6265SDimitry Andric     return Zero64;
291981ad6265SDimitry Andric   };
292081ad6265SDimitry Andric 
292181ad6265SDimitry Andric   // Merge the given carries into the 32-bit LocalAccum, which is modified
292281ad6265SDimitry Andric   // in-place.
292381ad6265SDimitry Andric   //
292481ad6265SDimitry Andric   // Returns the carry-out, which is a single S1 register or null.
292581ad6265SDimitry Andric   auto mergeCarry =
292681ad6265SDimitry Andric       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
292781ad6265SDimitry Andric         if (CarryIn.empty())
292881ad6265SDimitry Andric           return Register();
292981ad6265SDimitry Andric 
293081ad6265SDimitry Andric         bool HaveCarryOut = true;
293181ad6265SDimitry Andric         Register CarryAccum;
293281ad6265SDimitry Andric         if (CarryIn.size() == 1) {
293381ad6265SDimitry Andric           if (!LocalAccum) {
293481ad6265SDimitry Andric             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
293581ad6265SDimitry Andric             return Register();
293681ad6265SDimitry Andric           }
293781ad6265SDimitry Andric 
293881ad6265SDimitry Andric           CarryAccum = getZero32();
293981ad6265SDimitry Andric         } else {
294081ad6265SDimitry Andric           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
294181ad6265SDimitry Andric           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
294281ad6265SDimitry Andric             CarryAccum =
294381ad6265SDimitry Andric                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
294481ad6265SDimitry Andric                     .getReg(0);
294581ad6265SDimitry Andric           }
294681ad6265SDimitry Andric 
294781ad6265SDimitry Andric           if (!LocalAccum) {
294881ad6265SDimitry Andric             LocalAccum = getZero32();
294981ad6265SDimitry Andric             HaveCarryOut = false;
295081ad6265SDimitry Andric           }
295181ad6265SDimitry Andric         }
295281ad6265SDimitry Andric 
295381ad6265SDimitry Andric         auto Add =
295481ad6265SDimitry Andric             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
295581ad6265SDimitry Andric         LocalAccum = Add.getReg(0);
295681ad6265SDimitry Andric         return HaveCarryOut ? Add.getReg(1) : Register();
295781ad6265SDimitry Andric       };
295881ad6265SDimitry Andric 
295981ad6265SDimitry Andric   // Build a multiply-add chain to compute
296081ad6265SDimitry Andric   //
296181ad6265SDimitry Andric   //   LocalAccum + (partial products at DstIndex)
296281ad6265SDimitry Andric   //       + (opportunistic subset of CarryIn)
296381ad6265SDimitry Andric   //
296481ad6265SDimitry Andric   // LocalAccum is an array of one or two 32-bit registers that are updated
296581ad6265SDimitry Andric   // in-place. The incoming registers may be null.
296681ad6265SDimitry Andric   //
296781ad6265SDimitry Andric   // In some edge cases, carry-ins can be consumed "for free". In that case,
296881ad6265SDimitry Andric   // the consumed carry bits are removed from CarryIn in-place.
296981ad6265SDimitry Andric   auto buildMadChain =
297081ad6265SDimitry Andric       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
297181ad6265SDimitry Andric           -> Carry {
297281ad6265SDimitry Andric         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
297381ad6265SDimitry Andric                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
297481ad6265SDimitry Andric 
297581ad6265SDimitry Andric         Carry CarryOut;
297681ad6265SDimitry Andric         unsigned j0 = 0;
297781ad6265SDimitry Andric 
297881ad6265SDimitry Andric         // Use plain 32-bit multiplication for the most significant part of the
297981ad6265SDimitry Andric         // result by default.
298081ad6265SDimitry Andric         if (LocalAccum.size() == 1 &&
298181ad6265SDimitry Andric             (!UsePartialMad64_32 || !CarryIn.empty())) {
298281ad6265SDimitry Andric           do {
298381ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
298481ad6265SDimitry Andric             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
298581ad6265SDimitry Andric             if (!LocalAccum[0]) {
298681ad6265SDimitry Andric               LocalAccum[0] = Mul.getReg(0);
298781ad6265SDimitry Andric             } else {
298881ad6265SDimitry Andric               if (CarryIn.empty()) {
298981ad6265SDimitry Andric                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
299081ad6265SDimitry Andric               } else {
299181ad6265SDimitry Andric                 LocalAccum[0] =
299281ad6265SDimitry Andric                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
299381ad6265SDimitry Andric                         .getReg(0);
299481ad6265SDimitry Andric                 CarryIn.pop_back();
299581ad6265SDimitry Andric               }
299681ad6265SDimitry Andric             }
299781ad6265SDimitry Andric             ++j0;
299881ad6265SDimitry Andric           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
299981ad6265SDimitry Andric         }
300081ad6265SDimitry Andric 
300181ad6265SDimitry Andric         // Build full 64-bit multiplies.
300281ad6265SDimitry Andric         if (j0 <= DstIndex) {
300381ad6265SDimitry Andric           bool HaveSmallAccum = false;
300481ad6265SDimitry Andric           Register Tmp;
300581ad6265SDimitry Andric 
300681ad6265SDimitry Andric           if (LocalAccum[0]) {
300781ad6265SDimitry Andric             if (LocalAccum.size() == 1) {
300881ad6265SDimitry Andric               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
300981ad6265SDimitry Andric               HaveSmallAccum = true;
301081ad6265SDimitry Andric             } else if (LocalAccum[1]) {
3011*bdd1243dSDimitry Andric               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
301281ad6265SDimitry Andric               HaveSmallAccum = false;
301381ad6265SDimitry Andric             } else {
301481ad6265SDimitry Andric               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
301581ad6265SDimitry Andric               HaveSmallAccum = true;
301681ad6265SDimitry Andric             }
301781ad6265SDimitry Andric           } else {
301881ad6265SDimitry Andric             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
301981ad6265SDimitry Andric             Tmp = getZero64();
302081ad6265SDimitry Andric             HaveSmallAccum = true;
302181ad6265SDimitry Andric           }
302281ad6265SDimitry Andric 
302381ad6265SDimitry Andric           do {
302481ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
302581ad6265SDimitry Andric             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
302681ad6265SDimitry Andric                                     {Src0[j0], Src1[j1], Tmp});
302781ad6265SDimitry Andric             Tmp = Mad.getReg(0);
302881ad6265SDimitry Andric             if (!HaveSmallAccum)
302981ad6265SDimitry Andric               CarryOut.push_back(Mad.getReg(1));
303081ad6265SDimitry Andric             HaveSmallAccum = false;
303181ad6265SDimitry Andric             ++j0;
303281ad6265SDimitry Andric           } while (j0 <= DstIndex);
303381ad6265SDimitry Andric 
303481ad6265SDimitry Andric           auto Unmerge = B.buildUnmerge(S32, Tmp);
303581ad6265SDimitry Andric           LocalAccum[0] = Unmerge.getReg(0);
303681ad6265SDimitry Andric           if (LocalAccum.size() > 1)
303781ad6265SDimitry Andric             LocalAccum[1] = Unmerge.getReg(1);
303881ad6265SDimitry Andric         }
303981ad6265SDimitry Andric 
304081ad6265SDimitry Andric         return CarryOut;
304181ad6265SDimitry Andric       };
304281ad6265SDimitry Andric 
304381ad6265SDimitry Andric   // Outer multiply loop, iterating over destination parts from least
304481ad6265SDimitry Andric   // significant to most significant parts.
304581ad6265SDimitry Andric   //
304681ad6265SDimitry Andric   // The columns of the following diagram correspond to the destination parts
304781ad6265SDimitry Andric   // affected by one iteration of the outer loop (ignoring boundary
304881ad6265SDimitry Andric   // conditions).
304981ad6265SDimitry Andric   //
305081ad6265SDimitry Andric   //   Dest index relative to 2 * i:      1 0 -1
305181ad6265SDimitry Andric   //                                      ------
305281ad6265SDimitry Andric   //   Carries from previous iteration:     e o
305381ad6265SDimitry Andric   //   Even-aligned partial product sum:  E E .
305481ad6265SDimitry Andric   //   Odd-aligned partial product sum:     O O
305581ad6265SDimitry Andric   //
305681ad6265SDimitry Andric   // 'o' is OddCarry, 'e' is EvenCarry.
305781ad6265SDimitry Andric   // EE and OO are computed from partial products via buildMadChain and use
305881ad6265SDimitry Andric   // accumulation where possible and appropriate.
305981ad6265SDimitry Andric   //
306081ad6265SDimitry Andric   Register SeparateOddCarry;
306181ad6265SDimitry Andric   Carry EvenCarry;
306281ad6265SDimitry Andric   Carry OddCarry;
306381ad6265SDimitry Andric 
306481ad6265SDimitry Andric   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
306581ad6265SDimitry Andric     Carry OddCarryIn = std::move(OddCarry);
306681ad6265SDimitry Andric     Carry EvenCarryIn = std::move(EvenCarry);
306781ad6265SDimitry Andric     OddCarry.clear();
306881ad6265SDimitry Andric     EvenCarry.clear();
306981ad6265SDimitry Andric 
307081ad6265SDimitry Andric     // Partial products at offset 2 * i.
307181ad6265SDimitry Andric     if (2 * i < Accum.size()) {
307281ad6265SDimitry Andric       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
307381ad6265SDimitry Andric       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
307481ad6265SDimitry Andric     }
307581ad6265SDimitry Andric 
307681ad6265SDimitry Andric     // Partial products at offset 2 * i - 1.
307781ad6265SDimitry Andric     if (i > 0) {
307881ad6265SDimitry Andric       if (!SeparateOddAlignedProducts) {
307981ad6265SDimitry Andric         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
308081ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
308181ad6265SDimitry Andric       } else {
308281ad6265SDimitry Andric         bool IsHighest = 2 * i >= Accum.size();
308381ad6265SDimitry Andric         Register SeparateOddOut[2];
3084*bdd1243dSDimitry Andric         auto LocalAccum = MutableArrayRef(SeparateOddOut)
308581ad6265SDimitry Andric                               .take_front(IsHighest ? 1 : 2);
308681ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
308781ad6265SDimitry Andric 
308881ad6265SDimitry Andric         MachineInstr *Lo;
308981ad6265SDimitry Andric 
309081ad6265SDimitry Andric         if (i == 1) {
309181ad6265SDimitry Andric           if (!IsHighest)
309281ad6265SDimitry Andric             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
309381ad6265SDimitry Andric           else
309481ad6265SDimitry Andric             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
309581ad6265SDimitry Andric         } else {
309681ad6265SDimitry Andric           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
309781ad6265SDimitry Andric                             SeparateOddCarry);
309881ad6265SDimitry Andric         }
309981ad6265SDimitry Andric         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
310081ad6265SDimitry Andric 
310181ad6265SDimitry Andric         if (!IsHighest) {
310281ad6265SDimitry Andric           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
310381ad6265SDimitry Andric                                 Lo->getOperand(1).getReg());
310481ad6265SDimitry Andric           Accum[2 * i] = Hi.getReg(0);
310581ad6265SDimitry Andric           SeparateOddCarry = Hi.getReg(1);
310681ad6265SDimitry Andric         }
310781ad6265SDimitry Andric       }
310881ad6265SDimitry Andric     }
310981ad6265SDimitry Andric 
311081ad6265SDimitry Andric     // Add in the carries from the previous iteration
311181ad6265SDimitry Andric     if (i > 0) {
311281ad6265SDimitry Andric       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
311381ad6265SDimitry Andric         EvenCarryIn.push_back(CarryOut);
311481ad6265SDimitry Andric 
311581ad6265SDimitry Andric       if (2 * i < Accum.size()) {
311681ad6265SDimitry Andric         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
311781ad6265SDimitry Andric           OddCarry.push_back(CarryOut);
311881ad6265SDimitry Andric       }
311981ad6265SDimitry Andric     }
312081ad6265SDimitry Andric   }
312181ad6265SDimitry Andric }
312281ad6265SDimitry Andric 
312381ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions.
312481ad6265SDimitry Andric //
312581ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to
312681ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
312781ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
312881ad6265SDimitry Andric                                       MachineInstr &MI) const {
312981ad6265SDimitry Andric   assert(ST.hasMad64_32());
313081ad6265SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_MUL);
313181ad6265SDimitry Andric 
313281ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
313381ad6265SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
313481ad6265SDimitry Andric 
313581ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
313681ad6265SDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
313781ad6265SDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
313881ad6265SDimitry Andric 
313981ad6265SDimitry Andric   LLT Ty = MRI.getType(DstReg);
314081ad6265SDimitry Andric   assert(Ty.isScalar());
314181ad6265SDimitry Andric 
314281ad6265SDimitry Andric   unsigned Size = Ty.getSizeInBits();
314381ad6265SDimitry Andric   unsigned NumParts = Size / 32;
314481ad6265SDimitry Andric   assert((Size % 32) == 0);
314581ad6265SDimitry Andric   assert(NumParts >= 2);
314681ad6265SDimitry Andric 
314781ad6265SDimitry Andric   // Whether to use MAD_64_32 for partial products whose high half is
314881ad6265SDimitry Andric   // discarded. This avoids some ADD instructions but risks false dependency
314981ad6265SDimitry Andric   // stalls on some subtargets in some cases.
315081ad6265SDimitry Andric   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
315181ad6265SDimitry Andric 
315281ad6265SDimitry Andric   // Whether to compute odd-aligned partial products separately. This is
315381ad6265SDimitry Andric   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
315481ad6265SDimitry Andric   // in an even-aligned VGPR.
315581ad6265SDimitry Andric   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
315681ad6265SDimitry Andric 
315781ad6265SDimitry Andric   LLT S32 = LLT::scalar(32);
315881ad6265SDimitry Andric   SmallVector<Register, 2> Src0Parts, Src1Parts;
315981ad6265SDimitry Andric   for (unsigned i = 0; i < NumParts; ++i) {
316081ad6265SDimitry Andric     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
316181ad6265SDimitry Andric     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
316281ad6265SDimitry Andric   }
316381ad6265SDimitry Andric   B.buildUnmerge(Src0Parts, Src0);
316481ad6265SDimitry Andric   B.buildUnmerge(Src1Parts, Src1);
316581ad6265SDimitry Andric 
316681ad6265SDimitry Andric   SmallVector<Register, 2> AccumRegs(NumParts);
316781ad6265SDimitry Andric   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
316881ad6265SDimitry Andric                 SeparateOddAlignedProducts);
316981ad6265SDimitry Andric 
3170*bdd1243dSDimitry Andric   B.buildMergeLikeInstr(DstReg, AccumRegs);
317181ad6265SDimitry Andric   MI.eraseFromParent();
317281ad6265SDimitry Andric   return true;
317381ad6265SDimitry Andric 
317481ad6265SDimitry Andric }
317581ad6265SDimitry Andric 
3176349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3177349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3178349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select.
3179349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3180349cc55cSDimitry Andric                                             MachineRegisterInfo &MRI,
3181349cc55cSDimitry Andric                                             MachineIRBuilder &B) const {
3182349cc55cSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3183349cc55cSDimitry Andric   Register Src = MI.getOperand(1).getReg();
3184349cc55cSDimitry Andric   LLT DstTy = MRI.getType(Dst);
3185349cc55cSDimitry Andric   LLT SrcTy = MRI.getType(Src);
3186349cc55cSDimitry Andric 
3187349cc55cSDimitry Andric   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3188349cc55cSDimitry Andric                         ? AMDGPU::G_AMDGPU_FFBH_U32
3189349cc55cSDimitry Andric                         : AMDGPU::G_AMDGPU_FFBL_B32;
3190349cc55cSDimitry Andric   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3191349cc55cSDimitry Andric   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3192349cc55cSDimitry Andric 
3193349cc55cSDimitry Andric   MI.eraseFromParent();
3194349cc55cSDimitry Andric   return true;
3195349cc55cSDimitry Andric }
3196349cc55cSDimitry Andric 
3197e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
3198e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3199e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
3200e8d8bef9SDimitry Andric     return false;
3201349cc55cSDimitry Andric   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3202e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
3203e8d8bef9SDimitry Andric }
3204e8d8bef9SDimitry Andric 
32050b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
3206e8d8bef9SDimitry Andric static MachineInstr *
3207e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3208e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
32090b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
32100b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
32110b57cec5SDimitry Andric     return nullptr;
32120b57cec5SDimitry Andric 
32135ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
3214e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3215e8d8bef9SDimitry Andric 
3216e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
3217e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
3218e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
3219e8d8bef9SDimitry Andric       return nullptr;
3220e8d8bef9SDimitry Andric 
3221e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
3222349cc55cSDimitry Andric     eraseInstr(*UseMI, MRI);
3223e8d8bef9SDimitry Andric 
3224e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3225e8d8bef9SDimitry Andric     Negated = true;
3226e8d8bef9SDimitry Andric   }
3227e8d8bef9SDimitry Andric 
3228e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3229480093f4SDimitry Andric     return nullptr;
3230480093f4SDimitry Andric 
32315ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3232e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
32335ffd83dbSDimitry Andric   if (Next == Parent->end()) {
32345ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
32355ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
32365ffd83dbSDimitry Andric       return nullptr;
32375ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
32385ffd83dbSDimitry Andric   } else {
3239480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
3240480093f4SDimitry Andric       return nullptr;
3241480093f4SDimitry Andric     Br = &*Next;
32425ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
3243480093f4SDimitry Andric   }
3244480093f4SDimitry Andric 
3245e8d8bef9SDimitry Andric   return UseMI;
32460b57cec5SDimitry Andric }
32470b57cec5SDimitry Andric 
32480b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3249e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
3250e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
3251e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
3252e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
3253e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
32545ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
32550b57cec5SDimitry Andric 
325604eeddc0SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
325704eeddc0SDimitry Andric                                              *ArgRC, B.getDebugLoc(), ArgTy);
32580b57cec5SDimitry Andric   if (Arg->isMasked()) {
32590b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
32600b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
32610b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
32620b57cec5SDimitry Andric     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
32630b57cec5SDimitry Andric 
32648bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
32658bcb0991SDimitry Andric 
326604eeddc0SDimitry Andric     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
326704eeddc0SDimitry Andric     // 0.
32688bcb0991SDimitry Andric     if (Shift != 0) {
32690b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
32708bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
32718bcb0991SDimitry Andric     }
32728bcb0991SDimitry Andric 
32738bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
32745ffd83dbSDimitry Andric   } else {
32750b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
32760b57cec5SDimitry Andric   }
32770b57cec5SDimitry Andric 
32780b57cec5SDimitry Andric   return true;
32790b57cec5SDimitry Andric }
32800b57cec5SDimitry Andric 
3281e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
3282e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
3283e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3284e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3285e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
3286e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
3287e8d8bef9SDimitry Andric   LLT ArgTy;
3288e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3289e8d8bef9SDimitry Andric 
3290349cc55cSDimitry Andric   if (!Arg) {
3291349cc55cSDimitry Andric     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3292349cc55cSDimitry Andric       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3293349cc55cSDimitry Andric       // case the pointer argument may be missing and we use null.
3294349cc55cSDimitry Andric       B.buildConstant(DstReg, 0);
3295349cc55cSDimitry Andric       return true;
3296349cc55cSDimitry Andric     }
3297349cc55cSDimitry Andric 
3298349cc55cSDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
3299349cc55cSDimitry Andric     // attributes uses the corresponding intrinsic.
3300349cc55cSDimitry Andric     B.buildUndef(DstReg);
3301349cc55cSDimitry Andric     return true;
3302349cc55cSDimitry Andric   }
3303349cc55cSDimitry Andric 
3304e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
3305e8d8bef9SDimitry Andric     return false; // TODO: Handle these
3306e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3307e8d8bef9SDimitry Andric }
3308e8d8bef9SDimitry Andric 
33090b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
33105ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
33110b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3312e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
33135ffd83dbSDimitry Andric     return false;
33145ffd83dbSDimitry Andric 
33150b57cec5SDimitry Andric   MI.eraseFromParent();
33160b57cec5SDimitry Andric   return true;
33170b57cec5SDimitry Andric }
33180b57cec5SDimitry Andric 
331981ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
332081ad6265SDimitry Andric                                 int64_t C) {
332181ad6265SDimitry Andric   B.buildConstant(MI.getOperand(0).getReg(), C);
332281ad6265SDimitry Andric   MI.eraseFromParent();
332381ad6265SDimitry Andric   return true;
332481ad6265SDimitry Andric }
332581ad6265SDimitry Andric 
332681ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
332781ad6265SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
332881ad6265SDimitry Andric     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
332981ad6265SDimitry Andric   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
333081ad6265SDimitry Andric   if (MaxID == 0)
333181ad6265SDimitry Andric     return replaceWithConstant(B, MI, 0);
333281ad6265SDimitry Andric 
333381ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
333481ad6265SDimitry Andric   const ArgDescriptor *Arg;
333581ad6265SDimitry Andric   const TargetRegisterClass *ArgRC;
333681ad6265SDimitry Andric   LLT ArgTy;
333781ad6265SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
333881ad6265SDimitry Andric 
333981ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
334081ad6265SDimitry Andric   if (!Arg) {
334181ad6265SDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
334281ad6265SDimitry Andric     // attributes uses the corresponding intrinsic.
334381ad6265SDimitry Andric     B.buildUndef(DstReg);
334481ad6265SDimitry Andric     MI.eraseFromParent();
334581ad6265SDimitry Andric     return true;
334681ad6265SDimitry Andric   }
334781ad6265SDimitry Andric 
334881ad6265SDimitry Andric   if (Arg->isMasked()) {
334981ad6265SDimitry Andric     // Don't bother inserting AssertZext for packed IDs since we're emitting the
335081ad6265SDimitry Andric     // masking operations anyway.
335181ad6265SDimitry Andric     //
335281ad6265SDimitry Andric     // TODO: We could assert the top bit is 0 for the source copy.
335381ad6265SDimitry Andric     if (!loadInputValue(DstReg, B, ArgType))
335481ad6265SDimitry Andric       return false;
335581ad6265SDimitry Andric   } else {
335681ad6265SDimitry Andric     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
335781ad6265SDimitry Andric     if (!loadInputValue(TmpReg, B, ArgType))
335881ad6265SDimitry Andric       return false;
3359*bdd1243dSDimitry Andric     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
336081ad6265SDimitry Andric   }
336181ad6265SDimitry Andric 
336281ad6265SDimitry Andric   MI.eraseFromParent();
336381ad6265SDimitry Andric   return true;
336481ad6265SDimitry Andric }
336581ad6265SDimitry Andric 
336681ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
336781ad6265SDimitry Andric                                                      int64_t Offset) const {
336881ad6265SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
336981ad6265SDimitry Andric   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
337081ad6265SDimitry Andric 
337181ad6265SDimitry Andric   // TODO: If we passed in the base kernel offset we could have a better
337281ad6265SDimitry Andric   // alignment than 4, but we don't really need it.
337381ad6265SDimitry Andric   if (!loadInputValue(KernArgReg, B,
337481ad6265SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
337581ad6265SDimitry Andric     llvm_unreachable("failed to find kernarg segment ptr");
337681ad6265SDimitry Andric 
337781ad6265SDimitry Andric   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
337881ad6265SDimitry Andric   // TODO: Should get nuw
337981ad6265SDimitry Andric   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
338081ad6265SDimitry Andric }
338181ad6265SDimitry Andric 
338281ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by
338381ad6265SDimitry Andric /// legacy intrinsics.
338481ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
338581ad6265SDimitry Andric                                                       MachineIRBuilder &B,
338681ad6265SDimitry Andric                                                       uint64_t Offset,
338781ad6265SDimitry Andric                                                       Align Alignment) const {
338881ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
338981ad6265SDimitry Andric 
339081ad6265SDimitry Andric   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
339181ad6265SDimitry Andric          "unexpected kernarg parameter type");
339281ad6265SDimitry Andric 
339381ad6265SDimitry Andric   Register Ptr = getKernargParameterPtr(B, Offset);
339481ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
339581ad6265SDimitry Andric   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
339681ad6265SDimitry Andric               MachineMemOperand::MODereferenceable |
339781ad6265SDimitry Andric                   MachineMemOperand::MOInvariant);
339881ad6265SDimitry Andric   MI.eraseFromParent();
339981ad6265SDimitry Andric   return true;
340081ad6265SDimitry Andric }
340181ad6265SDimitry Andric 
34028bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
34038bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
34048bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
3405480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3406480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
3407480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
3408480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3409480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
34108bcb0991SDimitry Andric 
3411480093f4SDimitry Andric   if (DstTy == S16)
3412480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
3413480093f4SDimitry Andric   if (DstTy == S32)
3414480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
3415480093f4SDimitry Andric   if (DstTy == S64)
3416480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
3417480093f4SDimitry Andric 
34188bcb0991SDimitry Andric   return false;
34198bcb0991SDimitry Andric }
34208bcb0991SDimitry Andric 
3421fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3422fe6060f1SDimitry Andric                                                         Register DstDivReg,
3423fe6060f1SDimitry Andric                                                         Register DstRemReg,
34245ffd83dbSDimitry Andric                                                         Register X,
3425fe6060f1SDimitry Andric                                                         Register Y) const {
34265ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
34275ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
34285ffd83dbSDimitry Andric 
34295ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
34305ffd83dbSDimitry Andric   // algorithm used here.
34315ffd83dbSDimitry Andric 
34325ffd83dbSDimitry Andric   // Initial estimate of inv(y).
34335ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
34345ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
34355ffd83dbSDimitry Andric   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
34365ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
34375ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
34385ffd83dbSDimitry Andric 
34395ffd83dbSDimitry Andric   // One round of UNR.
34405ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
34415ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
34425ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
34435ffd83dbSDimitry Andric 
34445ffd83dbSDimitry Andric   // Quotient/remainder estimate.
34455ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
34465ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
34475ffd83dbSDimitry Andric 
34485ffd83dbSDimitry Andric   // First quotient/remainder refinement.
34495ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
34505ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3451fe6060f1SDimitry Andric   if (DstDivReg)
34525ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
34535ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
34545ffd83dbSDimitry Andric 
34555ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
34565ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3457fe6060f1SDimitry Andric   if (DstDivReg)
3458fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
34595ffd83dbSDimitry Andric 
3460fe6060f1SDimitry Andric   if (DstRemReg)
3461fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
34625ffd83dbSDimitry Andric }
34635ffd83dbSDimitry Andric 
3464349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32
34655ffd83dbSDimitry Andric //
34665ffd83dbSDimitry Andric // Return lo, hi of result
34675ffd83dbSDimitry Andric //
34685ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
34695ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
34705ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
34715ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
34725ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
34735ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
34745ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
34755ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
34765ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
34775ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
34785ffd83dbSDimitry Andric                                                        Register Val) {
34795ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
34805ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
34815ffd83dbSDimitry Andric 
34825ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
34835ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
34845ffd83dbSDimitry Andric 
34855ffd83dbSDimitry Andric   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
34865ffd83dbSDimitry Andric                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
34875ffd83dbSDimitry Andric 
34885ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
34895ffd83dbSDimitry Andric   auto Mul1 =
34905ffd83dbSDimitry Andric       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
34915ffd83dbSDimitry Andric 
34925ffd83dbSDimitry Andric   // 2**(-32)
34935ffd83dbSDimitry Andric   auto Mul2 =
34945ffd83dbSDimitry Andric       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
34955ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
34965ffd83dbSDimitry Andric 
34975ffd83dbSDimitry Andric   // -(2**32)
34985ffd83dbSDimitry Andric   auto Mad2 = B.buildFMAD(S32, Trunc,
34995ffd83dbSDimitry Andric                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
35005ffd83dbSDimitry Andric 
35015ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
35025ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
35035ffd83dbSDimitry Andric 
35045ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
35055ffd83dbSDimitry Andric }
35065ffd83dbSDimitry Andric 
3507fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3508fe6060f1SDimitry Andric                                                         Register DstDivReg,
3509fe6060f1SDimitry Andric                                                         Register DstRemReg,
35105ffd83dbSDimitry Andric                                                         Register Numer,
3511fe6060f1SDimitry Andric                                                         Register Denom) const {
35125ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
35135ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
35145ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
35155ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
35165ffd83dbSDimitry Andric 
35175ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
35185ffd83dbSDimitry Andric 
3519*bdd1243dSDimitry Andric   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
35205ffd83dbSDimitry Andric 
35215ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
35225ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
35235ffd83dbSDimitry Andric 
35245ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
35255ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
35265ffd83dbSDimitry Andric 
35275ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
35285ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
35295ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
35305ffd83dbSDimitry Andric 
35315ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
35325ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
3533*bdd1243dSDimitry Andric   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
35345ffd83dbSDimitry Andric 
35355ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
35365ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
35375ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
35385ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
35395ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
35405ffd83dbSDimitry Andric 
35415ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
35425ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3543349cc55cSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
3544*bdd1243dSDimitry Andric   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
35455ffd83dbSDimitry Andric 
35465ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
35475ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
35485ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
35495ffd83dbSDimitry Andric 
35505ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
35515ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
35525ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
35535ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
35545ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
35555ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
35565ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
35575ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
3558*bdd1243dSDimitry Andric   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
35595ffd83dbSDimitry Andric 
35605ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
35615ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
35625ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
35635ffd83dbSDimitry Andric 
35645ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
35655ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
35665ffd83dbSDimitry Andric 
35675ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
35685ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
35695ffd83dbSDimitry Andric 
35705ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
35715ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
35725ffd83dbSDimitry Andric 
35735ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
35745ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
35755ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
35765ffd83dbSDimitry Andric 
35775ffd83dbSDimitry Andric   // if C3 != 0 ...
35785ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
35795ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
35805ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
3581*bdd1243dSDimitry Andric   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
35825ffd83dbSDimitry Andric 
35835ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
35845ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
35855ffd83dbSDimitry Andric 
35865ffd83dbSDimitry Andric   auto C4 =
35875ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
35885ffd83dbSDimitry Andric   auto C5 =
35895ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
35905ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
35915ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
35925ffd83dbSDimitry Andric 
35935ffd83dbSDimitry Andric   // if (C6 != 0)
35945ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
35955ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
35965ffd83dbSDimitry Andric 
35975ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
35985ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
3599*bdd1243dSDimitry Andric   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
36005ffd83dbSDimitry Andric 
36015ffd83dbSDimitry Andric   // endif C6
36025ffd83dbSDimitry Andric   // endif C3
36035ffd83dbSDimitry Andric 
3604fe6060f1SDimitry Andric   if (DstDivReg) {
36055ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
36065ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3607fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3608fe6060f1SDimitry Andric                   Sel1, MulHi3);
3609fe6060f1SDimitry Andric   }
3610fe6060f1SDimitry Andric 
3611fe6060f1SDimitry Andric   if (DstRemReg) {
36125ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
36135ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3614fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3615fe6060f1SDimitry Andric                   Sel2, Sub1);
36165ffd83dbSDimitry Andric   }
36175ffd83dbSDimitry Andric }
36185ffd83dbSDimitry Andric 
3619fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
36205ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
36215ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
3622fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
3623fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3624fe6060f1SDimitry Andric   default:
3625fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3626fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
3627fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3628fe6060f1SDimitry Andric     break;
3629fe6060f1SDimitry Andric   }
3630fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
3631fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3632fe6060f1SDimitry Andric     break;
3633fe6060f1SDimitry Andric   }
3634fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
3635fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3636fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3637fe6060f1SDimitry Andric     break;
3638fe6060f1SDimitry Andric   }
3639fe6060f1SDimitry Andric   }
3640fe6060f1SDimitry Andric 
36415ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
36425ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3643fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3644fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3645fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3646fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
36475ffd83dbSDimitry Andric 
36485ffd83dbSDimitry Andric   if (Ty == S32)
3649fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
36505ffd83dbSDimitry Andric   else if (Ty == S64)
3651fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
36525ffd83dbSDimitry Andric   else
36535ffd83dbSDimitry Andric     return false;
36545ffd83dbSDimitry Andric 
36555ffd83dbSDimitry Andric   MI.eraseFromParent();
36565ffd83dbSDimitry Andric   return true;
36575ffd83dbSDimitry Andric }
36585ffd83dbSDimitry Andric 
3659fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
36605ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
36615ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
36625ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
36635ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
36645ffd83dbSDimitry Andric 
3665fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
36665ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
36675ffd83dbSDimitry Andric     return false;
36685ffd83dbSDimitry Andric 
3669fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3670fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3671fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
36725ffd83dbSDimitry Andric 
36735ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
36745ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
36755ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
36765ffd83dbSDimitry Andric 
36775ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
36785ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
36795ffd83dbSDimitry Andric 
36805ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
36815ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
36825ffd83dbSDimitry Andric 
3683fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3684fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3685fe6060f1SDimitry Andric   default:
3686fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3687fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
3688fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3689fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3690fe6060f1SDimitry Andric     break;
3691fe6060f1SDimitry Andric   }
3692fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
3693fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3694fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3695fe6060f1SDimitry Andric     break;
3696fe6060f1SDimitry Andric   }
3697fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
3698fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3699fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3700fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3701fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3702fe6060f1SDimitry Andric     break;
3703fe6060f1SDimitry Andric   }
3704fe6060f1SDimitry Andric   }
3705fe6060f1SDimitry Andric 
37065ffd83dbSDimitry Andric   if (Ty == S32)
3707fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
37085ffd83dbSDimitry Andric   else
3709fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
37105ffd83dbSDimitry Andric 
3711fe6060f1SDimitry Andric   if (DstDivReg) {
3712fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3713fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3714fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
3715fe6060f1SDimitry Andric   }
37165ffd83dbSDimitry Andric 
3717fe6060f1SDimitry Andric   if (DstRemReg) {
3718fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3719fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3720fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
3721fe6060f1SDimitry Andric   }
37225ffd83dbSDimitry Andric 
37235ffd83dbSDimitry Andric   MI.eraseFromParent();
37245ffd83dbSDimitry Andric   return true;
37255ffd83dbSDimitry Andric }
37265ffd83dbSDimitry Andric 
37278bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
37288bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
37298bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
37308bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
37318bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
37328bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
37338bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
37348bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
37358bcb0991SDimitry Andric 
37368bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
3737e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3738e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
37398bcb0991SDimitry Andric 
3740e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
37418bcb0991SDimitry Andric     return false;
37428bcb0991SDimitry Andric 
37438bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
37448bcb0991SDimitry Andric     // 1 / x -> RCP(x)
37458bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
37468bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
37478bcb0991SDimitry Andric         .addUse(RHS)
37488bcb0991SDimitry Andric         .setMIFlags(Flags);
37498bcb0991SDimitry Andric 
37508bcb0991SDimitry Andric       MI.eraseFromParent();
37518bcb0991SDimitry Andric       return true;
37528bcb0991SDimitry Andric     }
37538bcb0991SDimitry Andric 
37548bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
37558bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
37568bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
37578bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
37588bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
37598bcb0991SDimitry Andric         .setMIFlags(Flags);
37608bcb0991SDimitry Andric 
37618bcb0991SDimitry Andric       MI.eraseFromParent();
37628bcb0991SDimitry Andric       return true;
37638bcb0991SDimitry Andric     }
37648bcb0991SDimitry Andric   }
37658bcb0991SDimitry Andric 
37668bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
37678bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
37688bcb0991SDimitry Andric     .addUse(RHS)
37698bcb0991SDimitry Andric     .setMIFlags(Flags);
37708bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
37718bcb0991SDimitry Andric 
37728bcb0991SDimitry Andric   MI.eraseFromParent();
37738bcb0991SDimitry Andric   return true;
37748bcb0991SDimitry Andric }
37758bcb0991SDimitry Andric 
3776e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3777e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
3778e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
3779e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3780e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
3781e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
3782e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
3783e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
3784e8d8bef9SDimitry Andric 
3785e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
3786e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3787e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
3788e8d8bef9SDimitry Andric 
3789e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
37908bcb0991SDimitry Andric     return false;
3791e8d8bef9SDimitry Andric 
3792e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
3793e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
3794e8d8bef9SDimitry Andric 
3795e8d8bef9SDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3796e8d8bef9SDimitry Andric     .addUse(Y)
3797e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3798e8d8bef9SDimitry Andric 
3799e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3800e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
3801e8d8bef9SDimitry Andric 
3802e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3803e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
3804e8d8bef9SDimitry Andric 
3805e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
3806e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3807e8d8bef9SDimitry Andric 
3808e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
3809e8d8bef9SDimitry Andric   MI.eraseFromParent();
3810e8d8bef9SDimitry Andric   return true;
38118bcb0991SDimitry Andric }
38128bcb0991SDimitry Andric 
3813480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3814480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3815480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3816e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3817e8d8bef9SDimitry Andric     return true;
3818e8d8bef9SDimitry Andric 
3819480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3820480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3821480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3822480093f4SDimitry Andric 
3823480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3824480093f4SDimitry Andric 
3825480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
3826480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3827480093f4SDimitry Andric 
3828480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3829480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3830480093f4SDimitry Andric 
3831480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3832480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
3833480093f4SDimitry Andric     .setMIFlags(Flags);
3834480093f4SDimitry Andric 
3835480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3836480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3837480093f4SDimitry Andric 
3838480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3839480093f4SDimitry Andric     .addUse(RDst.getReg(0))
3840480093f4SDimitry Andric     .addUse(RHS)
3841480093f4SDimitry Andric     .addUse(LHS)
3842480093f4SDimitry Andric     .setMIFlags(Flags);
3843480093f4SDimitry Andric 
3844480093f4SDimitry Andric   MI.eraseFromParent();
3845480093f4SDimitry Andric   return true;
3846480093f4SDimitry Andric }
3847480093f4SDimitry Andric 
3848480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3849480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3850480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable,
3851480093f4SDimitry Andric                                MachineIRBuilder &B,
3852480093f4SDimitry Andric                                const GCNSubtarget &ST,
3853480093f4SDimitry Andric                                AMDGPU::SIModeRegisterDefaults Mode) {
3854480093f4SDimitry Andric   // Set SP denorm mode to this value.
3855480093f4SDimitry Andric   unsigned SPDenormMode =
38565ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3857480093f4SDimitry Andric 
3858480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
3859480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
38605ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3861480093f4SDimitry Andric 
38625ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3863480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
3864480093f4SDimitry Andric       .addImm(NewDenormModeValue);
3865480093f4SDimitry Andric 
3866480093f4SDimitry Andric   } else {
3867480093f4SDimitry Andric     // Select FP32 bit field in mode register.
3868480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3869480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3870480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3871480093f4SDimitry Andric 
3872480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3873480093f4SDimitry Andric       .addImm(SPDenormMode)
3874480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
3875480093f4SDimitry Andric   }
3876480093f4SDimitry Andric }
3877480093f4SDimitry Andric 
3878480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3879480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3880480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3881e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3882e8d8bef9SDimitry Andric     return true;
3883e8d8bef9SDimitry Andric 
3884480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3885480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3886480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3887480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3888480093f4SDimitry Andric   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3889480093f4SDimitry Andric 
3890480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3891480093f4SDimitry Andric 
3892480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3893480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3894480093f4SDimitry Andric 
3895480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
3896480093f4SDimitry Andric 
3897480093f4SDimitry Andric   auto DenominatorScaled =
3898480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3899480093f4SDimitry Andric       .addUse(LHS)
39005ffd83dbSDimitry Andric       .addUse(RHS)
39015ffd83dbSDimitry Andric       .addImm(0)
3902480093f4SDimitry Andric       .setMIFlags(Flags);
3903480093f4SDimitry Andric   auto NumeratorScaled =
3904480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3905480093f4SDimitry Andric       .addUse(LHS)
3906480093f4SDimitry Andric       .addUse(RHS)
39075ffd83dbSDimitry Andric       .addImm(1)
3908480093f4SDimitry Andric       .setMIFlags(Flags);
3909480093f4SDimitry Andric 
3910480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3911480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
3912480093f4SDimitry Andric     .setMIFlags(Flags);
3913480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3914480093f4SDimitry Andric 
3915480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3916480093f4SDimitry Andric   // aren't modeled as reading it.
39175ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3918480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
3919480093f4SDimitry Andric 
3920480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3921480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3922480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3923480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3924480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3925480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3926480093f4SDimitry Andric 
39275ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3928480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
3929480093f4SDimitry Andric 
3930480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3931480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3932480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
3933480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3934480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
3935480093f4SDimitry Andric     .setMIFlags(Flags);
3936480093f4SDimitry Andric 
3937480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3938480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3939480093f4SDimitry Andric     .addUse(RHS)
3940480093f4SDimitry Andric     .addUse(LHS)
3941480093f4SDimitry Andric     .setMIFlags(Flags);
3942480093f4SDimitry Andric 
3943480093f4SDimitry Andric   MI.eraseFromParent();
3944480093f4SDimitry Andric   return true;
3945480093f4SDimitry Andric }
3946480093f4SDimitry Andric 
3947480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3948480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3949480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3950e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3951e8d8bef9SDimitry Andric     return true;
3952e8d8bef9SDimitry Andric 
3953480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3954480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3955480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3956480093f4SDimitry Andric 
3957480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3958480093f4SDimitry Andric 
3959480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
3960480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3961480093f4SDimitry Andric 
3962480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
3963480093f4SDimitry Andric 
3964480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3965480093f4SDimitry Andric     .addUse(LHS)
3966480093f4SDimitry Andric     .addUse(RHS)
39675ffd83dbSDimitry Andric     .addImm(0)
3968480093f4SDimitry Andric     .setMIFlags(Flags);
3969480093f4SDimitry Andric 
3970480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3971480093f4SDimitry Andric 
3972480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3973480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
3974480093f4SDimitry Andric     .setMIFlags(Flags);
3975480093f4SDimitry Andric 
3976480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3977480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3978480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3979480093f4SDimitry Andric 
3980480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3981480093f4SDimitry Andric     .addUse(LHS)
3982480093f4SDimitry Andric     .addUse(RHS)
39835ffd83dbSDimitry Andric     .addImm(1)
3984480093f4SDimitry Andric     .setMIFlags(Flags);
3985480093f4SDimitry Andric 
3986480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
39875ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3988480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3989480093f4SDimitry Andric 
3990480093f4SDimitry Andric   Register Scale;
3991480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
3992480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
3993480093f4SDimitry Andric     // is not usable.
3994480093f4SDimitry Andric 
3995480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
3996480093f4SDimitry Andric 
3997480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3998480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3999480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4000480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4001480093f4SDimitry Andric 
4002480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4003480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
4004480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4005480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
40065ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4007480093f4SDimitry Andric   } else {
4008480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
4009480093f4SDimitry Andric   }
4010480093f4SDimitry Andric 
4011480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4012480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
4013480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
4014480093f4SDimitry Andric     .addUse(Mul.getReg(0))
4015480093f4SDimitry Andric     .addUse(Scale)
4016480093f4SDimitry Andric     .setMIFlags(Flags);
4017480093f4SDimitry Andric 
4018*bdd1243dSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4019480093f4SDimitry Andric       .addUse(Fmas.getReg(0))
4020480093f4SDimitry Andric       .addUse(RHS)
4021480093f4SDimitry Andric       .addUse(LHS)
4022480093f4SDimitry Andric       .setMIFlags(Flags);
4023480093f4SDimitry Andric 
4024480093f4SDimitry Andric   MI.eraseFromParent();
4025480093f4SDimitry Andric   return true;
4026480093f4SDimitry Andric }
4027480093f4SDimitry Andric 
40288bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
40298bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
40308bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
40318bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
40328bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
40338bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
40348bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
40358bcb0991SDimitry Andric 
40368bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
40378bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
40388bcb0991SDimitry Andric 
40398bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
40408bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
40418bcb0991SDimitry Andric 
40428bcb0991SDimitry Andric   auto C0 = B.buildConstant(S32, 0x6f800000);
40438bcb0991SDimitry Andric   auto C1 = B.buildConstant(S32, 0x2f800000);
40448bcb0991SDimitry Andric   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
40458bcb0991SDimitry Andric 
40468bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
40478bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
40488bcb0991SDimitry Andric 
40498bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
40508bcb0991SDimitry Andric 
40518bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
40528bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
40538bcb0991SDimitry Andric     .setMIFlags(Flags);
40548bcb0991SDimitry Andric 
40558bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
40568bcb0991SDimitry Andric 
40578bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
40588bcb0991SDimitry Andric 
40598bcb0991SDimitry Andric   MI.eraseFromParent();
40608bcb0991SDimitry Andric   return true;
40618bcb0991SDimitry Andric }
40628bcb0991SDimitry Andric 
4063e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4064e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
4065e8d8bef9SDimitry Andric //
4066e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
4067e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4068e8d8bef9SDimitry Andric // +-max_float.
4069e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4070e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
4071e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
4072e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4073e8d8bef9SDimitry Andric     return true;
4074e8d8bef9SDimitry Andric 
4075e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4076e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
4077e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
4078e8d8bef9SDimitry Andric 
4079e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
4080e8d8bef9SDimitry Andric 
4081e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
4082e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
4083e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
4084e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
4085e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
4086e8d8bef9SDimitry Andric   else
4087e8d8bef9SDimitry Andric     return false;
4088e8d8bef9SDimitry Andric 
4089e8d8bef9SDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4090e8d8bef9SDimitry Andric     .addUse(Src)
4091e8d8bef9SDimitry Andric     .setMIFlags(Flags);
4092e8d8bef9SDimitry Andric 
4093e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
4094e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
4095e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4096e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
4097e8d8bef9SDimitry Andric 
4098e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4099e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4100e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4101e8d8bef9SDimitry Andric 
4102e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4103e8d8bef9SDimitry Andric 
4104e8d8bef9SDimitry Andric   if (UseIEEE)
4105e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4106e8d8bef9SDimitry Andric   else
4107e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4108e8d8bef9SDimitry Andric   MI.eraseFromParent();
4109e8d8bef9SDimitry Andric   return true;
4110e8d8bef9SDimitry Andric }
4111e8d8bef9SDimitry Andric 
4112e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4113e8d8bef9SDimitry Andric   switch (IID) {
4114e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
4115e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
4116e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
4117e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4118e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
4119e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4120e8d8bef9SDimitry Andric   default:
4121e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
4122e8d8bef9SDimitry Andric   }
4123e8d8bef9SDimitry Andric }
4124e8d8bef9SDimitry Andric 
4125e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4126e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
4127e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
4128e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
4129e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
4130e8d8bef9SDimitry Andric 
4131e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4132e8d8bef9SDimitry Andric 
4133e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
4134e8d8bef9SDimitry Andric   // construction.
4135e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
413681ad6265SDimitry Andric     MI.removeOperand(I);
4137e8d8bef9SDimitry Andric 
413881ad6265SDimitry Andric   MI.removeOperand(1); // Remove the intrinsic ID.
4139e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
4140e8d8bef9SDimitry Andric   return true;
4141e8d8bef9SDimitry Andric }
4142e8d8bef9SDimitry Andric 
4143e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4144e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
4145e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
4146e8d8bef9SDimitry Andric   uint64_t Offset =
4147e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
4148e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4149e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
4150e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4151e8d8bef9SDimitry Andric 
4152e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4153e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
4154e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4155e8d8bef9SDimitry Andric     return false;
4156e8d8bef9SDimitry Andric 
4157e8d8bef9SDimitry Andric   // FIXME: This should be nuw
4158e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4159e8d8bef9SDimitry Andric   return true;
4160e8d8bef9SDimitry Andric }
4161e8d8bef9SDimitry Andric 
41620b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
41630b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
41640b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
41650b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
41660b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
41670b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
41680b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
41690b57cec5SDimitry Andric   }
41700b57cec5SDimitry Andric 
41710b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4172e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
41730b57cec5SDimitry Andric     return false;
41740b57cec5SDimitry Andric 
41750b57cec5SDimitry Andric   MI.eraseFromParent();
41760b57cec5SDimitry Andric   return true;
41770b57cec5SDimitry Andric }
41780b57cec5SDimitry Andric 
4179fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4180fcaf7f86SDimitry Andric                                          MachineRegisterInfo &MRI,
4181fcaf7f86SDimitry Andric                                          MachineIRBuilder &B) const {
4182fcaf7f86SDimitry Andric   Function &F = B.getMF().getFunction();
4183*bdd1243dSDimitry Andric   std::optional<uint32_t> KnownSize =
4184fcaf7f86SDimitry Andric       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4185fcaf7f86SDimitry Andric   if (KnownSize.has_value())
4186*bdd1243dSDimitry Andric     B.buildConstant(DstReg, *KnownSize);
4187fcaf7f86SDimitry Andric   return false;
4188fcaf7f86SDimitry Andric }
4189fcaf7f86SDimitry Andric 
4190fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4191fcaf7f86SDimitry Andric                                               MachineRegisterInfo &MRI,
4192fcaf7f86SDimitry Andric                                               MachineIRBuilder &B) const {
4193fcaf7f86SDimitry Andric 
4194fcaf7f86SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4195fcaf7f86SDimitry Andric   if (!MFI->isEntryFunction()) {
4196fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
4197fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4198fcaf7f86SDimitry Andric   }
4199fcaf7f86SDimitry Andric 
4200fcaf7f86SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4201fcaf7f86SDimitry Andric   if (!getLDSKernelId(DstReg, MRI, B))
4202fcaf7f86SDimitry Andric     return false;
4203fcaf7f86SDimitry Andric 
4204fcaf7f86SDimitry Andric   MI.eraseFromParent();
4205fcaf7f86SDimitry Andric   return true;
4206fcaf7f86SDimitry Andric }
4207fcaf7f86SDimitry Andric 
42088bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
42098bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
42108bcb0991SDimitry Andric                                               MachineIRBuilder &B,
42118bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
42128bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4213e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4214e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
4215e8d8bef9SDimitry Andric 
42168bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
42178bcb0991SDimitry Andric   MI.eraseFromParent();
42188bcb0991SDimitry Andric   return true;
42198bcb0991SDimitry Andric }
42208bcb0991SDimitry Andric 
42215ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
42225ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
42235ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
42245ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
42255ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
42265ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
4227fe6060f1SDimitry Andric std::pair<Register, unsigned>
42285ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
42295ffd83dbSDimitry Andric                                         Register OrigOffset) const {
42305ffd83dbSDimitry Andric   const unsigned MaxImm = 4095;
42315ffd83dbSDimitry Andric   Register BaseReg;
4232fe6060f1SDimitry Andric   unsigned ImmOffset;
42335ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
4234fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
42355ffd83dbSDimitry Andric 
4236fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
4237fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
42385ffd83dbSDimitry Andric 
4239fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
4240fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
4241fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
42425ffd83dbSDimitry Andric 
42435ffd83dbSDimitry Andric   // If the immediate value is too big for the immoffset field, put the value
42445ffd83dbSDimitry Andric   // and -4096 into the immoffset field so that the value that is copied/added
42455ffd83dbSDimitry Andric   // for the voffset field is a multiple of 4096, and it stands more chance
42465ffd83dbSDimitry Andric   // of being CSEd with the copy/add for another similar load/store.
42475ffd83dbSDimitry Andric   // However, do not do that rounding down to a multiple of 4096 if that is a
42485ffd83dbSDimitry Andric   // negative number, as it appears to be illegal to have a negative offset
42495ffd83dbSDimitry Andric   // in the vgpr, even if adding the immediate offset makes it positive.
42505ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
42515ffd83dbSDimitry Andric   ImmOffset -= Overflow;
42525ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
42535ffd83dbSDimitry Andric     Overflow += ImmOffset;
42545ffd83dbSDimitry Andric     ImmOffset = 0;
42555ffd83dbSDimitry Andric   }
42565ffd83dbSDimitry Andric 
42575ffd83dbSDimitry Andric   if (Overflow != 0) {
42585ffd83dbSDimitry Andric     if (!BaseReg) {
42595ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
42605ffd83dbSDimitry Andric     } else {
42615ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
42625ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
42635ffd83dbSDimitry Andric     }
42645ffd83dbSDimitry Andric   }
42655ffd83dbSDimitry Andric 
42665ffd83dbSDimitry Andric   if (!BaseReg)
42675ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
42685ffd83dbSDimitry Andric 
4269*bdd1243dSDimitry Andric   return std::pair(BaseReg, ImmOffset);
4270fe6060f1SDimitry Andric }
4271fe6060f1SDimitry Andric 
4272fe6060f1SDimitry Andric /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4273fe6060f1SDimitry Andric void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4274fe6060f1SDimitry Andric                                           Register VOffset, Register SOffset,
4275fe6060f1SDimitry Andric                                           unsigned ImmOffset, Register VIndex,
4276fe6060f1SDimitry Andric                                           MachineRegisterInfo &MRI) const {
4277*bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeVOffsetVal =
4278349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(VOffset, MRI);
4279*bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeSOffsetVal =
4280349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(SOffset, MRI);
4281*bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeVIndexVal =
4282349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(VIndex, MRI);
4283fe6060f1SDimitry Andric   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4284fe6060f1SDimitry Andric   // update the MMO with that offset. The stride is unknown so we can only do
4285fe6060f1SDimitry Andric   // this if VIndex is constant 0.
4286fe6060f1SDimitry Andric   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4287fe6060f1SDimitry Andric       MaybeVIndexVal->Value == 0) {
4288fe6060f1SDimitry Andric     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4289fe6060f1SDimitry Andric                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4290fe6060f1SDimitry Andric     MMO->setOffset(TotalOffset);
4291fe6060f1SDimitry Andric   } else {
4292fe6060f1SDimitry Andric     // We don't have a constant combined offset to use in the MMO. Give up.
4293fe6060f1SDimitry Andric     MMO->setValue((Value *)nullptr);
4294fe6060f1SDimitry Andric   }
42955ffd83dbSDimitry Andric }
42965ffd83dbSDimitry Andric 
42978bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
42988bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
42998bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
4300e8d8bef9SDimitry Andric                                              Register Reg,
4301e8d8bef9SDimitry Andric                                              bool ImageStore) const {
43028bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
43038bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
43048bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
43058bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
43068bcb0991SDimitry Andric 
4307e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
43088bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
43098bcb0991SDimitry Andric 
43108bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
43118bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
43128bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
43138bcb0991SDimitry Andric 
43148bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
43158bcb0991SDimitry Andric 
4316fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4317fe6060f1SDimitry Andric         .getReg(0);
43188bcb0991SDimitry Andric   }
43198bcb0991SDimitry Andric 
4320e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
4321e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
4322e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
4323e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
4324e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
4325e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4326fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4327fe6060f1SDimitry Andric           .getReg(0);
4328e8d8bef9SDimitry Andric     }
4329e8d8bef9SDimitry Andric 
4330e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
4331e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
4332e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
4333e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4334e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
4335e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4336fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4337fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4338e8d8bef9SDimitry Andric     }
4339e8d8bef9SDimitry Andric 
4340e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
4341e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
4342fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4343e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
4344e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4345e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
4346e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4347fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4348fe6060f1SDimitry Andric           .getReg(0);
4349e8d8bef9SDimitry Andric     }
4350e8d8bef9SDimitry Andric 
4351e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
4352e8d8bef9SDimitry Andric   }
4353e8d8bef9SDimitry Andric 
43540eae32dcSDimitry Andric   if (StoreVT == LLT::fixed_vector(3, S16)) {
43550eae32dcSDimitry Andric     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
43560eae32dcSDimitry Andric               .getReg(0);
43570eae32dcSDimitry Andric   }
4358e8d8bef9SDimitry Andric   return Reg;
4359e8d8bef9SDimitry Andric }
4360e8d8bef9SDimitry Andric 
43615ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
43625ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
43635ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
43645ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
43658bcb0991SDimitry Andric 
43668bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
43678bcb0991SDimitry Andric 
43688bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
43698bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
43708bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
43715ffd83dbSDimitry Andric     return AnyExt;
43728bcb0991SDimitry Andric   }
43738bcb0991SDimitry Andric 
43748bcb0991SDimitry Andric   if (Ty.isVector()) {
43758bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
43768bcb0991SDimitry Andric       if (IsFormat)
43775ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
43785ffd83dbSDimitry Andric     }
43795ffd83dbSDimitry Andric   }
43805ffd83dbSDimitry Andric 
43815ffd83dbSDimitry Andric   return VData;
43825ffd83dbSDimitry Andric }
43835ffd83dbSDimitry Andric 
43845ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
43855ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
43865ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
43875ffd83dbSDimitry Andric                                               bool IsTyped,
43885ffd83dbSDimitry Andric                                               bool IsFormat) const {
43895ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
43905ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
43915ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
43925ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
43935ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43945ffd83dbSDimitry Andric 
43955ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
43965ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
43975ffd83dbSDimitry Andric 
43985ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
43995ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
44005ffd83dbSDimitry Andric 
44015ffd83dbSDimitry Andric   unsigned ImmOffset;
44025ffd83dbSDimitry Andric 
44035ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
44045ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
44055ffd83dbSDimitry Andric 
44065ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
44075ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
44085ffd83dbSDimitry Andric   Register VIndex;
44095ffd83dbSDimitry Andric   int OpOffset = 0;
44105ffd83dbSDimitry Andric   if (HasVIndex) {
44115ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
44125ffd83dbSDimitry Andric     OpOffset = 1;
4413fe6060f1SDimitry Andric   } else {
4414fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
44155ffd83dbSDimitry Andric   }
44165ffd83dbSDimitry Andric 
44175ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
44185ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
44195ffd83dbSDimitry Andric 
44205ffd83dbSDimitry Andric   unsigned Format = 0;
44215ffd83dbSDimitry Andric   if (IsTyped) {
44225ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
44235ffd83dbSDimitry Andric     ++OpOffset;
44245ffd83dbSDimitry Andric   }
44255ffd83dbSDimitry Andric 
44265ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
44275ffd83dbSDimitry Andric 
4428fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4429fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
44305ffd83dbSDimitry Andric 
44315ffd83dbSDimitry Andric   unsigned Opc;
44325ffd83dbSDimitry Andric   if (IsTyped) {
44335ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
44345ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
44355ffd83dbSDimitry Andric   } else if (IsFormat) {
44365ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
44375ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
44385ffd83dbSDimitry Andric   } else {
44395ffd83dbSDimitry Andric     switch (MemSize) {
44405ffd83dbSDimitry Andric     case 1:
44415ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
44425ffd83dbSDimitry Andric       break;
44435ffd83dbSDimitry Andric     case 2:
44445ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
44455ffd83dbSDimitry Andric       break;
44465ffd83dbSDimitry Andric     default:
44475ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
44485ffd83dbSDimitry Andric       break;
44495ffd83dbSDimitry Andric     }
44505ffd83dbSDimitry Andric   }
44515ffd83dbSDimitry Andric 
44525ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
44535ffd83dbSDimitry Andric     .addUse(VData)              // vdata
44545ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
44555ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
44565ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
44575ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
44585ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
44595ffd83dbSDimitry Andric 
44605ffd83dbSDimitry Andric   if (IsTyped)
44615ffd83dbSDimitry Andric     MIB.addImm(Format);
44625ffd83dbSDimitry Andric 
44635ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
44645ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
44655ffd83dbSDimitry Andric      .addMemOperand(MMO);
44665ffd83dbSDimitry Andric 
44675ffd83dbSDimitry Andric   MI.eraseFromParent();
44688bcb0991SDimitry Andric   return true;
44698bcb0991SDimitry Andric }
44708bcb0991SDimitry Andric 
4471*bdd1243dSDimitry Andric static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
4472*bdd1243dSDimitry Andric                             Register VIndex, Register VOffset, Register SOffset,
4473*bdd1243dSDimitry Andric                             unsigned ImmOffset, unsigned Format,
4474*bdd1243dSDimitry Andric                             unsigned AuxiliaryData, MachineMemOperand *MMO,
4475*bdd1243dSDimitry Andric                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
4476*bdd1243dSDimitry Andric   auto MIB = B.buildInstr(Opc)
4477*bdd1243dSDimitry Andric                  .addDef(LoadDstReg) // vdata
4478*bdd1243dSDimitry Andric                  .addUse(RSrc)       // rsrc
4479*bdd1243dSDimitry Andric                  .addUse(VIndex)     // vindex
4480*bdd1243dSDimitry Andric                  .addUse(VOffset)    // voffset
4481*bdd1243dSDimitry Andric                  .addUse(SOffset)    // soffset
4482*bdd1243dSDimitry Andric                  .addImm(ImmOffset); // offset(imm)
4483*bdd1243dSDimitry Andric 
4484*bdd1243dSDimitry Andric   if (IsTyped)
4485*bdd1243dSDimitry Andric     MIB.addImm(Format);
4486*bdd1243dSDimitry Andric 
4487*bdd1243dSDimitry Andric   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
4488*bdd1243dSDimitry Andric       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
4489*bdd1243dSDimitry Andric       .addMemOperand(MMO);
4490*bdd1243dSDimitry Andric }
4491*bdd1243dSDimitry Andric 
44925ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
44935ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
44945ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
44955ffd83dbSDimitry Andric                                              bool IsFormat,
44965ffd83dbSDimitry Andric                                              bool IsTyped) const {
44975ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
44985ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
4499fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
45005ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
45015ffd83dbSDimitry Andric 
45025ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4503*bdd1243dSDimitry Andric 
4504*bdd1243dSDimitry Andric   Register StatusDst;
4505*bdd1243dSDimitry Andric   int OpOffset = 0;
4506*bdd1243dSDimitry Andric   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
4507*bdd1243dSDimitry Andric   bool IsTFE = MI.getNumExplicitDefs() == 2;
4508*bdd1243dSDimitry Andric   if (IsTFE) {
4509*bdd1243dSDimitry Andric     StatusDst = MI.getOperand(1).getReg();
4510*bdd1243dSDimitry Andric     ++OpOffset;
4511*bdd1243dSDimitry Andric   }
4512*bdd1243dSDimitry Andric 
4513*bdd1243dSDimitry Andric   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
45145ffd83dbSDimitry Andric 
45155ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
45165ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
45175ffd83dbSDimitry Andric 
45185ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
4519*bdd1243dSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
45205ffd83dbSDimitry Andric   Register VIndex;
45215ffd83dbSDimitry Andric   if (HasVIndex) {
4522*bdd1243dSDimitry Andric     VIndex = MI.getOperand(3 + OpOffset).getReg();
4523*bdd1243dSDimitry Andric     ++OpOffset;
4524fe6060f1SDimitry Andric   } else {
4525fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
45268bcb0991SDimitry Andric   }
45278bcb0991SDimitry Andric 
45285ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
45295ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
45305ffd83dbSDimitry Andric 
45315ffd83dbSDimitry Andric   unsigned Format = 0;
45325ffd83dbSDimitry Andric   if (IsTyped) {
45335ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
45345ffd83dbSDimitry Andric     ++OpOffset;
45358bcb0991SDimitry Andric   }
45368bcb0991SDimitry Andric 
45375ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
45385ffd83dbSDimitry Andric   unsigned ImmOffset;
45395ffd83dbSDimitry Andric 
45405ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
45415ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
45425ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
45435ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
45445ffd83dbSDimitry Andric 
4545fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4546fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
45475ffd83dbSDimitry Andric 
45485ffd83dbSDimitry Andric   unsigned Opc;
45495ffd83dbSDimitry Andric 
4550*bdd1243dSDimitry Andric   // TODO: Support TFE for typed and narrow loads.
45515ffd83dbSDimitry Andric   if (IsTyped) {
4552*bdd1243dSDimitry Andric     if (IsTFE)
4553*bdd1243dSDimitry Andric       return false;
45545ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
45555ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
45565ffd83dbSDimitry Andric   } else if (IsFormat) {
4557*bdd1243dSDimitry Andric     if (IsD16) {
4558*bdd1243dSDimitry Andric       if (IsTFE)
4559*bdd1243dSDimitry Andric         return false;
4560*bdd1243dSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
45615ffd83dbSDimitry Andric     } else {
4562*bdd1243dSDimitry Andric       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
4563*bdd1243dSDimitry Andric                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
4564*bdd1243dSDimitry Andric     }
4565*bdd1243dSDimitry Andric   } else {
4566*bdd1243dSDimitry Andric     if (IsTFE)
4567*bdd1243dSDimitry Andric       return false;
4568fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
4569fe6060f1SDimitry Andric     case 8:
45705ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
45715ffd83dbSDimitry Andric       break;
4572fe6060f1SDimitry Andric     case 16:
45735ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
45745ffd83dbSDimitry Andric       break;
45755ffd83dbSDimitry Andric     default:
45765ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
45775ffd83dbSDimitry Andric       break;
45785ffd83dbSDimitry Andric     }
45795ffd83dbSDimitry Andric   }
45805ffd83dbSDimitry Andric 
4581*bdd1243dSDimitry Andric   if (IsTFE) {
4582*bdd1243dSDimitry Andric     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
4583*bdd1243dSDimitry Andric     unsigned NumLoadDWords = NumValueDWords + 1;
4584*bdd1243dSDimitry Andric     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
4585*bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
4586*bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4587*bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4588*bdd1243dSDimitry Andric     if (NumValueDWords == 1) {
4589*bdd1243dSDimitry Andric       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
4590*bdd1243dSDimitry Andric     } else {
4591*bdd1243dSDimitry Andric       SmallVector<Register, 5> LoadElts;
4592*bdd1243dSDimitry Andric       for (unsigned I = 0; I != NumValueDWords; ++I)
4593*bdd1243dSDimitry Andric         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
4594*bdd1243dSDimitry Andric       LoadElts.push_back(StatusDst);
4595*bdd1243dSDimitry Andric       B.buildUnmerge(LoadElts, LoadDstReg);
4596*bdd1243dSDimitry Andric       LoadElts.truncate(NumValueDWords);
4597*bdd1243dSDimitry Andric       B.buildMergeLikeInstr(Dst, LoadElts);
4598*bdd1243dSDimitry Andric     }
4599*bdd1243dSDimitry Andric   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
4600*bdd1243dSDimitry Andric              (IsD16 && !Ty.isVector())) {
4601*bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
4602*bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4603*bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
46045ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
46055ffd83dbSDimitry Andric     B.buildTrunc(Dst, LoadDstReg);
4606*bdd1243dSDimitry Andric   } else if (Unpacked && IsD16 && Ty.isVector()) {
4607*bdd1243dSDimitry Andric     LLT UnpackedTy = Ty.changeElementSize(32);
4608*bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
4609*bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
4610*bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
4611*bdd1243dSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
46125ffd83dbSDimitry Andric     // FIXME: G_TRUNC should work, but legalization currently fails
46135ffd83dbSDimitry Andric     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
46145ffd83dbSDimitry Andric     SmallVector<Register, 4> Repack;
46155ffd83dbSDimitry Andric     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
46165ffd83dbSDimitry Andric       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
4617*bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, Repack);
4618*bdd1243dSDimitry Andric   } else {
4619*bdd1243dSDimitry Andric     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
4620*bdd1243dSDimitry Andric                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
46215ffd83dbSDimitry Andric   }
46225ffd83dbSDimitry Andric 
46235ffd83dbSDimitry Andric   MI.eraseFromParent();
46245ffd83dbSDimitry Andric   return true;
46255ffd83dbSDimitry Andric }
46265ffd83dbSDimitry Andric 
46275ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
46285ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
46295ffd83dbSDimitry Andric                                                bool IsInc) const {
46305ffd83dbSDimitry Andric   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
46315ffd83dbSDimitry Andric                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
46325ffd83dbSDimitry Andric   B.buildInstr(Opc)
46335ffd83dbSDimitry Andric     .addDef(MI.getOperand(0).getReg())
46345ffd83dbSDimitry Andric     .addUse(MI.getOperand(2).getReg())
46355ffd83dbSDimitry Andric     .addUse(MI.getOperand(3).getReg())
46365ffd83dbSDimitry Andric     .cloneMemRefs(MI);
46375ffd83dbSDimitry Andric   MI.eraseFromParent();
46385ffd83dbSDimitry Andric   return true;
46395ffd83dbSDimitry Andric }
46405ffd83dbSDimitry Andric 
46415ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
46425ffd83dbSDimitry Andric   switch (IntrID) {
46435ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
46445ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
46455ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
46465ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
46475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
46485ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
46495ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
46505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
46515ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
46525ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
46535ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
46545ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
46555ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
46565ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
46575ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
46585ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
46595ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
46605ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
46615ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
46625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
46635ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
46645ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
46655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
46665ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
46675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
46685ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
46695ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
46705ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
46715ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
46725ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
46735ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
46745ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
46755ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
46765ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
46775ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
46785ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
46795ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
46805ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
46815ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4682e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4683e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4684e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4685fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4686fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4687fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4688fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4689fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4690fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
46915ffd83dbSDimitry Andric   default:
46925ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
46935ffd83dbSDimitry Andric   }
46945ffd83dbSDimitry Andric }
46955ffd83dbSDimitry Andric 
46965ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
46975ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
46985ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
46995ffd83dbSDimitry Andric   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
47005ffd83dbSDimitry Andric                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4701e8d8bef9SDimitry Andric   const bool HasReturn = MI.getNumExplicitDefs() != 0;
47025ffd83dbSDimitry Andric 
4703e8d8bef9SDimitry Andric   Register Dst;
47045ffd83dbSDimitry Andric 
47055ffd83dbSDimitry Andric   int OpOffset = 0;
4706e8d8bef9SDimitry Andric   if (HasReturn) {
4707e8d8bef9SDimitry Andric     // A few FP atomics do not support return values.
4708e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4709e8d8bef9SDimitry Andric   } else {
4710e8d8bef9SDimitry Andric     OpOffset = -1;
4711e8d8bef9SDimitry Andric   }
4712e8d8bef9SDimitry Andric 
4713e8d8bef9SDimitry Andric   Register VData = MI.getOperand(2 + OpOffset).getReg();
4714e8d8bef9SDimitry Andric   Register CmpVal;
47155ffd83dbSDimitry Andric 
47165ffd83dbSDimitry Andric   if (IsCmpSwap) {
47175ffd83dbSDimitry Andric     CmpVal = MI.getOperand(3 + OpOffset).getReg();
47185ffd83dbSDimitry Andric     ++OpOffset;
47195ffd83dbSDimitry Andric   }
47205ffd83dbSDimitry Andric 
47215ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4722e8d8bef9SDimitry Andric   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
47235ffd83dbSDimitry Andric 
47245ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
47255ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
47265ffd83dbSDimitry Andric   Register VIndex;
47275ffd83dbSDimitry Andric   if (HasVIndex) {
47285ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
47295ffd83dbSDimitry Andric     ++OpOffset;
4730fe6060f1SDimitry Andric   } else {
4731fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
47325ffd83dbSDimitry Andric   }
47335ffd83dbSDimitry Andric 
47345ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
47355ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
47365ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
47375ffd83dbSDimitry Andric 
47385ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
47395ffd83dbSDimitry Andric 
47405ffd83dbSDimitry Andric   unsigned ImmOffset;
4741fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4742fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
47435ffd83dbSDimitry Andric 
4744e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4745e8d8bef9SDimitry Andric 
4746e8d8bef9SDimitry Andric   if (HasReturn)
4747e8d8bef9SDimitry Andric     MIB.addDef(Dst);
4748e8d8bef9SDimitry Andric 
4749e8d8bef9SDimitry Andric   MIB.addUse(VData); // vdata
47505ffd83dbSDimitry Andric 
47515ffd83dbSDimitry Andric   if (IsCmpSwap)
47525ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
47535ffd83dbSDimitry Andric 
47545ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
47555ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
47565ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
47575ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
47585ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
47595ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
47605ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
47615ffd83dbSDimitry Andric      .addMemOperand(MMO);
47625ffd83dbSDimitry Andric 
47635ffd83dbSDimitry Andric   MI.eraseFromParent();
47645ffd83dbSDimitry Andric   return true;
47655ffd83dbSDimitry Andric }
47665ffd83dbSDimitry Andric 
4767fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
47685ffd83dbSDimitry Andric /// vector with s16 typed elements.
4769fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4770fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
4771fe6060f1SDimitry Andric                                       unsigned ArgOffset,
4772fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4773fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
47745ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4775fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
4776fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
47775ffd83dbSDimitry Andric 
4778e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4779e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
47805ffd83dbSDimitry Andric     if (!SrcOp.isReg())
47815ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
47825ffd83dbSDimitry Andric 
47835ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
47845ffd83dbSDimitry Andric 
4785fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
4786fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4787fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
47880eae32dcSDimitry Andric       if ((I < Intr->GradientStart) && IsA16 &&
47890eae32dcSDimitry Andric           (B.getMRI()->getType(AddrReg) == S16)) {
479004eeddc0SDimitry Andric         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
47910eae32dcSDimitry Andric         // Special handling of bias when A16 is on. Bias is of type half but
47920eae32dcSDimitry Andric         // occupies full 32-bit.
47930eae32dcSDimitry Andric         PackedAddrs.push_back(
47940eae32dcSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
47950eae32dcSDimitry Andric                 .getReg(0));
47960eae32dcSDimitry Andric       } else {
479704eeddc0SDimitry Andric         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
479804eeddc0SDimitry Andric                "Bias needs to be converted to 16 bit in A16 mode");
479904eeddc0SDimitry Andric         // Handle any gradient or coordinate operands that should not be packed
48005ffd83dbSDimitry Andric         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
48015ffd83dbSDimitry Andric         PackedAddrs.push_back(AddrReg);
48020eae32dcSDimitry Andric       }
48035ffd83dbSDimitry Andric     } else {
48045ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
48055ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
48065ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
4807e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
4808e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
4809e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
4810e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
4811e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
48125ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
4813e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
48145ffd83dbSDimitry Andric         PackedAddrs.push_back(
48155ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
48165ffd83dbSDimitry Andric                 .getReg(0));
48175ffd83dbSDimitry Andric       } else {
48185ffd83dbSDimitry Andric         PackedAddrs.push_back(
4819e8d8bef9SDimitry Andric             B.buildBuildVector(
4820e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
48215ffd83dbSDimitry Andric                 .getReg(0));
48225ffd83dbSDimitry Andric         ++I;
48235ffd83dbSDimitry Andric       }
48245ffd83dbSDimitry Andric     }
48255ffd83dbSDimitry Andric   }
48265ffd83dbSDimitry Andric }
48275ffd83dbSDimitry Andric 
48285ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
48295ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
48305ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
48315ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
48325ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
4833*bdd1243dSDimitry Andric   (void)S32;
48345ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
48355ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
48365ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
48375ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
48385ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
48395ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
48405ffd83dbSDimitry Andric     }
48415ffd83dbSDimitry Andric   }
48425ffd83dbSDimitry Andric 
48435ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
48445ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
4845fe6060f1SDimitry Andric     auto VAddr =
4846fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
48475ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
48485ffd83dbSDimitry Andric   }
48495ffd83dbSDimitry Andric 
48505ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
48515ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
48525ffd83dbSDimitry Andric     if (SrcOp.isReg())
48535ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
48545ffd83dbSDimitry Andric   }
48555ffd83dbSDimitry Andric }
48565ffd83dbSDimitry Andric 
48575ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
48585ffd83dbSDimitry Andric ///
48595ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
48605ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
48615ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
48625ffd83dbSDimitry Andric /// registers.
48635ffd83dbSDimitry Andric ///
48645ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
48655ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
486681ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid
48675ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
4868349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires
4869349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg.
48705ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4871e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4872e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
48735ffd83dbSDimitry Andric 
4874*bdd1243dSDimitry Andric   const MachineFunction &MF = *MI.getMF();
4875e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
4876e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
48775ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
48785ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
48795ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
48805ffd83dbSDimitry Andric 
48815ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
48825ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4883e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
48845ffd83dbSDimitry Andric 
48855ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
48865ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
48875ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4888fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
48895ffd83dbSDimitry Andric 
48905ffd83dbSDimitry Andric   unsigned DMask = 0;
489104eeddc0SDimitry Andric   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
489204eeddc0SDimitry Andric   LLT Ty = MRI->getType(VData);
48935ffd83dbSDimitry Andric 
48945ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
4895e8d8bef9SDimitry Andric   LLT GradTy =
4896e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4897e8d8bef9SDimitry Andric   LLT AddrTy =
4898e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
48995ffd83dbSDimitry Andric   const bool IsG16 = GradTy == S16;
49005ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
490104eeddc0SDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
49025ffd83dbSDimitry Andric 
49035ffd83dbSDimitry Andric   int DMaskLanes = 0;
49045ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
4905e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
49065ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
49075ffd83dbSDimitry Andric       DMaskLanes = 4;
49085ffd83dbSDimitry Andric     } else if (DMask != 0) {
4909*bdd1243dSDimitry Andric       DMaskLanes = llvm::popcount(DMask);
49105ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
49115ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
49125ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
49135ffd83dbSDimitry Andric       MI.eraseFromParent();
49145ffd83dbSDimitry Andric       return true;
49155ffd83dbSDimitry Andric     }
49165ffd83dbSDimitry Andric   }
49175ffd83dbSDimitry Andric 
49185ffd83dbSDimitry Andric   Observer.changingInstr(MI);
49195ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
49205ffd83dbSDimitry Andric 
492104eeddc0SDimitry Andric   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
492204eeddc0SDimitry Andric                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
492304eeddc0SDimitry Andric   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
492404eeddc0SDimitry Andric                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
492504eeddc0SDimitry Andric   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
49265ffd83dbSDimitry Andric 
49275ffd83dbSDimitry Andric   // Track that we legalized this
49285ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
49295ffd83dbSDimitry Andric 
49305ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
49315ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
49325ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
49335ffd83dbSDimitry Andric     DMask = 0x1;
49345ffd83dbSDimitry Andric     DMaskLanes = 1;
4935e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
49365ffd83dbSDimitry Andric   }
49375ffd83dbSDimitry Andric 
49385ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
49395ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
49405ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
49415ffd83dbSDimitry Andric 
49425ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
49435ffd83dbSDimitry Andric     if (Ty.isVector())
49445ffd83dbSDimitry Andric       return false;
49455ffd83dbSDimitry Andric 
49465ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
49475ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
49485ffd83dbSDimitry Andric       // The two values are packed in one register.
4949fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
49505ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
49515ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
49525ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
49535ffd83dbSDimitry Andric     }
49545ffd83dbSDimitry Andric   }
49555ffd83dbSDimitry Andric 
4956e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
49575ffd83dbSDimitry Andric 
49585ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
4959fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4960fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
4961fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
49625ffd83dbSDimitry Andric     return false;
4963fe6060f1SDimitry Andric   }
49645ffd83dbSDimitry Andric 
4965fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
4966fe6060f1SDimitry Andric     // A16 not supported
4967fe6060f1SDimitry Andric     return false;
4968fe6060f1SDimitry Andric   }
4969fe6060f1SDimitry Andric 
4970fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
4971e8d8bef9SDimitry Andric     if (Intr->NumVAddrs > 1) {
49725ffd83dbSDimitry Andric       SmallVector<Register, 4> PackedRegs;
49735ffd83dbSDimitry Andric 
4974fe6060f1SDimitry Andric       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4975fe6060f1SDimitry Andric                                 IsG16);
49765ffd83dbSDimitry Andric 
49775ffd83dbSDimitry Andric       // See also below in the non-a16 branch
4978*bdd1243dSDimitry Andric       const bool UseNSA = ST.hasNSAEncoding() &&
4979*bdd1243dSDimitry Andric                           PackedRegs.size() >= ST.getNSAThreshold(MF) &&
4980fe6060f1SDimitry Andric                           PackedRegs.size() <= ST.getNSAMaxSize();
49815ffd83dbSDimitry Andric 
49825ffd83dbSDimitry Andric       if (!UseNSA && PackedRegs.size() > 1) {
4983fe6060f1SDimitry Andric         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
49845ffd83dbSDimitry Andric         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
49855ffd83dbSDimitry Andric         PackedRegs[0] = Concat.getReg(0);
49865ffd83dbSDimitry Andric         PackedRegs.resize(1);
49875ffd83dbSDimitry Andric       }
49885ffd83dbSDimitry Andric 
4989e8d8bef9SDimitry Andric       const unsigned NumPacked = PackedRegs.size();
4990e8d8bef9SDimitry Andric       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4991e8d8bef9SDimitry Andric         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
49925ffd83dbSDimitry Andric         if (!SrcOp.isReg()) {
49935ffd83dbSDimitry Andric           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
49945ffd83dbSDimitry Andric           continue;
49955ffd83dbSDimitry Andric         }
49965ffd83dbSDimitry Andric 
49975ffd83dbSDimitry Andric         assert(SrcOp.getReg() != AMDGPU::NoRegister);
49985ffd83dbSDimitry Andric 
4999e8d8bef9SDimitry Andric         if (I - Intr->VAddrStart < NumPacked)
5000e8d8bef9SDimitry Andric           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
50015ffd83dbSDimitry Andric         else
50025ffd83dbSDimitry Andric           SrcOp.setReg(AMDGPU::NoRegister);
50035ffd83dbSDimitry Andric       }
50045ffd83dbSDimitry Andric     }
50055ffd83dbSDimitry Andric   } else {
50065ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
50075ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
50085ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
50095ffd83dbSDimitry Andric     // wash in terms of code size or even better.
50105ffd83dbSDimitry Andric     //
50115ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
50125ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
50135ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
50145ffd83dbSDimitry Andric     //
50155ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
50165ffd83dbSDimitry Andric     // allocation when possible.
501781ad6265SDimitry Andric     //
501881ad6265SDimitry Andric     // TODO: we can actually allow partial NSA where the final register is a
501981ad6265SDimitry Andric     // contiguous set of the remaining addresses.
502081ad6265SDimitry Andric     // This could help where there are more addresses than supported.
5021*bdd1243dSDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() &&
5022*bdd1243dSDimitry Andric                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5023fe6060f1SDimitry Andric                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
50245ffd83dbSDimitry Andric 
5025e8d8bef9SDimitry Andric     if (!UseNSA && Intr->NumVAddrs > 1)
5026e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5027e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
50285ffd83dbSDimitry Andric   }
50295ffd83dbSDimitry Andric 
50305ffd83dbSDimitry Andric   int Flags = 0;
50315ffd83dbSDimitry Andric   if (IsA16)
50325ffd83dbSDimitry Andric     Flags |= 1;
50335ffd83dbSDimitry Andric   if (IsG16)
50345ffd83dbSDimitry Andric     Flags |= 2;
50355ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
50365ffd83dbSDimitry Andric 
50375ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
50385ffd83dbSDimitry Andric     // TODO: Handle dmask trim
503904eeddc0SDimitry Andric     if (!Ty.isVector() || !IsD16)
50405ffd83dbSDimitry Andric       return true;
50415ffd83dbSDimitry Andric 
5042e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
50435ffd83dbSDimitry Andric     if (RepackedReg != VData) {
50445ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
50455ffd83dbSDimitry Andric     }
50465ffd83dbSDimitry Andric 
50475ffd83dbSDimitry Andric     return true;
50485ffd83dbSDimitry Andric   }
50495ffd83dbSDimitry Andric 
50505ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
50515ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
50525ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
50535ffd83dbSDimitry Andric 
50545ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
50555ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
50565ffd83dbSDimitry Andric     return false;
50575ffd83dbSDimitry Andric 
50585ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
50595ffd83dbSDimitry Andric     return false;
50605ffd83dbSDimitry Andric 
50615ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5062fe6060f1SDimitry Andric   const LLT AdjustedTy =
5063fe6060f1SDimitry Andric       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
50645ffd83dbSDimitry Andric 
50655ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
50665ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
50675ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
50685ffd83dbSDimitry Andric   LLT RoundedTy;
50695ffd83dbSDimitry Andric 
5070*bdd1243dSDimitry Andric   // S32 vector to cover all data, plus TFE result element.
50715ffd83dbSDimitry Andric   LLT TFETy;
50725ffd83dbSDimitry Andric 
50735ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
50745ffd83dbSDimitry Andric   LLT RegTy;
50755ffd83dbSDimitry Andric 
50765ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
5077fe6060f1SDimitry Andric     RoundedTy =
5078fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5079fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
50805ffd83dbSDimitry Andric     RegTy = S32;
50815ffd83dbSDimitry Andric   } else {
50825ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
50835ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
50845ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
5085fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
5086fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5087fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
50885ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
50895ffd83dbSDimitry Andric   }
50905ffd83dbSDimitry Andric 
50915ffd83dbSDimitry Andric   // The return type does not need adjustment.
50925ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
50935ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
50945ffd83dbSDimitry Andric     return true;
50955ffd83dbSDimitry Andric 
50965ffd83dbSDimitry Andric   Register Dst1Reg;
50975ffd83dbSDimitry Andric 
50985ffd83dbSDimitry Andric   // Insert after the instruction.
50995ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
51005ffd83dbSDimitry Andric 
51015ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
51025ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
51035ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
51045ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
51055ffd83dbSDimitry Andric 
51065ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
51075ffd83dbSDimitry Andric 
51085ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
51095ffd83dbSDimitry Andric 
51105ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
5111349cc55cSDimitry Andric   // type. The instruction really returns these two values in one contiguous
51125ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
51135ffd83dbSDimitry Andric   // return type to use a single register result.
51145ffd83dbSDimitry Andric 
51155ffd83dbSDimitry Andric   if (IsTFE) {
51165ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
51175ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
51185ffd83dbSDimitry Andric       return false;
51195ffd83dbSDimitry Andric 
51205ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
512181ad6265SDimitry Andric     MI.removeOperand(1);
51225ffd83dbSDimitry Andric 
51235ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
51245ffd83dbSDimitry Andric     if (Ty == S32) {
51255ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
51265ffd83dbSDimitry Andric       return true;
51275ffd83dbSDimitry Andric     }
51285ffd83dbSDimitry Andric   }
51295ffd83dbSDimitry Andric 
51305ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
51315ffd83dbSDimitry Andric   // result.
51325ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
51335ffd83dbSDimitry Andric 
51345ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
51355ffd83dbSDimitry Andric 
51365ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
51375ffd83dbSDimitry Andric     assert(!IsTFE);
51385ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
51395ffd83dbSDimitry Andric   } else {
51405ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
51415ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
51425ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
51435ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
51445ffd83dbSDimitry Andric 
51455ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
51465ffd83dbSDimitry Andric     // directly written to the right place already.
51475ffd83dbSDimitry Andric     if (IsTFE)
51485ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
51495ffd83dbSDimitry Andric   }
51505ffd83dbSDimitry Andric 
51515ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
51525ffd83dbSDimitry Andric   // of packed vs. unpacked.
51535ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
51545ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
51555ffd83dbSDimitry Andric     return true;
51565ffd83dbSDimitry Andric   }
51575ffd83dbSDimitry Andric 
51585ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
51595ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
51605ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
51615ffd83dbSDimitry Andric     return true;
51625ffd83dbSDimitry Andric   }
51635ffd83dbSDimitry Andric 
51645ffd83dbSDimitry Andric   assert(Ty.isVector());
51655ffd83dbSDimitry Andric 
51665ffd83dbSDimitry Andric   if (IsD16) {
51675ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
51685ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
51695ffd83dbSDimitry Andric     //
51705ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
51715ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
51725ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
51735ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
51745ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
51755ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
51765ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
51775ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
51785ffd83dbSDimitry Andric     }
51795ffd83dbSDimitry Andric   }
51805ffd83dbSDimitry Andric 
51815ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
51825ffd83dbSDimitry Andric     if (NumElts == 0)
51835ffd83dbSDimitry Andric       return;
51845ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
51855ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
51865ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
51875ffd83dbSDimitry Andric   };
51885ffd83dbSDimitry Andric 
51895ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
51905ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
51915ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
51925ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
51935ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
51945ffd83dbSDimitry Andric     return true;
51955ffd83dbSDimitry Andric   }
51965ffd83dbSDimitry Andric 
51975ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
51985ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
51995ffd83dbSDimitry Andric 
52005ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
5201fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
52025ffd83dbSDimitry Andric   if (Ty == V3S16) {
52030eae32dcSDimitry Andric     if (IsTFE) {
52040eae32dcSDimitry Andric       if (ResultRegs.size() == 1) {
52050eae32dcSDimitry Andric         NewResultReg = ResultRegs[0];
52060eae32dcSDimitry Andric       } else if (ResultRegs.size() == 2) {
52070eae32dcSDimitry Andric         LLT V4S16 = LLT::fixed_vector(4, 16);
52080eae32dcSDimitry Andric         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
52090eae32dcSDimitry Andric       } else {
52100eae32dcSDimitry Andric         return false;
52110eae32dcSDimitry Andric       }
52120eae32dcSDimitry Andric     }
52130eae32dcSDimitry Andric 
52140eae32dcSDimitry Andric     if (MRI->getType(DstReg).getNumElements() <
52150eae32dcSDimitry Andric         MRI->getType(NewResultReg).getNumElements()) {
52160eae32dcSDimitry Andric       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
52170eae32dcSDimitry Andric     } else {
52180eae32dcSDimitry Andric       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
52190eae32dcSDimitry Andric     }
52205ffd83dbSDimitry Andric     return true;
52215ffd83dbSDimitry Andric   }
52225ffd83dbSDimitry Andric 
52235ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
52245ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
52255ffd83dbSDimitry Andric   return true;
52265ffd83dbSDimitry Andric }
52275ffd83dbSDimitry Andric 
52285ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5229e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
5230e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
5231e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
5232e8d8bef9SDimitry Andric 
52335ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
52345ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
52355ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
52365ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
52375ffd83dbSDimitry Andric 
52385ffd83dbSDimitry Andric   Observer.changingInstr(MI);
52395ffd83dbSDimitry Andric 
5240fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5241e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
5242e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
5243e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
5244e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
5245e8d8bef9SDimitry Andric   }
5246e8d8bef9SDimitry Andric 
52475ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
52485ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
52495ffd83dbSDimitry Andric   // allowed to add one.
52505ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
525181ad6265SDimitry Andric   MI.removeOperand(1); // Remove intrinsic ID
52525ffd83dbSDimitry Andric 
52535ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
52545ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
52555ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
52565ffd83dbSDimitry Andric   const Align MemAlign(4);
52575ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
52585ffd83dbSDimitry Andric       MachinePointerInfo(),
52595ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
52605ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
52615ffd83dbSDimitry Andric       MemSize, MemAlign);
52625ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
52635ffd83dbSDimitry Andric 
52645ffd83dbSDimitry Andric   // There are no 96-bit result scalar loads, but widening to 128-bit should
52655ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
52665ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
52675ffd83dbSDimitry Andric   if (!isPowerOf2_32(Size)) {
52685ffd83dbSDimitry Andric     if (Ty.isVector())
52695ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
52705ffd83dbSDimitry Andric     else
52715ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
52725ffd83dbSDimitry Andric   }
52735ffd83dbSDimitry Andric 
52745ffd83dbSDimitry Andric   Observer.changedInstr(MI);
52755ffd83dbSDimitry Andric   return true;
52765ffd83dbSDimitry Andric }
52775ffd83dbSDimitry Andric 
5278e8d8bef9SDimitry Andric // TODO: Move to selection
52795ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
52800b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
52810b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
5282fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
5283fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5284fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
5285fe6060f1SDimitry Andric 
5286*bdd1243dSDimitry Andric   if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5287fe6060f1SDimitry Andric     switch (*HsaAbiVer) {
5288fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5289fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5290fe6060f1SDimitry Andric       return legalizeTrapHsaQueuePtr(MI, MRI, B);
5291fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
52921fd87a68SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5293fe6060f1SDimitry Andric       return ST.supportsGetDoorbellID() ?
5294fe6060f1SDimitry Andric           legalizeTrapHsa(MI, MRI, B) :
5295fe6060f1SDimitry Andric           legalizeTrapHsaQueuePtr(MI, MRI, B);
5296fe6060f1SDimitry Andric     }
5297fe6060f1SDimitry Andric   }
5298fe6060f1SDimitry Andric 
5299fe6060f1SDimitry Andric   llvm_unreachable("Unknown trap handler");
5300fe6060f1SDimitry Andric }
5301fe6060f1SDimitry Andric 
5302fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5303fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
53045ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5305fe6060f1SDimitry Andric   MI.eraseFromParent();
5306fe6060f1SDimitry Andric   return true;
5307fe6060f1SDimitry Andric }
5308fe6060f1SDimitry Andric 
5309fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5310fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
531181ad6265SDimitry Andric   MachineFunction &MF = B.getMF();
531281ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
531381ad6265SDimitry Andric 
531481ad6265SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
531581ad6265SDimitry Andric   // For code object version 5, queue_ptr is passed through implicit kernarg.
531681ad6265SDimitry Andric   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
531781ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
531881ad6265SDimitry Andric         AMDGPUTargetLowering::QUEUE_PTR;
531981ad6265SDimitry Andric     uint64_t Offset =
532081ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
532181ad6265SDimitry Andric 
532281ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
532381ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
532481ad6265SDimitry Andric 
532581ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
532681ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
532781ad6265SDimitry Andric       return false;
532881ad6265SDimitry Andric 
532981ad6265SDimitry Andric     // TODO: can we be smarter about machine pointer info?
533081ad6265SDimitry Andric     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
533181ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
533281ad6265SDimitry Andric         PtrInfo,
533381ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
533481ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
533581ad6265SDimitry Andric         LLT::scalar(64), commonAlignment(Align(64), Offset));
533681ad6265SDimitry Andric 
533781ad6265SDimitry Andric     // Pointer address
533881ad6265SDimitry Andric     Register LoadAddr = MRI.createGenericVirtualRegister(
533981ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
534081ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
534181ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
534281ad6265SDimitry Andric     // Load address
534381ad6265SDimitry Andric     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
534481ad6265SDimitry Andric     B.buildCopy(SGPR01, Temp);
534581ad6265SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
534681ad6265SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
534781ad6265SDimitry Andric         .addReg(SGPR01, RegState::Implicit);
534881ad6265SDimitry Andric     MI.eraseFromParent();
534981ad6265SDimitry Andric     return true;
535081ad6265SDimitry Andric   }
535181ad6265SDimitry Andric 
53525ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
53535ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5354e8d8bef9SDimitry Andric   Register LiveIn =
5355e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5356e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
53575ffd83dbSDimitry Andric     return false;
5358e8d8bef9SDimitry Andric 
53595ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
53605ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
5361fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
53625ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
5363fe6060f1SDimitry Andric 
5364fe6060f1SDimitry Andric   MI.eraseFromParent();
5365fe6060f1SDimitry Andric   return true;
53665ffd83dbSDimitry Andric }
53675ffd83dbSDimitry Andric 
5368fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(
5369fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5370fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
5371fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
53725ffd83dbSDimitry Andric   MI.eraseFromParent();
53735ffd83dbSDimitry Andric   return true;
53745ffd83dbSDimitry Andric }
53755ffd83dbSDimitry Andric 
53765ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
53775ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5378349cc55cSDimitry Andric   // Is non-HSA path or trap-handler disabled? Then, report a warning
53795ffd83dbSDimitry Andric   // accordingly
5380fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
5381fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
53825ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
53835ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
53845ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
53855ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
53865ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
53875ffd83dbSDimitry Andric   } else {
53885ffd83dbSDimitry Andric     // Insert debug-trap instruction
5389fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
5390fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
53915ffd83dbSDimitry Andric   }
53925ffd83dbSDimitry Andric 
53935ffd83dbSDimitry Andric   MI.eraseFromParent();
53945ffd83dbSDimitry Andric   return true;
53955ffd83dbSDimitry Andric }
53965ffd83dbSDimitry Andric 
5397e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5398e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
5399e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
5400e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
5401e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
540281ad6265SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
540381ad6265SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
5404e8d8bef9SDimitry Andric 
5405e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5406e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
5407e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
5408e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
5409e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
5410e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
5411e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
5412e8d8bef9SDimitry Andric 
5413fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
5414fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5415fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
5416fe6060f1SDimitry Andric                                         MI.getDebugLoc());
5417fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
5418fe6060f1SDimitry Andric     return false;
5419fe6060f1SDimitry Andric   }
5420fe6060f1SDimitry Andric 
542181ad6265SDimitry Andric   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5422349cc55cSDimitry Andric   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5423349cc55cSDimitry Andric   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5424349cc55cSDimitry Andric   const unsigned NumVDataDwords = 4;
5425349cc55cSDimitry Andric   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
542681ad6265SDimitry Andric   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
542781ad6265SDimitry Andric   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5428349cc55cSDimitry Andric   const unsigned BaseOpcodes[2][2] = {
5429349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5430349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5431349cc55cSDimitry Andric        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5432349cc55cSDimitry Andric   int Opcode;
5433349cc55cSDimitry Andric   if (UseNSA) {
543481ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
543581ad6265SDimitry Andric                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
543681ad6265SDimitry Andric                                                : AMDGPU::MIMGEncGfx10NSA,
5437349cc55cSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
5438349cc55cSDimitry Andric   } else {
543981ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(
544081ad6265SDimitry Andric         BaseOpcodes[Is64][IsA16],
544181ad6265SDimitry Andric         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
5442*bdd1243dSDimitry Andric         NumVDataDwords, NumVAddrDwords);
5443349cc55cSDimitry Andric   }
5444349cc55cSDimitry Andric   assert(Opcode != -1);
5445e8d8bef9SDimitry Andric 
5446e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
544781ad6265SDimitry Andric   if (UseNSA && IsGFX11Plus) {
544881ad6265SDimitry Andric     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
544981ad6265SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5450*bdd1243dSDimitry Andric       auto Merged = B.buildMergeLikeInstr(
545181ad6265SDimitry Andric           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
545281ad6265SDimitry Andric       Ops.push_back(Merged.getReg(0));
545381ad6265SDimitry Andric     };
545481ad6265SDimitry Andric 
545581ad6265SDimitry Andric     Ops.push_back(NodePtr);
545681ad6265SDimitry Andric     Ops.push_back(RayExtent);
545781ad6265SDimitry Andric     packLanes(RayOrigin);
545881ad6265SDimitry Andric 
545981ad6265SDimitry Andric     if (IsA16) {
546081ad6265SDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
546181ad6265SDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5462*bdd1243dSDimitry Andric       auto MergedDir = B.buildMergeLikeInstr(
546381ad6265SDimitry Andric           V3S32,
5464*bdd1243dSDimitry Andric           {B.buildBitcast(
5465*bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
546681ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(0)}))
546781ad6265SDimitry Andric                .getReg(0),
5468*bdd1243dSDimitry Andric            B.buildBitcast(
5469*bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
547081ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(1)}))
547181ad6265SDimitry Andric                .getReg(0),
5472*bdd1243dSDimitry Andric            B.buildBitcast(
5473*bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
547481ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(2)}))
547581ad6265SDimitry Andric                .getReg(0)});
547681ad6265SDimitry Andric       Ops.push_back(MergedDir.getReg(0));
547781ad6265SDimitry Andric     } else {
547881ad6265SDimitry Andric       packLanes(RayDir);
547981ad6265SDimitry Andric       packLanes(RayInvDir);
548081ad6265SDimitry Andric     }
548181ad6265SDimitry Andric   } else {
5482e8d8bef9SDimitry Andric     if (Is64) {
5483e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5484e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
5485e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
5486e8d8bef9SDimitry Andric     } else {
5487e8d8bef9SDimitry Andric       Ops.push_back(NodePtr);
5488e8d8bef9SDimitry Andric     }
5489e8d8bef9SDimitry Andric     Ops.push_back(RayExtent);
5490e8d8bef9SDimitry Andric 
5491e8d8bef9SDimitry Andric     auto packLanes = [&Ops, &S32, &B](Register Src) {
54920eae32dcSDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5493e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
5494e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
5495e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(2));
5496e8d8bef9SDimitry Andric     };
5497e8d8bef9SDimitry Andric 
5498e8d8bef9SDimitry Andric     packLanes(RayOrigin);
5499e8d8bef9SDimitry Andric     if (IsA16) {
55000eae32dcSDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
55010eae32dcSDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5502e8d8bef9SDimitry Andric       Register R1 = MRI.createGenericVirtualRegister(S32);
5503e8d8bef9SDimitry Andric       Register R2 = MRI.createGenericVirtualRegister(S32);
5504e8d8bef9SDimitry Andric       Register R3 = MRI.createGenericVirtualRegister(S32);
5505*bdd1243dSDimitry Andric       B.buildMergeLikeInstr(R1,
5506*bdd1243dSDimitry Andric                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5507*bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
5508*bdd1243dSDimitry Andric           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
5509*bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
5510*bdd1243dSDimitry Andric           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5511e8d8bef9SDimitry Andric       Ops.push_back(R1);
5512e8d8bef9SDimitry Andric       Ops.push_back(R2);
5513e8d8bef9SDimitry Andric       Ops.push_back(R3);
5514e8d8bef9SDimitry Andric     } else {
5515e8d8bef9SDimitry Andric       packLanes(RayDir);
5516e8d8bef9SDimitry Andric       packLanes(RayInvDir);
5517e8d8bef9SDimitry Andric     }
551881ad6265SDimitry Andric   }
5519e8d8bef9SDimitry Andric 
5520349cc55cSDimitry Andric   if (!UseNSA) {
5521349cc55cSDimitry Andric     // Build a single vector containing all the operands so far prepared.
5522349cc55cSDimitry Andric     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5523*bdd1243dSDimitry Andric     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
5524349cc55cSDimitry Andric     Ops.clear();
5525349cc55cSDimitry Andric     Ops.push_back(MergedOps);
5526349cc55cSDimitry Andric   }
5527349cc55cSDimitry Andric 
5528e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5529e8d8bef9SDimitry Andric     .addDef(DstReg)
5530e8d8bef9SDimitry Andric     .addImm(Opcode);
5531e8d8bef9SDimitry Andric 
5532e8d8bef9SDimitry Andric   for (Register R : Ops) {
5533e8d8bef9SDimitry Andric     MIB.addUse(R);
5534e8d8bef9SDimitry Andric   }
5535e8d8bef9SDimitry Andric 
5536e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
5537e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
5538e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
5539e8d8bef9SDimitry Andric 
5540e8d8bef9SDimitry Andric   MI.eraseFromParent();
5541e8d8bef9SDimitry Andric   return true;
5542e8d8bef9SDimitry Andric }
5543e8d8bef9SDimitry Andric 
554481ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
554581ad6265SDimitry Andric                                                MachineIRBuilder &B) const {
554681ad6265SDimitry Andric   unsigned Opc;
554781ad6265SDimitry Andric   int RoundMode = MI.getOperand(2).getImm();
554881ad6265SDimitry Andric 
554981ad6265SDimitry Andric   if (RoundMode == (int)RoundingMode::TowardPositive)
555081ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
555181ad6265SDimitry Andric   else if (RoundMode == (int)RoundingMode::TowardNegative)
555281ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
555381ad6265SDimitry Andric   else
555481ad6265SDimitry Andric     return false;
555581ad6265SDimitry Andric 
555681ad6265SDimitry Andric   B.buildInstr(Opc)
555781ad6265SDimitry Andric       .addDef(MI.getOperand(0).getReg())
555881ad6265SDimitry Andric       .addUse(MI.getOperand(1).getReg());
555981ad6265SDimitry Andric 
556004eeddc0SDimitry Andric   MI.eraseFromParent();
556181ad6265SDimitry Andric 
556204eeddc0SDimitry Andric   return true;
556304eeddc0SDimitry Andric }
556404eeddc0SDimitry Andric 
55655ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
55665ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
55675ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
55685ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
55695ffd83dbSDimitry Andric 
55700b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5571480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
5572480093f4SDimitry Andric   switch (IntrID) {
5573480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
5574480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
5575480093f4SDimitry Andric     MachineInstr *Br = nullptr;
55765ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
5577e8d8bef9SDimitry Andric     bool Negated = false;
5578e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
5579e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
55800b57cec5SDimitry Andric       const SIRegisterInfo *TRI
55810b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
55820b57cec5SDimitry Andric 
55830b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
55840b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
5585480093f4SDimitry Andric 
55865ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5587e8d8bef9SDimitry Andric 
5588e8d8bef9SDimitry Andric       if (Negated)
5589e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
5590e8d8bef9SDimitry Andric 
55915ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5592480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
55930b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
55940b57cec5SDimitry Andric           .addDef(Def)
55950b57cec5SDimitry Andric           .addUse(Use)
55965ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
5597480093f4SDimitry Andric       } else {
5598480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
5599480093f4SDimitry Andric             .addDef(Def)
5600480093f4SDimitry Andric             .addUse(Use)
5601e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
5602480093f4SDimitry Andric       }
5603480093f4SDimitry Andric 
56045ffd83dbSDimitry Andric       if (Br) {
56055ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
56065ffd83dbSDimitry Andric       } else {
56075ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
56085ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
56095ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
56105ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
56115ffd83dbSDimitry Andric       }
56120b57cec5SDimitry Andric 
56130b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
56140b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
56150b57cec5SDimitry Andric       MI.eraseFromParent();
56160b57cec5SDimitry Andric       BrCond->eraseFromParent();
56170b57cec5SDimitry Andric       return true;
56180b57cec5SDimitry Andric     }
56190b57cec5SDimitry Andric 
56200b57cec5SDimitry Andric     return false;
56210b57cec5SDimitry Andric   }
56220b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
5623480093f4SDimitry Andric     MachineInstr *Br = nullptr;
56245ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
5625e8d8bef9SDimitry Andric     bool Negated = false;
5626e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
5627e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
56280b57cec5SDimitry Andric       const SIRegisterInfo *TRI
56290b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
56300b57cec5SDimitry Andric 
56315ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
56320b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
56335ffd83dbSDimitry Andric 
5634e8d8bef9SDimitry Andric       if (Negated)
5635e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
5636e8d8bef9SDimitry Andric 
56375ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
56380b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
56390b57cec5SDimitry Andric         .addUse(Reg)
56405ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
56415ffd83dbSDimitry Andric 
56425ffd83dbSDimitry Andric       if (Br)
56435ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
56445ffd83dbSDimitry Andric       else
56455ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
56465ffd83dbSDimitry Andric 
56470b57cec5SDimitry Andric       MI.eraseFromParent();
56480b57cec5SDimitry Andric       BrCond->eraseFromParent();
56490b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
56500b57cec5SDimitry Andric       return true;
56510b57cec5SDimitry Andric     }
56520b57cec5SDimitry Andric 
56530b57cec5SDimitry Andric     return false;
56540b57cec5SDimitry Andric   }
56550b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
56565ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
56575ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
56585ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
56595ffd83dbSDimitry Andric       MI.eraseFromParent();
56605ffd83dbSDimitry Andric       return true;
56615ffd83dbSDimitry Andric     }
56625ffd83dbSDimitry Andric 
56630b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
56640b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
56650b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
56660b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
56670b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
566881ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
56690b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
56700b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
567181ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
56720b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
56730b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
567481ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
56750b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
56760b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
56770b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56780b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
56790b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
56800b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56810b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
56820b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
56830b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56840b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5685fcaf7f86SDimitry Andric   case Intrinsic::amdgcn_lds_kernel_id:
5686fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
5687fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
56880b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
56890b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56900b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
56910b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
56920b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56930b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
56940b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
56950b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
56960b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
56970b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
56980b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56990b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
570081ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_x:
570181ad6265SDimitry Andric     // TODO: Emit error for hsa
570281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
570381ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_X);
570481ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_y:
570581ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
570681ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Y);
570781ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_z:
570881ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
570981ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Z);
571081ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_x:
571181ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
571281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
571381ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_y:
571481ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
571581ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
571681ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
571781ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_z:
571881ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
571981ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_x:
572081ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
572181ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_y:
572281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
572381ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_z:
572481ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
57258bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
57268bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
57278bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
57288bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
57298bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
57308bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
57318bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
57328bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
57338bcb0991SDimitry Andric     MI.eraseFromParent();
57348bcb0991SDimitry Andric     return true;
57358bcb0991SDimitry Andric   }
57365ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
5737e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
57388bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
57395ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
57405ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
57418bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
57425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
57435ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
57445ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
57455ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
57465ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
57475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
57485ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
57495ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
57505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
57515ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
57525ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
57535ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
57545ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
57555ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
57565ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
57575ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
57585ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
57595ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
57605ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
57615ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
57625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
57635ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
57645ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
57655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
57665ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
57675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
57685ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
57695ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
57705ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
57715ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
57725ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
57735ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
57745ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
57755ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
57765ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
57775ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
57785ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
57795ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
57805ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
57815ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5782fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5783fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5784fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5785fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
578604eeddc0SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5787*bdd1243dSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
578804eeddc0SDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
57895ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_inc:
57905ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, true);
57915ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_dec:
57925ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, false);
57935ffd83dbSDimitry Andric   case Intrinsic::trap:
57945ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
57955ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
57965ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5797e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
5798e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
5799e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
5800e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
5801e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
5802e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5803e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5804e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
58055ffd83dbSDimitry Andric   default: {
58065ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
58075ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
58085ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
58090b57cec5SDimitry Andric     return true;
58100b57cec5SDimitry Andric   }
58115ffd83dbSDimitry Andric   }
58120b57cec5SDimitry Andric 
58130b57cec5SDimitry Andric   return true;
58140b57cec5SDimitry Andric }
5815