xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision fcaf7f8644a9988098ac6be2165bce3ea4786e91)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
21fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
23fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
28e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
2981ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
300b57cec5SDimitry Andric 
310b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
320b57cec5SDimitry Andric 
330b57cec5SDimitry Andric using namespace llvm;
340b57cec5SDimitry Andric using namespace LegalizeActions;
350b57cec5SDimitry Andric using namespace LegalizeMutations;
360b57cec5SDimitry Andric using namespace LegalityPredicates;
375ffd83dbSDimitry Andric using namespace MIPatternMatch;
380b57cec5SDimitry Andric 
395ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
405ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
415ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
425ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
435ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
445ffd83dbSDimitry Andric   cl::init(false),
455ffd83dbSDimitry Andric   cl::ReallyHidden);
460b57cec5SDimitry Andric 
475ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
485ffd83dbSDimitry Andric 
495ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
505ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
515ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
525ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
53fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
540b57cec5SDimitry Andric }
550b57cec5SDimitry Andric 
565ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
575ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
585ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
595ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
605ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
618bcb0991SDimitry Andric }
628bcb0991SDimitry Andric 
63349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an
64e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
65e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
660b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
670b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
680b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
69e8d8bef9SDimitry Andric     if (!Ty.isVector())
70e8d8bef9SDimitry Andric       return false;
71e8d8bef9SDimitry Andric 
72e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
73e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
74e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
75e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
768bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
778bcb0991SDimitry Andric   };
788bcb0991SDimitry Andric }
798bcb0991SDimitry Andric 
80e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
81e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
82e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
83e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
84e8d8bef9SDimitry Andric   };
85e8d8bef9SDimitry Andric }
86e8d8bef9SDimitry Andric 
878bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
888bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
898bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
908bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
918bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
920b57cec5SDimitry Andric   };
930b57cec5SDimitry Andric }
940b57cec5SDimitry Andric 
950b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
960b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
970b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
980b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
99fe6060f1SDimitry Andric     return std::make_pair(TypeIdx,
100fe6060f1SDimitry Andric                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1010b57cec5SDimitry Andric   };
1020b57cec5SDimitry Andric }
1030b57cec5SDimitry Andric 
1040b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1050b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1060b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1070b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1080b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1090b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1100b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
111fe6060f1SDimitry Andric     return std::make_pair(
112fe6060f1SDimitry Andric         TypeIdx,
113fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
1140b57cec5SDimitry Andric   };
1150b57cec5SDimitry Andric }
1160b57cec5SDimitry Andric 
1178bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1188bcb0991SDimitry Andric // type.
1198bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1208bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1218bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1228bcb0991SDimitry Andric 
1238bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1248bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1258bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1268bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1278bcb0991SDimitry Andric 
1288bcb0991SDimitry Andric     assert(EltSize < 32);
1298bcb0991SDimitry Andric 
1308bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131fe6060f1SDimitry Andric     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1328bcb0991SDimitry Andric   };
1338bcb0991SDimitry Andric }
1348bcb0991SDimitry Andric 
135e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
136e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1375ffd83dbSDimitry Andric 
1385ffd83dbSDimitry Andric   if (Size <= 32) {
1395ffd83dbSDimitry Andric     // <2 x s8> -> s16
1405ffd83dbSDimitry Andric     // <4 x s8> -> s32
141e8d8bef9SDimitry Andric     return LLT::scalar(Size);
142e8d8bef9SDimitry Andric   }
1435ffd83dbSDimitry Andric 
144fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145e8d8bef9SDimitry Andric }
146e8d8bef9SDimitry Andric 
147e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
149e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
150e8d8bef9SDimitry Andric     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151e8d8bef9SDimitry Andric   };
152e8d8bef9SDimitry Andric }
153e8d8bef9SDimitry Andric 
154e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
156e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
157e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
158e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
159fe6060f1SDimitry Andric     return std::make_pair(
160fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
1615ffd83dbSDimitry Andric   };
1625ffd83dbSDimitry Andric }
1635ffd83dbSDimitry Andric 
1648bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
1658bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1668bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1678bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
1688bcb0991SDimitry Andric   };
1698bcb0991SDimitry Andric }
1708bcb0991SDimitry Andric 
1710b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
1720b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1730b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1740b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
1750b57cec5SDimitry Andric   };
1760b57cec5SDimitry Andric }
1770b57cec5SDimitry Andric 
1780b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
1790b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1800b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1810b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
1820b57cec5SDimitry Andric   };
1830b57cec5SDimitry Andric }
1840b57cec5SDimitry Andric 
1855ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
1865ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
1875ffd83dbSDimitry Andric }
1885ffd83dbSDimitry Andric 
1895ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
1905ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
1915ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
1925ffd83dbSDimitry Andric }
1935ffd83dbSDimitry Andric 
1945ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
1950b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
1960b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
1970b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
1980b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
1990b57cec5SDimitry Andric }
2000b57cec5SDimitry Andric 
2015ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2025ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2035ffd83dbSDimitry Andric     return false;
2045ffd83dbSDimitry Andric 
2055ffd83dbSDimitry Andric   if (Ty.isVector())
2065ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2075ffd83dbSDimitry Andric 
2085ffd83dbSDimitry Andric   return true;
2095ffd83dbSDimitry Andric }
2105ffd83dbSDimitry Andric 
2115ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2125ffd83dbSDimitry Andric // multiples of v2s16.
2135ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2145ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2155ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2168bcb0991SDimitry Andric   };
2178bcb0991SDimitry Andric }
2188bcb0991SDimitry Andric 
2195ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2208bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2215ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2225ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2235ffd83dbSDimitry Andric       return false;
2245ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2255ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2268bcb0991SDimitry Andric   };
2278bcb0991SDimitry Andric }
2288bcb0991SDimitry Andric 
229fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
230fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
231fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
2328bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2338bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2348bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
2360b57cec5SDimitry Andric   };
2370b57cec5SDimitry Andric }
2380b57cec5SDimitry Andric 
2395ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2405ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2415ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2425ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
2435ffd83dbSDimitry Andric                                     bool IsLoad) {
2445ffd83dbSDimitry Andric   switch (AS) {
2455ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2465ffd83dbSDimitry Andric     // FIXME: Private element size.
247e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2485ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2495ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
2505ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
2515ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
2525ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
2535ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
2545ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
2555ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
2565ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
2575ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
2585ffd83dbSDimitry Andric     // kernel.
2595ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
2605ffd83dbSDimitry Andric   default:
2615ffd83dbSDimitry Andric     // Flat addresses may contextually need to be split to 32-bit parts if they
2625ffd83dbSDimitry Andric     // may alias scratch depending on the subtarget.
2635ffd83dbSDimitry Andric     return 128;
2645ffd83dbSDimitry Andric   }
2655ffd83dbSDimitry Andric }
2665ffd83dbSDimitry Andric 
2675ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
2695ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
2705ffd83dbSDimitry Andric 
2715ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
2735ffd83dbSDimitry Andric 
2745ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
27504eeddc0SDimitry Andric   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
27604eeddc0SDimitry Andric   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
2775ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
2785ffd83dbSDimitry Andric 
2795ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
2805ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2815ffd83dbSDimitry Andric     return false;
2825ffd83dbSDimitry Andric 
283fe6060f1SDimitry Andric   // Do not handle extending vector loads.
284fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
285fe6060f1SDimitry Andric     return false;
286fe6060f1SDimitry Andric 
2875ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
2885ffd83dbSDimitry Andric   // we also need to modify the memory access size.
2895ffd83dbSDimitry Andric #if 0
2905ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
2915ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
2925ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
2935ffd83dbSDimitry Andric #endif
2945ffd83dbSDimitry Andric 
2955ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
2965ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
2975ffd83dbSDimitry Andric     return false;
2985ffd83dbSDimitry Andric 
2995ffd83dbSDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
3005ffd83dbSDimitry Andric     return false;
3015ffd83dbSDimitry Andric 
3025ffd83dbSDimitry Andric   switch (MemSize) {
3035ffd83dbSDimitry Andric   case 8:
3045ffd83dbSDimitry Andric   case 16:
3055ffd83dbSDimitry Andric   case 32:
3065ffd83dbSDimitry Andric   case 64:
3075ffd83dbSDimitry Andric   case 128:
3085ffd83dbSDimitry Andric     break;
3095ffd83dbSDimitry Andric   case 96:
3105ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3115ffd83dbSDimitry Andric       return false;
3125ffd83dbSDimitry Andric     break;
3135ffd83dbSDimitry Andric   case 256:
3145ffd83dbSDimitry Andric   case 512:
3155ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3165ffd83dbSDimitry Andric     break;
3175ffd83dbSDimitry Andric   default:
3185ffd83dbSDimitry Andric     return false;
3195ffd83dbSDimitry Andric   }
3205ffd83dbSDimitry Andric 
3215ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3225ffd83dbSDimitry Andric 
323e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3245ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
325e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3275ffd83dbSDimitry Andric       return false;
3285ffd83dbSDimitry Andric   }
3295ffd83dbSDimitry Andric 
3305ffd83dbSDimitry Andric   return true;
3315ffd83dbSDimitry Andric }
3325ffd83dbSDimitry Andric 
3335ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
3345ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
3355ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
3365ffd83dbSDimitry Andric // bitcasting.
3375ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
3385ffd83dbSDimitry Andric   if (EnableNewLegality)
3395ffd83dbSDimitry Andric     return false;
3405ffd83dbSDimitry Andric 
3415ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
3425ffd83dbSDimitry Andric   if (Size <= 64)
3435ffd83dbSDimitry Andric     return false;
3445ffd83dbSDimitry Andric   if (!Ty.isVector())
3455ffd83dbSDimitry Andric     return true;
346e8d8bef9SDimitry Andric 
347e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
348e8d8bef9SDimitry Andric   if (EltTy.isPointer())
349e8d8bef9SDimitry Andric     return true;
350e8d8bef9SDimitry Andric 
351e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
3525ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
3535ffd83dbSDimitry Andric }
3545ffd83dbSDimitry Andric 
355fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
3565ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
357fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
3585ffd83dbSDimitry Andric          !loadStoreBitcastWorkaround(Ty);
3595ffd83dbSDimitry Andric }
3605ffd83dbSDimitry Andric 
361e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
362e8d8bef9SDimitry Andric /// to a different type.
363e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364fe6060f1SDimitry Andric                                        const LLT MemTy) {
365fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
367e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
368e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
369e8d8bef9SDimitry Andric 
370e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371e8d8bef9SDimitry Andric     return true;
372fe6060f1SDimitry Andric 
373fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
374fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
376e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
377e8d8bef9SDimitry Andric }
378e8d8bef9SDimitry Andric 
379e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
380e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
381e8d8bef9SDimitry Andric /// changes, not the size of the result register.
382fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
38304eeddc0SDimitry Andric                             uint64_t AlignInBits, unsigned AddrSpace,
384e8d8bef9SDimitry Andric                             unsigned Opcode) {
385fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
386e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
387e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
388e8d8bef9SDimitry Andric     return false;
389e8d8bef9SDimitry Andric 
390e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391e8d8bef9SDimitry Andric   // end up widening these for a scalar load during RegBankSelect, since there
392e8d8bef9SDimitry Andric   // aren't 96-bit scalar loads.
393e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394e8d8bef9SDimitry Andric     return false;
395e8d8bef9SDimitry Andric 
396e8d8bef9SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397e8d8bef9SDimitry Andric     return false;
398e8d8bef9SDimitry Andric 
399e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
400e8d8bef9SDimitry Andric   // to it.
401e8d8bef9SDimitry Andric   //
402e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
403e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
405e8d8bef9SDimitry Andric     return false;
406e8d8bef9SDimitry Andric 
407e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
408e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
409e8d8bef9SDimitry Andric   bool Fast = false;
410e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
411e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
413e8d8bef9SDimitry Andric          Fast;
414e8d8bef9SDimitry Andric }
415e8d8bef9SDimitry Andric 
416e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417e8d8bef9SDimitry Andric                             unsigned Opcode) {
418e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419e8d8bef9SDimitry Andric     return false;
420e8d8bef9SDimitry Andric 
421fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
423e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
424e8d8bef9SDimitry Andric }
425e8d8bef9SDimitry Andric 
4260b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
4270b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
4280b57cec5SDimitry Andric   :  ST(ST_) {
4290b57cec5SDimitry Andric   using namespace TargetOpcode;
4300b57cec5SDimitry Andric 
4310b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
4320b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
4330b57cec5SDimitry Andric   };
4340b57cec5SDimitry Andric 
4350b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
436e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
4370b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
4380b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
4390b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
4400b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
4410b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
4425ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
4435ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
4440b57cec5SDimitry Andric 
445fe6060f1SDimitry Andric   const LLT V2S8 = LLT::fixed_vector(2, 8);
446fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
447fe6060f1SDimitry Andric   const LLT V4S16 = LLT::fixed_vector(4, 16);
4480b57cec5SDimitry Andric 
449fe6060f1SDimitry Andric   const LLT V2S32 = LLT::fixed_vector(2, 32);
450fe6060f1SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
451fe6060f1SDimitry Andric   const LLT V4S32 = LLT::fixed_vector(4, 32);
452fe6060f1SDimitry Andric   const LLT V5S32 = LLT::fixed_vector(5, 32);
453fe6060f1SDimitry Andric   const LLT V6S32 = LLT::fixed_vector(6, 32);
454fe6060f1SDimitry Andric   const LLT V7S32 = LLT::fixed_vector(7, 32);
455fe6060f1SDimitry Andric   const LLT V8S32 = LLT::fixed_vector(8, 32);
456fe6060f1SDimitry Andric   const LLT V9S32 = LLT::fixed_vector(9, 32);
457fe6060f1SDimitry Andric   const LLT V10S32 = LLT::fixed_vector(10, 32);
458fe6060f1SDimitry Andric   const LLT V11S32 = LLT::fixed_vector(11, 32);
459fe6060f1SDimitry Andric   const LLT V12S32 = LLT::fixed_vector(12, 32);
460fe6060f1SDimitry Andric   const LLT V13S32 = LLT::fixed_vector(13, 32);
461fe6060f1SDimitry Andric   const LLT V14S32 = LLT::fixed_vector(14, 32);
462fe6060f1SDimitry Andric   const LLT V15S32 = LLT::fixed_vector(15, 32);
463fe6060f1SDimitry Andric   const LLT V16S32 = LLT::fixed_vector(16, 32);
464fe6060f1SDimitry Andric   const LLT V32S32 = LLT::fixed_vector(32, 32);
4650b57cec5SDimitry Andric 
466fe6060f1SDimitry Andric   const LLT V2S64 = LLT::fixed_vector(2, 64);
467fe6060f1SDimitry Andric   const LLT V3S64 = LLT::fixed_vector(3, 64);
468fe6060f1SDimitry Andric   const LLT V4S64 = LLT::fixed_vector(4, 64);
469fe6060f1SDimitry Andric   const LLT V5S64 = LLT::fixed_vector(5, 64);
470fe6060f1SDimitry Andric   const LLT V6S64 = LLT::fixed_vector(6, 64);
471fe6060f1SDimitry Andric   const LLT V7S64 = LLT::fixed_vector(7, 64);
472fe6060f1SDimitry Andric   const LLT V8S64 = LLT::fixed_vector(8, 64);
473fe6060f1SDimitry Andric   const LLT V16S64 = LLT::fixed_vector(16, 64);
4740b57cec5SDimitry Andric 
4750b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
4760b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
4778bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
4780b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
4798bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
4800b57cec5SDimitry Andric 
4810b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
4820b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
4838bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
4840b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
4858bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
4860b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
4870b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
4880b57cec5SDimitry Andric 
4890b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
4900b57cec5SDimitry Andric 
4910b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
4920b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
4930b57cec5SDimitry Andric   };
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
4968bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
4970b57cec5SDimitry Andric   };
4980b57cec5SDimitry Andric 
4990b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
5000b57cec5SDimitry Andric     S32, S64
5010b57cec5SDimitry Andric   };
5020b57cec5SDimitry Andric 
5030b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
5040b57cec5SDimitry Andric     S32, S64, S16
5050b57cec5SDimitry Andric   };
5060b57cec5SDimitry Andric 
5070b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
5080b57cec5SDimitry Andric     S32, S64, S16, V2S16
5090b57cec5SDimitry Andric   };
5100b57cec5SDimitry Andric 
5115ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
5125ffd83dbSDimitry Andric 
513fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
514fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
5150b57cec5SDimitry Andric 
5160b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
5170b57cec5SDimitry Andric   // elements for v3s16
5180b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
519e8d8bef9SDimitry Andric     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
5200b57cec5SDimitry Andric     .legalFor(AllS32Vectors)
5210b57cec5SDimitry Andric     .legalFor(AllS64Vectors)
5220b57cec5SDimitry Andric     .legalFor(AddrSpaces64)
5230b57cec5SDimitry Andric     .legalFor(AddrSpaces32)
524e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
525e8d8bef9SDimitry Andric     .clampScalar(0, S16, S256)
5260b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
5270b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16)
5280b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529e8d8bef9SDimitry Andric     .scalarize(0);
5300b57cec5SDimitry Andric 
531e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532e8d8bef9SDimitry Andric     // Full set of gfx9 features.
53381ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
5345ffd83dbSDimitry Andric       .legalFor({S32, S16, V2S16})
5350eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
53681ad6265SDimitry Andric       .scalarize(0)
53781ad6265SDimitry Andric       .minScalar(0, S16)
538349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
53981ad6265SDimitry Andric       .maxScalar(0, S32);
54081ad6265SDimitry Andric 
54181ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
54281ad6265SDimitry Andric       .legalFor({S32, S16, V2S16})
54381ad6265SDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
54481ad6265SDimitry Andric       .scalarize(0)
54581ad6265SDimitry Andric       .minScalar(0, S16)
54681ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
54781ad6265SDimitry Andric       .custom();
54881ad6265SDimitry Andric     assert(ST.hasMad64_32());
549e8d8bef9SDimitry Andric 
550e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
551e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
552e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
5530eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
554e8d8bef9SDimitry Andric       .scalarize(0)
555e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
556e8d8bef9SDimitry Andric       .lower();
5575ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
55881ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
5590b57cec5SDimitry Andric       .legalFor({S32, S16})
560349cc55cSDimitry Andric       .minScalar(0, S16)
561349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
562349cc55cSDimitry Andric       .maxScalar(0, S32)
563349cc55cSDimitry Andric       .scalarize(0);
564e8d8bef9SDimitry Andric 
56581ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
56681ad6265SDimitry Andric       .legalFor({S32, S16})
56781ad6265SDimitry Andric       .scalarize(0)
56881ad6265SDimitry Andric       .minScalar(0, S16)
56981ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
57081ad6265SDimitry Andric       .custom();
57181ad6265SDimitry Andric     assert(ST.hasMad64_32());
57281ad6265SDimitry Andric 
573e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
574e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
575e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
576e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
577e8d8bef9SDimitry Andric       .minScalar(0, S16)
578e8d8bef9SDimitry Andric       .scalarize(0)
579e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
580e8d8bef9SDimitry Andric       .lower();
581e8d8bef9SDimitry Andric 
582e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
583e8d8bef9SDimitry Andric     // coerce to the desired type first.
584e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
585e8d8bef9SDimitry Andric       .minScalar(0, S16)
586e8d8bef9SDimitry Andric       .scalarize(0)
587e8d8bef9SDimitry Andric       .lower();
5880b57cec5SDimitry Andric   } else {
58981ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
5900b57cec5SDimitry Andric       .legalFor({S32})
591349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
5920b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
5930b57cec5SDimitry Andric       .scalarize(0);
594e8d8bef9SDimitry Andric 
59581ad6265SDimitry Andric     auto &Mul = getActionDefinitionsBuilder(G_MUL)
59681ad6265SDimitry Andric       .legalFor({S32})
59781ad6265SDimitry Andric       .scalarize(0)
59881ad6265SDimitry Andric       .minScalar(0, S32)
59981ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32);
60081ad6265SDimitry Andric 
60181ad6265SDimitry Andric     if (ST.hasMad64_32())
60281ad6265SDimitry Andric       Mul.custom();
60381ad6265SDimitry Andric     else
60481ad6265SDimitry Andric       Mul.maxScalar(0, S32);
60581ad6265SDimitry Andric 
606e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
607e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
608e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
609e8d8bef9SDimitry Andric         .scalarize(0)
610e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
611e8d8bef9SDimitry Andric         .lower();
612e8d8bef9SDimitry Andric     } else {
613e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
614e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
615e8d8bef9SDimitry Andric         .minScalar(0, S32)
616e8d8bef9SDimitry Andric         .scalarize(0)
617e8d8bef9SDimitry Andric         .lower();
6180b57cec5SDimitry Andric     }
6190b57cec5SDimitry Andric 
620e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
621e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
622e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
623e8d8bef9SDimitry Andric       .minScalar(0, S32)
624e8d8bef9SDimitry Andric       .scalarize(0)
625e8d8bef9SDimitry Andric       .lower();
626e8d8bef9SDimitry Andric   }
627e8d8bef9SDimitry Andric 
628fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
629fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
6305ffd83dbSDimitry Andric       .customFor({S32, S64})
631480093f4SDimitry Andric       .clampScalar(0, S32, S64)
632480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
633480093f4SDimitry Andric       .scalarize(0);
634480093f4SDimitry Andric 
635e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
6360b57cec5SDimitry Andric                    .legalFor({S32})
637349cc55cSDimitry Andric                    .maxScalar(0, S32);
638e8d8bef9SDimitry Andric 
639e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
640e8d8bef9SDimitry Andric     Mulh
641e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
642e8d8bef9SDimitry Andric       .lowerFor({V2S8});
643e8d8bef9SDimitry Andric   }
644e8d8bef9SDimitry Andric 
645e8d8bef9SDimitry Andric   Mulh
646e8d8bef9SDimitry Andric     .scalarize(0)
647e8d8bef9SDimitry Andric     .lower();
6480b57cec5SDimitry Andric 
6490b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
6500b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
6510b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
6520b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
6530b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
6540b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6558bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
6560b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
6570b57cec5SDimitry Andric     .scalarize(0);
6580b57cec5SDimitry Andric 
6598bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
6600b57cec5SDimitry Andric                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
661480093f4SDimitry Andric     .legalFor({{S32, S1}, {S32, S32}})
6625ffd83dbSDimitry Andric     .minScalar(0, S32)
66381ad6265SDimitry Andric     .scalarize(0)
6648bcb0991SDimitry Andric     .lower();
6650b57cec5SDimitry Andric 
6660b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
6670b57cec5SDimitry Andric     // Don't worry about the size constraint.
6688bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
6695ffd83dbSDimitry Andric     .lower();
6700b57cec5SDimitry Andric 
6710b57cec5SDimitry Andric 
6720b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
6738bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
6740b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
675e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
6760b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
677e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
6780b57cec5SDimitry Andric 
6795ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
6805ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
6815ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
6828bcb0991SDimitry Andric 
6835ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
6845ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
6855ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
6865ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
6875ffd83dbSDimitry Andric       .legalFor({S1, S16})
6885ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6895ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
6905ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
6915ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
6925ffd83dbSDimitry Andric 
693fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
6945ffd83dbSDimitry Andric 
6955ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
6965ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
6975ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
6985ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
6995ffd83dbSDimitry Andric 
7005ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
701e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
702e8d8bef9SDimitry Andric 
703fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
7040b57cec5SDimitry Andric 
7050b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
7068bcb0991SDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
7070b57cec5SDimitry Andric     .legalFor({S32, S64});
7088bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
7098bcb0991SDimitry Andric     .customFor({S32, S64});
7108bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
7118bcb0991SDimitry Andric     .customFor({S32, S64});
7120b57cec5SDimitry Andric 
7130b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7140b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
7150b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
7160b57cec5SDimitry Andric     else
7170b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
7188bcb0991SDimitry Andric 
7198bcb0991SDimitry Andric     TrigActions.customFor({S16});
7208bcb0991SDimitry Andric     FDIVActions.customFor({S16});
7210b57cec5SDimitry Andric   }
7220b57cec5SDimitry Andric 
7230b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
7240b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
7250b57cec5SDimitry Andric 
7260b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
7270b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
728480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
7290b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
7300b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7310b57cec5SDimitry Andric       .scalarize(0);
7320b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
7330b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
7340b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7350b57cec5SDimitry Andric       .scalarize(0);
7360b57cec5SDimitry Andric   } else {
7370b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
7380b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
7390b57cec5SDimitry Andric       .scalarize(0);
7400b57cec5SDimitry Andric   }
7410b57cec5SDimitry Andric 
7420b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
7430eae32dcSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
7448bcb0991SDimitry Andric 
7450b57cec5SDimitry Andric   FPOpActions
7460b57cec5SDimitry Andric     .scalarize(0)
7470b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7480b57cec5SDimitry Andric 
7498bcb0991SDimitry Andric   TrigActions
7508bcb0991SDimitry Andric     .scalarize(0)
7518bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7528bcb0991SDimitry Andric 
7538bcb0991SDimitry Andric   FDIVActions
7548bcb0991SDimitry Andric     .scalarize(0)
7558bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7568bcb0991SDimitry Andric 
7578bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
7588bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
7590eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
7608bcb0991SDimitry Andric     .scalarize(0)
7618bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
7628bcb0991SDimitry Andric 
7630b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7648bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
7650b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
7660b57cec5SDimitry Andric       .scalarize(0)
7670b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
7680b57cec5SDimitry Andric   } else {
7695ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
7705ffd83dbSDimitry Andric       .legalFor({S32, S64})
7715ffd83dbSDimitry Andric       .scalarize(0)
7725ffd83dbSDimitry Andric       .clampScalar(0, S32, S64);
7735ffd83dbSDimitry Andric 
7745ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
7755ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7765ffd83dbSDimitry Andric         .customFor({S64})
7775ffd83dbSDimitry Andric         .legalFor({S32, S64})
7785ffd83dbSDimitry Andric         .scalarize(0)
7795ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
7805ffd83dbSDimitry Andric     } else {
7815ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7820b57cec5SDimitry Andric         .legalFor({S32, S64})
7830b57cec5SDimitry Andric         .scalarize(0)
7840b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
7850b57cec5SDimitry Andric     }
7865ffd83dbSDimitry Andric   }
7870b57cec5SDimitry Andric 
7880b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
7890b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
7905ffd83dbSDimitry Andric     .scalarize(0)
7915ffd83dbSDimitry Andric     .lower();
7920b57cec5SDimitry Andric 
7930b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
7940b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
795e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
7960b57cec5SDimitry Andric     .scalarize(0);
7970b57cec5SDimitry Andric 
79881ad6265SDimitry Andric   auto &FSubActions = getActionDefinitionsBuilder(G_FSUB);
79981ad6265SDimitry Andric   if (ST.has16BitInsts()) {
80081ad6265SDimitry Andric     FSubActions
80181ad6265SDimitry Andric       // Use actual fsub instruction
80281ad6265SDimitry Andric       .legalFor({S32, S16})
80381ad6265SDimitry Andric       // Must use fadd + fneg
80481ad6265SDimitry Andric       .lowerFor({S64, V2S16});
80581ad6265SDimitry Andric   } else {
80681ad6265SDimitry Andric     FSubActions
8070b57cec5SDimitry Andric       // Use actual fsub instruction
8080b57cec5SDimitry Andric       .legalFor({S32})
8090b57cec5SDimitry Andric       // Must use fadd + fneg
81081ad6265SDimitry Andric       .lowerFor({S64, S16, V2S16});
81181ad6265SDimitry Andric   }
81281ad6265SDimitry Andric 
81381ad6265SDimitry Andric   FSubActions
8140b57cec5SDimitry Andric     .scalarize(0)
8150b57cec5SDimitry Andric     .clampScalar(0, S32, S64);
8160b57cec5SDimitry Andric 
8178bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
8188bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
8195ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
8208bcb0991SDimitry Andric     FMad.customFor({S32, S16});
8215ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
8228bcb0991SDimitry Andric     FMad.customFor({S32});
8235ffd83dbSDimitry Andric   else if (ST.hasMadF16())
8245ffd83dbSDimitry Andric     FMad.customFor({S16});
8258bcb0991SDimitry Andric   FMad.scalarize(0)
8268bcb0991SDimitry Andric       .lower();
8278bcb0991SDimitry Andric 
828e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
829e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
830e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
831e8d8bef9SDimitry Andric   } else {
832e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
833e8d8bef9SDimitry Andric         .customFor({S32, S64});
834e8d8bef9SDimitry Andric   }
835e8d8bef9SDimitry Andric   FRem.scalarize(0);
836e8d8bef9SDimitry Andric 
8375ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
8385ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
8395ffd83dbSDimitry Andric     .legalIf(isScalar(0))
8405ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
8415ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
8425ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
8435ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
8445ffd83dbSDimitry Andric     // in the legalizer.
8455ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
8465ffd83dbSDimitry Andric     .alwaysLegal();
8475ffd83dbSDimitry Andric 
8480b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
8490b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
8505ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
851480093f4SDimitry Andric     .scalarize(0)
8525ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
8535ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
8540b57cec5SDimitry Andric 
8558bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
8568bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
857480093f4SDimitry Andric                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
858480093f4SDimitry Andric                     .lowerIf(typeIs(1, S1))
859349cc55cSDimitry Andric                     .customFor({{S32, S64}, {S64, S64}});
8608bcb0991SDimitry Andric   if (ST.has16BitInsts())
8618bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
8628bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
863e8d8bef9SDimitry Andric        .minScalar(0, S32)
8645ffd83dbSDimitry Andric        .scalarize(0)
8655ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
8660b57cec5SDimitry Andric 
8678bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
8685ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
869fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
870e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
8718bcb0991SDimitry Andric   if (ST.has16BitInsts())
8728bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
8738bcb0991SDimitry Andric   else
8748bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
8758bcb0991SDimitry Andric 
8768bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
877fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
8785ffd83dbSDimitry Andric        .scalarize(0)
8795ffd83dbSDimitry Andric        .lower();
8800b57cec5SDimitry Andric 
88181ad6265SDimitry Andric   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
88281ad6265SDimitry Andric       .customFor({S16, S32})
88381ad6265SDimitry Andric       .scalarize(0)
88481ad6265SDimitry Andric       .lower();
88581ad6265SDimitry Andric 
886e8d8bef9SDimitry Andric   // Lower roundeven into G_FRINT
887e8d8bef9SDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
888480093f4SDimitry Andric     .scalarize(0)
889480093f4SDimitry Andric     .lower();
8900b57cec5SDimitry Andric 
891480093f4SDimitry Andric   if (ST.has16BitInsts()) {
892480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
893480093f4SDimitry Andric       .legalFor({S16, S32, S64})
894480093f4SDimitry Andric       .clampScalar(0, S16, S64)
895480093f4SDimitry Andric       .scalarize(0);
896480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
8970b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8980b57cec5SDimitry Andric       .legalFor({S32, S64})
8990b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
9000b57cec5SDimitry Andric       .scalarize(0);
9010b57cec5SDimitry Andric   } else {
9020b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
9030b57cec5SDimitry Andric       .legalFor({S32})
9040b57cec5SDimitry Andric       .customFor({S64})
9050b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
9060b57cec5SDimitry Andric       .scalarize(0);
9070b57cec5SDimitry Andric   }
9080b57cec5SDimitry Andric 
909480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
910e8d8bef9SDimitry Andric     .legalIf(all(isPointer(0), sameSize(0, 1)))
911e8d8bef9SDimitry Andric     .scalarize(0)
912e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0);
9130b57cec5SDimitry Andric 
9145ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
915e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
916e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
9175ffd83dbSDimitry Andric     .scalarize(0);
9180b57cec5SDimitry Andric 
9190b57cec5SDimitry Andric   auto &CmpBuilder =
9200b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
921480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
922480093f4SDimitry Andric     // so make both s1 and s32 legal.
923480093f4SDimitry Andric     //
924480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
925480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
926480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
927480093f4SDimitry Andric     // before that won't try to use s32 result types.
928480093f4SDimitry Andric     //
929480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
930480093f4SDimitry Andric     // bank.
9310b57cec5SDimitry Andric     .legalForCartesianProduct(
9320b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
933480093f4SDimitry Andric     .legalForCartesianProduct(
934480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
9350b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
9360b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
9370b57cec5SDimitry Andric   }
9380b57cec5SDimitry Andric 
9390b57cec5SDimitry Andric   CmpBuilder
9400b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
9410b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9420b57cec5SDimitry Andric     .scalarize(0)
943480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
9440b57cec5SDimitry Andric 
9450b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
9460b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
9470b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
9480b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9490b57cec5SDimitry Andric     .scalarize(0);
9500b57cec5SDimitry Andric 
9515ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
9525ffd83dbSDimitry Andric   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
9535ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9545ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32, S16});
9555ffd83dbSDimitry Andric   else
9565ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32});
9575ffd83dbSDimitry Andric   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
9585ffd83dbSDimitry Andric   Exp2Ops.scalarize(0);
9595ffd83dbSDimitry Andric 
9605ffd83dbSDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
9615ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9625ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
9635ffd83dbSDimitry Andric   else
9645ffd83dbSDimitry Andric     ExpOps.customFor({S32});
9655ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
9660b57cec5SDimitry Andric         .scalarize(0);
9670b57cec5SDimitry Andric 
968e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
969e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
970e8d8bef9SDimitry Andric     .lower();
971e8d8bef9SDimitry Andric 
9720b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9735ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
9740b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9750b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
97604eeddc0SDimitry Andric     .widenScalarToNextPow2(1, 32)
9770b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9780b57cec5SDimitry Andric     .scalarize(0)
97904eeddc0SDimitry Andric     .widenScalarToNextPow2(0, 32);
98004eeddc0SDimitry Andric 
9810b57cec5SDimitry Andric 
9825ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
9835ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
9845ffd83dbSDimitry Andric   // bitwidth.
9855ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
9865ffd83dbSDimitry Andric     .scalarize(0)
9875ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9885ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9895ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9905ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
991349cc55cSDimitry Andric     .custom();
9925ffd83dbSDimitry Andric 
9935ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9945ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
9955ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9965ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9975ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9985ffd83dbSDimitry Andric     .scalarize(0)
9995ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
10005ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
10015ffd83dbSDimitry Andric 
1002fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1003fe6060f1SDimitry Andric   // RegBankSelect.
10045ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
1005fe6060f1SDimitry Andric     .legalFor({S32, S64})
1006fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
1007fe6060f1SDimitry Andric     .scalarize(0)
1008fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
10090b57cec5SDimitry Andric 
10100b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
10115ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
10125ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
10130eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
10145ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
10155ffd83dbSDimitry Andric       // narrowScalar limitation.
10165ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
10175ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
10185ffd83dbSDimitry Andric       .scalarize(0);
10195ffd83dbSDimitry Andric 
10200b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
1021fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10220b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
10230b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
10240b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
10255ffd83dbSDimitry Andric         .minScalar(0, S16)
10260b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
10275ffd83dbSDimitry Andric         .scalarize(0)
10285ffd83dbSDimitry Andric         .lower();
10290b57cec5SDimitry Andric     } else {
1030fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10310b57cec5SDimitry Andric         .legalFor({S32, S16})
10320b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
10335ffd83dbSDimitry Andric         .minScalar(0, S16)
10345ffd83dbSDimitry Andric         .scalarize(0)
10355ffd83dbSDimitry Andric         .lower();
10360b57cec5SDimitry Andric     }
10370b57cec5SDimitry Andric   } else {
10385ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
10395ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
10405ffd83dbSDimitry Andric       .legalFor({S32})
10415ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
10425ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
10435ffd83dbSDimitry Andric       // narrowScalar limitation.
10445ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
10455ffd83dbSDimitry Andric       .maxScalar(0, S32)
10465ffd83dbSDimitry Andric       .scalarize(0)
10475ffd83dbSDimitry Andric       .lower();
10485ffd83dbSDimitry Andric 
1049fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10500b57cec5SDimitry Andric       .legalFor({S32})
10515ffd83dbSDimitry Andric       .minScalar(0, S32)
10520b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
10535ffd83dbSDimitry Andric       .scalarize(0)
10545ffd83dbSDimitry Andric       .lower();
10550b57cec5SDimitry Andric   }
10560b57cec5SDimitry Andric 
10570b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
10580b57cec5SDimitry Andric     // List the common cases
10590b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10600b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10610b57cec5SDimitry Andric     .scalarize(0)
10620b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10630b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10640b57cec5SDimitry Andric     .widenScalarIf(smallerThan(1, 0),
10650b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10660b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10670b57cec5SDimitry Andric       })
10685ffd83dbSDimitry Andric     .narrowScalarIf(largerThan(1, 0),
10690b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10700b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10710b57cec5SDimitry Andric       });
10720b57cec5SDimitry Andric 
10730b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
10740b57cec5SDimitry Andric     // List the common cases
10750b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10760b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10770b57cec5SDimitry Andric     .scalarize(0)
10780b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10790b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10800b57cec5SDimitry Andric     .widenScalarIf(smallerThan(0, 1),
10810b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10820b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10830b57cec5SDimitry Andric       })
10840b57cec5SDimitry Andric     .narrowScalarIf(
10855ffd83dbSDimitry Andric       largerThan(0, 1),
10860b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10870b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10880b57cec5SDimitry Andric       });
10890b57cec5SDimitry Andric 
10900b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
10910b57cec5SDimitry Andric     .scalarize(0)
10920b57cec5SDimitry Andric     .custom();
10930b57cec5SDimitry Andric 
10945ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
10955ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
10968bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
10978bcb0991SDimitry Andric 
10988bcb0991SDimitry Andric     // Split vector extloads.
1099fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1100480093f4SDimitry Andric 
11018bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
11028bcb0991SDimitry Andric       return true;
11038bcb0991SDimitry Andric 
11048bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
11058bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
11065ffd83dbSDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
11078bcb0991SDimitry Andric       return true;
11088bcb0991SDimitry Andric 
11098bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
11108bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
11115ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
11125ffd83dbSDimitry Andric     if (NumRegs == 3) {
11135ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
11148bcb0991SDimitry Andric         return true;
11155ffd83dbSDimitry Andric     } else {
11165ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
11175ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
11185ffd83dbSDimitry Andric         return true;
11195ffd83dbSDimitry Andric     }
11208bcb0991SDimitry Andric 
11218bcb0991SDimitry Andric     return false;
11228bcb0991SDimitry Andric   };
11238bcb0991SDimitry Andric 
1124e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1125e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1126e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
11278bcb0991SDimitry Andric 
11288bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
11298bcb0991SDimitry Andric   // LDS
11308bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
11318bcb0991SDimitry Andric 
11328bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
11338bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
11348bcb0991SDimitry Andric 
11358bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
11365ffd83dbSDimitry Andric     // Explicitly list some common cases.
11375ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1138fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1139fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1140fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1141fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1142fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1143fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1144fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1145fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
11468bcb0991SDimitry Andric 
1147fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1148fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1149fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1150fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1151fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1152fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
11538bcb0991SDimitry Andric 
1154fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1155fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1156fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1157fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
11588bcb0991SDimitry Andric 
1159fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1160fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1161fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1162fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1163fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
11645ffd83dbSDimitry Andric     Actions.legalIf(
11655ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1166fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
11675ffd83dbSDimitry Andric       });
11685ffd83dbSDimitry Andric 
11695ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
11705ffd83dbSDimitry Andric     // 64-bits.
11715ffd83dbSDimitry Andric     //
11725ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
11735ffd83dbSDimitry Andric     // inserting addrspacecasts.
11745ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
11755ffd83dbSDimitry Andric 
11765ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
11775ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
11785ffd83dbSDimitry Andric     // parts anyway.
11795ffd83dbSDimitry Andric     //
11805ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
11815ffd83dbSDimitry Andric     // 16-bit vector parts.
11825ffd83dbSDimitry Andric     Actions.bitcastIf(
11835ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1184e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1185fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
11865ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
11875ffd83dbSDimitry Andric 
1188e8d8bef9SDimitry Andric     if (!IsStore) {
1189e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1190e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1191e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1192e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1193e8d8bef9SDimitry Andric       });
1194e8d8bef9SDimitry Andric     }
1195e8d8bef9SDimitry Andric 
1196e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
11978bcb0991SDimitry Andric     Actions
11988bcb0991SDimitry Andric         .narrowScalarIf(
11998bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
12005ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
12015ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
12028bcb0991SDimitry Andric             },
12038bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
12048bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
12058bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
12068bcb0991SDimitry Andric 
12078bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1208fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
12098bcb0991SDimitry Andric 
12108bcb0991SDimitry Andric               // Split extloads.
12118bcb0991SDimitry Andric               if (DstSize > MemSize)
12128bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MemSize));
12138bcb0991SDimitry Andric 
12145ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
12155ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
12165ffd83dbSDimitry Andric                                                      Op == G_LOAD);
12178bcb0991SDimitry Andric               if (MemSize > MaxSize)
12188bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MaxSize));
12198bcb0991SDimitry Andric 
122004eeddc0SDimitry Andric               uint64_t Align = Query.MMODescrs[0].AlignInBits;
12218bcb0991SDimitry Andric               return std::make_pair(0, LLT::scalar(Align));
12228bcb0991SDimitry Andric             })
12238bcb0991SDimitry Andric         .fewerElementsIf(
12248bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
12255ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
12265ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
12278bcb0991SDimitry Andric             },
12288bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
12298bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
12308bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
12318bcb0991SDimitry Andric 
12328bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
12335ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
12345ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
12355ffd83dbSDimitry Andric                                                      Op == G_LOAD);
12365ffd83dbSDimitry Andric 
12375ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
12385ffd83dbSDimitry Andric               // up scalarizing.
12395ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
12408bcb0991SDimitry Andric 
12418bcb0991SDimitry Andric               // Split if it's too large for the address space.
1242fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1243fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
12448bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
12455ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
12465ffd83dbSDimitry Andric 
12475ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
12485ffd83dbSDimitry Andric                   return std::make_pair(
1249fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1250fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
12515ffd83dbSDimitry Andric                 }
12525ffd83dbSDimitry Andric 
1253fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
12548bcb0991SDimitry Andric 
12558bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
12568bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
12578bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
12588bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
12598bcb0991SDimitry Andric                   return std::make_pair(0, EltTy);
12608bcb0991SDimitry Andric 
1261fe6060f1SDimitry Andric                 return std::make_pair(
1262fe6060f1SDimitry Andric                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
12638bcb0991SDimitry Andric               }
12648bcb0991SDimitry Andric 
12655ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
12665ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
12675ffd83dbSDimitry Andric                 return std::make_pair(0, EltTy);
12685ffd83dbSDimitry Andric 
12695ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
12705ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
12715ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
12725ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
12735ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
12745ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
12755ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
12765ffd83dbSDimitry Andric                 return std::make_pair(
1277fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1278fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
12795ffd83dbSDimitry Andric               }
12805ffd83dbSDimitry Andric 
12818bcb0991SDimitry Andric               // May need relegalization for the scalars.
12828bcb0991SDimitry Andric               return std::make_pair(0, EltTy);
12838bcb0991SDimitry Andric             })
1284fe6060f1SDimitry Andric     .minScalar(0, S32)
1285fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
12868bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1287e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1288e8d8bef9SDimitry Andric     .lower();
12898bcb0991SDimitry Andric   }
12900b57cec5SDimitry Andric 
1291fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
12920b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1293fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1294fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1295fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1296fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1297fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1298fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1299fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1300fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1301fe6060f1SDimitry Andric                        .legalIf(
1302fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1303fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1304fe6060f1SDimitry Andric                          });
1305fe6060f1SDimitry Andric 
13060b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
13078bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1308fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
13090b57cec5SDimitry Andric   }
13100b57cec5SDimitry Andric 
1311fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1312fe6060f1SDimitry Andric   // 64-bits.
1313fe6060f1SDimitry Andric   //
1314fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1315fe6060f1SDimitry Andric   // inserting addrspacecasts.
1316fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1317fe6060f1SDimitry Andric 
13180b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
13190b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
13200b57cec5SDimitry Andric           .lower();
13210b57cec5SDimitry Andric 
13220b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
13230b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
13240b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
13250b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1326480093f4SDimitry Andric      G_ATOMICRMW_UMIN})
13270b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1328e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1329e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13300b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
13310b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
13320b57cec5SDimitry Andric   }
13330b57cec5SDimitry Andric 
1334fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1335349cc55cSDimitry Andric   if (ST.hasLDSFPAtomicAdd()) {
1336fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1337fe6060f1SDimitry Andric     if (ST.hasGFX90AInsts())
1338fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
133981ad6265SDimitry Andric     if (ST.hasGFX940Insts())
134081ad6265SDimitry Andric       Atomic.legalFor({{V2S16, LocalPtr}});
13415ffd83dbSDimitry Andric   }
1342fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1343fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
13448bcb0991SDimitry Andric 
134504eeddc0SDimitry Andric   if (ST.hasGFX90AInsts()) {
134604eeddc0SDimitry Andric     // These are legal with some caveats, and should have undergone expansion in
134704eeddc0SDimitry Andric     // the IR in most situations
134804eeddc0SDimitry Andric     // TODO: Move atomic expansion into legalizer
134904eeddc0SDimitry Andric     // TODO: Also supports <2 x f16>
135004eeddc0SDimitry Andric     Atomic.legalFor({
135104eeddc0SDimitry Andric         {S32, GlobalPtr},
135204eeddc0SDimitry Andric         {S64, GlobalPtr},
135304eeddc0SDimitry Andric         {S64, FlatPtr}
135404eeddc0SDimitry Andric       });
135504eeddc0SDimitry Andric   }
135604eeddc0SDimitry Andric 
1357480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1358480093f4SDimitry Andric   // demarshalling
1359480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1360480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1361480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1362480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1363480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13640b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1365480093f4SDimitry Andric 
1366480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
13670b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1368fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1369fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1370fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1371fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1372fe6060f1SDimitry Andric                                 {S1, S32})
13730b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
13745ffd83dbSDimitry Andric       .scalarize(1)
13750b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
13760b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
13770b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
13780b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
13790b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
13800b57cec5SDimitry Andric       .scalarize(0)
13810b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1382480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
13830b57cec5SDimitry Andric 
13840b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
13850b57cec5SDimitry Andric   // be more flexible with the shift amount type.
13860b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
13870b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
13880b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
13890b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
13905ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
13910b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
13920b57cec5SDimitry Andric     } else
13935ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
13940b57cec5SDimitry Andric 
13955ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
13965ffd83dbSDimitry Andric     Shifts.widenScalarIf(
13975ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
13985ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
13995ffd83dbSDimitry Andric         // 32-bit amount.
14005ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
14015ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
14025ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
14035ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
14045ffd83dbSDimitry Andric       }, changeTo(1, S16));
14055ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1406480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
14070b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
140804eeddc0SDimitry Andric     Shifts.clampScalar(0, S16, S64);
1409e8d8bef9SDimitry Andric 
1410e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1411e8d8bef9SDimitry Andric       .minScalar(0, S16)
1412e8d8bef9SDimitry Andric       .scalarize(0)
1413e8d8bef9SDimitry Andric       .lower();
14140b57cec5SDimitry Andric   } else {
14150b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
14160b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
14170b57cec5SDimitry Andric     // been truncated already.
14180b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
14190b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
142004eeddc0SDimitry Andric     Shifts.clampScalar(0, S32, S64);
1421e8d8bef9SDimitry Andric 
1422e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1423e8d8bef9SDimitry Andric       .minScalar(0, S32)
1424e8d8bef9SDimitry Andric       .scalarize(0)
1425e8d8bef9SDimitry Andric       .lower();
14260b57cec5SDimitry Andric   }
14270b57cec5SDimitry Andric   Shifts.scalarize(0);
14280b57cec5SDimitry Andric 
14290b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
14300b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
14310b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
14320b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
14330b57cec5SDimitry Andric 
14340b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14350b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
14360b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
14370b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
14380b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1439e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
1440e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
14410b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
14425ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
14430b57cec5SDimitry Andric                   IdxTy.getSizeInBits() == 32;
14440b57cec5SDimitry Andric         })
1445e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1446e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1447e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1448e8d8bef9SDimitry Andric       .bitcastIf(
1449e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1450e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1451e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1452e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1453e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1454e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1455e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1456e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1457e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1458e8d8bef9SDimitry Andric 
1459e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1460e8d8bef9SDimitry Andric           return std::make_pair(
1461fe6060f1SDimitry Andric               VecTypeIdx,
1462fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1463e8d8bef9SDimitry Andric         })
14640b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
14650b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1466e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1467e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1468e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
1469e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1470e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1471e8d8bef9SDimitry Andric       .lower();
14720b57cec5SDimitry Andric   }
14730b57cec5SDimitry Andric 
14740b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
14750b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
14760b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
14770b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
14780b57cec5SDimitry Andric       });
14790b57cec5SDimitry Andric 
14800b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
14810b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
14820b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
14830b57cec5SDimitry Andric 
14840b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
14850b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14868bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
14870eae32dcSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
14880eae32dcSDimitry Andric           // Sub-vector(or single element) insert and extract.
14890eae32dcSDimitry Andric           // TODO: verify immediate offset here since lower only works with
14900eae32dcSDimitry Andric           // whole elements.
14910eae32dcSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14920eae32dcSDimitry Andric           return BigTy.isVector();
14930eae32dcSDimitry Andric         })
14948bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
14950b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
14960b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14970b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14980b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
14990b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
15000b57cec5SDimitry Andric         })
15010b57cec5SDimitry Andric       .widenScalarIf(
15020b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
15030b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15040b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
15050b57cec5SDimitry Andric         },
15060b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
15070b57cec5SDimitry Andric       .widenScalarIf(
15080b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
15090b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
15100b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
15110b57cec5SDimitry Andric         },
15120b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
15130b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
15140b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
15150b57cec5SDimitry Andric 
15160b57cec5SDimitry Andric   }
15170b57cec5SDimitry Andric 
15188bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
15190b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
15200b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
15218bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
15228bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
15238bcb0991SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
15248bcb0991SDimitry Andric 
15258bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
15265ffd83dbSDimitry Andric     BuildVector
15275ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
15285ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
15295ffd83dbSDimitry Andric       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
15305ffd83dbSDimitry Andric       .minScalar(1, S32);
15315ffd83dbSDimitry Andric 
15328bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
15338bcb0991SDimitry Andric       .legalFor({V2S16, S32})
15348bcb0991SDimitry Andric       .lower();
15355ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
15368bcb0991SDimitry Andric   } else {
15375ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
15385ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
15395ffd83dbSDimitry Andric 
15408bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
15415ffd83dbSDimitry Andric       .customFor({V2S16, S32})
15428bcb0991SDimitry Andric       .lower();
15438bcb0991SDimitry Andric   }
15448bcb0991SDimitry Andric 
15455ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
15465ffd83dbSDimitry Andric 
15475ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
15480b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1549e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1550e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1551e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1552e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
15530b57cec5SDimitry Andric 
155481ad6265SDimitry Andric   // TODO: Don't fully scalarize v2s16 pieces? Or combine out those
15555ffd83dbSDimitry Andric   // pre-legalize.
15565ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
15575ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
15585ffd83dbSDimitry Andric       .customFor({V2S16, V2S16})
15595ffd83dbSDimitry Andric       .lower();
15605ffd83dbSDimitry Andric   } else
15618bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
15628bcb0991SDimitry Andric 
15630b57cec5SDimitry Andric   // Merge/Unmerge
15640b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
15650b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
15660b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
15670b57cec5SDimitry Andric 
15680b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
15695ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
15700b57cec5SDimitry Andric       if (Ty.isVector()) {
15710b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
15725ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
15730b57cec5SDimitry Andric           return true;
15740b57cec5SDimitry Andric         if (!isPowerOf2_32(EltTy.getSizeInBits()))
15750b57cec5SDimitry Andric           return true;
15760b57cec5SDimitry Andric       }
15770b57cec5SDimitry Andric       return false;
15780b57cec5SDimitry Andric     };
15790b57cec5SDimitry Andric 
15808bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1581e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
15825ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
15835ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
15845ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15855ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
15865ffd83dbSDimitry Andric         })
15875ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
15885ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
15895ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
15900b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
15918bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
15928bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
15938bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
15948bcb0991SDimitry Andric                        changeTo(1, V2S16))
15955ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
15965ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
15975ffd83dbSDimitry Andric       // valid.
15985ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
15995ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
16000b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
16010b57cec5SDimitry Andric       .fewerElementsIf(
16025ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
16030b57cec5SDimitry Andric         scalarize(0))
16040b57cec5SDimitry Andric       .fewerElementsIf(
16055ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
16060b57cec5SDimitry Andric         scalarize(1))
16075ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
16088bcb0991SDimitry Andric 
16098bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
16108bcb0991SDimitry Andric       Builder.widenScalarIf(
16118bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
16120b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
16138bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
16148bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
16158bcb0991SDimitry Andric         },
16168bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
16178bcb0991SDimitry Andric     }
16188bcb0991SDimitry Andric 
16198bcb0991SDimitry Andric     Builder.widenScalarIf(
16208bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
16218bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
16220b57cec5SDimitry Andric         return !isPowerOf2_32(Ty.getSizeInBits()) &&
16230b57cec5SDimitry Andric           Ty.getSizeInBits() % 16 != 0;
16240b57cec5SDimitry Andric       },
16250b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
16260b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
16270b57cec5SDimitry Andric         // Whichever is smaller.
16280b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
16290b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
16300b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
16310b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
16320b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
16330b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
16340b57cec5SDimitry Andric         }
16350b57cec5SDimitry Andric         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
16360b57cec5SDimitry Andric       })
16370b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
16380b57cec5SDimitry Andric       .scalarize(0)
16390b57cec5SDimitry Andric       .scalarize(1);
16400b57cec5SDimitry Andric   }
16410b57cec5SDimitry Andric 
16425ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
16435ffd83dbSDimitry Andric   // RegBankSelect.
16445ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
16455ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
16468bcb0991SDimitry Andric 
16475ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
16485ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
16495ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
16505ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
16515ffd83dbSDimitry Andric       // expanded.
16520eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2);
16535ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
16545ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
16555ffd83dbSDimitry Andric   } else {
16565ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
16575ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
16585ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
16595ffd83dbSDimitry Andric   }
16605ffd83dbSDimitry Andric 
16615ffd83dbSDimitry Andric   SextInReg
16625ffd83dbSDimitry Andric     .scalarize(0)
16635ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
16645ffd83dbSDimitry Andric     .lower();
16655ffd83dbSDimitry Andric 
1666349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1667349cc55cSDimitry Andric     .scalarize(0)
1668349cc55cSDimitry Andric     .lower();
1669349cc55cSDimitry Andric 
1670fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
16715ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
16725ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1673fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
16740eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
16755ffd83dbSDimitry Andric     .scalarize(0)
16765ffd83dbSDimitry Andric     .lower();
1677480093f4SDimitry Andric 
1678fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
1679fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1680fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
16810eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
1682fe6060f1SDimitry Andric       .scalarize(0)
1683fe6060f1SDimitry Andric       .lower();
1684fe6060f1SDimitry Andric   } else {
1685fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1686fe6060f1SDimitry Andric       .scalarize(0)
1687fe6060f1SDimitry Andric       .lower();
1688fe6060f1SDimitry Andric   }
1689fe6060f1SDimitry Andric 
1690480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1691480093f4SDimitry Andric     .legalFor({S64});
1692480093f4SDimitry Andric 
1693e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1694e8d8bef9SDimitry Andric     .alwaysLegal();
1695e8d8bef9SDimitry Andric 
1696fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1697fe6060f1SDimitry Andric       .scalarize(0)
1698fe6060f1SDimitry Andric       .minScalar(0, S32)
1699fe6060f1SDimitry Andric       .lower();
1700fe6060f1SDimitry Andric 
1701fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1702fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
1703fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
1704fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
1705fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
1706fe6060f1SDimitry Andric       .scalarize(0);
1707fe6060f1SDimitry Andric 
17085ffd83dbSDimitry Andric   getActionDefinitionsBuilder({
17095ffd83dbSDimitry Andric       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
17105ffd83dbSDimitry Andric       G_FCOPYSIGN,
17115ffd83dbSDimitry Andric 
17125ffd83dbSDimitry Andric       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1713e8d8bef9SDimitry Andric       G_ATOMICRMW_NAND,
1714e8d8bef9SDimitry Andric       G_ATOMICRMW_FSUB,
17155ffd83dbSDimitry Andric       G_READ_REGISTER,
17165ffd83dbSDimitry Andric       G_WRITE_REGISTER,
17175ffd83dbSDimitry Andric 
17185ffd83dbSDimitry Andric       G_SADDO, G_SSUBO,
17195ffd83dbSDimitry Andric 
17205ffd83dbSDimitry Andric        // TODO: Implement
1721fe6060f1SDimitry Andric       G_FMINIMUM, G_FMAXIMUM}).lower();
17225ffd83dbSDimitry Andric 
1723349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1724349cc55cSDimitry Andric       .lower();
1725349cc55cSDimitry Andric 
1726480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
17275ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1728480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1729480093f4SDimitry Andric     .unsupported();
1730480093f4SDimitry Andric 
1731fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
17320b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
17330b57cec5SDimitry Andric }
17340b57cec5SDimitry Andric 
17355ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
17365ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
17375ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
17385ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
17395ffd83dbSDimitry Andric 
17400b57cec5SDimitry Andric   switch (MI.getOpcode()) {
17410b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
17428bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
17430b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
17448bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
17450b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
17468bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
1747e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
1748e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
17490b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
17508bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
17510b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
17528bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
17530b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
17548bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
17555ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
17565ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
17575ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
17585ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
17590b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
17600b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
17610b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
17620b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
17635ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
17640b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
17658bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
17660b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
17678bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
17685ffd83dbSDimitry Andric   case TargetOpcode::G_SHUFFLE_VECTOR:
17695ffd83dbSDimitry Andric     return legalizeShuffleVector(MI, MRI, B);
17708bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
17718bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
17728bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
17738bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
17748bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
17758bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
1776fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
1777fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
1778e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
17798bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
17808bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
17818bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
17828bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
17835ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
17845ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
1785fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
1786fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
17875ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
17885ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
1789fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
1790fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
1791480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
1792480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
17935ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
17945ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f);
17955ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
17965ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
17975ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
17985ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
17995ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
18005ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
18015ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
18025ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
18035ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
18045ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
180581ad6265SDimitry Andric   case TargetOpcode::G_MUL:
180681ad6265SDimitry Andric     return legalizeMul(Helper, MI);
1807349cc55cSDimitry Andric   case TargetOpcode::G_CTLZ:
1808349cc55cSDimitry Andric   case TargetOpcode::G_CTTZ:
1809349cc55cSDimitry Andric     return legalizeCTLZ_CTTZ(MI, MRI, B);
181081ad6265SDimitry Andric   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
181181ad6265SDimitry Andric     return legalizeFPTruncRound(MI, B);
18120b57cec5SDimitry Andric   default:
18130b57cec5SDimitry Andric     return false;
18140b57cec5SDimitry Andric   }
18150b57cec5SDimitry Andric 
18160b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
18170b57cec5SDimitry Andric }
18180b57cec5SDimitry Andric 
18190b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
18200b57cec5SDimitry Andric   unsigned AS,
18210b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
18228bcb0991SDimitry Andric   MachineIRBuilder &B) const {
18238bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
18240b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
18250b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
18260b57cec5SDimitry Andric 
18278bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
18288bcb0991SDimitry Andric 
18290b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
18300b57cec5SDimitry Andric     // FIXME: Use inline constants (src_{shared, private}_base) instead of
18310b57cec5SDimitry Andric     // getreg.
18320b57cec5SDimitry Andric     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
18330b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
18340b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
18350b57cec5SDimitry Andric     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
18360b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
18370b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
18380b57cec5SDimitry Andric     unsigned Encoding =
18390b57cec5SDimitry Andric         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
18400b57cec5SDimitry Andric         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
18410b57cec5SDimitry Andric         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
18420b57cec5SDimitry Andric 
18430b57cec5SDimitry Andric     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
18440b57cec5SDimitry Andric 
18458bcb0991SDimitry Andric     B.buildInstr(AMDGPU::S_GETREG_B32)
18460b57cec5SDimitry Andric       .addDef(GetReg)
18470b57cec5SDimitry Andric       .addImm(Encoding);
18480b57cec5SDimitry Andric     MRI.setType(GetReg, S32);
18490b57cec5SDimitry Andric 
18508bcb0991SDimitry Andric     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
18515ffd83dbSDimitry Andric     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
18520b57cec5SDimitry Andric   }
18530b57cec5SDimitry Andric 
185481ad6265SDimitry Andric   // TODO: can we be smarter about machine pointer info?
185581ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
185681ad6265SDimitry Andric   Register LoadAddr = MRI.createGenericVirtualRegister(
185781ad6265SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
185881ad6265SDimitry Andric   // For code object version 5, private_base and shared_base are passed through
185981ad6265SDimitry Andric   // implicit kernargs.
186081ad6265SDimitry Andric   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
186181ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
186281ad6265SDimitry Andric         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
186381ad6265SDimitry Andric                                       : AMDGPUTargetLowering::PRIVATE_BASE;
186481ad6265SDimitry Andric     uint64_t Offset =
186581ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
186681ad6265SDimitry Andric 
186781ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
186881ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
186981ad6265SDimitry Andric 
187081ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
187181ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
187281ad6265SDimitry Andric       return Register();
187381ad6265SDimitry Andric 
187481ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
187581ad6265SDimitry Andric         PtrInfo,
187681ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
187781ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
187881ad6265SDimitry Andric         LLT::scalar(32), commonAlignment(Align(64), Offset));
187981ad6265SDimitry Andric 
188081ad6265SDimitry Andric     // Pointer address
188181ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
188281ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
188381ad6265SDimitry Andric     // Load address
188481ad6265SDimitry Andric     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
188581ad6265SDimitry Andric   }
188681ad6265SDimitry Andric 
18870b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
18880b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
18890b57cec5SDimitry Andric 
1890e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
18918bcb0991SDimitry Andric     return Register();
18920b57cec5SDimitry Andric 
18930b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
18940b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
18950b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
18960b57cec5SDimitry Andric 
18970b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
18980b57cec5SDimitry Andric       PtrInfo,
18995ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
19000b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
1901fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
19020b57cec5SDimitry Andric 
190381ad6265SDimitry Andric   B.buildPtrAdd(LoadAddr, QueuePtr,
190481ad6265SDimitry Andric                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
19055ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
19060b57cec5SDimitry Andric }
19070b57cec5SDimitry Andric 
190804eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is
190904eeddc0SDimitry Andric /// not necessary.
191004eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
191104eeddc0SDimitry Andric                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
191204eeddc0SDimitry Andric   MachineInstr *Def = MRI.getVRegDef(Val);
191304eeddc0SDimitry Andric   switch (Def->getOpcode()) {
191404eeddc0SDimitry Andric   case AMDGPU::G_FRAME_INDEX:
191504eeddc0SDimitry Andric   case AMDGPU::G_GLOBAL_VALUE:
191604eeddc0SDimitry Andric   case AMDGPU::G_BLOCK_ADDR:
191704eeddc0SDimitry Andric     return true;
191804eeddc0SDimitry Andric   case AMDGPU::G_CONSTANT: {
191904eeddc0SDimitry Andric     const ConstantInt *CI = Def->getOperand(1).getCImm();
192004eeddc0SDimitry Andric     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
192104eeddc0SDimitry Andric   }
192204eeddc0SDimitry Andric   default:
192304eeddc0SDimitry Andric     return false;
192404eeddc0SDimitry Andric   }
192504eeddc0SDimitry Andric 
192604eeddc0SDimitry Andric   return false;
192704eeddc0SDimitry Andric }
192804eeddc0SDimitry Andric 
19290b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
19300b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19318bcb0991SDimitry Andric   MachineIRBuilder &B) const {
19328bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
19330b57cec5SDimitry Andric 
19348bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
19350b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
19360b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19370b57cec5SDimitry Andric 
19380b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
19390b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
19400b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
19410b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
19420b57cec5SDimitry Andric 
19430b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
19440b57cec5SDimitry Andric   // vector element.
19450b57cec5SDimitry Andric   assert(!DstTy.isVector());
19460b57cec5SDimitry Andric 
19470b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
19480b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
19490b57cec5SDimitry Andric 
1950e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
19518bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
19528bcb0991SDimitry Andric     return true;
19538bcb0991SDimitry Andric   }
19548bcb0991SDimitry Andric 
195581ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
195681ad6265SDimitry Andric       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
195781ad6265SDimitry Andric        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
195804eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
195904eeddc0SDimitry Andric       // Extract low 32-bits of the pointer.
196004eeddc0SDimitry Andric       B.buildExtract(Dst, Src, 0);
196104eeddc0SDimitry Andric       MI.eraseFromParent();
196204eeddc0SDimitry Andric       return true;
196304eeddc0SDimitry Andric     }
196404eeddc0SDimitry Andric 
19650b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
19660b57cec5SDimitry Andric 
19678bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
19688bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
19690b57cec5SDimitry Andric 
19700b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
19715ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
19720b57cec5SDimitry Andric 
19735ffd83dbSDimitry Andric     auto CmpRes =
19745ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
19758bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
19760b57cec5SDimitry Andric 
19770b57cec5SDimitry Andric     MI.eraseFromParent();
19780b57cec5SDimitry Andric     return true;
19790b57cec5SDimitry Andric   }
19800b57cec5SDimitry Andric 
198181ad6265SDimitry Andric   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
198281ad6265SDimitry Andric       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
198381ad6265SDimitry Andric        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
19848bcb0991SDimitry Andric     if (!ST.hasFlatAddressSpace())
19858bcb0991SDimitry Andric       return false;
19860b57cec5SDimitry Andric 
19878bcb0991SDimitry Andric     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
19888bcb0991SDimitry Andric     if (!ApertureReg.isValid())
19898bcb0991SDimitry Andric       return false;
19900b57cec5SDimitry Andric 
19910b57cec5SDimitry Andric     // Coerce the type of the low half of the result so we can use merge_values.
19925ffd83dbSDimitry Andric     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
19930b57cec5SDimitry Andric 
19940b57cec5SDimitry Andric     // TODO: Should we allow mismatched types but matching sizes in merges to
19950b57cec5SDimitry Andric     // avoid the ptrtoint?
19965ffd83dbSDimitry Andric     auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
199704eeddc0SDimitry Andric 
199804eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
199904eeddc0SDimitry Andric       B.buildCopy(Dst, BuildPtr);
200004eeddc0SDimitry Andric       MI.eraseFromParent();
200104eeddc0SDimitry Andric       return true;
200204eeddc0SDimitry Andric     }
200304eeddc0SDimitry Andric 
200404eeddc0SDimitry Andric     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
200504eeddc0SDimitry Andric     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
200604eeddc0SDimitry Andric 
200781ad6265SDimitry Andric     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
200881ad6265SDimitry Andric                               SegmentNull.getReg(0));
200904eeddc0SDimitry Andric 
20105ffd83dbSDimitry Andric     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
20110b57cec5SDimitry Andric 
20120b57cec5SDimitry Andric     MI.eraseFromParent();
20130b57cec5SDimitry Andric     return true;
20140b57cec5SDimitry Andric   }
20150b57cec5SDimitry Andric 
201681ad6265SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
201781ad6265SDimitry Andric       SrcTy.getSizeInBits() == 64) {
201881ad6265SDimitry Andric     // Truncate.
201981ad6265SDimitry Andric     B.buildExtract(Dst, Src, 0);
202081ad6265SDimitry Andric     MI.eraseFromParent();
202181ad6265SDimitry Andric     return true;
202281ad6265SDimitry Andric   }
202381ad6265SDimitry Andric 
202481ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
202581ad6265SDimitry Andric       DstTy.getSizeInBits() == 64) {
202681ad6265SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
202781ad6265SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
202881ad6265SDimitry Andric 
202981ad6265SDimitry Andric     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
203081ad6265SDimitry Andric     // another. Merge operands are required to be the same type, but creating an
203181ad6265SDimitry Andric     // extra ptrtoint would be kind of pointless.
203281ad6265SDimitry Andric     auto HighAddr = B.buildConstant(
203381ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
203481ad6265SDimitry Andric     B.buildMerge(Dst, {Src, HighAddr});
203581ad6265SDimitry Andric     MI.eraseFromParent();
203681ad6265SDimitry Andric     return true;
203781ad6265SDimitry Andric   }
203881ad6265SDimitry Andric 
203981ad6265SDimitry Andric   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
204081ad6265SDimitry Andric       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
204181ad6265SDimitry Andric 
204281ad6265SDimitry Andric   LLVMContext &Ctx = MF.getFunction().getContext();
204381ad6265SDimitry Andric   Ctx.diagnose(InvalidAddrSpaceCast);
204481ad6265SDimitry Andric   B.buildUndef(Dst);
204581ad6265SDimitry Andric   MI.eraseFromParent();
204681ad6265SDimitry Andric   return true;
204781ad6265SDimitry Andric }
204881ad6265SDimitry Andric 
20490b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
20500b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20518bcb0991SDimitry Andric   MachineIRBuilder &B) const {
20520b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20530b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
20540b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
20550b57cec5SDimitry Andric 
20560b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
20570b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
20580b57cec5SDimitry Andric 
20598bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
20608bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
20610b57cec5SDimitry Andric 
20620b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
20638bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
20648bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
20650b57cec5SDimitry Andric 
20668bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
20678bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
20680b57cec5SDimitry Andric 
20698bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
20708bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2071e8d8bef9SDimitry Andric   MI.eraseFromParent();
20720b57cec5SDimitry Andric   return true;
20730b57cec5SDimitry Andric }
20740b57cec5SDimitry Andric 
20750b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
20760b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20770b57cec5SDimitry Andric   MachineIRBuilder &B) const {
20780b57cec5SDimitry Andric 
20790b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
20800b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
20810b57cec5SDimitry Andric 
20820b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20830b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
20840b57cec5SDimitry Andric 
20850b57cec5SDimitry Andric   // result = trunc(src)
20860b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
20870b57cec5SDimitry Andric   //   result += 1.0
20880b57cec5SDimitry Andric 
20895ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
20900b57cec5SDimitry Andric 
20910b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
20920b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
20930b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
20940b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
20950b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
20960b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
20970b57cec5SDimitry Andric 
20980b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
20990b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
210004eeddc0SDimitry Andric   MI.eraseFromParent();
21010b57cec5SDimitry Andric   return true;
21020b57cec5SDimitry Andric }
21030b57cec5SDimitry Andric 
2104e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
2105e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
2106e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
2107e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
2108e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
2109e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
2110e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
2111e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
2112e8d8bef9SDimitry Andric 
2113e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2114e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2115e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2116e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2117e8d8bef9SDimitry Andric     MI.eraseFromParent();
2118e8d8bef9SDimitry Andric     return true;
2119e8d8bef9SDimitry Andric }
2120e8d8bef9SDimitry Andric 
2121e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
21220b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
21230b57cec5SDimitry Andric   const unsigned FractBits = 52;
21240b57cec5SDimitry Andric   const unsigned ExpBits = 11;
21250b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
21260b57cec5SDimitry Andric 
21270b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
21280b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
21290b57cec5SDimitry Andric 
21300b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2131e8d8bef9SDimitry Andric     .addUse(Hi)
21320b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
21330b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
21340b57cec5SDimitry Andric 
21350b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
21360b57cec5SDimitry Andric }
21370b57cec5SDimitry Andric 
21380b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
21390b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21400b57cec5SDimitry Andric   MachineIRBuilder &B) const {
21410b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
21420b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
21430b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
21440b57cec5SDimitry Andric 
21450b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
21460b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
21470b57cec5SDimitry Andric 
21480b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
21490b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
21500b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
21510b57cec5SDimitry Andric 
21520b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
21530b57cec5SDimitry Andric   // exponent.
21540b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
21550b57cec5SDimitry Andric 
21560b57cec5SDimitry Andric   const unsigned FractBits = 52;
21570b57cec5SDimitry Andric 
21580b57cec5SDimitry Andric   // Extract the sign bit.
21590b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
21600b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
21610b57cec5SDimitry Andric 
21620b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
21630b57cec5SDimitry Andric 
21640b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
21650b57cec5SDimitry Andric 
21660b57cec5SDimitry Andric   // Extend back to 64-bits.
21675ffd83dbSDimitry Andric   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
21680b57cec5SDimitry Andric 
21690b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
21700b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
21710b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
21720b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
21730b57cec5SDimitry Andric 
21740b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
21750b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
21760b57cec5SDimitry Andric 
21770b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
21780b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2179e8d8bef9SDimitry Andric   MI.eraseFromParent();
21800b57cec5SDimitry Andric   return true;
21810b57cec5SDimitry Andric }
21820b57cec5SDimitry Andric 
21830b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
21840b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21850b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
21860b57cec5SDimitry Andric 
21870b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
21880b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
21890b57cec5SDimitry Andric 
21900b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
21910b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
21920b57cec5SDimitry Andric 
2193349cc55cSDimitry Andric   assert(MRI.getType(Src) == S64);
21940b57cec5SDimitry Andric 
21950b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2196349cc55cSDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
21970b57cec5SDimitry Andric 
2198349cc55cSDimitry Andric   if (MRI.getType(Dst) == S64) {
2199349cc55cSDimitry Andric     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2200349cc55cSDimitry Andric                         : B.buildUITOFP(S64, Unmerge.getReg(1));
22010b57cec5SDimitry Andric 
22020b57cec5SDimitry Andric     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
22030b57cec5SDimitry Andric     auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
22040b57cec5SDimitry Andric                      .addUse(CvtHi.getReg(0))
22050b57cec5SDimitry Andric                      .addUse(ThirtyTwo.getReg(0));
22060b57cec5SDimitry Andric 
22070b57cec5SDimitry Andric     // TODO: Should this propagate fast-math-flags?
22080b57cec5SDimitry Andric     B.buildFAdd(Dst, LdExp, CvtLo);
22090b57cec5SDimitry Andric     MI.eraseFromParent();
22100b57cec5SDimitry Andric     return true;
22110b57cec5SDimitry Andric   }
22120b57cec5SDimitry Andric 
2213349cc55cSDimitry Andric   assert(MRI.getType(Dst) == S32);
2214349cc55cSDimitry Andric 
2215349cc55cSDimitry Andric   auto One = B.buildConstant(S32, 1);
2216349cc55cSDimitry Andric 
2217349cc55cSDimitry Andric   MachineInstrBuilder ShAmt;
2218349cc55cSDimitry Andric   if (Signed) {
2219349cc55cSDimitry Andric     auto ThirtyOne = B.buildConstant(S32, 31);
2220349cc55cSDimitry Andric     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2221349cc55cSDimitry Andric     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2222349cc55cSDimitry Andric     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2223349cc55cSDimitry Andric     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2224349cc55cSDimitry Andric                                /*HasSideEffects=*/false)
2225349cc55cSDimitry Andric                   .addUse(Unmerge.getReg(1));
2226349cc55cSDimitry Andric     auto LS2 = B.buildSub(S32, LS, One);
2227349cc55cSDimitry Andric     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2228349cc55cSDimitry Andric   } else
2229349cc55cSDimitry Andric     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2230349cc55cSDimitry Andric   auto Norm = B.buildShl(S64, Src, ShAmt);
2231349cc55cSDimitry Andric   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2232349cc55cSDimitry Andric   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2233349cc55cSDimitry Andric   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2234349cc55cSDimitry Andric   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2235349cc55cSDimitry Andric   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2236349cc55cSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
2237349cc55cSDimitry Andric                    /*HasSideEffects=*/false)
2238349cc55cSDimitry Andric       .addUse(FVal.getReg(0))
2239349cc55cSDimitry Andric       .addUse(Scale.getReg(0));
2240349cc55cSDimitry Andric   MI.eraseFromParent();
2241349cc55cSDimitry Andric   return true;
2242349cc55cSDimitry Andric }
2243349cc55cSDimitry Andric 
22445ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
22455ffd83dbSDimitry Andric // actually works.
2246fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2247fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2248fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2249fe6060f1SDimitry Andric                                         bool Signed) const {
22505ffd83dbSDimitry Andric 
22515ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22525ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
22535ffd83dbSDimitry Andric 
22545ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
22555ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
22565ffd83dbSDimitry Andric 
2257fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2258fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
22595ffd83dbSDimitry Andric 
22605ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
22615ffd83dbSDimitry Andric 
2262fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2263fe6060f1SDimitry Andric   // integers is illustrated as follows:
2264fe6060f1SDimitry Andric   //
2265fe6060f1SDimitry Andric   //     tf := trunc(val);
2266fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2267fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2268fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2269fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2270fe6060f1SDimitry Andric   //
2271fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2272fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2273fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2274fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2275fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2276fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2277fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2278fe6060f1SDimitry Andric     // signedness.
2279fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2280fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2281fe6060f1SDimitry Andric   }
2282fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2283fe6060f1SDimitry Andric   if (SrcLT == S64) {
2284fe6060f1SDimitry Andric     K0 = B.buildFConstant(S64,
2285fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2286fe6060f1SDimitry Andric     K1 = B.buildFConstant(S64,
2287fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2288fe6060f1SDimitry Andric   } else {
2289fe6060f1SDimitry Andric     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2290fe6060f1SDimitry Andric     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2291fe6060f1SDimitry Andric   }
22925ffd83dbSDimitry Andric 
2293fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2294fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2295fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
22965ffd83dbSDimitry Andric 
2297fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2298fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
22995ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
23005ffd83dbSDimitry Andric 
2301fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2302fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2303fe6060f1SDimitry Andric     Sign = B.buildMerge(S64, {Sign, Sign});
2304fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2305fe6060f1SDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2306fe6060f1SDimitry Andric   } else
23075ffd83dbSDimitry Andric     B.buildMerge(Dst, {Lo, Hi});
23085ffd83dbSDimitry Andric   MI.eraseFromParent();
23095ffd83dbSDimitry Andric 
23105ffd83dbSDimitry Andric   return true;
23115ffd83dbSDimitry Andric }
23125ffd83dbSDimitry Andric 
23135ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
23145ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
23155ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
23160b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
23170b57cec5SDimitry Andric 
23180b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
23190b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
23200b57cec5SDimitry Andric 
23210b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
23220b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
23230b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
23240b57cec5SDimitry Andric     return !IsIEEEOp;
23250b57cec5SDimitry Andric 
23260b57cec5SDimitry Andric   if (IsIEEEOp)
23270b57cec5SDimitry Andric     return true;
23280b57cec5SDimitry Andric 
23290b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
23300b57cec5SDimitry Andric }
23310b57cec5SDimitry Andric 
23320b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
23330b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23340b57cec5SDimitry Andric   MachineIRBuilder &B) const {
23350b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
23360b57cec5SDimitry Andric 
23370b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
23385ffd83dbSDimitry Andric 
23395ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
23405ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2341349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2342e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2343349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2344e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
23450b57cec5SDimitry Andric     return true;
2346e8d8bef9SDimitry Andric   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
23470b57cec5SDimitry Andric 
23480b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
23490b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
23500b57cec5SDimitry Andric 
23510b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
23520b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
23530b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Dst));
23540b57cec5SDimitry Andric 
235504eeddc0SDimitry Andric   if (IdxVal < VecTy.getNumElements()) {
235604eeddc0SDimitry Andric     auto Unmerge = B.buildUnmerge(EltTy, Vec);
235704eeddc0SDimitry Andric     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
235804eeddc0SDimitry Andric   } else {
23590b57cec5SDimitry Andric     B.buildUndef(Dst);
236004eeddc0SDimitry Andric   }
23610b57cec5SDimitry Andric 
23620b57cec5SDimitry Andric   MI.eraseFromParent();
23630b57cec5SDimitry Andric   return true;
23640b57cec5SDimitry Andric }
23650b57cec5SDimitry Andric 
23660b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
23670b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23680b57cec5SDimitry Andric   MachineIRBuilder &B) const {
23690b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
23700b57cec5SDimitry Andric 
23710b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
23725ffd83dbSDimitry Andric 
23735ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
23745ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2375349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2376e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2377349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2378e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
23790b57cec5SDimitry Andric     return true;
23800b57cec5SDimitry Andric 
2381e8d8bef9SDimitry Andric   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
23820b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
23830b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
23840b57cec5SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
23850b57cec5SDimitry Andric 
23860b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
23870b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
23880b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Ins));
238904eeddc0SDimitry Andric   (void)Ins;
23900b57cec5SDimitry Andric 
239104eeddc0SDimitry Andric   unsigned NumElts = VecTy.getNumElements();
239204eeddc0SDimitry Andric   if (IdxVal < NumElts) {
239304eeddc0SDimitry Andric     SmallVector<Register, 8> SrcRegs;
239404eeddc0SDimitry Andric     for (unsigned i = 0; i < NumElts; ++i)
239504eeddc0SDimitry Andric       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
239604eeddc0SDimitry Andric     B.buildUnmerge(SrcRegs, Vec);
239704eeddc0SDimitry Andric 
239804eeddc0SDimitry Andric     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
239904eeddc0SDimitry Andric     B.buildMerge(Dst, SrcRegs);
240004eeddc0SDimitry Andric   } else {
24010b57cec5SDimitry Andric     B.buildUndef(Dst);
240204eeddc0SDimitry Andric   }
24030b57cec5SDimitry Andric 
24040b57cec5SDimitry Andric   MI.eraseFromParent();
24050b57cec5SDimitry Andric   return true;
24060b57cec5SDimitry Andric }
24070b57cec5SDimitry Andric 
24085ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector(
24095ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24105ffd83dbSDimitry Andric   MachineIRBuilder &B) const {
2411fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
24125ffd83dbSDimitry Andric 
24135ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
24145ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
24155ffd83dbSDimitry Andric   LLT DstTy = MRI.getType(Dst);
24165ffd83dbSDimitry Andric   LLT SrcTy = MRI.getType(Src0);
24175ffd83dbSDimitry Andric 
24185ffd83dbSDimitry Andric   if (SrcTy == V2S16 && DstTy == V2S16 &&
24195ffd83dbSDimitry Andric       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
24205ffd83dbSDimitry Andric     return true;
24215ffd83dbSDimitry Andric 
24225ffd83dbSDimitry Andric   MachineIRBuilder HelperBuilder(MI);
24235ffd83dbSDimitry Andric   GISelObserverWrapper DummyObserver;
24245ffd83dbSDimitry Andric   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
24255ffd83dbSDimitry Andric   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
24265ffd83dbSDimitry Andric }
24275ffd83dbSDimitry Andric 
24288bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
24298bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24308bcb0991SDimitry Andric   MachineIRBuilder &B) const {
24318bcb0991SDimitry Andric 
24328bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
24338bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
24348bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
24358bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
24368bcb0991SDimitry Andric 
24378bcb0991SDimitry Andric   Register TrigVal;
24385ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
24398bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
24408bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
24418bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
24428bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
24438bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
24448bcb0991SDimitry Andric   } else
24458bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
24468bcb0991SDimitry Andric 
24478bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
24488bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
24498bcb0991SDimitry Andric   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
24508bcb0991SDimitry Andric     .addUse(TrigVal)
24518bcb0991SDimitry Andric     .setMIFlags(Flags);
24528bcb0991SDimitry Andric   MI.eraseFromParent();
24538bcb0991SDimitry Andric   return true;
24548bcb0991SDimitry Andric }
24558bcb0991SDimitry Andric 
24565ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
24575ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
24585ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
24595ffd83dbSDimitry Andric                                                   int64_t Offset,
24605ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
24615ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
24628bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
24638bcb0991SDimitry Andric   // to the following code sequence:
24648bcb0991SDimitry Andric   //
24658bcb0991SDimitry Andric   // For constant address space:
24668bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
24678bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
24688bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
24698bcb0991SDimitry Andric   //
24708bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
24718bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
24728bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
24738bcb0991SDimitry Andric   //   operand to the global variable.
24748bcb0991SDimitry Andric   //
24758bcb0991SDimitry Andric   // For global address space:
24768bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
24778bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
24788bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
24798bcb0991SDimitry Andric   //
24808bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
24818bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
24828bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
24838bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
24848bcb0991SDimitry Andric   //   operand to the global variable.
24858bcb0991SDimitry Andric   //
24868bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
24878bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
24888bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
24898bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
24908bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
2491e8d8bef9SDimitry Andric   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2492e8d8bef9SDimitry Andric   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2493e8d8bef9SDimitry Andric   // instruction.
24948bcb0991SDimitry Andric 
24958bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
24968bcb0991SDimitry Andric 
24978bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
24988bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
24998bcb0991SDimitry Andric 
25008bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
25018bcb0991SDimitry Andric     .addDef(PCReg);
25028bcb0991SDimitry Andric 
25038bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
25048bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
25058bcb0991SDimitry Andric     MIB.addImm(0);
25068bcb0991SDimitry Andric   else
2507e8d8bef9SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
25088bcb0991SDimitry Andric 
25098bcb0991SDimitry Andric   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
25108bcb0991SDimitry Andric 
25118bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
25128bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
25138bcb0991SDimitry Andric   return true;
25148bcb0991SDimitry Andric  }
25158bcb0991SDimitry Andric 
25168bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
25178bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
25188bcb0991SDimitry Andric   MachineIRBuilder &B) const {
25198bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
25208bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
25218bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
25228bcb0991SDimitry Andric 
25238bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
25248bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
25258bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
25268bcb0991SDimitry Andric 
25278bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2528fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2529fe6060f1SDimitry Andric         !GV->getName().equals("llvm.amdgcn.module.lds")) {
25308bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
25318bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
25325ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
25335ffd83dbSDimitry Andric         DS_Warning);
25348bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
25355ffd83dbSDimitry Andric 
25365ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
25375ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
25385ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
25395ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
25405ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
25415ffd83dbSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
25425ffd83dbSDimitry Andric       B.buildUndef(DstReg);
25435ffd83dbSDimitry Andric       MI.eraseFromParent();
25445ffd83dbSDimitry Andric       return true;
25458bcb0991SDimitry Andric     }
25468bcb0991SDimitry Andric 
25478bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
2548349cc55cSDimitry Andric     // We ignore the initializer for now and legalize it to allow selection.
2549349cc55cSDimitry Andric     // The initializer will anyway get errored out during assembly emission.
25505ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
25515ffd83dbSDimitry Andric     if (!TLI->shouldUseLDSConstAddress(GV)) {
25525ffd83dbSDimitry Andric       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
25535ffd83dbSDimitry Andric       return true; // Leave in place;
25545ffd83dbSDimitry Andric     }
25555ffd83dbSDimitry Andric 
2556e8d8bef9SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2557e8d8bef9SDimitry Andric       Type *Ty = GV->getValueType();
2558e8d8bef9SDimitry Andric       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2559e8d8bef9SDimitry Andric       // zero-sized type in other languages to declare the dynamic shared
2560e8d8bef9SDimitry Andric       // memory which size is not known at the compile time. They will be
2561e8d8bef9SDimitry Andric       // allocated by the runtime and placed directly after the static
2562e8d8bef9SDimitry Andric       // allocated ones. They all share the same offset.
2563e8d8bef9SDimitry Andric       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2564e8d8bef9SDimitry Andric         // Adjust alignment for that dynamic shared memory array.
2565e8d8bef9SDimitry Andric         MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2566e8d8bef9SDimitry Andric         LLT S32 = LLT::scalar(32);
2567e8d8bef9SDimitry Andric         auto Sz =
2568e8d8bef9SDimitry Andric             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2569e8d8bef9SDimitry Andric         B.buildIntToPtr(DstReg, Sz);
2570e8d8bef9SDimitry Andric         MI.eraseFromParent();
2571e8d8bef9SDimitry Andric         return true;
2572e8d8bef9SDimitry Andric       }
2573e8d8bef9SDimitry Andric     }
2574e8d8bef9SDimitry Andric 
2575349cc55cSDimitry Andric     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2576349cc55cSDimitry Andric                                                    *cast<GlobalVariable>(GV)));
25778bcb0991SDimitry Andric     MI.eraseFromParent();
25788bcb0991SDimitry Andric     return true;
25798bcb0991SDimitry Andric   }
25808bcb0991SDimitry Andric 
25818bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
25828bcb0991SDimitry Andric 
25838bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
25848bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
25858bcb0991SDimitry Andric     MI.eraseFromParent();
25868bcb0991SDimitry Andric     return true;
25878bcb0991SDimitry Andric   }
25888bcb0991SDimitry Andric 
25898bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
25908bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
25918bcb0991SDimitry Andric     MI.eraseFromParent();
25928bcb0991SDimitry Andric     return true;
25938bcb0991SDimitry Andric   }
25948bcb0991SDimitry Andric 
25958bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
25968bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
25978bcb0991SDimitry Andric 
2598fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
25998bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
26008bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
26018bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
26028bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2603fe6060f1SDimitry Andric       LoadTy, Align(8));
26048bcb0991SDimitry Andric 
26058bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
26068bcb0991SDimitry Andric 
26078bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
2608349cc55cSDimitry Andric     // Truncate if this is a 32-bit constant address.
26098bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
26108bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
26118bcb0991SDimitry Andric   } else
26128bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
26138bcb0991SDimitry Andric 
26148bcb0991SDimitry Andric   MI.eraseFromParent();
26158bcb0991SDimitry Andric   return true;
26168bcb0991SDimitry Andric }
26178bcb0991SDimitry Andric 
2618e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2619e8d8bef9SDimitry Andric   if (Ty.isVector())
2620fe6060f1SDimitry Andric     return Ty.changeElementCount(
2621fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2622e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2623e8d8bef9SDimitry Andric }
2624e8d8bef9SDimitry Andric 
2625e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2626e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2627e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2628e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2629e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2630e8d8bef9SDimitry Andric 
2631e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2632e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2633e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2634e8d8bef9SDimitry Andric 
2635e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
26368bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2637e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
26388bcb0991SDimitry Andric     Observer.changingInstr(MI);
26398bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
26408bcb0991SDimitry Andric     Observer.changedInstr(MI);
26418bcb0991SDimitry Andric     return true;
26428bcb0991SDimitry Andric   }
26438bcb0991SDimitry Andric 
2644fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
2645fe6060f1SDimitry Andric     return false;
2646fe6060f1SDimitry Andric 
2647e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2648e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2649e8d8bef9SDimitry Andric 
2650e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
2651e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
2652fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
2653e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
2654fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
265504eeddc0SDimitry Andric   const uint64_t AlignInBits = 8 * MemAlign.value();
2656e8d8bef9SDimitry Andric 
2657e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
2658fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2659e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2660e8d8bef9SDimitry Andric 
2661e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
2662e8d8bef9SDimitry Andric     // the memory type.
2663e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
2664e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
2665e8d8bef9SDimitry Andric 
2666e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
2667e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2668e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
2669e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
2670e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
2671e8d8bef9SDimitry Andric       return true;
2672e8d8bef9SDimitry Andric     }
2673e8d8bef9SDimitry Andric 
2674e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
2675e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
2676e8d8bef9SDimitry Andric       return false;
2677e8d8bef9SDimitry Andric 
2678e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
2679e8d8bef9SDimitry Andric 
2680e8d8bef9SDimitry Andric     Register WideLoad;
2681e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
2682e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2683e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
2684e8d8bef9SDimitry Andric     } else {
2685e8d8bef9SDimitry Andric       // Extract the subvector.
2686e8d8bef9SDimitry Andric 
2687e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
2688e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
2689e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
2690e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2691e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
2692e8d8bef9SDimitry Andric       } else {
2693e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
2694e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
26950eae32dcSDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
26960eae32dcSDimitry Andric         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2697e8d8bef9SDimitry Andric       }
2698e8d8bef9SDimitry Andric     }
2699e8d8bef9SDimitry Andric 
2700e8d8bef9SDimitry Andric     MI.eraseFromParent();
2701e8d8bef9SDimitry Andric     return true;
2702e8d8bef9SDimitry Andric   }
2703e8d8bef9SDimitry Andric 
2704e8d8bef9SDimitry Andric   return false;
2705e8d8bef9SDimitry Andric }
2706e8d8bef9SDimitry Andric 
27078bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
27088bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
27098bcb0991SDimitry Andric   MachineIRBuilder &B) const {
27108bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
27118bcb0991SDimitry Andric   assert(Ty.isScalar());
27128bcb0991SDimitry Andric 
2713480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
2714480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2715480093f4SDimitry Andric 
27168bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
27175ffd83dbSDimitry Andric   // FIXME: Do we need just output?
27185ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
27198bcb0991SDimitry Andric     return true;
27205ffd83dbSDimitry Andric   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
27218bcb0991SDimitry Andric     return true;
27228bcb0991SDimitry Andric 
27238bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
27248bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
27258bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
27268bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
27278bcb0991SDimitry Andric }
27288bcb0991SDimitry Andric 
2729480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2730480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2731480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2732480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2733480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
2734480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
2735480093f4SDimitry Andric 
2736e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2737480093f4SDimitry Andric          "this should not have been custom lowered");
2738480093f4SDimitry Andric 
2739480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
2740fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
2741480093f4SDimitry Andric 
2742480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2743480093f4SDimitry Andric 
2744480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2745480093f4SDimitry Andric     .addDef(DstReg)
2746480093f4SDimitry Andric     .addUse(PtrReg)
2747480093f4SDimitry Andric     .addUse(PackedVal)
2748480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
2749480093f4SDimitry Andric 
2750480093f4SDimitry Andric   MI.eraseFromParent();
2751480093f4SDimitry Andric   return true;
2752480093f4SDimitry Andric }
2753480093f4SDimitry Andric 
27545ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog(
27555ffd83dbSDimitry Andric   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
27565ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27575ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
27585ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
27595ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
27605ffd83dbSDimitry Andric 
27615ffd83dbSDimitry Andric   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
27625ffd83dbSDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
27635ffd83dbSDimitry Andric 
27645ffd83dbSDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
27655ffd83dbSDimitry Andric   MI.eraseFromParent();
27665ffd83dbSDimitry Andric   return true;
27675ffd83dbSDimitry Andric }
27685ffd83dbSDimitry Andric 
27695ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
27705ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
27715ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27725ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
27735ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
27745ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
27755ffd83dbSDimitry Andric 
27765ffd83dbSDimitry Andric   auto K = B.buildFConstant(Ty, numbers::log2e);
27775ffd83dbSDimitry Andric   auto Mul = B.buildFMul(Ty, Src, K, Flags);
27785ffd83dbSDimitry Andric   B.buildFExp2(Dst, Mul, Flags);
27795ffd83dbSDimitry Andric   MI.eraseFromParent();
27805ffd83dbSDimitry Andric   return true;
27815ffd83dbSDimitry Andric }
27825ffd83dbSDimitry Andric 
27835ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
27845ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
27855ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27865ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
27875ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
27885ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
27895ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
27905ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
27915ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
27925ffd83dbSDimitry Andric 
27935ffd83dbSDimitry Andric   if (Ty == S32) {
27945ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S32, Src0, Flags);
27955ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
27965ffd83dbSDimitry Andric       .addUse(Log.getReg(0))
27975ffd83dbSDimitry Andric       .addUse(Src1)
27985ffd83dbSDimitry Andric       .setMIFlags(Flags);
27995ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
28005ffd83dbSDimitry Andric   } else if (Ty == S16) {
28015ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
28025ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S16, Src0, Flags);
28035ffd83dbSDimitry Andric     auto Ext0 = B.buildFPExt(S32, Log, Flags);
28045ffd83dbSDimitry Andric     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
28055ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
28065ffd83dbSDimitry Andric       .addUse(Ext0.getReg(0))
28075ffd83dbSDimitry Andric       .addUse(Ext1.getReg(0))
28085ffd83dbSDimitry Andric       .setMIFlags(Flags);
28095ffd83dbSDimitry Andric 
28105ffd83dbSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
28115ffd83dbSDimitry Andric   } else
28125ffd83dbSDimitry Andric     return false;
28135ffd83dbSDimitry Andric 
28145ffd83dbSDimitry Andric   MI.eraseFromParent();
28155ffd83dbSDimitry Andric   return true;
28165ffd83dbSDimitry Andric }
28175ffd83dbSDimitry Andric 
28185ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
28195ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
28205ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
28215ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
28225ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
28235ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
28245ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
28255ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
28265ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
28275ffd83dbSDimitry Andric   return ModSrc;
28285ffd83dbSDimitry Andric }
28295ffd83dbSDimitry Andric 
28305ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
28315ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
28325ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
28335ffd83dbSDimitry Andric 
28345ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
28355ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
28365ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
28375ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
28385ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
28395ffd83dbSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
28405ffd83dbSDimitry Andric          "this should not have been custom lowered");
28415ffd83dbSDimitry Andric 
28425ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
28435ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
28445ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
28455ffd83dbSDimitry Andric   // V_FRACT bug is:
28465ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
28475ffd83dbSDimitry Andric   //
28485ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
28495ffd83dbSDimitry Andric 
28505ffd83dbSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
28515ffd83dbSDimitry Andric     .addUse(OrigSrc)
28525ffd83dbSDimitry Andric     .setMIFlags(Flags);
28535ffd83dbSDimitry Andric 
28545ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
28555ffd83dbSDimitry Andric   // pattern.
28565ffd83dbSDimitry Andric 
28575ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
28585ffd83dbSDimitry Andric   // shouldn't matter?
28595ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
28605ffd83dbSDimitry Andric 
28615ffd83dbSDimitry Andric   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
28625ffd83dbSDimitry Andric 
28635ffd83dbSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(S64);
28645ffd83dbSDimitry Andric 
28655ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
28665ffd83dbSDimitry Andric   // use the one which will directly select.
28675ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
28685ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
28695ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
28705ffd83dbSDimitry Andric   else
28715ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
28725ffd83dbSDimitry Andric 
28735ffd83dbSDimitry Andric   Register CorrectedFract = Min;
28745ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
28755ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
28765ffd83dbSDimitry Andric     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
28775ffd83dbSDimitry Andric   }
28785ffd83dbSDimitry Andric 
28795ffd83dbSDimitry Andric   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
28805ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
28815ffd83dbSDimitry Andric 
28825ffd83dbSDimitry Andric   MI.eraseFromParent();
28835ffd83dbSDimitry Andric   return true;
28845ffd83dbSDimitry Andric }
28855ffd83dbSDimitry Andric 
28865ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
28875ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
28885ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
28895ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
28905ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
28915ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2892fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
28935ffd83dbSDimitry Andric 
28945ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
28955ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
28965ffd83dbSDimitry Andric   assert(MRI.getType(Src0) == LLT::scalar(16));
28975ffd83dbSDimitry Andric 
28985ffd83dbSDimitry Andric   auto Merge = B.buildMerge(S32, {Src0, Src1});
28995ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
29005ffd83dbSDimitry Andric 
29015ffd83dbSDimitry Andric   MI.eraseFromParent();
29025ffd83dbSDimitry Andric   return true;
29035ffd83dbSDimitry Andric }
29045ffd83dbSDimitry Andric 
290581ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
290681ad6265SDimitry Andric //
290781ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits.
290881ad6265SDimitry Andric //
290981ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence
291081ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of
291181ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go
291281ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction
291381ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions.
291481ad6265SDimitry Andric void AMDGPULegalizerInfo::buildMultiply(
291581ad6265SDimitry Andric     LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
291681ad6265SDimitry Andric     ArrayRef<Register> Src0, ArrayRef<Register> Src1,
291781ad6265SDimitry Andric     bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
291881ad6265SDimitry Andric   // Use (possibly empty) vectors of S1 registers to represent the set of
291981ad6265SDimitry Andric   // carries from one pair of positions to the next.
292081ad6265SDimitry Andric   using Carry = SmallVector<Register, 2>;
292181ad6265SDimitry Andric 
292281ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
292381ad6265SDimitry Andric 
292481ad6265SDimitry Andric   const LLT S1 = LLT::scalar(1);
292581ad6265SDimitry Andric   const LLT S32 = LLT::scalar(32);
292681ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
292781ad6265SDimitry Andric 
292881ad6265SDimitry Andric   Register Zero32;
292981ad6265SDimitry Andric   Register Zero64;
293081ad6265SDimitry Andric 
293181ad6265SDimitry Andric   auto getZero32 = [&]() -> Register {
293281ad6265SDimitry Andric     if (!Zero32)
293381ad6265SDimitry Andric       Zero32 = B.buildConstant(S32, 0).getReg(0);
293481ad6265SDimitry Andric     return Zero32;
293581ad6265SDimitry Andric   };
293681ad6265SDimitry Andric   auto getZero64 = [&]() -> Register {
293781ad6265SDimitry Andric     if (!Zero64)
293881ad6265SDimitry Andric       Zero64 = B.buildConstant(S64, 0).getReg(0);
293981ad6265SDimitry Andric     return Zero64;
294081ad6265SDimitry Andric   };
294181ad6265SDimitry Andric 
294281ad6265SDimitry Andric   // Merge the given carries into the 32-bit LocalAccum, which is modified
294381ad6265SDimitry Andric   // in-place.
294481ad6265SDimitry Andric   //
294581ad6265SDimitry Andric   // Returns the carry-out, which is a single S1 register or null.
294681ad6265SDimitry Andric   auto mergeCarry =
294781ad6265SDimitry Andric       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
294881ad6265SDimitry Andric         if (CarryIn.empty())
294981ad6265SDimitry Andric           return Register();
295081ad6265SDimitry Andric 
295181ad6265SDimitry Andric         bool HaveCarryOut = true;
295281ad6265SDimitry Andric         Register CarryAccum;
295381ad6265SDimitry Andric         if (CarryIn.size() == 1) {
295481ad6265SDimitry Andric           if (!LocalAccum) {
295581ad6265SDimitry Andric             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
295681ad6265SDimitry Andric             return Register();
295781ad6265SDimitry Andric           }
295881ad6265SDimitry Andric 
295981ad6265SDimitry Andric           CarryAccum = getZero32();
296081ad6265SDimitry Andric         } else {
296181ad6265SDimitry Andric           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
296281ad6265SDimitry Andric           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
296381ad6265SDimitry Andric             CarryAccum =
296481ad6265SDimitry Andric                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
296581ad6265SDimitry Andric                     .getReg(0);
296681ad6265SDimitry Andric           }
296781ad6265SDimitry Andric 
296881ad6265SDimitry Andric           if (!LocalAccum) {
296981ad6265SDimitry Andric             LocalAccum = getZero32();
297081ad6265SDimitry Andric             HaveCarryOut = false;
297181ad6265SDimitry Andric           }
297281ad6265SDimitry Andric         }
297381ad6265SDimitry Andric 
297481ad6265SDimitry Andric         auto Add =
297581ad6265SDimitry Andric             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
297681ad6265SDimitry Andric         LocalAccum = Add.getReg(0);
297781ad6265SDimitry Andric         return HaveCarryOut ? Add.getReg(1) : Register();
297881ad6265SDimitry Andric       };
297981ad6265SDimitry Andric 
298081ad6265SDimitry Andric   // Build a multiply-add chain to compute
298181ad6265SDimitry Andric   //
298281ad6265SDimitry Andric   //   LocalAccum + (partial products at DstIndex)
298381ad6265SDimitry Andric   //       + (opportunistic subset of CarryIn)
298481ad6265SDimitry Andric   //
298581ad6265SDimitry Andric   // LocalAccum is an array of one or two 32-bit registers that are updated
298681ad6265SDimitry Andric   // in-place. The incoming registers may be null.
298781ad6265SDimitry Andric   //
298881ad6265SDimitry Andric   // In some edge cases, carry-ins can be consumed "for free". In that case,
298981ad6265SDimitry Andric   // the consumed carry bits are removed from CarryIn in-place.
299081ad6265SDimitry Andric   auto buildMadChain =
299181ad6265SDimitry Andric       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
299281ad6265SDimitry Andric           -> Carry {
299381ad6265SDimitry Andric         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
299481ad6265SDimitry Andric                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
299581ad6265SDimitry Andric 
299681ad6265SDimitry Andric         Carry CarryOut;
299781ad6265SDimitry Andric         unsigned j0 = 0;
299881ad6265SDimitry Andric 
299981ad6265SDimitry Andric         // Use plain 32-bit multiplication for the most significant part of the
300081ad6265SDimitry Andric         // result by default.
300181ad6265SDimitry Andric         if (LocalAccum.size() == 1 &&
300281ad6265SDimitry Andric             (!UsePartialMad64_32 || !CarryIn.empty())) {
300381ad6265SDimitry Andric           do {
300481ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
300581ad6265SDimitry Andric             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
300681ad6265SDimitry Andric             if (!LocalAccum[0]) {
300781ad6265SDimitry Andric               LocalAccum[0] = Mul.getReg(0);
300881ad6265SDimitry Andric             } else {
300981ad6265SDimitry Andric               if (CarryIn.empty()) {
301081ad6265SDimitry Andric                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
301181ad6265SDimitry Andric               } else {
301281ad6265SDimitry Andric                 LocalAccum[0] =
301381ad6265SDimitry Andric                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
301481ad6265SDimitry Andric                         .getReg(0);
301581ad6265SDimitry Andric                 CarryIn.pop_back();
301681ad6265SDimitry Andric               }
301781ad6265SDimitry Andric             }
301881ad6265SDimitry Andric             ++j0;
301981ad6265SDimitry Andric           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
302081ad6265SDimitry Andric         }
302181ad6265SDimitry Andric 
302281ad6265SDimitry Andric         // Build full 64-bit multiplies.
302381ad6265SDimitry Andric         if (j0 <= DstIndex) {
302481ad6265SDimitry Andric           bool HaveSmallAccum = false;
302581ad6265SDimitry Andric           Register Tmp;
302681ad6265SDimitry Andric 
302781ad6265SDimitry Andric           if (LocalAccum[0]) {
302881ad6265SDimitry Andric             if (LocalAccum.size() == 1) {
302981ad6265SDimitry Andric               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
303081ad6265SDimitry Andric               HaveSmallAccum = true;
303181ad6265SDimitry Andric             } else if (LocalAccum[1]) {
303281ad6265SDimitry Andric               Tmp = B.buildMerge(S64, LocalAccum).getReg(0);
303381ad6265SDimitry Andric               HaveSmallAccum = false;
303481ad6265SDimitry Andric             } else {
303581ad6265SDimitry Andric               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
303681ad6265SDimitry Andric               HaveSmallAccum = true;
303781ad6265SDimitry Andric             }
303881ad6265SDimitry Andric           } else {
303981ad6265SDimitry Andric             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
304081ad6265SDimitry Andric             Tmp = getZero64();
304181ad6265SDimitry Andric             HaveSmallAccum = true;
304281ad6265SDimitry Andric           }
304381ad6265SDimitry Andric 
304481ad6265SDimitry Andric           do {
304581ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
304681ad6265SDimitry Andric             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
304781ad6265SDimitry Andric                                     {Src0[j0], Src1[j1], Tmp});
304881ad6265SDimitry Andric             Tmp = Mad.getReg(0);
304981ad6265SDimitry Andric             if (!HaveSmallAccum)
305081ad6265SDimitry Andric               CarryOut.push_back(Mad.getReg(1));
305181ad6265SDimitry Andric             HaveSmallAccum = false;
305281ad6265SDimitry Andric             ++j0;
305381ad6265SDimitry Andric           } while (j0 <= DstIndex);
305481ad6265SDimitry Andric 
305581ad6265SDimitry Andric           auto Unmerge = B.buildUnmerge(S32, Tmp);
305681ad6265SDimitry Andric           LocalAccum[0] = Unmerge.getReg(0);
305781ad6265SDimitry Andric           if (LocalAccum.size() > 1)
305881ad6265SDimitry Andric             LocalAccum[1] = Unmerge.getReg(1);
305981ad6265SDimitry Andric         }
306081ad6265SDimitry Andric 
306181ad6265SDimitry Andric         return CarryOut;
306281ad6265SDimitry Andric       };
306381ad6265SDimitry Andric 
306481ad6265SDimitry Andric   // Outer multiply loop, iterating over destination parts from least
306581ad6265SDimitry Andric   // significant to most significant parts.
306681ad6265SDimitry Andric   //
306781ad6265SDimitry Andric   // The columns of the following diagram correspond to the destination parts
306881ad6265SDimitry Andric   // affected by one iteration of the outer loop (ignoring boundary
306981ad6265SDimitry Andric   // conditions).
307081ad6265SDimitry Andric   //
307181ad6265SDimitry Andric   //   Dest index relative to 2 * i:      1 0 -1
307281ad6265SDimitry Andric   //                                      ------
307381ad6265SDimitry Andric   //   Carries from previous iteration:     e o
307481ad6265SDimitry Andric   //   Even-aligned partial product sum:  E E .
307581ad6265SDimitry Andric   //   Odd-aligned partial product sum:     O O
307681ad6265SDimitry Andric   //
307781ad6265SDimitry Andric   // 'o' is OddCarry, 'e' is EvenCarry.
307881ad6265SDimitry Andric   // EE and OO are computed from partial products via buildMadChain and use
307981ad6265SDimitry Andric   // accumulation where possible and appropriate.
308081ad6265SDimitry Andric   //
308181ad6265SDimitry Andric   Register SeparateOddCarry;
308281ad6265SDimitry Andric   Carry EvenCarry;
308381ad6265SDimitry Andric   Carry OddCarry;
308481ad6265SDimitry Andric 
308581ad6265SDimitry Andric   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
308681ad6265SDimitry Andric     Carry OddCarryIn = std::move(OddCarry);
308781ad6265SDimitry Andric     Carry EvenCarryIn = std::move(EvenCarry);
308881ad6265SDimitry Andric     OddCarry.clear();
308981ad6265SDimitry Andric     EvenCarry.clear();
309081ad6265SDimitry Andric 
309181ad6265SDimitry Andric     // Partial products at offset 2 * i.
309281ad6265SDimitry Andric     if (2 * i < Accum.size()) {
309381ad6265SDimitry Andric       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
309481ad6265SDimitry Andric       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
309581ad6265SDimitry Andric     }
309681ad6265SDimitry Andric 
309781ad6265SDimitry Andric     // Partial products at offset 2 * i - 1.
309881ad6265SDimitry Andric     if (i > 0) {
309981ad6265SDimitry Andric       if (!SeparateOddAlignedProducts) {
310081ad6265SDimitry Andric         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
310181ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
310281ad6265SDimitry Andric       } else {
310381ad6265SDimitry Andric         bool IsHighest = 2 * i >= Accum.size();
310481ad6265SDimitry Andric         Register SeparateOddOut[2];
310581ad6265SDimitry Andric         auto LocalAccum = makeMutableArrayRef(SeparateOddOut)
310681ad6265SDimitry Andric                               .take_front(IsHighest ? 1 : 2);
310781ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
310881ad6265SDimitry Andric 
310981ad6265SDimitry Andric         MachineInstr *Lo;
311081ad6265SDimitry Andric 
311181ad6265SDimitry Andric         if (i == 1) {
311281ad6265SDimitry Andric           if (!IsHighest)
311381ad6265SDimitry Andric             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
311481ad6265SDimitry Andric           else
311581ad6265SDimitry Andric             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
311681ad6265SDimitry Andric         } else {
311781ad6265SDimitry Andric           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
311881ad6265SDimitry Andric                             SeparateOddCarry);
311981ad6265SDimitry Andric         }
312081ad6265SDimitry Andric         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
312181ad6265SDimitry Andric 
312281ad6265SDimitry Andric         if (!IsHighest) {
312381ad6265SDimitry Andric           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
312481ad6265SDimitry Andric                                 Lo->getOperand(1).getReg());
312581ad6265SDimitry Andric           Accum[2 * i] = Hi.getReg(0);
312681ad6265SDimitry Andric           SeparateOddCarry = Hi.getReg(1);
312781ad6265SDimitry Andric         }
312881ad6265SDimitry Andric       }
312981ad6265SDimitry Andric     }
313081ad6265SDimitry Andric 
313181ad6265SDimitry Andric     // Add in the carries from the previous iteration
313281ad6265SDimitry Andric     if (i > 0) {
313381ad6265SDimitry Andric       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
313481ad6265SDimitry Andric         EvenCarryIn.push_back(CarryOut);
313581ad6265SDimitry Andric 
313681ad6265SDimitry Andric       if (2 * i < Accum.size()) {
313781ad6265SDimitry Andric         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
313881ad6265SDimitry Andric           OddCarry.push_back(CarryOut);
313981ad6265SDimitry Andric       }
314081ad6265SDimitry Andric     }
314181ad6265SDimitry Andric   }
314281ad6265SDimitry Andric }
314381ad6265SDimitry Andric 
314481ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions.
314581ad6265SDimitry Andric //
314681ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to
314781ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
314881ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
314981ad6265SDimitry Andric                                       MachineInstr &MI) const {
315081ad6265SDimitry Andric   assert(ST.hasMad64_32());
315181ad6265SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_MUL);
315281ad6265SDimitry Andric 
315381ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
315481ad6265SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
315581ad6265SDimitry Andric 
315681ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
315781ad6265SDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
315881ad6265SDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
315981ad6265SDimitry Andric 
316081ad6265SDimitry Andric   LLT Ty = MRI.getType(DstReg);
316181ad6265SDimitry Andric   assert(Ty.isScalar());
316281ad6265SDimitry Andric 
316381ad6265SDimitry Andric   unsigned Size = Ty.getSizeInBits();
316481ad6265SDimitry Andric   unsigned NumParts = Size / 32;
316581ad6265SDimitry Andric   assert((Size % 32) == 0);
316681ad6265SDimitry Andric   assert(NumParts >= 2);
316781ad6265SDimitry Andric 
316881ad6265SDimitry Andric   // Whether to use MAD_64_32 for partial products whose high half is
316981ad6265SDimitry Andric   // discarded. This avoids some ADD instructions but risks false dependency
317081ad6265SDimitry Andric   // stalls on some subtargets in some cases.
317181ad6265SDimitry Andric   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
317281ad6265SDimitry Andric 
317381ad6265SDimitry Andric   // Whether to compute odd-aligned partial products separately. This is
317481ad6265SDimitry Andric   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
317581ad6265SDimitry Andric   // in an even-aligned VGPR.
317681ad6265SDimitry Andric   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
317781ad6265SDimitry Andric 
317881ad6265SDimitry Andric   LLT S32 = LLT::scalar(32);
317981ad6265SDimitry Andric   SmallVector<Register, 2> Src0Parts, Src1Parts;
318081ad6265SDimitry Andric   for (unsigned i = 0; i < NumParts; ++i) {
318181ad6265SDimitry Andric     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
318281ad6265SDimitry Andric     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
318381ad6265SDimitry Andric   }
318481ad6265SDimitry Andric   B.buildUnmerge(Src0Parts, Src0);
318581ad6265SDimitry Andric   B.buildUnmerge(Src1Parts, Src1);
318681ad6265SDimitry Andric 
318781ad6265SDimitry Andric   SmallVector<Register, 2> AccumRegs(NumParts);
318881ad6265SDimitry Andric   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
318981ad6265SDimitry Andric                 SeparateOddAlignedProducts);
319081ad6265SDimitry Andric 
319181ad6265SDimitry Andric   B.buildMerge(DstReg, AccumRegs);
319281ad6265SDimitry Andric   MI.eraseFromParent();
319381ad6265SDimitry Andric   return true;
319481ad6265SDimitry Andric 
319581ad6265SDimitry Andric }
319681ad6265SDimitry Andric 
3197349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3198349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3199349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select.
3200349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3201349cc55cSDimitry Andric                                             MachineRegisterInfo &MRI,
3202349cc55cSDimitry Andric                                             MachineIRBuilder &B) const {
3203349cc55cSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3204349cc55cSDimitry Andric   Register Src = MI.getOperand(1).getReg();
3205349cc55cSDimitry Andric   LLT DstTy = MRI.getType(Dst);
3206349cc55cSDimitry Andric   LLT SrcTy = MRI.getType(Src);
3207349cc55cSDimitry Andric 
3208349cc55cSDimitry Andric   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3209349cc55cSDimitry Andric                         ? AMDGPU::G_AMDGPU_FFBH_U32
3210349cc55cSDimitry Andric                         : AMDGPU::G_AMDGPU_FFBL_B32;
3211349cc55cSDimitry Andric   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3212349cc55cSDimitry Andric   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3213349cc55cSDimitry Andric 
3214349cc55cSDimitry Andric   MI.eraseFromParent();
3215349cc55cSDimitry Andric   return true;
3216349cc55cSDimitry Andric }
3217349cc55cSDimitry Andric 
3218e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
3219e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3220e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
3221e8d8bef9SDimitry Andric     return false;
3222349cc55cSDimitry Andric   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3223e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
3224e8d8bef9SDimitry Andric }
3225e8d8bef9SDimitry Andric 
32260b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
3227e8d8bef9SDimitry Andric static MachineInstr *
3228e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3229e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
32300b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
32310b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
32320b57cec5SDimitry Andric     return nullptr;
32330b57cec5SDimitry Andric 
32345ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
3235e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3236e8d8bef9SDimitry Andric 
3237e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
3238e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
3239e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
3240e8d8bef9SDimitry Andric       return nullptr;
3241e8d8bef9SDimitry Andric 
3242e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
3243349cc55cSDimitry Andric     eraseInstr(*UseMI, MRI);
3244e8d8bef9SDimitry Andric 
3245e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3246e8d8bef9SDimitry Andric     Negated = true;
3247e8d8bef9SDimitry Andric   }
3248e8d8bef9SDimitry Andric 
3249e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3250480093f4SDimitry Andric     return nullptr;
3251480093f4SDimitry Andric 
32525ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3253e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
32545ffd83dbSDimitry Andric   if (Next == Parent->end()) {
32555ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
32565ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
32575ffd83dbSDimitry Andric       return nullptr;
32585ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
32595ffd83dbSDimitry Andric   } else {
3260480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
3261480093f4SDimitry Andric       return nullptr;
3262480093f4SDimitry Andric     Br = &*Next;
32635ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
3264480093f4SDimitry Andric   }
3265480093f4SDimitry Andric 
3266e8d8bef9SDimitry Andric   return UseMI;
32670b57cec5SDimitry Andric }
32680b57cec5SDimitry Andric 
32690b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3270e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
3271e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
3272e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
3273e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
3274e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
32755ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
32760b57cec5SDimitry Andric 
327704eeddc0SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
327804eeddc0SDimitry Andric                                              *ArgRC, B.getDebugLoc(), ArgTy);
32790b57cec5SDimitry Andric   if (Arg->isMasked()) {
32800b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
32810b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
32820b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
32830b57cec5SDimitry Andric     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
32840b57cec5SDimitry Andric 
32858bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
32868bcb0991SDimitry Andric 
328704eeddc0SDimitry Andric     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
328804eeddc0SDimitry Andric     // 0.
32898bcb0991SDimitry Andric     if (Shift != 0) {
32900b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
32918bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
32928bcb0991SDimitry Andric     }
32938bcb0991SDimitry Andric 
32948bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
32955ffd83dbSDimitry Andric   } else {
32960b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
32970b57cec5SDimitry Andric   }
32980b57cec5SDimitry Andric 
32990b57cec5SDimitry Andric   return true;
33000b57cec5SDimitry Andric }
33010b57cec5SDimitry Andric 
3302e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
3303e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
3304e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3305e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3306e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
3307e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
3308e8d8bef9SDimitry Andric   LLT ArgTy;
3309e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
3310e8d8bef9SDimitry Andric 
3311349cc55cSDimitry Andric   if (!Arg) {
3312349cc55cSDimitry Andric     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
3313349cc55cSDimitry Andric       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
3314349cc55cSDimitry Andric       // case the pointer argument may be missing and we use null.
3315349cc55cSDimitry Andric       B.buildConstant(DstReg, 0);
3316349cc55cSDimitry Andric       return true;
3317349cc55cSDimitry Andric     }
3318349cc55cSDimitry Andric 
3319349cc55cSDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
3320349cc55cSDimitry Andric     // attributes uses the corresponding intrinsic.
3321349cc55cSDimitry Andric     B.buildUndef(DstReg);
3322349cc55cSDimitry Andric     return true;
3323349cc55cSDimitry Andric   }
3324349cc55cSDimitry Andric 
3325e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
3326e8d8bef9SDimitry Andric     return false; // TODO: Handle these
3327e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
3328e8d8bef9SDimitry Andric }
3329e8d8bef9SDimitry Andric 
33300b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
33315ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
33320b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
3333e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
33345ffd83dbSDimitry Andric     return false;
33355ffd83dbSDimitry Andric 
33360b57cec5SDimitry Andric   MI.eraseFromParent();
33370b57cec5SDimitry Andric   return true;
33380b57cec5SDimitry Andric }
33390b57cec5SDimitry Andric 
334081ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
334181ad6265SDimitry Andric                                 int64_t C) {
334281ad6265SDimitry Andric   B.buildConstant(MI.getOperand(0).getReg(), C);
334381ad6265SDimitry Andric   MI.eraseFromParent();
334481ad6265SDimitry Andric   return true;
334581ad6265SDimitry Andric }
334681ad6265SDimitry Andric 
334781ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
334881ad6265SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
334981ad6265SDimitry Andric     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
335081ad6265SDimitry Andric   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
335181ad6265SDimitry Andric   if (MaxID == 0)
335281ad6265SDimitry Andric     return replaceWithConstant(B, MI, 0);
335381ad6265SDimitry Andric 
335481ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
335581ad6265SDimitry Andric   const ArgDescriptor *Arg;
335681ad6265SDimitry Andric   const TargetRegisterClass *ArgRC;
335781ad6265SDimitry Andric   LLT ArgTy;
335881ad6265SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
335981ad6265SDimitry Andric 
336081ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
336181ad6265SDimitry Andric   if (!Arg) {
336281ad6265SDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
336381ad6265SDimitry Andric     // attributes uses the corresponding intrinsic.
336481ad6265SDimitry Andric     B.buildUndef(DstReg);
336581ad6265SDimitry Andric     MI.eraseFromParent();
336681ad6265SDimitry Andric     return true;
336781ad6265SDimitry Andric   }
336881ad6265SDimitry Andric 
336981ad6265SDimitry Andric   if (Arg->isMasked()) {
337081ad6265SDimitry Andric     // Don't bother inserting AssertZext for packed IDs since we're emitting the
337181ad6265SDimitry Andric     // masking operations anyway.
337281ad6265SDimitry Andric     //
337381ad6265SDimitry Andric     // TODO: We could assert the top bit is 0 for the source copy.
337481ad6265SDimitry Andric     if (!loadInputValue(DstReg, B, ArgType))
337581ad6265SDimitry Andric       return false;
337681ad6265SDimitry Andric   } else {
337781ad6265SDimitry Andric     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
337881ad6265SDimitry Andric     if (!loadInputValue(TmpReg, B, ArgType))
337981ad6265SDimitry Andric       return false;
338081ad6265SDimitry Andric     B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID));
338181ad6265SDimitry Andric   }
338281ad6265SDimitry Andric 
338381ad6265SDimitry Andric   MI.eraseFromParent();
338481ad6265SDimitry Andric   return true;
338581ad6265SDimitry Andric }
338681ad6265SDimitry Andric 
338781ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
338881ad6265SDimitry Andric                                                      int64_t Offset) const {
338981ad6265SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
339081ad6265SDimitry Andric   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
339181ad6265SDimitry Andric 
339281ad6265SDimitry Andric   // TODO: If we passed in the base kernel offset we could have a better
339381ad6265SDimitry Andric   // alignment than 4, but we don't really need it.
339481ad6265SDimitry Andric   if (!loadInputValue(KernArgReg, B,
339581ad6265SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
339681ad6265SDimitry Andric     llvm_unreachable("failed to find kernarg segment ptr");
339781ad6265SDimitry Andric 
339881ad6265SDimitry Andric   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
339981ad6265SDimitry Andric   // TODO: Should get nuw
340081ad6265SDimitry Andric   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
340181ad6265SDimitry Andric }
340281ad6265SDimitry Andric 
340381ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by
340481ad6265SDimitry Andric /// legacy intrinsics.
340581ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
340681ad6265SDimitry Andric                                                       MachineIRBuilder &B,
340781ad6265SDimitry Andric                                                       uint64_t Offset,
340881ad6265SDimitry Andric                                                       Align Alignment) const {
340981ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
341081ad6265SDimitry Andric 
341181ad6265SDimitry Andric   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
341281ad6265SDimitry Andric          "unexpected kernarg parameter type");
341381ad6265SDimitry Andric 
341481ad6265SDimitry Andric   Register Ptr = getKernargParameterPtr(B, Offset);
341581ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
341681ad6265SDimitry Andric   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
341781ad6265SDimitry Andric               MachineMemOperand::MODereferenceable |
341881ad6265SDimitry Andric                   MachineMemOperand::MOInvariant);
341981ad6265SDimitry Andric   MI.eraseFromParent();
342081ad6265SDimitry Andric   return true;
342181ad6265SDimitry Andric }
342281ad6265SDimitry Andric 
34238bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
34248bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
34258bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
3426480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3427480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
3428480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
3429480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3430480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
34318bcb0991SDimitry Andric 
3432480093f4SDimitry Andric   if (DstTy == S16)
3433480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
3434480093f4SDimitry Andric   if (DstTy == S32)
3435480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
3436480093f4SDimitry Andric   if (DstTy == S64)
3437480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
3438480093f4SDimitry Andric 
34398bcb0991SDimitry Andric   return false;
34408bcb0991SDimitry Andric }
34418bcb0991SDimitry Andric 
3442fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
3443fe6060f1SDimitry Andric                                                         Register DstDivReg,
3444fe6060f1SDimitry Andric                                                         Register DstRemReg,
34455ffd83dbSDimitry Andric                                                         Register X,
3446fe6060f1SDimitry Andric                                                         Register Y) const {
34475ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
34485ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
34495ffd83dbSDimitry Andric 
34505ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
34515ffd83dbSDimitry Andric   // algorithm used here.
34525ffd83dbSDimitry Andric 
34535ffd83dbSDimitry Andric   // Initial estimate of inv(y).
34545ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
34555ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
34565ffd83dbSDimitry Andric   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
34575ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
34585ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
34595ffd83dbSDimitry Andric 
34605ffd83dbSDimitry Andric   // One round of UNR.
34615ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
34625ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
34635ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
34645ffd83dbSDimitry Andric 
34655ffd83dbSDimitry Andric   // Quotient/remainder estimate.
34665ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
34675ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
34685ffd83dbSDimitry Andric 
34695ffd83dbSDimitry Andric   // First quotient/remainder refinement.
34705ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
34715ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3472fe6060f1SDimitry Andric   if (DstDivReg)
34735ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
34745ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
34755ffd83dbSDimitry Andric 
34765ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
34775ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
3478fe6060f1SDimitry Andric   if (DstDivReg)
3479fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
34805ffd83dbSDimitry Andric 
3481fe6060f1SDimitry Andric   if (DstRemReg)
3482fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
34835ffd83dbSDimitry Andric }
34845ffd83dbSDimitry Andric 
3485349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32
34865ffd83dbSDimitry Andric //
34875ffd83dbSDimitry Andric // Return lo, hi of result
34885ffd83dbSDimitry Andric //
34895ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
34905ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
34915ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
34925ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
34935ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
34945ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
34955ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
34965ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
34975ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
34985ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
34995ffd83dbSDimitry Andric                                                        Register Val) {
35005ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
35015ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
35025ffd83dbSDimitry Andric 
35035ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
35045ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
35055ffd83dbSDimitry Andric 
35065ffd83dbSDimitry Andric   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
35075ffd83dbSDimitry Andric                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
35085ffd83dbSDimitry Andric 
35095ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
35105ffd83dbSDimitry Andric   auto Mul1 =
35115ffd83dbSDimitry Andric       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
35125ffd83dbSDimitry Andric 
35135ffd83dbSDimitry Andric   // 2**(-32)
35145ffd83dbSDimitry Andric   auto Mul2 =
35155ffd83dbSDimitry Andric       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
35165ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
35175ffd83dbSDimitry Andric 
35185ffd83dbSDimitry Andric   // -(2**32)
35195ffd83dbSDimitry Andric   auto Mad2 = B.buildFMAD(S32, Trunc,
35205ffd83dbSDimitry Andric                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
35215ffd83dbSDimitry Andric 
35225ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
35235ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
35245ffd83dbSDimitry Andric 
35255ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
35265ffd83dbSDimitry Andric }
35275ffd83dbSDimitry Andric 
3528fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
3529fe6060f1SDimitry Andric                                                         Register DstDivReg,
3530fe6060f1SDimitry Andric                                                         Register DstRemReg,
35315ffd83dbSDimitry Andric                                                         Register Numer,
3532fe6060f1SDimitry Andric                                                         Register Denom) const {
35335ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
35345ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
35355ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
35365ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
35375ffd83dbSDimitry Andric 
35385ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
35395ffd83dbSDimitry Andric 
35405ffd83dbSDimitry Andric   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
35415ffd83dbSDimitry Andric 
35425ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
35435ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
35445ffd83dbSDimitry Andric 
35455ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
35465ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
35475ffd83dbSDimitry Andric 
35485ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
35495ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
35505ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
35515ffd83dbSDimitry Andric 
35525ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
35535ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
35545ffd83dbSDimitry Andric   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
35555ffd83dbSDimitry Andric 
35565ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
35575ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
35585ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
35595ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
35605ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
35615ffd83dbSDimitry Andric 
35625ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
35635ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
3564349cc55cSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
35655ffd83dbSDimitry Andric   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
35665ffd83dbSDimitry Andric 
35675ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
35685ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
35695ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
35705ffd83dbSDimitry Andric 
35715ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
35725ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
35735ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
35745ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
35755ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
35765ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
35775ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
35785ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
35795ffd83dbSDimitry Andric   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
35805ffd83dbSDimitry Andric 
35815ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
35825ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
35835ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
35845ffd83dbSDimitry Andric 
35855ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
35865ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
35875ffd83dbSDimitry Andric 
35885ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
35895ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
35905ffd83dbSDimitry Andric 
35915ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
35925ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
35935ffd83dbSDimitry Andric 
35945ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
35955ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
35965ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
35975ffd83dbSDimitry Andric 
35985ffd83dbSDimitry Andric   // if C3 != 0 ...
35995ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
36005ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
36015ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
36025ffd83dbSDimitry Andric   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
36035ffd83dbSDimitry Andric 
36045ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
36055ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
36065ffd83dbSDimitry Andric 
36075ffd83dbSDimitry Andric   auto C4 =
36085ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
36095ffd83dbSDimitry Andric   auto C5 =
36105ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
36115ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
36125ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
36135ffd83dbSDimitry Andric 
36145ffd83dbSDimitry Andric   // if (C6 != 0)
36155ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
36165ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
36175ffd83dbSDimitry Andric 
36185ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
36195ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
36205ffd83dbSDimitry Andric   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
36215ffd83dbSDimitry Andric 
36225ffd83dbSDimitry Andric   // endif C6
36235ffd83dbSDimitry Andric   // endif C3
36245ffd83dbSDimitry Andric 
3625fe6060f1SDimitry Andric   if (DstDivReg) {
36265ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
36275ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3628fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3629fe6060f1SDimitry Andric                   Sel1, MulHi3);
3630fe6060f1SDimitry Andric   }
3631fe6060f1SDimitry Andric 
3632fe6060f1SDimitry Andric   if (DstRemReg) {
36335ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
36345ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3635fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3636fe6060f1SDimitry Andric                   Sel2, Sub1);
36375ffd83dbSDimitry Andric   }
36385ffd83dbSDimitry Andric }
36395ffd83dbSDimitry Andric 
3640fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
36415ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
36425ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
3643fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
3644fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3645fe6060f1SDimitry Andric   default:
3646fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3647fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
3648fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3649fe6060f1SDimitry Andric     break;
3650fe6060f1SDimitry Andric   }
3651fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
3652fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3653fe6060f1SDimitry Andric     break;
3654fe6060f1SDimitry Andric   }
3655fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
3656fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3657fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3658fe6060f1SDimitry Andric     break;
3659fe6060f1SDimitry Andric   }
3660fe6060f1SDimitry Andric   }
3661fe6060f1SDimitry Andric 
36625ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
36635ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3664fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3665fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3666fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3667fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
36685ffd83dbSDimitry Andric 
36695ffd83dbSDimitry Andric   if (Ty == S32)
3670fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
36715ffd83dbSDimitry Andric   else if (Ty == S64)
3672fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
36735ffd83dbSDimitry Andric   else
36745ffd83dbSDimitry Andric     return false;
36755ffd83dbSDimitry Andric 
36765ffd83dbSDimitry Andric   MI.eraseFromParent();
36775ffd83dbSDimitry Andric   return true;
36785ffd83dbSDimitry Andric }
36795ffd83dbSDimitry Andric 
3680fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
36815ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
36825ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
36835ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
36845ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
36855ffd83dbSDimitry Andric 
3686fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
36875ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
36885ffd83dbSDimitry Andric     return false;
36895ffd83dbSDimitry Andric 
3690fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3691fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3692fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
36935ffd83dbSDimitry Andric 
36945ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
36955ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
36965ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
36975ffd83dbSDimitry Andric 
36985ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
36995ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
37005ffd83dbSDimitry Andric 
37015ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
37025ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
37035ffd83dbSDimitry Andric 
3704fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3705fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3706fe6060f1SDimitry Andric   default:
3707fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3708fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
3709fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3710fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3711fe6060f1SDimitry Andric     break;
3712fe6060f1SDimitry Andric   }
3713fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
3714fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3715fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3716fe6060f1SDimitry Andric     break;
3717fe6060f1SDimitry Andric   }
3718fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
3719fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3720fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3721fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3722fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3723fe6060f1SDimitry Andric     break;
3724fe6060f1SDimitry Andric   }
3725fe6060f1SDimitry Andric   }
3726fe6060f1SDimitry Andric 
37275ffd83dbSDimitry Andric   if (Ty == S32)
3728fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
37295ffd83dbSDimitry Andric   else
3730fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
37315ffd83dbSDimitry Andric 
3732fe6060f1SDimitry Andric   if (DstDivReg) {
3733fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3734fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3735fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
3736fe6060f1SDimitry Andric   }
37375ffd83dbSDimitry Andric 
3738fe6060f1SDimitry Andric   if (DstRemReg) {
3739fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3740fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3741fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
3742fe6060f1SDimitry Andric   }
37435ffd83dbSDimitry Andric 
37445ffd83dbSDimitry Andric   MI.eraseFromParent();
37455ffd83dbSDimitry Andric   return true;
37465ffd83dbSDimitry Andric }
37475ffd83dbSDimitry Andric 
37488bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
37498bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
37508bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
37518bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
37528bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
37538bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
37548bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
37558bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
37568bcb0991SDimitry Andric 
37578bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
3758e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3759e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
37608bcb0991SDimitry Andric 
3761e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
37628bcb0991SDimitry Andric     return false;
37638bcb0991SDimitry Andric 
37648bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
37658bcb0991SDimitry Andric     // 1 / x -> RCP(x)
37668bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
37678bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
37688bcb0991SDimitry Andric         .addUse(RHS)
37698bcb0991SDimitry Andric         .setMIFlags(Flags);
37708bcb0991SDimitry Andric 
37718bcb0991SDimitry Andric       MI.eraseFromParent();
37728bcb0991SDimitry Andric       return true;
37738bcb0991SDimitry Andric     }
37748bcb0991SDimitry Andric 
37758bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
37768bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
37778bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
37788bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
37798bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
37808bcb0991SDimitry Andric         .setMIFlags(Flags);
37818bcb0991SDimitry Andric 
37828bcb0991SDimitry Andric       MI.eraseFromParent();
37838bcb0991SDimitry Andric       return true;
37848bcb0991SDimitry Andric     }
37858bcb0991SDimitry Andric   }
37868bcb0991SDimitry Andric 
37878bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
37888bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
37898bcb0991SDimitry Andric     .addUse(RHS)
37908bcb0991SDimitry Andric     .setMIFlags(Flags);
37918bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
37928bcb0991SDimitry Andric 
37938bcb0991SDimitry Andric   MI.eraseFromParent();
37948bcb0991SDimitry Andric   return true;
37958bcb0991SDimitry Andric }
37968bcb0991SDimitry Andric 
3797e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3798e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
3799e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
3800e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3801e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
3802e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
3803e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
3804e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
3805e8d8bef9SDimitry Andric 
3806e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
3807e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3808e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
3809e8d8bef9SDimitry Andric 
3810e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
38118bcb0991SDimitry Andric     return false;
3812e8d8bef9SDimitry Andric 
3813e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
3814e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
3815e8d8bef9SDimitry Andric 
3816e8d8bef9SDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3817e8d8bef9SDimitry Andric     .addUse(Y)
3818e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3819e8d8bef9SDimitry Andric 
3820e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3821e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
3822e8d8bef9SDimitry Andric 
3823e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3824e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
3825e8d8bef9SDimitry Andric 
3826e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
3827e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3828e8d8bef9SDimitry Andric 
3829e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
3830e8d8bef9SDimitry Andric   MI.eraseFromParent();
3831e8d8bef9SDimitry Andric   return true;
38328bcb0991SDimitry Andric }
38338bcb0991SDimitry Andric 
3834480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3835480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3836480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3837e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3838e8d8bef9SDimitry Andric     return true;
3839e8d8bef9SDimitry Andric 
3840480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3841480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3842480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3843480093f4SDimitry Andric 
3844480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3845480093f4SDimitry Andric 
3846480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
3847480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3848480093f4SDimitry Andric 
3849480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3850480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3851480093f4SDimitry Andric 
3852480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3853480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
3854480093f4SDimitry Andric     .setMIFlags(Flags);
3855480093f4SDimitry Andric 
3856480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3857480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3858480093f4SDimitry Andric 
3859480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3860480093f4SDimitry Andric     .addUse(RDst.getReg(0))
3861480093f4SDimitry Andric     .addUse(RHS)
3862480093f4SDimitry Andric     .addUse(LHS)
3863480093f4SDimitry Andric     .setMIFlags(Flags);
3864480093f4SDimitry Andric 
3865480093f4SDimitry Andric   MI.eraseFromParent();
3866480093f4SDimitry Andric   return true;
3867480093f4SDimitry Andric }
3868480093f4SDimitry Andric 
3869480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3870480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3871480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable,
3872480093f4SDimitry Andric                                MachineIRBuilder &B,
3873480093f4SDimitry Andric                                const GCNSubtarget &ST,
3874480093f4SDimitry Andric                                AMDGPU::SIModeRegisterDefaults Mode) {
3875480093f4SDimitry Andric   // Set SP denorm mode to this value.
3876480093f4SDimitry Andric   unsigned SPDenormMode =
38775ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3878480093f4SDimitry Andric 
3879480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
3880480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
38815ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3882480093f4SDimitry Andric 
38835ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3884480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
3885480093f4SDimitry Andric       .addImm(NewDenormModeValue);
3886480093f4SDimitry Andric 
3887480093f4SDimitry Andric   } else {
3888480093f4SDimitry Andric     // Select FP32 bit field in mode register.
3889480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3890480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3891480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3892480093f4SDimitry Andric 
3893480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3894480093f4SDimitry Andric       .addImm(SPDenormMode)
3895480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
3896480093f4SDimitry Andric   }
3897480093f4SDimitry Andric }
3898480093f4SDimitry Andric 
3899480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3900480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3901480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3902e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3903e8d8bef9SDimitry Andric     return true;
3904e8d8bef9SDimitry Andric 
3905480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3906480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3907480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3908480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3909480093f4SDimitry Andric   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3910480093f4SDimitry Andric 
3911480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3912480093f4SDimitry Andric 
3913480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3914480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3915480093f4SDimitry Andric 
3916480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
3917480093f4SDimitry Andric 
3918480093f4SDimitry Andric   auto DenominatorScaled =
3919480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3920480093f4SDimitry Andric       .addUse(LHS)
39215ffd83dbSDimitry Andric       .addUse(RHS)
39225ffd83dbSDimitry Andric       .addImm(0)
3923480093f4SDimitry Andric       .setMIFlags(Flags);
3924480093f4SDimitry Andric   auto NumeratorScaled =
3925480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3926480093f4SDimitry Andric       .addUse(LHS)
3927480093f4SDimitry Andric       .addUse(RHS)
39285ffd83dbSDimitry Andric       .addImm(1)
3929480093f4SDimitry Andric       .setMIFlags(Flags);
3930480093f4SDimitry Andric 
3931480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3932480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
3933480093f4SDimitry Andric     .setMIFlags(Flags);
3934480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3935480093f4SDimitry Andric 
3936480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3937480093f4SDimitry Andric   // aren't modeled as reading it.
39385ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3939480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
3940480093f4SDimitry Andric 
3941480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3942480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3943480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3944480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3945480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3946480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3947480093f4SDimitry Andric 
39485ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3949480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
3950480093f4SDimitry Andric 
3951480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3952480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3953480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
3954480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3955480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
3956480093f4SDimitry Andric     .setMIFlags(Flags);
3957480093f4SDimitry Andric 
3958480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3959480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3960480093f4SDimitry Andric     .addUse(RHS)
3961480093f4SDimitry Andric     .addUse(LHS)
3962480093f4SDimitry Andric     .setMIFlags(Flags);
3963480093f4SDimitry Andric 
3964480093f4SDimitry Andric   MI.eraseFromParent();
3965480093f4SDimitry Andric   return true;
3966480093f4SDimitry Andric }
3967480093f4SDimitry Andric 
3968480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3969480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3970480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3971e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3972e8d8bef9SDimitry Andric     return true;
3973e8d8bef9SDimitry Andric 
3974480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3975480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3976480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3977480093f4SDimitry Andric 
3978480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3979480093f4SDimitry Andric 
3980480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
3981480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3982480093f4SDimitry Andric 
3983480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
3984480093f4SDimitry Andric 
3985480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3986480093f4SDimitry Andric     .addUse(LHS)
3987480093f4SDimitry Andric     .addUse(RHS)
39885ffd83dbSDimitry Andric     .addImm(0)
3989480093f4SDimitry Andric     .setMIFlags(Flags);
3990480093f4SDimitry Andric 
3991480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3992480093f4SDimitry Andric 
3993480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3994480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
3995480093f4SDimitry Andric     .setMIFlags(Flags);
3996480093f4SDimitry Andric 
3997480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3998480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3999480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4000480093f4SDimitry Andric 
4001480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4002480093f4SDimitry Andric     .addUse(LHS)
4003480093f4SDimitry Andric     .addUse(RHS)
40045ffd83dbSDimitry Andric     .addImm(1)
4005480093f4SDimitry Andric     .setMIFlags(Flags);
4006480093f4SDimitry Andric 
4007480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
40085ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4009480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4010480093f4SDimitry Andric 
4011480093f4SDimitry Andric   Register Scale;
4012480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
4013480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
4014480093f4SDimitry Andric     // is not usable.
4015480093f4SDimitry Andric 
4016480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
4017480093f4SDimitry Andric 
4018480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4019480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4020480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4021480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4022480093f4SDimitry Andric 
4023480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4024480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
4025480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4026480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
40275ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4028480093f4SDimitry Andric   } else {
4029480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
4030480093f4SDimitry Andric   }
4031480093f4SDimitry Andric 
4032480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4033480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
4034480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
4035480093f4SDimitry Andric     .addUse(Mul.getReg(0))
4036480093f4SDimitry Andric     .addUse(Scale)
4037480093f4SDimitry Andric     .setMIFlags(Flags);
4038480093f4SDimitry Andric 
4039480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
4040480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
4041480093f4SDimitry Andric     .addUse(RHS)
4042480093f4SDimitry Andric     .addUse(LHS)
4043480093f4SDimitry Andric     .setMIFlags(Flags);
4044480093f4SDimitry Andric 
4045480093f4SDimitry Andric   MI.eraseFromParent();
4046480093f4SDimitry Andric   return true;
4047480093f4SDimitry Andric }
4048480093f4SDimitry Andric 
40498bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
40508bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
40518bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
40528bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
40538bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
40548bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
40558bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
40568bcb0991SDimitry Andric 
40578bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
40588bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
40598bcb0991SDimitry Andric 
40608bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
40618bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
40628bcb0991SDimitry Andric 
40638bcb0991SDimitry Andric   auto C0 = B.buildConstant(S32, 0x6f800000);
40648bcb0991SDimitry Andric   auto C1 = B.buildConstant(S32, 0x2f800000);
40658bcb0991SDimitry Andric   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
40668bcb0991SDimitry Andric 
40678bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
40688bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
40698bcb0991SDimitry Andric 
40708bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
40718bcb0991SDimitry Andric 
40728bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
40738bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
40748bcb0991SDimitry Andric     .setMIFlags(Flags);
40758bcb0991SDimitry Andric 
40768bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
40778bcb0991SDimitry Andric 
40788bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
40798bcb0991SDimitry Andric 
40808bcb0991SDimitry Andric   MI.eraseFromParent();
40818bcb0991SDimitry Andric   return true;
40828bcb0991SDimitry Andric }
40838bcb0991SDimitry Andric 
4084e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4085e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
4086e8d8bef9SDimitry Andric //
4087e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
4088e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4089e8d8bef9SDimitry Andric // +-max_float.
4090e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4091e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
4092e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
4093e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4094e8d8bef9SDimitry Andric     return true;
4095e8d8bef9SDimitry Andric 
4096e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4097e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
4098e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
4099e8d8bef9SDimitry Andric 
4100e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
4101e8d8bef9SDimitry Andric 
4102e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
4103e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
4104e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
4105e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
4106e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
4107e8d8bef9SDimitry Andric   else
4108e8d8bef9SDimitry Andric     return false;
4109e8d8bef9SDimitry Andric 
4110e8d8bef9SDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4111e8d8bef9SDimitry Andric     .addUse(Src)
4112e8d8bef9SDimitry Andric     .setMIFlags(Flags);
4113e8d8bef9SDimitry Andric 
4114e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
4115e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
4116e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4117e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
4118e8d8bef9SDimitry Andric 
4119e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4120e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4121e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4122e8d8bef9SDimitry Andric 
4123e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4124e8d8bef9SDimitry Andric 
4125e8d8bef9SDimitry Andric   if (UseIEEE)
4126e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4127e8d8bef9SDimitry Andric   else
4128e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4129e8d8bef9SDimitry Andric   MI.eraseFromParent();
4130e8d8bef9SDimitry Andric   return true;
4131e8d8bef9SDimitry Andric }
4132e8d8bef9SDimitry Andric 
4133e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4134e8d8bef9SDimitry Andric   switch (IID) {
4135e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
4136e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
4137e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
4138e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4139e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
4140e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4141e8d8bef9SDimitry Andric   default:
4142e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
4143e8d8bef9SDimitry Andric   }
4144e8d8bef9SDimitry Andric }
4145e8d8bef9SDimitry Andric 
4146e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4147e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
4148e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
4149e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
4150e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
4151e8d8bef9SDimitry Andric 
4152e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4153e8d8bef9SDimitry Andric 
4154e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
4155e8d8bef9SDimitry Andric   // construction.
4156e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
415781ad6265SDimitry Andric     MI.removeOperand(I);
4158e8d8bef9SDimitry Andric 
415981ad6265SDimitry Andric   MI.removeOperand(1); // Remove the intrinsic ID.
4160e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
4161e8d8bef9SDimitry Andric   return true;
4162e8d8bef9SDimitry Andric }
4163e8d8bef9SDimitry Andric 
4164e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
4165e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
4166e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
4167e8d8bef9SDimitry Andric   uint64_t Offset =
4168e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
4169e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
4170e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
4171e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
4172e8d8bef9SDimitry Andric 
4173e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
4174e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
4175e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
4176e8d8bef9SDimitry Andric     return false;
4177e8d8bef9SDimitry Andric 
4178e8d8bef9SDimitry Andric   // FIXME: This should be nuw
4179e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
4180e8d8bef9SDimitry Andric   return true;
4181e8d8bef9SDimitry Andric }
4182e8d8bef9SDimitry Andric 
41830b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
41840b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
41850b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
41860b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
41870b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
41880b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
41890b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
41900b57cec5SDimitry Andric   }
41910b57cec5SDimitry Andric 
41920b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4193e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
41940b57cec5SDimitry Andric     return false;
41950b57cec5SDimitry Andric 
41960b57cec5SDimitry Andric   MI.eraseFromParent();
41970b57cec5SDimitry Andric   return true;
41980b57cec5SDimitry Andric }
41990b57cec5SDimitry Andric 
4200*fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
4201*fcaf7f86SDimitry Andric                                          MachineRegisterInfo &MRI,
4202*fcaf7f86SDimitry Andric                                          MachineIRBuilder &B) const {
4203*fcaf7f86SDimitry Andric   Function &F = B.getMF().getFunction();
4204*fcaf7f86SDimitry Andric   Optional<uint32_t> KnownSize =
4205*fcaf7f86SDimitry Andric       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
4206*fcaf7f86SDimitry Andric   if (KnownSize.has_value())
4207*fcaf7f86SDimitry Andric     B.buildConstant(DstReg, KnownSize.value());
4208*fcaf7f86SDimitry Andric   return false;
4209*fcaf7f86SDimitry Andric }
4210*fcaf7f86SDimitry Andric 
4211*fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
4212*fcaf7f86SDimitry Andric                                               MachineRegisterInfo &MRI,
4213*fcaf7f86SDimitry Andric                                               MachineIRBuilder &B) const {
4214*fcaf7f86SDimitry Andric 
4215*fcaf7f86SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4216*fcaf7f86SDimitry Andric   if (!MFI->isEntryFunction()) {
4217*fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
4218*fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
4219*fcaf7f86SDimitry Andric   }
4220*fcaf7f86SDimitry Andric 
4221*fcaf7f86SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4222*fcaf7f86SDimitry Andric   if (!getLDSKernelId(DstReg, MRI, B))
4223*fcaf7f86SDimitry Andric     return false;
4224*fcaf7f86SDimitry Andric 
4225*fcaf7f86SDimitry Andric   MI.eraseFromParent();
4226*fcaf7f86SDimitry Andric   return true;
4227*fcaf7f86SDimitry Andric }
4228*fcaf7f86SDimitry Andric 
42298bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
42308bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
42318bcb0991SDimitry Andric                                               MachineIRBuilder &B,
42328bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
42338bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
4234e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
4235e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
4236e8d8bef9SDimitry Andric 
42378bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
42388bcb0991SDimitry Andric   MI.eraseFromParent();
42398bcb0991SDimitry Andric   return true;
42408bcb0991SDimitry Andric }
42418bcb0991SDimitry Andric 
42425ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
42435ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
42445ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
42455ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
42465ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
42475ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
4248fe6060f1SDimitry Andric std::pair<Register, unsigned>
42495ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
42505ffd83dbSDimitry Andric                                         Register OrigOffset) const {
42515ffd83dbSDimitry Andric   const unsigned MaxImm = 4095;
42525ffd83dbSDimitry Andric   Register BaseReg;
4253fe6060f1SDimitry Andric   unsigned ImmOffset;
42545ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
4255fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
42565ffd83dbSDimitry Andric 
4257fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
4258fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
42595ffd83dbSDimitry Andric 
4260fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
4261fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
4262fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
42635ffd83dbSDimitry Andric 
42645ffd83dbSDimitry Andric   // If the immediate value is too big for the immoffset field, put the value
42655ffd83dbSDimitry Andric   // and -4096 into the immoffset field so that the value that is copied/added
42665ffd83dbSDimitry Andric   // for the voffset field is a multiple of 4096, and it stands more chance
42675ffd83dbSDimitry Andric   // of being CSEd with the copy/add for another similar load/store.
42685ffd83dbSDimitry Andric   // However, do not do that rounding down to a multiple of 4096 if that is a
42695ffd83dbSDimitry Andric   // negative number, as it appears to be illegal to have a negative offset
42705ffd83dbSDimitry Andric   // in the vgpr, even if adding the immediate offset makes it positive.
42715ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
42725ffd83dbSDimitry Andric   ImmOffset -= Overflow;
42735ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
42745ffd83dbSDimitry Andric     Overflow += ImmOffset;
42755ffd83dbSDimitry Andric     ImmOffset = 0;
42765ffd83dbSDimitry Andric   }
42775ffd83dbSDimitry Andric 
42785ffd83dbSDimitry Andric   if (Overflow != 0) {
42795ffd83dbSDimitry Andric     if (!BaseReg) {
42805ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
42815ffd83dbSDimitry Andric     } else {
42825ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
42835ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
42845ffd83dbSDimitry Andric     }
42855ffd83dbSDimitry Andric   }
42865ffd83dbSDimitry Andric 
42875ffd83dbSDimitry Andric   if (!BaseReg)
42885ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
42895ffd83dbSDimitry Andric 
4290fe6060f1SDimitry Andric   return std::make_pair(BaseReg, ImmOffset);
4291fe6060f1SDimitry Andric }
4292fe6060f1SDimitry Andric 
4293fe6060f1SDimitry Andric /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
4294fe6060f1SDimitry Andric void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
4295fe6060f1SDimitry Andric                                           Register VOffset, Register SOffset,
4296fe6060f1SDimitry Andric                                           unsigned ImmOffset, Register VIndex,
4297fe6060f1SDimitry Andric                                           MachineRegisterInfo &MRI) const {
4298fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeVOffsetVal =
4299349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(VOffset, MRI);
4300fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeSOffsetVal =
4301349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(SOffset, MRI);
4302fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeVIndexVal =
4303349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(VIndex, MRI);
4304fe6060f1SDimitry Andric   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
4305fe6060f1SDimitry Andric   // update the MMO with that offset. The stride is unknown so we can only do
4306fe6060f1SDimitry Andric   // this if VIndex is constant 0.
4307fe6060f1SDimitry Andric   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
4308fe6060f1SDimitry Andric       MaybeVIndexVal->Value == 0) {
4309fe6060f1SDimitry Andric     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
4310fe6060f1SDimitry Andric                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
4311fe6060f1SDimitry Andric     MMO->setOffset(TotalOffset);
4312fe6060f1SDimitry Andric   } else {
4313fe6060f1SDimitry Andric     // We don't have a constant combined offset to use in the MMO. Give up.
4314fe6060f1SDimitry Andric     MMO->setValue((Value *)nullptr);
4315fe6060f1SDimitry Andric   }
43165ffd83dbSDimitry Andric }
43175ffd83dbSDimitry Andric 
43188bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
43198bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
43208bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
4321e8d8bef9SDimitry Andric                                              Register Reg,
4322e8d8bef9SDimitry Andric                                              bool ImageStore) const {
43238bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
43248bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
43258bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
43268bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
43278bcb0991SDimitry Andric 
4328e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
43298bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
43308bcb0991SDimitry Andric 
43318bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
43328bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
43338bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
43348bcb0991SDimitry Andric 
43358bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
43368bcb0991SDimitry Andric 
4337fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
4338fe6060f1SDimitry Andric         .getReg(0);
43398bcb0991SDimitry Andric   }
43408bcb0991SDimitry Andric 
4341e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
4342e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
4343e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
4344e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
4345e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
4346e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
4347fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
4348fe6060f1SDimitry Andric           .getReg(0);
4349e8d8bef9SDimitry Andric     }
4350e8d8bef9SDimitry Andric 
4351e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
4352e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
4353e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
4354e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4355e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
4356e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
4357fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
4358fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
4359e8d8bef9SDimitry Andric     }
4360e8d8bef9SDimitry Andric 
4361e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
4362e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
4363fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
4364e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
4365e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
4366e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
4367e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
4368fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
4369fe6060f1SDimitry Andric           .getReg(0);
4370e8d8bef9SDimitry Andric     }
4371e8d8bef9SDimitry Andric 
4372e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
4373e8d8bef9SDimitry Andric   }
4374e8d8bef9SDimitry Andric 
43750eae32dcSDimitry Andric   if (StoreVT == LLT::fixed_vector(3, S16)) {
43760eae32dcSDimitry Andric     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
43770eae32dcSDimitry Andric               .getReg(0);
43780eae32dcSDimitry Andric   }
4379e8d8bef9SDimitry Andric   return Reg;
4380e8d8bef9SDimitry Andric }
4381e8d8bef9SDimitry Andric 
43825ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
43835ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
43845ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
43855ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
43868bcb0991SDimitry Andric 
43878bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
43888bcb0991SDimitry Andric 
43898bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
43908bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
43918bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
43925ffd83dbSDimitry Andric     return AnyExt;
43938bcb0991SDimitry Andric   }
43948bcb0991SDimitry Andric 
43958bcb0991SDimitry Andric   if (Ty.isVector()) {
43968bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
43978bcb0991SDimitry Andric       if (IsFormat)
43985ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
43995ffd83dbSDimitry Andric     }
44005ffd83dbSDimitry Andric   }
44015ffd83dbSDimitry Andric 
44025ffd83dbSDimitry Andric   return VData;
44035ffd83dbSDimitry Andric }
44045ffd83dbSDimitry Andric 
44055ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
44065ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
44075ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
44085ffd83dbSDimitry Andric                                               bool IsTyped,
44095ffd83dbSDimitry Andric                                               bool IsFormat) const {
44105ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
44115ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
44125ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
44135ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
44145ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
44155ffd83dbSDimitry Andric 
44165ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
44175ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
44185ffd83dbSDimitry Andric 
44195ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
44205ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
44215ffd83dbSDimitry Andric 
44225ffd83dbSDimitry Andric   unsigned ImmOffset;
44235ffd83dbSDimitry Andric 
44245ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
44255ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
44265ffd83dbSDimitry Andric 
44275ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
44285ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
44295ffd83dbSDimitry Andric   Register VIndex;
44305ffd83dbSDimitry Andric   int OpOffset = 0;
44315ffd83dbSDimitry Andric   if (HasVIndex) {
44325ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
44335ffd83dbSDimitry Andric     OpOffset = 1;
4434fe6060f1SDimitry Andric   } else {
4435fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
44365ffd83dbSDimitry Andric   }
44375ffd83dbSDimitry Andric 
44385ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
44395ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
44405ffd83dbSDimitry Andric 
44415ffd83dbSDimitry Andric   unsigned Format = 0;
44425ffd83dbSDimitry Andric   if (IsTyped) {
44435ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
44445ffd83dbSDimitry Andric     ++OpOffset;
44455ffd83dbSDimitry Andric   }
44465ffd83dbSDimitry Andric 
44475ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
44485ffd83dbSDimitry Andric 
4449fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4450fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
44515ffd83dbSDimitry Andric 
44525ffd83dbSDimitry Andric   unsigned Opc;
44535ffd83dbSDimitry Andric   if (IsTyped) {
44545ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
44555ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
44565ffd83dbSDimitry Andric   } else if (IsFormat) {
44575ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
44585ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
44595ffd83dbSDimitry Andric   } else {
44605ffd83dbSDimitry Andric     switch (MemSize) {
44615ffd83dbSDimitry Andric     case 1:
44625ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
44635ffd83dbSDimitry Andric       break;
44645ffd83dbSDimitry Andric     case 2:
44655ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
44665ffd83dbSDimitry Andric       break;
44675ffd83dbSDimitry Andric     default:
44685ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
44695ffd83dbSDimitry Andric       break;
44705ffd83dbSDimitry Andric     }
44715ffd83dbSDimitry Andric   }
44725ffd83dbSDimitry Andric 
44735ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
44745ffd83dbSDimitry Andric     .addUse(VData)              // vdata
44755ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
44765ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
44775ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
44785ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
44795ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
44805ffd83dbSDimitry Andric 
44815ffd83dbSDimitry Andric   if (IsTyped)
44825ffd83dbSDimitry Andric     MIB.addImm(Format);
44835ffd83dbSDimitry Andric 
44845ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
44855ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
44865ffd83dbSDimitry Andric      .addMemOperand(MMO);
44875ffd83dbSDimitry Andric 
44885ffd83dbSDimitry Andric   MI.eraseFromParent();
44898bcb0991SDimitry Andric   return true;
44908bcb0991SDimitry Andric }
44918bcb0991SDimitry Andric 
44925ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
44935ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
44945ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
44955ffd83dbSDimitry Andric                                              bool IsFormat,
44965ffd83dbSDimitry Andric                                              bool IsTyped) const {
44975ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
44985ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
4499fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
45005ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
45015ffd83dbSDimitry Andric 
45025ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
45035ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
45045ffd83dbSDimitry Andric 
45055ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
45065ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
45075ffd83dbSDimitry Andric 
45085ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
45095ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
45105ffd83dbSDimitry Andric   Register VIndex;
45115ffd83dbSDimitry Andric   int OpOffset = 0;
45125ffd83dbSDimitry Andric   if (HasVIndex) {
45135ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
45145ffd83dbSDimitry Andric     OpOffset = 1;
4515fe6060f1SDimitry Andric   } else {
4516fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
45178bcb0991SDimitry Andric   }
45188bcb0991SDimitry Andric 
45195ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
45205ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
45215ffd83dbSDimitry Andric 
45225ffd83dbSDimitry Andric   unsigned Format = 0;
45235ffd83dbSDimitry Andric   if (IsTyped) {
45245ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
45255ffd83dbSDimitry Andric     ++OpOffset;
45268bcb0991SDimitry Andric   }
45278bcb0991SDimitry Andric 
45285ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
45295ffd83dbSDimitry Andric   unsigned ImmOffset;
45305ffd83dbSDimitry Andric 
45315ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
45325ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
45335ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
45345ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
45355ffd83dbSDimitry Andric 
4536fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4537fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
45385ffd83dbSDimitry Andric 
45395ffd83dbSDimitry Andric   unsigned Opc;
45405ffd83dbSDimitry Andric 
45415ffd83dbSDimitry Andric   if (IsTyped) {
45425ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
45435ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
45445ffd83dbSDimitry Andric   } else if (IsFormat) {
45455ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
45465ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
45475ffd83dbSDimitry Andric   } else {
4548fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
4549fe6060f1SDimitry Andric     case 8:
45505ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
45515ffd83dbSDimitry Andric       break;
4552fe6060f1SDimitry Andric     case 16:
45535ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
45545ffd83dbSDimitry Andric       break;
45555ffd83dbSDimitry Andric     default:
45565ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
45575ffd83dbSDimitry Andric       break;
45585ffd83dbSDimitry Andric     }
45595ffd83dbSDimitry Andric   }
45605ffd83dbSDimitry Andric 
45615ffd83dbSDimitry Andric   Register LoadDstReg;
45625ffd83dbSDimitry Andric 
4563fe6060f1SDimitry Andric   bool IsExtLoad =
4564fe6060f1SDimitry Andric       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
45655ffd83dbSDimitry Andric   LLT UnpackedTy = Ty.changeElementSize(32);
45665ffd83dbSDimitry Andric 
45675ffd83dbSDimitry Andric   if (IsExtLoad)
45685ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
45695ffd83dbSDimitry Andric   else if (Unpacked && IsD16 && Ty.isVector())
45705ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
45715ffd83dbSDimitry Andric   else
45725ffd83dbSDimitry Andric     LoadDstReg = Dst;
45735ffd83dbSDimitry Andric 
45745ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
45755ffd83dbSDimitry Andric     .addDef(LoadDstReg)         // vdata
45765ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
45775ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
45785ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
45795ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
45805ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
45815ffd83dbSDimitry Andric 
45825ffd83dbSDimitry Andric   if (IsTyped)
45835ffd83dbSDimitry Andric     MIB.addImm(Format);
45845ffd83dbSDimitry Andric 
45855ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
45865ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
45875ffd83dbSDimitry Andric      .addMemOperand(MMO);
45885ffd83dbSDimitry Andric 
45895ffd83dbSDimitry Andric   if (LoadDstReg != Dst) {
45905ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
45915ffd83dbSDimitry Andric 
45925ffd83dbSDimitry Andric     // Widen result for extending loads was widened.
45935ffd83dbSDimitry Andric     if (IsExtLoad)
45945ffd83dbSDimitry Andric       B.buildTrunc(Dst, LoadDstReg);
45955ffd83dbSDimitry Andric     else {
45965ffd83dbSDimitry Andric       // Repack to original 16-bit vector result
45975ffd83dbSDimitry Andric       // FIXME: G_TRUNC should work, but legalization currently fails
45985ffd83dbSDimitry Andric       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
45995ffd83dbSDimitry Andric       SmallVector<Register, 4> Repack;
46005ffd83dbSDimitry Andric       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
46015ffd83dbSDimitry Andric         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
46025ffd83dbSDimitry Andric       B.buildMerge(Dst, Repack);
46035ffd83dbSDimitry Andric     }
46045ffd83dbSDimitry Andric   }
46055ffd83dbSDimitry Andric 
46065ffd83dbSDimitry Andric   MI.eraseFromParent();
46075ffd83dbSDimitry Andric   return true;
46085ffd83dbSDimitry Andric }
46095ffd83dbSDimitry Andric 
46105ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
46115ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
46125ffd83dbSDimitry Andric                                                bool IsInc) const {
46135ffd83dbSDimitry Andric   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
46145ffd83dbSDimitry Andric                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
46155ffd83dbSDimitry Andric   B.buildInstr(Opc)
46165ffd83dbSDimitry Andric     .addDef(MI.getOperand(0).getReg())
46175ffd83dbSDimitry Andric     .addUse(MI.getOperand(2).getReg())
46185ffd83dbSDimitry Andric     .addUse(MI.getOperand(3).getReg())
46195ffd83dbSDimitry Andric     .cloneMemRefs(MI);
46205ffd83dbSDimitry Andric   MI.eraseFromParent();
46215ffd83dbSDimitry Andric   return true;
46225ffd83dbSDimitry Andric }
46235ffd83dbSDimitry Andric 
46245ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
46255ffd83dbSDimitry Andric   switch (IntrID) {
46265ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
46275ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
46285ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
46295ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
46305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
46315ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
46325ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
46335ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
46345ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
46355ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
46365ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
46375ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
46385ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
46395ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
46405ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
46415ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
46425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
46435ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
46445ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
46455ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
46465ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
46475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
46485ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
46495ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
46505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
46515ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
46525ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
46535ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
46545ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
46555ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
46565ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
46575ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
46585ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
46595ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
46605ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
46615ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
46625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
46635ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
46645ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4665e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4666e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4667e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4668fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4669fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4670fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4671fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4672fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4673fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
46745ffd83dbSDimitry Andric   default:
46755ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
46765ffd83dbSDimitry Andric   }
46775ffd83dbSDimitry Andric }
46785ffd83dbSDimitry Andric 
46795ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
46805ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
46815ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
46825ffd83dbSDimitry Andric   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
46835ffd83dbSDimitry Andric                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4684e8d8bef9SDimitry Andric   const bool HasReturn = MI.getNumExplicitDefs() != 0;
46855ffd83dbSDimitry Andric 
4686e8d8bef9SDimitry Andric   Register Dst;
46875ffd83dbSDimitry Andric 
46885ffd83dbSDimitry Andric   int OpOffset = 0;
4689e8d8bef9SDimitry Andric   if (HasReturn) {
4690e8d8bef9SDimitry Andric     // A few FP atomics do not support return values.
4691e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4692e8d8bef9SDimitry Andric   } else {
4693e8d8bef9SDimitry Andric     OpOffset = -1;
4694e8d8bef9SDimitry Andric   }
4695e8d8bef9SDimitry Andric 
4696e8d8bef9SDimitry Andric   Register VData = MI.getOperand(2 + OpOffset).getReg();
4697e8d8bef9SDimitry Andric   Register CmpVal;
46985ffd83dbSDimitry Andric 
46995ffd83dbSDimitry Andric   if (IsCmpSwap) {
47005ffd83dbSDimitry Andric     CmpVal = MI.getOperand(3 + OpOffset).getReg();
47015ffd83dbSDimitry Andric     ++OpOffset;
47025ffd83dbSDimitry Andric   }
47035ffd83dbSDimitry Andric 
47045ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4705e8d8bef9SDimitry Andric   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
47065ffd83dbSDimitry Andric 
47075ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
47085ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
47095ffd83dbSDimitry Andric   Register VIndex;
47105ffd83dbSDimitry Andric   if (HasVIndex) {
47115ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
47125ffd83dbSDimitry Andric     ++OpOffset;
4713fe6060f1SDimitry Andric   } else {
4714fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
47155ffd83dbSDimitry Andric   }
47165ffd83dbSDimitry Andric 
47175ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
47185ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
47195ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
47205ffd83dbSDimitry Andric 
47215ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
47225ffd83dbSDimitry Andric 
47235ffd83dbSDimitry Andric   unsigned ImmOffset;
4724fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4725fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
47265ffd83dbSDimitry Andric 
4727e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4728e8d8bef9SDimitry Andric 
4729e8d8bef9SDimitry Andric   if (HasReturn)
4730e8d8bef9SDimitry Andric     MIB.addDef(Dst);
4731e8d8bef9SDimitry Andric 
4732e8d8bef9SDimitry Andric   MIB.addUse(VData); // vdata
47335ffd83dbSDimitry Andric 
47345ffd83dbSDimitry Andric   if (IsCmpSwap)
47355ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
47365ffd83dbSDimitry Andric 
47375ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
47385ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
47395ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
47405ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
47415ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
47425ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
47435ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
47445ffd83dbSDimitry Andric      .addMemOperand(MMO);
47455ffd83dbSDimitry Andric 
47465ffd83dbSDimitry Andric   MI.eraseFromParent();
47475ffd83dbSDimitry Andric   return true;
47485ffd83dbSDimitry Andric }
47495ffd83dbSDimitry Andric 
4750fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
47515ffd83dbSDimitry Andric /// vector with s16 typed elements.
4752fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4753fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
4754fe6060f1SDimitry Andric                                       unsigned ArgOffset,
4755fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4756fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
47575ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4758fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
4759fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
47605ffd83dbSDimitry Andric 
4761e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4762e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
47635ffd83dbSDimitry Andric     if (!SrcOp.isReg())
47645ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
47655ffd83dbSDimitry Andric 
47665ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
47675ffd83dbSDimitry Andric 
4768fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
4769fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4770fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
47710eae32dcSDimitry Andric       if ((I < Intr->GradientStart) && IsA16 &&
47720eae32dcSDimitry Andric           (B.getMRI()->getType(AddrReg) == S16)) {
477304eeddc0SDimitry Andric         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
47740eae32dcSDimitry Andric         // Special handling of bias when A16 is on. Bias is of type half but
47750eae32dcSDimitry Andric         // occupies full 32-bit.
47760eae32dcSDimitry Andric         PackedAddrs.push_back(
47770eae32dcSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
47780eae32dcSDimitry Andric                 .getReg(0));
47790eae32dcSDimitry Andric       } else {
478004eeddc0SDimitry Andric         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
478104eeddc0SDimitry Andric                "Bias needs to be converted to 16 bit in A16 mode");
478204eeddc0SDimitry Andric         // Handle any gradient or coordinate operands that should not be packed
47835ffd83dbSDimitry Andric         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
47845ffd83dbSDimitry Andric         PackedAddrs.push_back(AddrReg);
47850eae32dcSDimitry Andric       }
47865ffd83dbSDimitry Andric     } else {
47875ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
47885ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
47895ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
4790e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
4791e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
4792e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
4793e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
4794e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
47955ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
4796e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
47975ffd83dbSDimitry Andric         PackedAddrs.push_back(
47985ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
47995ffd83dbSDimitry Andric                 .getReg(0));
48005ffd83dbSDimitry Andric       } else {
48015ffd83dbSDimitry Andric         PackedAddrs.push_back(
4802e8d8bef9SDimitry Andric             B.buildBuildVector(
4803e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
48045ffd83dbSDimitry Andric                 .getReg(0));
48055ffd83dbSDimitry Andric         ++I;
48065ffd83dbSDimitry Andric       }
48075ffd83dbSDimitry Andric     }
48085ffd83dbSDimitry Andric   }
48095ffd83dbSDimitry Andric }
48105ffd83dbSDimitry Andric 
48115ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
48125ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
48135ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
48145ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
48155ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
48165ffd83dbSDimitry Andric 
48175ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
48185ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
48195ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
48205ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
48215ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
48225ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
48235ffd83dbSDimitry Andric     }
48245ffd83dbSDimitry Andric   }
48255ffd83dbSDimitry Andric 
48265ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
48275ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
4828fe6060f1SDimitry Andric     // Above 8 elements round up to next power of 2 (i.e. 16).
4829fe6060f1SDimitry Andric     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
48305ffd83dbSDimitry Andric       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
48315ffd83dbSDimitry Andric       auto Undef = B.buildUndef(S32);
48325ffd83dbSDimitry Andric       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
48335ffd83dbSDimitry Andric       NumAddrRegs = RoundedNumRegs;
48345ffd83dbSDimitry Andric     }
48355ffd83dbSDimitry Andric 
4836fe6060f1SDimitry Andric     auto VAddr =
4837fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
48385ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
48395ffd83dbSDimitry Andric   }
48405ffd83dbSDimitry Andric 
48415ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
48425ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
48435ffd83dbSDimitry Andric     if (SrcOp.isReg())
48445ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
48455ffd83dbSDimitry Andric   }
48465ffd83dbSDimitry Andric }
48475ffd83dbSDimitry Andric 
48485ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
48495ffd83dbSDimitry Andric ///
48505ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
48515ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
48525ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
48535ffd83dbSDimitry Andric /// registers.
48545ffd83dbSDimitry Andric ///
48555ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
48565ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
485781ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid
48585ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
4859349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires
4860349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg.
48615ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4862e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4863e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
48645ffd83dbSDimitry Andric 
4865e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
4866e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
48675ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
48685ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
48695ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
48705ffd83dbSDimitry Andric 
48715ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
48725ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4873e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
48745ffd83dbSDimitry Andric 
48755ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
48765ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
48775ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4878fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
48795ffd83dbSDimitry Andric 
48805ffd83dbSDimitry Andric   unsigned DMask = 0;
488104eeddc0SDimitry Andric   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
488204eeddc0SDimitry Andric   LLT Ty = MRI->getType(VData);
48835ffd83dbSDimitry Andric 
48845ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
4885e8d8bef9SDimitry Andric   LLT GradTy =
4886e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4887e8d8bef9SDimitry Andric   LLT AddrTy =
4888e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
48895ffd83dbSDimitry Andric   const bool IsG16 = GradTy == S16;
48905ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
489104eeddc0SDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
48925ffd83dbSDimitry Andric 
48935ffd83dbSDimitry Andric   int DMaskLanes = 0;
48945ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
4895e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
48965ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
48975ffd83dbSDimitry Andric       DMaskLanes = 4;
48985ffd83dbSDimitry Andric     } else if (DMask != 0) {
48995ffd83dbSDimitry Andric       DMaskLanes = countPopulation(DMask);
49005ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
49015ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
49025ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
49035ffd83dbSDimitry Andric       MI.eraseFromParent();
49045ffd83dbSDimitry Andric       return true;
49055ffd83dbSDimitry Andric     }
49065ffd83dbSDimitry Andric   }
49075ffd83dbSDimitry Andric 
49085ffd83dbSDimitry Andric   Observer.changingInstr(MI);
49095ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
49105ffd83dbSDimitry Andric 
491104eeddc0SDimitry Andric   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
491204eeddc0SDimitry Andric                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
491304eeddc0SDimitry Andric   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
491404eeddc0SDimitry Andric                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
491504eeddc0SDimitry Andric   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
49165ffd83dbSDimitry Andric 
49175ffd83dbSDimitry Andric   // Track that we legalized this
49185ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
49195ffd83dbSDimitry Andric 
49205ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
49215ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
49225ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
49235ffd83dbSDimitry Andric     DMask = 0x1;
49245ffd83dbSDimitry Andric     DMaskLanes = 1;
4925e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
49265ffd83dbSDimitry Andric   }
49275ffd83dbSDimitry Andric 
49285ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
49295ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
49305ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
49315ffd83dbSDimitry Andric 
49325ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
49335ffd83dbSDimitry Andric     if (Ty.isVector())
49345ffd83dbSDimitry Andric       return false;
49355ffd83dbSDimitry Andric 
49365ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
49375ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
49385ffd83dbSDimitry Andric       // The two values are packed in one register.
4939fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
49405ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
49415ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
49425ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
49435ffd83dbSDimitry Andric     }
49445ffd83dbSDimitry Andric   }
49455ffd83dbSDimitry Andric 
4946e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
49475ffd83dbSDimitry Andric 
49485ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
4949fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4950fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
4951fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
49525ffd83dbSDimitry Andric     return false;
4953fe6060f1SDimitry Andric   }
49545ffd83dbSDimitry Andric 
4955fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
4956fe6060f1SDimitry Andric     // A16 not supported
4957fe6060f1SDimitry Andric     return false;
4958fe6060f1SDimitry Andric   }
4959fe6060f1SDimitry Andric 
4960fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
4961e8d8bef9SDimitry Andric     if (Intr->NumVAddrs > 1) {
49625ffd83dbSDimitry Andric       SmallVector<Register, 4> PackedRegs;
49635ffd83dbSDimitry Andric 
4964fe6060f1SDimitry Andric       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4965fe6060f1SDimitry Andric                                 IsG16);
49665ffd83dbSDimitry Andric 
49675ffd83dbSDimitry Andric       // See also below in the non-a16 branch
4968fe6060f1SDimitry Andric       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4969fe6060f1SDimitry Andric                           PackedRegs.size() <= ST.getNSAMaxSize();
49705ffd83dbSDimitry Andric 
49715ffd83dbSDimitry Andric       if (!UseNSA && PackedRegs.size() > 1) {
4972fe6060f1SDimitry Andric         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
49735ffd83dbSDimitry Andric         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
49745ffd83dbSDimitry Andric         PackedRegs[0] = Concat.getReg(0);
49755ffd83dbSDimitry Andric         PackedRegs.resize(1);
49765ffd83dbSDimitry Andric       }
49775ffd83dbSDimitry Andric 
4978e8d8bef9SDimitry Andric       const unsigned NumPacked = PackedRegs.size();
4979e8d8bef9SDimitry Andric       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4980e8d8bef9SDimitry Andric         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
49815ffd83dbSDimitry Andric         if (!SrcOp.isReg()) {
49825ffd83dbSDimitry Andric           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
49835ffd83dbSDimitry Andric           continue;
49845ffd83dbSDimitry Andric         }
49855ffd83dbSDimitry Andric 
49865ffd83dbSDimitry Andric         assert(SrcOp.getReg() != AMDGPU::NoRegister);
49875ffd83dbSDimitry Andric 
4988e8d8bef9SDimitry Andric         if (I - Intr->VAddrStart < NumPacked)
4989e8d8bef9SDimitry Andric           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
49905ffd83dbSDimitry Andric         else
49915ffd83dbSDimitry Andric           SrcOp.setReg(AMDGPU::NoRegister);
49925ffd83dbSDimitry Andric       }
49935ffd83dbSDimitry Andric     }
49945ffd83dbSDimitry Andric   } else {
49955ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
49965ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
49975ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
49985ffd83dbSDimitry Andric     // wash in terms of code size or even better.
49995ffd83dbSDimitry Andric     //
50005ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
50015ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
50025ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
50035ffd83dbSDimitry Andric     //
50045ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
50055ffd83dbSDimitry Andric     // allocation when possible.
500681ad6265SDimitry Andric     //
500781ad6265SDimitry Andric     // TODO: we can actually allow partial NSA where the final register is a
500881ad6265SDimitry Andric     // contiguous set of the remaining addresses.
500981ad6265SDimitry Andric     // This could help where there are more addresses than supported.
5010fe6060f1SDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
5011fe6060f1SDimitry Andric                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
50125ffd83dbSDimitry Andric 
5013e8d8bef9SDimitry Andric     if (!UseNSA && Intr->NumVAddrs > 1)
5014e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5015e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
50165ffd83dbSDimitry Andric   }
50175ffd83dbSDimitry Andric 
50185ffd83dbSDimitry Andric   int Flags = 0;
50195ffd83dbSDimitry Andric   if (IsA16)
50205ffd83dbSDimitry Andric     Flags |= 1;
50215ffd83dbSDimitry Andric   if (IsG16)
50225ffd83dbSDimitry Andric     Flags |= 2;
50235ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
50245ffd83dbSDimitry Andric 
50255ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
50265ffd83dbSDimitry Andric     // TODO: Handle dmask trim
502704eeddc0SDimitry Andric     if (!Ty.isVector() || !IsD16)
50285ffd83dbSDimitry Andric       return true;
50295ffd83dbSDimitry Andric 
5030e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
50315ffd83dbSDimitry Andric     if (RepackedReg != VData) {
50325ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
50335ffd83dbSDimitry Andric     }
50345ffd83dbSDimitry Andric 
50355ffd83dbSDimitry Andric     return true;
50365ffd83dbSDimitry Andric   }
50375ffd83dbSDimitry Andric 
50385ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
50395ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
50405ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
50415ffd83dbSDimitry Andric 
50425ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
50435ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
50445ffd83dbSDimitry Andric     return false;
50455ffd83dbSDimitry Andric 
50465ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
50475ffd83dbSDimitry Andric     return false;
50485ffd83dbSDimitry Andric 
50495ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5050fe6060f1SDimitry Andric   const LLT AdjustedTy =
5051fe6060f1SDimitry Andric       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
50525ffd83dbSDimitry Andric 
50535ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
50545ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
50555ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
50565ffd83dbSDimitry Andric   LLT RoundedTy;
50575ffd83dbSDimitry Andric 
50585ffd83dbSDimitry Andric   // S32 vector to to cover all data, plus TFE result element.
50595ffd83dbSDimitry Andric   LLT TFETy;
50605ffd83dbSDimitry Andric 
50615ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
50625ffd83dbSDimitry Andric   LLT RegTy;
50635ffd83dbSDimitry Andric 
50645ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
5065fe6060f1SDimitry Andric     RoundedTy =
5066fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
5067fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
50685ffd83dbSDimitry Andric     RegTy = S32;
50695ffd83dbSDimitry Andric   } else {
50705ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
50715ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
50725ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
5073fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
5074fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
5075fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
50765ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
50775ffd83dbSDimitry Andric   }
50785ffd83dbSDimitry Andric 
50795ffd83dbSDimitry Andric   // The return type does not need adjustment.
50805ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
50815ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
50825ffd83dbSDimitry Andric     return true;
50835ffd83dbSDimitry Andric 
50845ffd83dbSDimitry Andric   Register Dst1Reg;
50855ffd83dbSDimitry Andric 
50865ffd83dbSDimitry Andric   // Insert after the instruction.
50875ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
50885ffd83dbSDimitry Andric 
50895ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
50905ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
50915ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
50925ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
50935ffd83dbSDimitry Andric 
50945ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
50955ffd83dbSDimitry Andric 
50965ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
50975ffd83dbSDimitry Andric 
50985ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
5099349cc55cSDimitry Andric   // type. The instruction really returns these two values in one contiguous
51005ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
51015ffd83dbSDimitry Andric   // return type to use a single register result.
51025ffd83dbSDimitry Andric 
51035ffd83dbSDimitry Andric   if (IsTFE) {
51045ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
51055ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
51065ffd83dbSDimitry Andric       return false;
51075ffd83dbSDimitry Andric 
51085ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
510981ad6265SDimitry Andric     MI.removeOperand(1);
51105ffd83dbSDimitry Andric 
51115ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
51125ffd83dbSDimitry Andric     if (Ty == S32) {
51135ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
51145ffd83dbSDimitry Andric       return true;
51155ffd83dbSDimitry Andric     }
51165ffd83dbSDimitry Andric   }
51175ffd83dbSDimitry Andric 
51185ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
51195ffd83dbSDimitry Andric   // result.
51205ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
51215ffd83dbSDimitry Andric 
51225ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
51235ffd83dbSDimitry Andric 
51245ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
51255ffd83dbSDimitry Andric     assert(!IsTFE);
51265ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
51275ffd83dbSDimitry Andric   } else {
51285ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
51295ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
51305ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
51315ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
51325ffd83dbSDimitry Andric 
51335ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
51345ffd83dbSDimitry Andric     // directly written to the right place already.
51355ffd83dbSDimitry Andric     if (IsTFE)
51365ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
51375ffd83dbSDimitry Andric   }
51385ffd83dbSDimitry Andric 
51395ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
51405ffd83dbSDimitry Andric   // of packed vs. unpacked.
51415ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
51425ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
51435ffd83dbSDimitry Andric     return true;
51445ffd83dbSDimitry Andric   }
51455ffd83dbSDimitry Andric 
51465ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
51475ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
51485ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
51495ffd83dbSDimitry Andric     return true;
51505ffd83dbSDimitry Andric   }
51515ffd83dbSDimitry Andric 
51525ffd83dbSDimitry Andric   assert(Ty.isVector());
51535ffd83dbSDimitry Andric 
51545ffd83dbSDimitry Andric   if (IsD16) {
51555ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
51565ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
51575ffd83dbSDimitry Andric     //
51585ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
51595ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
51605ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
51615ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
51625ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
51635ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
51645ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
51655ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
51665ffd83dbSDimitry Andric     }
51675ffd83dbSDimitry Andric   }
51685ffd83dbSDimitry Andric 
51695ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
51705ffd83dbSDimitry Andric     if (NumElts == 0)
51715ffd83dbSDimitry Andric       return;
51725ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
51735ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
51745ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
51755ffd83dbSDimitry Andric   };
51765ffd83dbSDimitry Andric 
51775ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
51785ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
51795ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
51805ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
51815ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
51825ffd83dbSDimitry Andric     return true;
51835ffd83dbSDimitry Andric   }
51845ffd83dbSDimitry Andric 
51855ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
51865ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
51875ffd83dbSDimitry Andric 
51885ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
5189fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
51905ffd83dbSDimitry Andric   if (Ty == V3S16) {
51910eae32dcSDimitry Andric     if (IsTFE) {
51920eae32dcSDimitry Andric       if (ResultRegs.size() == 1) {
51930eae32dcSDimitry Andric         NewResultReg = ResultRegs[0];
51940eae32dcSDimitry Andric       } else if (ResultRegs.size() == 2) {
51950eae32dcSDimitry Andric         LLT V4S16 = LLT::fixed_vector(4, 16);
51960eae32dcSDimitry Andric         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
51970eae32dcSDimitry Andric       } else {
51980eae32dcSDimitry Andric         return false;
51990eae32dcSDimitry Andric       }
52000eae32dcSDimitry Andric     }
52010eae32dcSDimitry Andric 
52020eae32dcSDimitry Andric     if (MRI->getType(DstReg).getNumElements() <
52030eae32dcSDimitry Andric         MRI->getType(NewResultReg).getNumElements()) {
52040eae32dcSDimitry Andric       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
52050eae32dcSDimitry Andric     } else {
52060eae32dcSDimitry Andric       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
52070eae32dcSDimitry Andric     }
52085ffd83dbSDimitry Andric     return true;
52095ffd83dbSDimitry Andric   }
52105ffd83dbSDimitry Andric 
52115ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
52125ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
52135ffd83dbSDimitry Andric   return true;
52145ffd83dbSDimitry Andric }
52155ffd83dbSDimitry Andric 
52165ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
5217e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
5218e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
5219e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
5220e8d8bef9SDimitry Andric 
52215ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
52225ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
52235ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
52245ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
52255ffd83dbSDimitry Andric 
52265ffd83dbSDimitry Andric   Observer.changingInstr(MI);
52275ffd83dbSDimitry Andric 
5228fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
5229e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
5230e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
5231e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
5232e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
5233e8d8bef9SDimitry Andric   }
5234e8d8bef9SDimitry Andric 
52355ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
52365ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
52375ffd83dbSDimitry Andric   // allowed to add one.
52385ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
523981ad6265SDimitry Andric   MI.removeOperand(1); // Remove intrinsic ID
52405ffd83dbSDimitry Andric 
52415ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
52425ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
52435ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
52445ffd83dbSDimitry Andric   const Align MemAlign(4);
52455ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
52465ffd83dbSDimitry Andric       MachinePointerInfo(),
52475ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
52485ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
52495ffd83dbSDimitry Andric       MemSize, MemAlign);
52505ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
52515ffd83dbSDimitry Andric 
52525ffd83dbSDimitry Andric   // There are no 96-bit result scalar loads, but widening to 128-bit should
52535ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
52545ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
52555ffd83dbSDimitry Andric   if (!isPowerOf2_32(Size)) {
52565ffd83dbSDimitry Andric     if (Ty.isVector())
52575ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
52585ffd83dbSDimitry Andric     else
52595ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
52605ffd83dbSDimitry Andric   }
52615ffd83dbSDimitry Andric 
52625ffd83dbSDimitry Andric   Observer.changedInstr(MI);
52635ffd83dbSDimitry Andric   return true;
52645ffd83dbSDimitry Andric }
52655ffd83dbSDimitry Andric 
5266e8d8bef9SDimitry Andric // TODO: Move to selection
52675ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
52680b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
52690b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
5270fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
5271fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5272fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
5273fe6060f1SDimitry Andric 
5274fe6060f1SDimitry Andric   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
5275fe6060f1SDimitry Andric     switch (*HsaAbiVer) {
5276fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5277fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5278fe6060f1SDimitry Andric       return legalizeTrapHsaQueuePtr(MI, MRI, B);
5279fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
52801fd87a68SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5281fe6060f1SDimitry Andric       return ST.supportsGetDoorbellID() ?
5282fe6060f1SDimitry Andric           legalizeTrapHsa(MI, MRI, B) :
5283fe6060f1SDimitry Andric           legalizeTrapHsaQueuePtr(MI, MRI, B);
5284fe6060f1SDimitry Andric     }
5285fe6060f1SDimitry Andric   }
5286fe6060f1SDimitry Andric 
5287fe6060f1SDimitry Andric   llvm_unreachable("Unknown trap handler");
5288fe6060f1SDimitry Andric }
5289fe6060f1SDimitry Andric 
5290fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
5291fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
52925ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
5293fe6060f1SDimitry Andric   MI.eraseFromParent();
5294fe6060f1SDimitry Andric   return true;
5295fe6060f1SDimitry Andric }
5296fe6060f1SDimitry Andric 
5297fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
5298fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
529981ad6265SDimitry Andric   MachineFunction &MF = B.getMF();
530081ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
530181ad6265SDimitry Andric 
530281ad6265SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
530381ad6265SDimitry Andric   // For code object version 5, queue_ptr is passed through implicit kernarg.
530481ad6265SDimitry Andric   if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
530581ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
530681ad6265SDimitry Andric         AMDGPUTargetLowering::QUEUE_PTR;
530781ad6265SDimitry Andric     uint64_t Offset =
530881ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
530981ad6265SDimitry Andric 
531081ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
531181ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
531281ad6265SDimitry Andric 
531381ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
531481ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
531581ad6265SDimitry Andric       return false;
531681ad6265SDimitry Andric 
531781ad6265SDimitry Andric     // TODO: can we be smarter about machine pointer info?
531881ad6265SDimitry Andric     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
531981ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
532081ad6265SDimitry Andric         PtrInfo,
532181ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
532281ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
532381ad6265SDimitry Andric         LLT::scalar(64), commonAlignment(Align(64), Offset));
532481ad6265SDimitry Andric 
532581ad6265SDimitry Andric     // Pointer address
532681ad6265SDimitry Andric     Register LoadAddr = MRI.createGenericVirtualRegister(
532781ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
532881ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
532981ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
533081ad6265SDimitry Andric     // Load address
533181ad6265SDimitry Andric     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
533281ad6265SDimitry Andric     B.buildCopy(SGPR01, Temp);
533381ad6265SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
533481ad6265SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
533581ad6265SDimitry Andric         .addReg(SGPR01, RegState::Implicit);
533681ad6265SDimitry Andric     MI.eraseFromParent();
533781ad6265SDimitry Andric     return true;
533881ad6265SDimitry Andric   }
533981ad6265SDimitry Andric 
53405ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
53415ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
5342e8d8bef9SDimitry Andric   Register LiveIn =
5343e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
5344e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
53455ffd83dbSDimitry Andric     return false;
5346e8d8bef9SDimitry Andric 
53475ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
53485ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
5349fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
53505ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
5351fe6060f1SDimitry Andric 
5352fe6060f1SDimitry Andric   MI.eraseFromParent();
5353fe6060f1SDimitry Andric   return true;
53545ffd83dbSDimitry Andric }
53555ffd83dbSDimitry Andric 
5356fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(
5357fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5358fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
5359fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
53605ffd83dbSDimitry Andric   MI.eraseFromParent();
53615ffd83dbSDimitry Andric   return true;
53625ffd83dbSDimitry Andric }
53635ffd83dbSDimitry Andric 
53645ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
53655ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5366349cc55cSDimitry Andric   // Is non-HSA path or trap-handler disabled? Then, report a warning
53675ffd83dbSDimitry Andric   // accordingly
5368fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
5369fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
53705ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
53715ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
53725ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
53735ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
53745ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
53755ffd83dbSDimitry Andric   } else {
53765ffd83dbSDimitry Andric     // Insert debug-trap instruction
5377fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
5378fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
53795ffd83dbSDimitry Andric   }
53805ffd83dbSDimitry Andric 
53815ffd83dbSDimitry Andric   MI.eraseFromParent();
53825ffd83dbSDimitry Andric   return true;
53835ffd83dbSDimitry Andric }
53845ffd83dbSDimitry Andric 
5385e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
5386e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
5387e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
5388e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
5389e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
539081ad6265SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
539181ad6265SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
5392e8d8bef9SDimitry Andric 
5393e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5394e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
5395e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
5396e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
5397e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
5398e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
5399e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
5400e8d8bef9SDimitry Andric 
5401fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
5402fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
5403fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
5404fe6060f1SDimitry Andric                                         MI.getDebugLoc());
5405fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
5406fe6060f1SDimitry Andric     return false;
5407fe6060f1SDimitry Andric   }
5408fe6060f1SDimitry Andric 
540981ad6265SDimitry Andric   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
5410349cc55cSDimitry Andric   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
5411349cc55cSDimitry Andric   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
5412349cc55cSDimitry Andric   const unsigned NumVDataDwords = 4;
5413349cc55cSDimitry Andric   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
541481ad6265SDimitry Andric   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
541581ad6265SDimitry Andric   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
5416349cc55cSDimitry Andric   const unsigned BaseOpcodes[2][2] = {
5417349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
5418349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
5419349cc55cSDimitry Andric        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
5420349cc55cSDimitry Andric   int Opcode;
5421349cc55cSDimitry Andric   if (UseNSA) {
542281ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
542381ad6265SDimitry Andric                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
542481ad6265SDimitry Andric                                                : AMDGPU::MIMGEncGfx10NSA,
5425349cc55cSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
5426349cc55cSDimitry Andric   } else {
542781ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(
542881ad6265SDimitry Andric         BaseOpcodes[Is64][IsA16],
542981ad6265SDimitry Andric         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
543081ad6265SDimitry Andric         NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
5431349cc55cSDimitry Andric   }
5432349cc55cSDimitry Andric   assert(Opcode != -1);
5433e8d8bef9SDimitry Andric 
5434e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
543581ad6265SDimitry Andric   if (UseNSA && IsGFX11Plus) {
543681ad6265SDimitry Andric     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
543781ad6265SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
543881ad6265SDimitry Andric       auto Merged = B.buildMerge(
543981ad6265SDimitry Andric           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
544081ad6265SDimitry Andric       Ops.push_back(Merged.getReg(0));
544181ad6265SDimitry Andric     };
544281ad6265SDimitry Andric 
544381ad6265SDimitry Andric     Ops.push_back(NodePtr);
544481ad6265SDimitry Andric     Ops.push_back(RayExtent);
544581ad6265SDimitry Andric     packLanes(RayOrigin);
544681ad6265SDimitry Andric 
544781ad6265SDimitry Andric     if (IsA16) {
544881ad6265SDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
544981ad6265SDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
545081ad6265SDimitry Andric       auto MergedDir = B.buildMerge(
545181ad6265SDimitry Andric           V3S32,
545281ad6265SDimitry Andric           {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0),
545381ad6265SDimitry Andric                                                     UnmergeRayDir.getReg(0)}))
545481ad6265SDimitry Andric                .getReg(0),
545581ad6265SDimitry Andric            B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1),
545681ad6265SDimitry Andric                                                     UnmergeRayDir.getReg(1)}))
545781ad6265SDimitry Andric                .getReg(0),
545881ad6265SDimitry Andric            B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2),
545981ad6265SDimitry Andric                                                     UnmergeRayDir.getReg(2)}))
546081ad6265SDimitry Andric                .getReg(0)});
546181ad6265SDimitry Andric       Ops.push_back(MergedDir.getReg(0));
546281ad6265SDimitry Andric     } else {
546381ad6265SDimitry Andric       packLanes(RayDir);
546481ad6265SDimitry Andric       packLanes(RayInvDir);
546581ad6265SDimitry Andric     }
546681ad6265SDimitry Andric   } else {
5467e8d8bef9SDimitry Andric     if (Is64) {
5468e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
5469e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
5470e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
5471e8d8bef9SDimitry Andric     } else {
5472e8d8bef9SDimitry Andric       Ops.push_back(NodePtr);
5473e8d8bef9SDimitry Andric     }
5474e8d8bef9SDimitry Andric     Ops.push_back(RayExtent);
5475e8d8bef9SDimitry Andric 
5476e8d8bef9SDimitry Andric     auto packLanes = [&Ops, &S32, &B](Register Src) {
54770eae32dcSDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
5478e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
5479e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
5480e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(2));
5481e8d8bef9SDimitry Andric     };
5482e8d8bef9SDimitry Andric 
5483e8d8bef9SDimitry Andric     packLanes(RayOrigin);
5484e8d8bef9SDimitry Andric     if (IsA16) {
54850eae32dcSDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
54860eae32dcSDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
5487e8d8bef9SDimitry Andric       Register R1 = MRI.createGenericVirtualRegister(S32);
5488e8d8bef9SDimitry Andric       Register R2 = MRI.createGenericVirtualRegister(S32);
5489e8d8bef9SDimitry Andric       Register R3 = MRI.createGenericVirtualRegister(S32);
5490e8d8bef9SDimitry Andric       B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
5491e8d8bef9SDimitry Andric       B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
549281ad6265SDimitry Andric       B.buildMerge(R3,
549381ad6265SDimitry Andric                    {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
5494e8d8bef9SDimitry Andric       Ops.push_back(R1);
5495e8d8bef9SDimitry Andric       Ops.push_back(R2);
5496e8d8bef9SDimitry Andric       Ops.push_back(R3);
5497e8d8bef9SDimitry Andric     } else {
5498e8d8bef9SDimitry Andric       packLanes(RayDir);
5499e8d8bef9SDimitry Andric       packLanes(RayInvDir);
5500e8d8bef9SDimitry Andric     }
550181ad6265SDimitry Andric   }
5502e8d8bef9SDimitry Andric 
5503349cc55cSDimitry Andric   if (!UseNSA) {
5504349cc55cSDimitry Andric     // Build a single vector containing all the operands so far prepared.
5505349cc55cSDimitry Andric     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
5506349cc55cSDimitry Andric     Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0);
5507349cc55cSDimitry Andric     Ops.clear();
5508349cc55cSDimitry Andric     Ops.push_back(MergedOps);
5509349cc55cSDimitry Andric   }
5510349cc55cSDimitry Andric 
5511e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
5512e8d8bef9SDimitry Andric     .addDef(DstReg)
5513e8d8bef9SDimitry Andric     .addImm(Opcode);
5514e8d8bef9SDimitry Andric 
5515e8d8bef9SDimitry Andric   for (Register R : Ops) {
5516e8d8bef9SDimitry Andric     MIB.addUse(R);
5517e8d8bef9SDimitry Andric   }
5518e8d8bef9SDimitry Andric 
5519e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
5520e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
5521e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
5522e8d8bef9SDimitry Andric 
5523e8d8bef9SDimitry Andric   MI.eraseFromParent();
5524e8d8bef9SDimitry Andric   return true;
5525e8d8bef9SDimitry Andric }
5526e8d8bef9SDimitry Andric 
552781ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
552881ad6265SDimitry Andric                                                MachineIRBuilder &B) const {
552981ad6265SDimitry Andric   unsigned Opc;
553081ad6265SDimitry Andric   int RoundMode = MI.getOperand(2).getImm();
553181ad6265SDimitry Andric 
553281ad6265SDimitry Andric   if (RoundMode == (int)RoundingMode::TowardPositive)
553381ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
553481ad6265SDimitry Andric   else if (RoundMode == (int)RoundingMode::TowardNegative)
553581ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
553681ad6265SDimitry Andric   else
553781ad6265SDimitry Andric     return false;
553881ad6265SDimitry Andric 
553981ad6265SDimitry Andric   B.buildInstr(Opc)
554081ad6265SDimitry Andric       .addDef(MI.getOperand(0).getReg())
554181ad6265SDimitry Andric       .addUse(MI.getOperand(1).getReg());
554281ad6265SDimitry Andric 
554304eeddc0SDimitry Andric   MI.eraseFromParent();
554481ad6265SDimitry Andric 
554504eeddc0SDimitry Andric   return true;
554604eeddc0SDimitry Andric }
554704eeddc0SDimitry Andric 
55485ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
55495ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
55505ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
55515ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
55525ffd83dbSDimitry Andric 
55530b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
5554480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
5555480093f4SDimitry Andric   switch (IntrID) {
5556480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
5557480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
5558480093f4SDimitry Andric     MachineInstr *Br = nullptr;
55595ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
5560e8d8bef9SDimitry Andric     bool Negated = false;
5561e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
5562e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
55630b57cec5SDimitry Andric       const SIRegisterInfo *TRI
55640b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
55650b57cec5SDimitry Andric 
55660b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
55670b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
5568480093f4SDimitry Andric 
55695ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
5570e8d8bef9SDimitry Andric 
5571e8d8bef9SDimitry Andric       if (Negated)
5572e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
5573e8d8bef9SDimitry Andric 
55745ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
5575480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
55760b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
55770b57cec5SDimitry Andric           .addDef(Def)
55780b57cec5SDimitry Andric           .addUse(Use)
55795ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
5580480093f4SDimitry Andric       } else {
5581480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
5582480093f4SDimitry Andric             .addDef(Def)
5583480093f4SDimitry Andric             .addUse(Use)
5584e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
5585480093f4SDimitry Andric       }
5586480093f4SDimitry Andric 
55875ffd83dbSDimitry Andric       if (Br) {
55885ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
55895ffd83dbSDimitry Andric       } else {
55905ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
55915ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
55925ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
55935ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
55945ffd83dbSDimitry Andric       }
55950b57cec5SDimitry Andric 
55960b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
55970b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
55980b57cec5SDimitry Andric       MI.eraseFromParent();
55990b57cec5SDimitry Andric       BrCond->eraseFromParent();
56000b57cec5SDimitry Andric       return true;
56010b57cec5SDimitry Andric     }
56020b57cec5SDimitry Andric 
56030b57cec5SDimitry Andric     return false;
56040b57cec5SDimitry Andric   }
56050b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
5606480093f4SDimitry Andric     MachineInstr *Br = nullptr;
56075ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
5608e8d8bef9SDimitry Andric     bool Negated = false;
5609e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
5610e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
56110b57cec5SDimitry Andric       const SIRegisterInfo *TRI
56120b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
56130b57cec5SDimitry Andric 
56145ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
56150b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
56165ffd83dbSDimitry Andric 
5617e8d8bef9SDimitry Andric       if (Negated)
5618e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
5619e8d8bef9SDimitry Andric 
56205ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
56210b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
56220b57cec5SDimitry Andric         .addUse(Reg)
56235ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
56245ffd83dbSDimitry Andric 
56255ffd83dbSDimitry Andric       if (Br)
56265ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
56275ffd83dbSDimitry Andric       else
56285ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
56295ffd83dbSDimitry Andric 
56300b57cec5SDimitry Andric       MI.eraseFromParent();
56310b57cec5SDimitry Andric       BrCond->eraseFromParent();
56320b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
56330b57cec5SDimitry Andric       return true;
56340b57cec5SDimitry Andric     }
56350b57cec5SDimitry Andric 
56360b57cec5SDimitry Andric     return false;
56370b57cec5SDimitry Andric   }
56380b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
56395ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
56405ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
56415ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
56425ffd83dbSDimitry Andric       MI.eraseFromParent();
56435ffd83dbSDimitry Andric       return true;
56445ffd83dbSDimitry Andric     }
56455ffd83dbSDimitry Andric 
56460b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
56470b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
56480b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
56490b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
56500b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
565181ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
56520b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
56530b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
565481ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
56550b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
56560b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
565781ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
56580b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
56590b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
56600b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56610b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
56620b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
56630b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56640b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
56650b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
56660b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56670b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5668*fcaf7f86SDimitry Andric   case Intrinsic::amdgcn_lds_kernel_id:
5669*fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
5670*fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
56710b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
56720b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56730b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
56740b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
56750b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56760b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
56770b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
56780b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
56790b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
56800b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
56810b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56820b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
568381ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_x:
568481ad6265SDimitry Andric     // TODO: Emit error for hsa
568581ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
568681ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_X);
568781ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_y:
568881ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
568981ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Y);
569081ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_z:
569181ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
569281ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Z);
569381ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_x:
569481ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
569581ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
569681ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_y:
569781ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
569881ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
569981ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
570081ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_z:
570181ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
570281ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_x:
570381ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
570481ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_y:
570581ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
570681ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_z:
570781ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
57088bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
57098bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
57108bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
57118bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
57128bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
57138bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
57148bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
57158bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
57168bcb0991SDimitry Andric     MI.eraseFromParent();
57178bcb0991SDimitry Andric     return true;
57188bcb0991SDimitry Andric   }
57195ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
5720e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
57218bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
57225ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
57235ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
57248bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
57255ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
57265ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
57275ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
57285ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
57295ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
57305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
57315ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
57325ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
57335ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
57345ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
57355ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
57365ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
57375ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
57385ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
57395ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
57405ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
57415ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
57425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
57435ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
57445ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
57455ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
57465ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
57475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
57485ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
57495ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
57505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
57515ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
57525ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
57535ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
57545ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
57555ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
57565ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
57575ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
57585ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
57595ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
57605ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
57615ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
57625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
57635ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
57645ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5765fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5766fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5767fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5768fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
57695ffd83dbSDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
577004eeddc0SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
577104eeddc0SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
577204eeddc0SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
577381ad6265SDimitry Andric     if (!MRI.use_empty(DstReg) &&
577481ad6265SDimitry Andric         !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) {
577504eeddc0SDimitry Andric       Function &F = B.getMF().getFunction();
577604eeddc0SDimitry Andric       DiagnosticInfoUnsupported NoFpRet(
577704eeddc0SDimitry Andric           F, "return versions of fp atomics not supported", B.getDebugLoc(),
577804eeddc0SDimitry Andric           DS_Error);
577904eeddc0SDimitry Andric       F.getContext().diagnose(NoFpRet);
578004eeddc0SDimitry Andric       B.buildUndef(DstReg);
578104eeddc0SDimitry Andric       MI.eraseFromParent();
578204eeddc0SDimitry Andric       return true;
578304eeddc0SDimitry Andric     }
578404eeddc0SDimitry Andric 
578504eeddc0SDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
578604eeddc0SDimitry Andric   }
57875ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_inc:
57885ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, true);
57895ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_dec:
57905ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, false);
57915ffd83dbSDimitry Andric   case Intrinsic::trap:
57925ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
57935ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
57945ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5795e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
5796e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
5797e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
5798e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
5799e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
5800e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5801e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5802e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
58035ffd83dbSDimitry Andric   default: {
58045ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
58055ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
58065ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
58070b57cec5SDimitry Andric     return true;
58080b57cec5SDimitry Andric   }
58095ffd83dbSDimitry Andric   }
58100b57cec5SDimitry Andric 
58110b57cec5SDimitry Andric   return true;
58120b57cec5SDimitry Andric }
5813