xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision fe6060f10f634930ff71b7c50291ddc610da2475)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
21*fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
23*fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
28e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
290b57cec5SDimitry Andric 
300b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
310b57cec5SDimitry Andric 
320b57cec5SDimitry Andric using namespace llvm;
330b57cec5SDimitry Andric using namespace LegalizeActions;
340b57cec5SDimitry Andric using namespace LegalizeMutations;
350b57cec5SDimitry Andric using namespace LegalityPredicates;
365ffd83dbSDimitry Andric using namespace MIPatternMatch;
370b57cec5SDimitry Andric 
385ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
395ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
405ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
415ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
425ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
435ffd83dbSDimitry Andric   cl::init(false),
445ffd83dbSDimitry Andric   cl::ReallyHidden);
450b57cec5SDimitry Andric 
465ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
475ffd83dbSDimitry Andric 
485ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
495ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
505ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
515ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
52*fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
530b57cec5SDimitry Andric }
540b57cec5SDimitry Andric 
555ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
565ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
575ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
585ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
595ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
608bcb0991SDimitry Andric }
618bcb0991SDimitry Andric 
62e8d8bef9SDimitry Andric /// \returs true if this is an odd sized vector which should widen by adding an
63e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
64e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
650b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
660b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
670b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
68e8d8bef9SDimitry Andric     if (!Ty.isVector())
69e8d8bef9SDimitry Andric       return false;
70e8d8bef9SDimitry Andric 
71e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
72e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
73e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
74e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
758bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
768bcb0991SDimitry Andric   };
778bcb0991SDimitry Andric }
788bcb0991SDimitry Andric 
79e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
80e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
81e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
82e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
83e8d8bef9SDimitry Andric   };
84e8d8bef9SDimitry Andric }
85e8d8bef9SDimitry Andric 
868bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
878bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
888bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
898bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
908bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
910b57cec5SDimitry Andric   };
920b57cec5SDimitry Andric }
930b57cec5SDimitry Andric 
940b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
950b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
960b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
970b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
98*fe6060f1SDimitry Andric     return std::make_pair(TypeIdx,
99*fe6060f1SDimitry Andric                           LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1000b57cec5SDimitry Andric   };
1010b57cec5SDimitry Andric }
1020b57cec5SDimitry Andric 
1030b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1040b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1050b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1060b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1070b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1080b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1090b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
110*fe6060f1SDimitry Andric     return std::make_pair(
111*fe6060f1SDimitry Andric         TypeIdx,
112*fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy));
1130b57cec5SDimitry Andric   };
1140b57cec5SDimitry Andric }
1150b57cec5SDimitry Andric 
1168bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1178bcb0991SDimitry Andric // type.
1188bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1198bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1208bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1218bcb0991SDimitry Andric 
1228bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1238bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1248bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1258bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1268bcb0991SDimitry Andric 
1278bcb0991SDimitry Andric     assert(EltSize < 32);
1288bcb0991SDimitry Andric 
1298bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
130*fe6060f1SDimitry Andric     return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1318bcb0991SDimitry Andric   };
1328bcb0991SDimitry Andric }
1338bcb0991SDimitry Andric 
134e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
135e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1365ffd83dbSDimitry Andric 
1375ffd83dbSDimitry Andric   LLT CoercedTy;
1385ffd83dbSDimitry Andric   if (Size <= 32) {
1395ffd83dbSDimitry Andric     // <2 x s8> -> s16
1405ffd83dbSDimitry Andric     // <4 x s8> -> s32
141e8d8bef9SDimitry Andric     return LLT::scalar(Size);
142e8d8bef9SDimitry Andric   }
1435ffd83dbSDimitry Andric 
144*fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
145e8d8bef9SDimitry Andric }
146e8d8bef9SDimitry Andric 
147e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
148e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
149e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
150e8d8bef9SDimitry Andric     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
151e8d8bef9SDimitry Andric   };
152e8d8bef9SDimitry Andric }
153e8d8bef9SDimitry Andric 
154e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
155e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
156e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
157e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
158e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
159*fe6060f1SDimitry Andric     return std::make_pair(
160*fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
1615ffd83dbSDimitry Andric   };
1625ffd83dbSDimitry Andric }
1635ffd83dbSDimitry Andric 
1648bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
1658bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1668bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1678bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
1688bcb0991SDimitry Andric   };
1698bcb0991SDimitry Andric }
1708bcb0991SDimitry Andric 
1710b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
1720b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1730b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1740b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
1750b57cec5SDimitry Andric   };
1760b57cec5SDimitry Andric }
1770b57cec5SDimitry Andric 
1780b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
1790b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1800b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1810b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
1820b57cec5SDimitry Andric   };
1830b57cec5SDimitry Andric }
1840b57cec5SDimitry Andric 
1855ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
1865ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
1875ffd83dbSDimitry Andric }
1885ffd83dbSDimitry Andric 
1895ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
1905ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
1915ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
1925ffd83dbSDimitry Andric }
1935ffd83dbSDimitry Andric 
1945ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
1950b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
1960b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
1970b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
1980b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
1990b57cec5SDimitry Andric }
2000b57cec5SDimitry Andric 
2015ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2025ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2035ffd83dbSDimitry Andric     return false;
2045ffd83dbSDimitry Andric 
2055ffd83dbSDimitry Andric   if (Ty.isVector())
2065ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2075ffd83dbSDimitry Andric 
2085ffd83dbSDimitry Andric   return true;
2095ffd83dbSDimitry Andric }
2105ffd83dbSDimitry Andric 
2115ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2125ffd83dbSDimitry Andric // multiples of v2s16.
2135ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2145ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2155ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2168bcb0991SDimitry Andric   };
2178bcb0991SDimitry Andric }
2188bcb0991SDimitry Andric 
2195ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2208bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2215ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2225ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2235ffd83dbSDimitry Andric       return false;
2245ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2255ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2268bcb0991SDimitry Andric   };
2278bcb0991SDimitry Andric }
2288bcb0991SDimitry Andric 
229*fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
230*fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
231*fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
2328bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2338bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2348bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
235*fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
2360b57cec5SDimitry Andric   };
2370b57cec5SDimitry Andric }
2380b57cec5SDimitry Andric 
2395ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2405ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2415ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2425ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
2435ffd83dbSDimitry Andric                                     bool IsLoad) {
2445ffd83dbSDimitry Andric   switch (AS) {
2455ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2465ffd83dbSDimitry Andric     // FIXME: Private element size.
247e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2485ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2495ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
2505ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
2515ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
2525ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
2535ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
2545ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
2555ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
2565ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
2575ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
2585ffd83dbSDimitry Andric     // kernel.
2595ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
2605ffd83dbSDimitry Andric   default:
2615ffd83dbSDimitry Andric     // Flat addresses may contextually need to be split to 32-bit parts if they
2625ffd83dbSDimitry Andric     // may alias scratch depending on the subtarget.
2635ffd83dbSDimitry Andric     return 128;
2645ffd83dbSDimitry Andric   }
2655ffd83dbSDimitry Andric }
2665ffd83dbSDimitry Andric 
2675ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
268*fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
2695ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
2705ffd83dbSDimitry Andric 
2715ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
272*fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
2735ffd83dbSDimitry Andric 
2745ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
275*fe6060f1SDimitry Andric   unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
276e8d8bef9SDimitry Andric   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
2775ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
2785ffd83dbSDimitry Andric 
2795ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
2805ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2815ffd83dbSDimitry Andric     return false;
2825ffd83dbSDimitry Andric 
283*fe6060f1SDimitry Andric   // Do not handle extending vector loads.
284*fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
285*fe6060f1SDimitry Andric     return false;
286*fe6060f1SDimitry Andric 
2875ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
2885ffd83dbSDimitry Andric   // we also need to modify the memory access size.
2895ffd83dbSDimitry Andric #if 0
2905ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
2915ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
2925ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
2935ffd83dbSDimitry Andric #endif
2945ffd83dbSDimitry Andric 
2955ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
2965ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
2975ffd83dbSDimitry Andric     return false;
2985ffd83dbSDimitry Andric 
2995ffd83dbSDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
3005ffd83dbSDimitry Andric     return false;
3015ffd83dbSDimitry Andric 
3025ffd83dbSDimitry Andric   switch (MemSize) {
3035ffd83dbSDimitry Andric   case 8:
3045ffd83dbSDimitry Andric   case 16:
3055ffd83dbSDimitry Andric   case 32:
3065ffd83dbSDimitry Andric   case 64:
3075ffd83dbSDimitry Andric   case 128:
3085ffd83dbSDimitry Andric     break;
3095ffd83dbSDimitry Andric   case 96:
3105ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3115ffd83dbSDimitry Andric       return false;
3125ffd83dbSDimitry Andric     break;
3135ffd83dbSDimitry Andric   case 256:
3145ffd83dbSDimitry Andric   case 512:
3155ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3165ffd83dbSDimitry Andric     break;
3175ffd83dbSDimitry Andric   default:
3185ffd83dbSDimitry Andric     return false;
3195ffd83dbSDimitry Andric   }
3205ffd83dbSDimitry Andric 
3215ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3225ffd83dbSDimitry Andric 
323e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3245ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
325e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
326e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3275ffd83dbSDimitry Andric       return false;
3285ffd83dbSDimitry Andric   }
3295ffd83dbSDimitry Andric 
3305ffd83dbSDimitry Andric   return true;
3315ffd83dbSDimitry Andric }
3325ffd83dbSDimitry Andric 
3335ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
3345ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
3355ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
3365ffd83dbSDimitry Andric // bitcasting.
3375ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
3385ffd83dbSDimitry Andric   if (EnableNewLegality)
3395ffd83dbSDimitry Andric     return false;
3405ffd83dbSDimitry Andric 
3415ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
3425ffd83dbSDimitry Andric   if (Size <= 64)
3435ffd83dbSDimitry Andric     return false;
3445ffd83dbSDimitry Andric   if (!Ty.isVector())
3455ffd83dbSDimitry Andric     return true;
346e8d8bef9SDimitry Andric 
347e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
348e8d8bef9SDimitry Andric   if (EltTy.isPointer())
349e8d8bef9SDimitry Andric     return true;
350e8d8bef9SDimitry Andric 
351e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
3525ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
3535ffd83dbSDimitry Andric }
3545ffd83dbSDimitry Andric 
355*fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
3565ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
357*fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
3585ffd83dbSDimitry Andric          !loadStoreBitcastWorkaround(Ty);
3595ffd83dbSDimitry Andric }
3605ffd83dbSDimitry Andric 
361e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
362e8d8bef9SDimitry Andric /// to a different type.
363e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
364*fe6060f1SDimitry Andric                                        const LLT MemTy) {
365*fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
366e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
367e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
368e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
369e8d8bef9SDimitry Andric 
370e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
371e8d8bef9SDimitry Andric     return true;
372*fe6060f1SDimitry Andric 
373*fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
374*fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
375*fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
376e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
377e8d8bef9SDimitry Andric }
378e8d8bef9SDimitry Andric 
379e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
380e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
381e8d8bef9SDimitry Andric /// changes, not the size of the result register.
382*fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
383e8d8bef9SDimitry Andric                             unsigned AlignInBits, unsigned AddrSpace,
384e8d8bef9SDimitry Andric                             unsigned Opcode) {
385*fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
386e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
387e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
388e8d8bef9SDimitry Andric     return false;
389e8d8bef9SDimitry Andric 
390e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
391e8d8bef9SDimitry Andric   // end up widening these for a scalar load during RegBankSelect, since there
392e8d8bef9SDimitry Andric   // aren't 96-bit scalar loads.
393e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
394e8d8bef9SDimitry Andric     return false;
395e8d8bef9SDimitry Andric 
396e8d8bef9SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
397e8d8bef9SDimitry Andric     return false;
398e8d8bef9SDimitry Andric 
399e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
400e8d8bef9SDimitry Andric   // to it.
401e8d8bef9SDimitry Andric   //
402e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
403e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
404e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
405e8d8bef9SDimitry Andric     return false;
406e8d8bef9SDimitry Andric 
407e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
408e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
409e8d8bef9SDimitry Andric   bool Fast = false;
410e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
411e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
412e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
413e8d8bef9SDimitry Andric          Fast;
414e8d8bef9SDimitry Andric }
415e8d8bef9SDimitry Andric 
416e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
417e8d8bef9SDimitry Andric                             unsigned Opcode) {
418e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
419e8d8bef9SDimitry Andric     return false;
420e8d8bef9SDimitry Andric 
421*fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
422e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
423e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
424e8d8bef9SDimitry Andric }
425e8d8bef9SDimitry Andric 
4260b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
4270b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
4280b57cec5SDimitry Andric   :  ST(ST_) {
4290b57cec5SDimitry Andric   using namespace TargetOpcode;
4300b57cec5SDimitry Andric 
4310b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
4320b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
4330b57cec5SDimitry Andric   };
4340b57cec5SDimitry Andric 
4350b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
436e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
4370b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
4380b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
4390b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
4400b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
4410b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
4425ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
4435ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
4440b57cec5SDimitry Andric 
445*fe6060f1SDimitry Andric   const LLT V2S8 = LLT::fixed_vector(2, 8);
446*fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
447*fe6060f1SDimitry Andric   const LLT V4S16 = LLT::fixed_vector(4, 16);
4480b57cec5SDimitry Andric 
449*fe6060f1SDimitry Andric   const LLT V2S32 = LLT::fixed_vector(2, 32);
450*fe6060f1SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
451*fe6060f1SDimitry Andric   const LLT V4S32 = LLT::fixed_vector(4, 32);
452*fe6060f1SDimitry Andric   const LLT V5S32 = LLT::fixed_vector(5, 32);
453*fe6060f1SDimitry Andric   const LLT V6S32 = LLT::fixed_vector(6, 32);
454*fe6060f1SDimitry Andric   const LLT V7S32 = LLT::fixed_vector(7, 32);
455*fe6060f1SDimitry Andric   const LLT V8S32 = LLT::fixed_vector(8, 32);
456*fe6060f1SDimitry Andric   const LLT V9S32 = LLT::fixed_vector(9, 32);
457*fe6060f1SDimitry Andric   const LLT V10S32 = LLT::fixed_vector(10, 32);
458*fe6060f1SDimitry Andric   const LLT V11S32 = LLT::fixed_vector(11, 32);
459*fe6060f1SDimitry Andric   const LLT V12S32 = LLT::fixed_vector(12, 32);
460*fe6060f1SDimitry Andric   const LLT V13S32 = LLT::fixed_vector(13, 32);
461*fe6060f1SDimitry Andric   const LLT V14S32 = LLT::fixed_vector(14, 32);
462*fe6060f1SDimitry Andric   const LLT V15S32 = LLT::fixed_vector(15, 32);
463*fe6060f1SDimitry Andric   const LLT V16S32 = LLT::fixed_vector(16, 32);
464*fe6060f1SDimitry Andric   const LLT V32S32 = LLT::fixed_vector(32, 32);
4650b57cec5SDimitry Andric 
466*fe6060f1SDimitry Andric   const LLT V2S64 = LLT::fixed_vector(2, 64);
467*fe6060f1SDimitry Andric   const LLT V3S64 = LLT::fixed_vector(3, 64);
468*fe6060f1SDimitry Andric   const LLT V4S64 = LLT::fixed_vector(4, 64);
469*fe6060f1SDimitry Andric   const LLT V5S64 = LLT::fixed_vector(5, 64);
470*fe6060f1SDimitry Andric   const LLT V6S64 = LLT::fixed_vector(6, 64);
471*fe6060f1SDimitry Andric   const LLT V7S64 = LLT::fixed_vector(7, 64);
472*fe6060f1SDimitry Andric   const LLT V8S64 = LLT::fixed_vector(8, 64);
473*fe6060f1SDimitry Andric   const LLT V16S64 = LLT::fixed_vector(16, 64);
4740b57cec5SDimitry Andric 
4750b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
4760b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
4778bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
4780b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
4798bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
4800b57cec5SDimitry Andric 
4810b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
4820b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
4838bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
4840b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
4858bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
4860b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
4870b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
4880b57cec5SDimitry Andric 
4890b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
4900b57cec5SDimitry Andric 
4910b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
4920b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
4930b57cec5SDimitry Andric   };
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
4968bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
4970b57cec5SDimitry Andric   };
4980b57cec5SDimitry Andric 
4990b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
5000b57cec5SDimitry Andric     S32, S64
5010b57cec5SDimitry Andric   };
5020b57cec5SDimitry Andric 
5030b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
5040b57cec5SDimitry Andric     S32, S64, S16
5050b57cec5SDimitry Andric   };
5060b57cec5SDimitry Andric 
5070b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
5080b57cec5SDimitry Andric     S32, S64, S16, V2S16
5090b57cec5SDimitry Andric   };
5100b57cec5SDimitry Andric 
5115ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
5125ffd83dbSDimitry Andric 
513*fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
514*fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
5150b57cec5SDimitry Andric 
5160b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
5170b57cec5SDimitry Andric   // elements for v3s16
5180b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
519e8d8bef9SDimitry Andric     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
5200b57cec5SDimitry Andric     .legalFor(AllS32Vectors)
5210b57cec5SDimitry Andric     .legalFor(AllS64Vectors)
5220b57cec5SDimitry Andric     .legalFor(AddrSpaces64)
5230b57cec5SDimitry Andric     .legalFor(AddrSpaces32)
524e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
525e8d8bef9SDimitry Andric     .clampScalar(0, S16, S256)
5260b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
5270b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16)
5280b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
529e8d8bef9SDimitry Andric     .scalarize(0);
5300b57cec5SDimitry Andric 
531e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
532e8d8bef9SDimitry Andric     // Full set of gfx9 features.
5335ffd83dbSDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5345ffd83dbSDimitry Andric       .legalFor({S32, S16, V2S16})
5355ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
5365ffd83dbSDimitry Andric       .clampMaxNumElements(0, S16, 2)
5375ffd83dbSDimitry Andric       .scalarize(0)
5385ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32);
539e8d8bef9SDimitry Andric 
540e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
541e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
542e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
543e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S16, 2)
544e8d8bef9SDimitry Andric       .scalarize(0)
545e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
546e8d8bef9SDimitry Andric       .lower();
5475ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
5480b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5490b57cec5SDimitry Andric       .legalFor({S32, S16})
5500b57cec5SDimitry Andric       .clampScalar(0, S16, S32)
5515ffd83dbSDimitry Andric       .scalarize(0)
552e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
553e8d8bef9SDimitry Andric 
554e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
555e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
556e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
557e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
558e8d8bef9SDimitry Andric       .minScalar(0, S16)
559e8d8bef9SDimitry Andric       .scalarize(0)
560e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
561e8d8bef9SDimitry Andric       .lower();
562e8d8bef9SDimitry Andric 
563e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
564e8d8bef9SDimitry Andric     // coerce to the desired type first.
565e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
566e8d8bef9SDimitry Andric       .minScalar(0, S16)
567e8d8bef9SDimitry Andric       .scalarize(0)
568e8d8bef9SDimitry Andric       .lower();
5690b57cec5SDimitry Andric   } else {
5700b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5710b57cec5SDimitry Andric       .legalFor({S32})
5720b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
5730b57cec5SDimitry Andric       .scalarize(0);
574e8d8bef9SDimitry Andric 
575e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
576e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
577e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
578e8d8bef9SDimitry Andric         .scalarize(0)
579e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
580e8d8bef9SDimitry Andric         .lower();
581e8d8bef9SDimitry Andric     } else {
582e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
583e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
584e8d8bef9SDimitry Andric         .minScalar(0, S32)
585e8d8bef9SDimitry Andric         .scalarize(0)
586e8d8bef9SDimitry Andric         .lower();
5870b57cec5SDimitry Andric     }
5880b57cec5SDimitry Andric 
589e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
590e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
591e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
592e8d8bef9SDimitry Andric       .minScalar(0, S32)
593e8d8bef9SDimitry Andric       .scalarize(0)
594e8d8bef9SDimitry Andric       .lower();
595e8d8bef9SDimitry Andric   }
596e8d8bef9SDimitry Andric 
597*fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
598*fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
5995ffd83dbSDimitry Andric       .customFor({S32, S64})
600480093f4SDimitry Andric       .clampScalar(0, S32, S64)
601480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
602480093f4SDimitry Andric       .scalarize(0);
603480093f4SDimitry Andric 
604e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
6050b57cec5SDimitry Andric                    .legalFor({S32})
606e8d8bef9SDimitry Andric                    .maxScalarOrElt(0, S32);
607e8d8bef9SDimitry Andric 
608e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
609e8d8bef9SDimitry Andric     Mulh
610e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
611e8d8bef9SDimitry Andric       .lowerFor({V2S8});
612e8d8bef9SDimitry Andric   }
613e8d8bef9SDimitry Andric 
614e8d8bef9SDimitry Andric   Mulh
615e8d8bef9SDimitry Andric     .scalarize(0)
616e8d8bef9SDimitry Andric     .lower();
6170b57cec5SDimitry Andric 
6180b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
6190b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
6200b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
6210b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
6220b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
6230b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6248bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
6250b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
6260b57cec5SDimitry Andric     .scalarize(0);
6270b57cec5SDimitry Andric 
6288bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
6290b57cec5SDimitry Andric                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
630480093f4SDimitry Andric     .legalFor({{S32, S1}, {S32, S32}})
6315ffd83dbSDimitry Andric     .minScalar(0, S32)
6325ffd83dbSDimitry Andric     // TODO: .scalarize(0)
6338bcb0991SDimitry Andric     .lower();
6340b57cec5SDimitry Andric 
6350b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
6360b57cec5SDimitry Andric     // Don't worry about the size constraint.
6378bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
6385ffd83dbSDimitry Andric     .lower();
6390b57cec5SDimitry Andric 
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
6428bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
6430b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
644e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
6450b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
646e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
6470b57cec5SDimitry Andric 
6485ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
6495ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
6505ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
6518bcb0991SDimitry Andric 
6525ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
6535ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
6545ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
6555ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
6565ffd83dbSDimitry Andric       .legalFor({S1, S16})
6575ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6585ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
6595ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
6605ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
6615ffd83dbSDimitry Andric 
662*fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
6635ffd83dbSDimitry Andric 
6645ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
6655ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
6665ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
6675ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
6685ffd83dbSDimitry Andric 
6695ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
670e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
671e8d8bef9SDimitry Andric 
672*fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
6730b57cec5SDimitry Andric 
6740b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
6758bcb0991SDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
6760b57cec5SDimitry Andric     .legalFor({S32, S64});
6778bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
6788bcb0991SDimitry Andric     .customFor({S32, S64});
6798bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
6808bcb0991SDimitry Andric     .customFor({S32, S64});
6810b57cec5SDimitry Andric 
6820b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
6830b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
6840b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
6850b57cec5SDimitry Andric     else
6860b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
6878bcb0991SDimitry Andric 
6888bcb0991SDimitry Andric     TrigActions.customFor({S16});
6898bcb0991SDimitry Andric     FDIVActions.customFor({S16});
6900b57cec5SDimitry Andric   }
6910b57cec5SDimitry Andric 
6920b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
6930b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
6940b57cec5SDimitry Andric 
6950b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
6960b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
697480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6980b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
6990b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7000b57cec5SDimitry Andric       .scalarize(0);
7010b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
7020b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
7030b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
7040b57cec5SDimitry Andric       .scalarize(0);
7050b57cec5SDimitry Andric   } else {
7060b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
7070b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
7080b57cec5SDimitry Andric       .scalarize(0);
7090b57cec5SDimitry Andric   }
7100b57cec5SDimitry Andric 
7110b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
7120b57cec5SDimitry Andric     FPOpActions.clampMaxNumElements(0, S16, 2);
7138bcb0991SDimitry Andric 
7140b57cec5SDimitry Andric   FPOpActions
7150b57cec5SDimitry Andric     .scalarize(0)
7160b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7170b57cec5SDimitry Andric 
7188bcb0991SDimitry Andric   TrigActions
7198bcb0991SDimitry Andric     .scalarize(0)
7208bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7218bcb0991SDimitry Andric 
7228bcb0991SDimitry Andric   FDIVActions
7238bcb0991SDimitry Andric     .scalarize(0)
7248bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7258bcb0991SDimitry Andric 
7268bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
7278bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
7288bcb0991SDimitry Andric     .clampMaxNumElements(0, S16, 2)
7298bcb0991SDimitry Andric     .scalarize(0)
7308bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
7318bcb0991SDimitry Andric 
7320b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7338bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
7340b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
7350b57cec5SDimitry Andric       .scalarize(0)
7360b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
7370b57cec5SDimitry Andric   } else {
7385ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
7395ffd83dbSDimitry Andric       .legalFor({S32, S64})
7405ffd83dbSDimitry Andric       .scalarize(0)
7415ffd83dbSDimitry Andric       .clampScalar(0, S32, S64);
7425ffd83dbSDimitry Andric 
7435ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
7445ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7455ffd83dbSDimitry Andric         .customFor({S64})
7465ffd83dbSDimitry Andric         .legalFor({S32, S64})
7475ffd83dbSDimitry Andric         .scalarize(0)
7485ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
7495ffd83dbSDimitry Andric     } else {
7505ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7510b57cec5SDimitry Andric         .legalFor({S32, S64})
7520b57cec5SDimitry Andric         .scalarize(0)
7530b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
7540b57cec5SDimitry Andric     }
7555ffd83dbSDimitry Andric   }
7560b57cec5SDimitry Andric 
7570b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
7580b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
7595ffd83dbSDimitry Andric     .scalarize(0)
7605ffd83dbSDimitry Andric     .lower();
7610b57cec5SDimitry Andric 
7620b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
7630b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
764e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
7650b57cec5SDimitry Andric     .scalarize(0);
7660b57cec5SDimitry Andric 
7670b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FSUB)
7680b57cec5SDimitry Andric       // Use actual fsub instruction
7690b57cec5SDimitry Andric       .legalFor({S32})
7700b57cec5SDimitry Andric       // Must use fadd + fneg
7710b57cec5SDimitry Andric       .lowerFor({S64, S16, V2S16})
7720b57cec5SDimitry Andric       .scalarize(0)
7730b57cec5SDimitry Andric       .clampScalar(0, S32, S64);
7740b57cec5SDimitry Andric 
7758bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
7768bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
7775ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
7788bcb0991SDimitry Andric     FMad.customFor({S32, S16});
7795ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
7808bcb0991SDimitry Andric     FMad.customFor({S32});
7815ffd83dbSDimitry Andric   else if (ST.hasMadF16())
7825ffd83dbSDimitry Andric     FMad.customFor({S16});
7838bcb0991SDimitry Andric   FMad.scalarize(0)
7848bcb0991SDimitry Andric       .lower();
7858bcb0991SDimitry Andric 
786e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
787e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
788e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
789e8d8bef9SDimitry Andric   } else {
790e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
791e8d8bef9SDimitry Andric         .customFor({S32, S64});
792e8d8bef9SDimitry Andric   }
793e8d8bef9SDimitry Andric   FRem.scalarize(0);
794e8d8bef9SDimitry Andric 
7955ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
7965ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
7975ffd83dbSDimitry Andric     .legalIf(isScalar(0))
7985ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
7995ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
8005ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
8015ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
8025ffd83dbSDimitry Andric     // in the legalizer.
8035ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
8045ffd83dbSDimitry Andric     .alwaysLegal();
8055ffd83dbSDimitry Andric 
8060b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
8070b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
8085ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
809480093f4SDimitry Andric     .scalarize(0)
8105ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
8115ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
8120b57cec5SDimitry Andric 
8138bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
8148bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
815480093f4SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
8160b57cec5SDimitry Andric     .lowerFor({{S32, S64}})
817480093f4SDimitry Andric     .lowerIf(typeIs(1, S1))
8188bcb0991SDimitry Andric     .customFor({{S64, S64}});
8198bcb0991SDimitry Andric   if (ST.has16BitInsts())
8208bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
8218bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
822e8d8bef9SDimitry Andric        .minScalar(0, S32)
8235ffd83dbSDimitry Andric        .scalarize(0)
8245ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
8250b57cec5SDimitry Andric 
8268bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
8275ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
828*fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
829e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
8308bcb0991SDimitry Andric   if (ST.has16BitInsts())
8318bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
8328bcb0991SDimitry Andric   else
8338bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
8348bcb0991SDimitry Andric 
8358bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
836*fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
8375ffd83dbSDimitry Andric        .scalarize(0)
8385ffd83dbSDimitry Andric        .lower();
8390b57cec5SDimitry Andric 
840e8d8bef9SDimitry Andric   // Lower roundeven into G_FRINT
841e8d8bef9SDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
842480093f4SDimitry Andric     .scalarize(0)
843480093f4SDimitry Andric     .lower();
8440b57cec5SDimitry Andric 
845480093f4SDimitry Andric   if (ST.has16BitInsts()) {
846480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
847480093f4SDimitry Andric       .legalFor({S16, S32, S64})
848480093f4SDimitry Andric       .clampScalar(0, S16, S64)
849480093f4SDimitry Andric       .scalarize(0);
850480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
8510b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8520b57cec5SDimitry Andric       .legalFor({S32, S64})
8530b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8540b57cec5SDimitry Andric       .scalarize(0);
8550b57cec5SDimitry Andric   } else {
8560b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8570b57cec5SDimitry Andric       .legalFor({S32})
8580b57cec5SDimitry Andric       .customFor({S64})
8590b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8600b57cec5SDimitry Andric       .scalarize(0);
8610b57cec5SDimitry Andric   }
8620b57cec5SDimitry Andric 
863480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
864e8d8bef9SDimitry Andric     .legalIf(all(isPointer(0), sameSize(0, 1)))
865e8d8bef9SDimitry Andric     .scalarize(0)
866e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0);
8670b57cec5SDimitry Andric 
8685ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
869e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
870e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
8715ffd83dbSDimitry Andric     .scalarize(0);
8720b57cec5SDimitry Andric 
8730b57cec5SDimitry Andric   auto &CmpBuilder =
8740b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
875480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
876480093f4SDimitry Andric     // so make both s1 and s32 legal.
877480093f4SDimitry Andric     //
878480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
879480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
880480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
881480093f4SDimitry Andric     // before that won't try to use s32 result types.
882480093f4SDimitry Andric     //
883480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
884480093f4SDimitry Andric     // bank.
8850b57cec5SDimitry Andric     .legalForCartesianProduct(
8860b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
887480093f4SDimitry Andric     .legalForCartesianProduct(
888480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
8890b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
8900b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
8910b57cec5SDimitry Andric   }
8920b57cec5SDimitry Andric 
8930b57cec5SDimitry Andric   CmpBuilder
8940b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
8950b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
8960b57cec5SDimitry Andric     .scalarize(0)
897480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
8980b57cec5SDimitry Andric 
8990b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
9000b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
9010b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
9020b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9030b57cec5SDimitry Andric     .scalarize(0);
9040b57cec5SDimitry Andric 
9055ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
9065ffd83dbSDimitry Andric   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
9075ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9085ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32, S16});
9095ffd83dbSDimitry Andric   else
9105ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32});
9115ffd83dbSDimitry Andric   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
9125ffd83dbSDimitry Andric   Exp2Ops.scalarize(0);
9135ffd83dbSDimitry Andric 
9145ffd83dbSDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
9155ffd83dbSDimitry Andric   if (ST.has16BitInsts())
9165ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
9175ffd83dbSDimitry Andric   else
9185ffd83dbSDimitry Andric     ExpOps.customFor({S32});
9195ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
9200b57cec5SDimitry Andric         .scalarize(0);
9210b57cec5SDimitry Andric 
922e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
923e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
924e8d8bef9SDimitry Andric     .lower();
925e8d8bef9SDimitry Andric 
9260b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9275ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
9280b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9290b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
9300b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9310b57cec5SDimitry Andric     .scalarize(0)
9320b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
9330b57cec5SDimitry Andric     .widenScalarToNextPow2(1, 32);
9340b57cec5SDimitry Andric 
9355ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
9365ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
9375ffd83dbSDimitry Andric   // bitwidth.
9385ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
9395ffd83dbSDimitry Andric     .scalarize(0)
9405ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9415ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9425ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9435ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
9445ffd83dbSDimitry Andric     .lower();
9455ffd83dbSDimitry Andric 
9465ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9475ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
9485ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9495ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9505ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9515ffd83dbSDimitry Andric     .scalarize(0)
9525ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9535ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
9545ffd83dbSDimitry Andric 
955*fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
956*fe6060f1SDimitry Andric   // RegBankSelect.
9575ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
958*fe6060f1SDimitry Andric     .legalFor({S32, S64})
959*fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
960*fe6060f1SDimitry Andric     .scalarize(0)
961*fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
9620b57cec5SDimitry Andric 
9630b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
9645ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
9655ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
9665ffd83dbSDimitry Andric       .clampMaxNumElements(0, S16, 2)
9675ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
9685ffd83dbSDimitry Andric       // narrowScalar limitation.
9695ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
9705ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
9715ffd83dbSDimitry Andric       .scalarize(0);
9725ffd83dbSDimitry Andric 
9730b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
974*fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
9750b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
9760b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
9770b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
9785ffd83dbSDimitry Andric         .minScalar(0, S16)
9790b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
9805ffd83dbSDimitry Andric         .scalarize(0)
9815ffd83dbSDimitry Andric         .lower();
9820b57cec5SDimitry Andric     } else {
983*fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
9840b57cec5SDimitry Andric         .legalFor({S32, S16})
9850b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
9865ffd83dbSDimitry Andric         .minScalar(0, S16)
9875ffd83dbSDimitry Andric         .scalarize(0)
9885ffd83dbSDimitry Andric         .lower();
9890b57cec5SDimitry Andric     }
9900b57cec5SDimitry Andric   } else {
9915ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
9925ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
9935ffd83dbSDimitry Andric       .legalFor({S32})
9945ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
9955ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
9965ffd83dbSDimitry Andric       // narrowScalar limitation.
9975ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
9985ffd83dbSDimitry Andric       .maxScalar(0, S32)
9995ffd83dbSDimitry Andric       .scalarize(0)
10005ffd83dbSDimitry Andric       .lower();
10015ffd83dbSDimitry Andric 
1002*fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
10030b57cec5SDimitry Andric       .legalFor({S32})
10045ffd83dbSDimitry Andric       .minScalar(0, S32)
10050b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
10065ffd83dbSDimitry Andric       .scalarize(0)
10075ffd83dbSDimitry Andric       .lower();
10080b57cec5SDimitry Andric   }
10090b57cec5SDimitry Andric 
10100b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
10110b57cec5SDimitry Andric     // List the common cases
10120b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10130b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10140b57cec5SDimitry Andric     .scalarize(0)
10150b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10160b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10170b57cec5SDimitry Andric     .widenScalarIf(smallerThan(1, 0),
10180b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10190b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10200b57cec5SDimitry Andric       })
10215ffd83dbSDimitry Andric     .narrowScalarIf(largerThan(1, 0),
10220b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10230b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10240b57cec5SDimitry Andric       });
10250b57cec5SDimitry Andric 
10260b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
10270b57cec5SDimitry Andric     // List the common cases
10280b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10290b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10300b57cec5SDimitry Andric     .scalarize(0)
10310b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10320b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10330b57cec5SDimitry Andric     .widenScalarIf(smallerThan(0, 1),
10340b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10350b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10360b57cec5SDimitry Andric       })
10370b57cec5SDimitry Andric     .narrowScalarIf(
10385ffd83dbSDimitry Andric       largerThan(0, 1),
10390b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10400b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10410b57cec5SDimitry Andric       });
10420b57cec5SDimitry Andric 
10430b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
10440b57cec5SDimitry Andric     .scalarize(0)
10450b57cec5SDimitry Andric     .custom();
10460b57cec5SDimitry Andric 
10475ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
10485ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
10498bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
10508bcb0991SDimitry Andric 
10518bcb0991SDimitry Andric     // Split vector extloads.
1052*fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1053e8d8bef9SDimitry Andric     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1054480093f4SDimitry Andric 
1055480093f4SDimitry Andric     if (MemSize < DstTy.getSizeInBits())
1056e8d8bef9SDimitry Andric       MemSize = std::max(MemSize, AlignBits);
1057480093f4SDimitry Andric 
10588bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
10598bcb0991SDimitry Andric       return true;
10608bcb0991SDimitry Andric 
10618bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
10628bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
10635ffd83dbSDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
10648bcb0991SDimitry Andric       return true;
10658bcb0991SDimitry Andric 
10668bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
10678bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
10685ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
10695ffd83dbSDimitry Andric     if (NumRegs == 3) {
10705ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
10718bcb0991SDimitry Andric         return true;
10725ffd83dbSDimitry Andric     } else {
10735ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
10745ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
10755ffd83dbSDimitry Andric         return true;
10765ffd83dbSDimitry Andric     }
10778bcb0991SDimitry Andric 
1078e8d8bef9SDimitry Andric     if (AlignBits < MemSize) {
10798bcb0991SDimitry Andric       const SITargetLowering *TLI = ST.getTargetLowering();
1080e8d8bef9SDimitry Andric       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1081e8d8bef9SDimitry Andric                                                       Align(AlignBits / 8));
10828bcb0991SDimitry Andric     }
10838bcb0991SDimitry Andric 
10848bcb0991SDimitry Andric     return false;
10858bcb0991SDimitry Andric   };
10868bcb0991SDimitry Andric 
1087e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1088e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1089e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
10908bcb0991SDimitry Andric 
10918bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
10928bcb0991SDimitry Andric   // LDS
10938bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
10948bcb0991SDimitry Andric 
10958bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
10968bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
10978bcb0991SDimitry Andric 
10988bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
10995ffd83dbSDimitry Andric     // Explicitly list some common cases.
11005ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1101*fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1102*fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1103*fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1104*fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1105*fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1106*fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1107*fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1108*fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
11098bcb0991SDimitry Andric 
1110*fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1111*fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1112*fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1113*fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1114*fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1115*fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
11168bcb0991SDimitry Andric 
1117*fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1118*fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1119*fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1120*fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
11218bcb0991SDimitry Andric 
1122*fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1123*fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1124*fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1125*fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1126*fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
11275ffd83dbSDimitry Andric     Actions.legalIf(
11285ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1129*fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
11305ffd83dbSDimitry Andric       });
11315ffd83dbSDimitry Andric 
11325ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
11335ffd83dbSDimitry Andric     // 64-bits.
11345ffd83dbSDimitry Andric     //
11355ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
11365ffd83dbSDimitry Andric     // inserting addrspacecasts.
11375ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
11385ffd83dbSDimitry Andric 
11395ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
11405ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
11415ffd83dbSDimitry Andric     // parts anyway.
11425ffd83dbSDimitry Andric     //
11435ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
11445ffd83dbSDimitry Andric     // 16-bit vector parts.
11455ffd83dbSDimitry Andric     Actions.bitcastIf(
11465ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1147e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1148*fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
11495ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
11505ffd83dbSDimitry Andric 
1151e8d8bef9SDimitry Andric     if (!IsStore) {
1152e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1153e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1154e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1155e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1156e8d8bef9SDimitry Andric       });
1157e8d8bef9SDimitry Andric     }
1158e8d8bef9SDimitry Andric 
1159e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
11608bcb0991SDimitry Andric     Actions
11618bcb0991SDimitry Andric         .narrowScalarIf(
11628bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
11635ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
11645ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
11658bcb0991SDimitry Andric             },
11668bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
11678bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
11688bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
11698bcb0991SDimitry Andric 
11708bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1171*fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
11728bcb0991SDimitry Andric 
11738bcb0991SDimitry Andric               // Split extloads.
11748bcb0991SDimitry Andric               if (DstSize > MemSize)
11758bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MemSize));
11768bcb0991SDimitry Andric 
11775ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
11785ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
11795ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
11805ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
11815ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
11825ffd83dbSDimitry Andric                 return std::make_pair(0, LLT::scalar(FloorSize));
11835ffd83dbSDimitry Andric               }
11845ffd83dbSDimitry Andric 
11858bcb0991SDimitry Andric               if (DstSize > 32 && (DstSize % 32 != 0)) {
11868bcb0991SDimitry Andric                 // FIXME: Need a way to specify non-extload of larger size if
11878bcb0991SDimitry Andric                 // suitably aligned.
11888bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
11898bcb0991SDimitry Andric               }
11908bcb0991SDimitry Andric 
11915ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
11925ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
11935ffd83dbSDimitry Andric                                                      Op == G_LOAD);
11948bcb0991SDimitry Andric               if (MemSize > MaxSize)
11958bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MaxSize));
11968bcb0991SDimitry Andric 
11978bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
11988bcb0991SDimitry Andric               return std::make_pair(0, LLT::scalar(Align));
11998bcb0991SDimitry Andric             })
12008bcb0991SDimitry Andric         .fewerElementsIf(
12018bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
12025ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
12035ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
12048bcb0991SDimitry Andric             },
12058bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
12068bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
12078bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
12088bcb0991SDimitry Andric 
12098bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
12105ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
12115ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
12125ffd83dbSDimitry Andric                                                      Op == G_LOAD);
12135ffd83dbSDimitry Andric 
12145ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
12155ffd83dbSDimitry Andric               // up scalarizing.
12165ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
12178bcb0991SDimitry Andric 
12188bcb0991SDimitry Andric               // Split if it's too large for the address space.
1219*fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1220*fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
12218bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
12225ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
12235ffd83dbSDimitry Andric 
12245ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
12255ffd83dbSDimitry Andric                   return std::make_pair(
1226*fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1227*fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
12285ffd83dbSDimitry Andric                 }
12295ffd83dbSDimitry Andric 
1230*fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
12318bcb0991SDimitry Andric 
12328bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
12338bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
12348bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
12358bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
12368bcb0991SDimitry Andric                   return std::make_pair(0, EltTy);
12378bcb0991SDimitry Andric 
1238*fe6060f1SDimitry Andric                 return std::make_pair(
1239*fe6060f1SDimitry Andric                     0, LLT::fixed_vector(NumElts / NumPieces, EltTy));
12408bcb0991SDimitry Andric               }
12418bcb0991SDimitry Andric 
12425ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
12435ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
12445ffd83dbSDimitry Andric                 return std::make_pair(0, EltTy);
12455ffd83dbSDimitry Andric 
12465ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
12475ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
12485ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
12495ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
12505ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
12515ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
12525ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
12535ffd83dbSDimitry Andric                 return std::make_pair(
1254*fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1255*fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
12565ffd83dbSDimitry Andric               }
12575ffd83dbSDimitry Andric 
12588bcb0991SDimitry Andric               // Need to split because of alignment.
12598bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
12608bcb0991SDimitry Andric               if (EltSize > Align &&
12618bcb0991SDimitry Andric                   (EltSize / Align < DstTy.getNumElements())) {
1262*fe6060f1SDimitry Andric                 return std::make_pair(
1263*fe6060f1SDimitry Andric                     0, LLT::fixed_vector(EltSize / Align, EltTy));
12648bcb0991SDimitry Andric               }
12658bcb0991SDimitry Andric 
12668bcb0991SDimitry Andric               // May need relegalization for the scalars.
12678bcb0991SDimitry Andric               return std::make_pair(0, EltTy);
12688bcb0991SDimitry Andric             })
1269e8d8bef9SDimitry Andric     .lowerIfMemSizeNotPow2()
1270*fe6060f1SDimitry Andric     .minScalar(0, S32)
1271*fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
12728bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1273e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1274e8d8bef9SDimitry Andric     .lower();
12758bcb0991SDimitry Andric   }
12760b57cec5SDimitry Andric 
1277*fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
12780b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1279*fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1280*fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1281*fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1282*fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1283*fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1284*fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1285*fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1286*fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1287*fe6060f1SDimitry Andric                        .legalIf(
1288*fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1289*fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1290*fe6060f1SDimitry Andric                          });
1291*fe6060f1SDimitry Andric 
12920b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
12938bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1294*fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
12950b57cec5SDimitry Andric   }
12960b57cec5SDimitry Andric 
1297*fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1298*fe6060f1SDimitry Andric   // 64-bits.
1299*fe6060f1SDimitry Andric   //
1300*fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1301*fe6060f1SDimitry Andric   // inserting addrspacecasts.
1302*fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1303*fe6060f1SDimitry Andric 
13040b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
13050b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
13060b57cec5SDimitry Andric           .lower();
13070b57cec5SDimitry Andric 
13080b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
13090b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
13100b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
13110b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1312480093f4SDimitry Andric      G_ATOMICRMW_UMIN})
13130b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1314e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1315e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13160b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
13170b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
13180b57cec5SDimitry Andric   }
13190b57cec5SDimitry Andric 
1320*fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
13215ffd83dbSDimitry Andric   if (ST.hasLDSFPAtomics()) {
1322*fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1323*fe6060f1SDimitry Andric     if (ST.hasGFX90AInsts())
1324*fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
13255ffd83dbSDimitry Andric   }
1326*fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1327*fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
13288bcb0991SDimitry Andric 
1329480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1330480093f4SDimitry Andric   // demarshalling
1331480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1332480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1333480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1334480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1335480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13360b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1337480093f4SDimitry Andric 
1338480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
13390b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1340*fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1341*fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1342*fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1343*fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1344*fe6060f1SDimitry Andric                                 {S1, S32})
13450b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
13465ffd83dbSDimitry Andric       .scalarize(1)
13470b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
13480b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
13490b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
13500b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
13510b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
13520b57cec5SDimitry Andric       .scalarize(0)
13530b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1354480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
13550b57cec5SDimitry Andric 
13560b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
13570b57cec5SDimitry Andric   // be more flexible with the shift amount type.
13580b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
13590b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
13600b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
13610b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
13625ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
13630b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
13640b57cec5SDimitry Andric     } else
13655ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
13660b57cec5SDimitry Andric 
13675ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
13685ffd83dbSDimitry Andric     Shifts.widenScalarIf(
13695ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
13705ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
13715ffd83dbSDimitry Andric         // 32-bit amount.
13725ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
13735ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
13745ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
13755ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
13765ffd83dbSDimitry Andric       }, changeTo(1, S16));
13775ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1378480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
13790b57cec5SDimitry Andric     Shifts.clampScalar(0, S16, S64);
13800b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
1381e8d8bef9SDimitry Andric 
1382e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1383e8d8bef9SDimitry Andric       .minScalar(0, S16)
1384e8d8bef9SDimitry Andric       .scalarize(0)
1385e8d8bef9SDimitry Andric       .lower();
13860b57cec5SDimitry Andric   } else {
13870b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
13880b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
13890b57cec5SDimitry Andric     // been truncated already.
13900b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
13910b57cec5SDimitry Andric     Shifts.clampScalar(0, S32, S64);
13920b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
1393e8d8bef9SDimitry Andric 
1394e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1395e8d8bef9SDimitry Andric       .minScalar(0, S32)
1396e8d8bef9SDimitry Andric       .scalarize(0)
1397e8d8bef9SDimitry Andric       .lower();
13980b57cec5SDimitry Andric   }
13990b57cec5SDimitry Andric   Shifts.scalarize(0);
14000b57cec5SDimitry Andric 
14010b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
14020b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
14030b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
14040b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
14050b57cec5SDimitry Andric 
14060b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14070b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
14080b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
14090b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
14100b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1411e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
1412e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
14130b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
14145ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
14150b57cec5SDimitry Andric                   IdxTy.getSizeInBits() == 32;
14160b57cec5SDimitry Andric         })
1417e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1418e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1419e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1420e8d8bef9SDimitry Andric       .bitcastIf(
1421e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1422e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1423e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1424e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1425e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1426e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1427e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1428e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1429e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1430e8d8bef9SDimitry Andric 
1431e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1432e8d8bef9SDimitry Andric           return std::make_pair(
1433*fe6060f1SDimitry Andric               VecTypeIdx,
1434*fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1435e8d8bef9SDimitry Andric         })
14360b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
14370b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1438e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1439e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1440e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
1441e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1442e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1443e8d8bef9SDimitry Andric       .lower();
14440b57cec5SDimitry Andric   }
14450b57cec5SDimitry Andric 
14460b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
14470b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
14480b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
14490b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
14500b57cec5SDimitry Andric       });
14510b57cec5SDimitry Andric 
14520b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
14530b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
14540b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
14550b57cec5SDimitry Andric 
14560b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
14570b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14588bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
14598bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
14600b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
14610b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14620b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14630b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
14640b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
14650b57cec5SDimitry Andric         })
14660b57cec5SDimitry Andric       .widenScalarIf(
14670b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
14680b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14690b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
14700b57cec5SDimitry Andric         },
14710b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
14720b57cec5SDimitry Andric       .widenScalarIf(
14730b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
14740b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14750b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
14760b57cec5SDimitry Andric         },
14770b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
14780b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
14790b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
14800b57cec5SDimitry Andric 
14810b57cec5SDimitry Andric   }
14820b57cec5SDimitry Andric 
14838bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
14840b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
14850b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
14868bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
14878bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
14888bcb0991SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
14898bcb0991SDimitry Andric 
14908bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
14915ffd83dbSDimitry Andric     BuildVector
14925ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
14935ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
14945ffd83dbSDimitry Andric       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
14955ffd83dbSDimitry Andric       .minScalar(1, S32);
14965ffd83dbSDimitry Andric 
14978bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
14988bcb0991SDimitry Andric       .legalFor({V2S16, S32})
14998bcb0991SDimitry Andric       .lower();
15005ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
15018bcb0991SDimitry Andric   } else {
15025ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
15035ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
15045ffd83dbSDimitry Andric 
15058bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
15065ffd83dbSDimitry Andric       .customFor({V2S16, S32})
15078bcb0991SDimitry Andric       .lower();
15088bcb0991SDimitry Andric   }
15098bcb0991SDimitry Andric 
15105ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
15115ffd83dbSDimitry Andric 
15125ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
15130b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1514e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1515e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1516e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1517e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
15180b57cec5SDimitry Andric 
15195ffd83dbSDimitry Andric   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
15205ffd83dbSDimitry Andric   // pre-legalize.
15215ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
15225ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
15235ffd83dbSDimitry Andric       .customFor({V2S16, V2S16})
15245ffd83dbSDimitry Andric       .lower();
15255ffd83dbSDimitry Andric   } else
15268bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
15278bcb0991SDimitry Andric 
15280b57cec5SDimitry Andric   // Merge/Unmerge
15290b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
15300b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
15310b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
15320b57cec5SDimitry Andric 
15330b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
15345ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
15350b57cec5SDimitry Andric       if (Ty.isVector()) {
15360b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
15375ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
15380b57cec5SDimitry Andric           return true;
15390b57cec5SDimitry Andric         if (!isPowerOf2_32(EltTy.getSizeInBits()))
15400b57cec5SDimitry Andric           return true;
15410b57cec5SDimitry Andric       }
15420b57cec5SDimitry Andric       return false;
15430b57cec5SDimitry Andric     };
15440b57cec5SDimitry Andric 
15458bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1546e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
15475ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
15485ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
15495ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15505ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
15515ffd83dbSDimitry Andric         })
15525ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
15535ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
15545ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
15550b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
15568bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
15578bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
15588bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
15598bcb0991SDimitry Andric                        changeTo(1, V2S16))
15605ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
15615ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
15625ffd83dbSDimitry Andric       // valid.
15635ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
15645ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
15650b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
15660b57cec5SDimitry Andric       .fewerElementsIf(
15675ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
15680b57cec5SDimitry Andric         scalarize(0))
15690b57cec5SDimitry Andric       .fewerElementsIf(
15705ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
15710b57cec5SDimitry Andric         scalarize(1))
15725ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
15738bcb0991SDimitry Andric 
15748bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
15758bcb0991SDimitry Andric       Builder.widenScalarIf(
15768bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
15770b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
15788bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
15798bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
15808bcb0991SDimitry Andric         },
15818bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
15828bcb0991SDimitry Andric     }
15838bcb0991SDimitry Andric 
15848bcb0991SDimitry Andric     Builder.widenScalarIf(
15858bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
15868bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
15870b57cec5SDimitry Andric         return !isPowerOf2_32(Ty.getSizeInBits()) &&
15880b57cec5SDimitry Andric           Ty.getSizeInBits() % 16 != 0;
15890b57cec5SDimitry Andric       },
15900b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
15910b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
15920b57cec5SDimitry Andric         // Whichever is smaller.
15930b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
15940b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
15950b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
15960b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
15970b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
15980b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
15990b57cec5SDimitry Andric         }
16000b57cec5SDimitry Andric         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
16010b57cec5SDimitry Andric       })
16020b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
16030b57cec5SDimitry Andric       .scalarize(0)
16040b57cec5SDimitry Andric       .scalarize(1);
16050b57cec5SDimitry Andric   }
16060b57cec5SDimitry Andric 
16075ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
16085ffd83dbSDimitry Andric   // RegBankSelect.
16095ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
16105ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
16118bcb0991SDimitry Andric 
16125ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
16135ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
16145ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
16155ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
16165ffd83dbSDimitry Andric       // expanded.
16175ffd83dbSDimitry Andric       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
16185ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
16195ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
16205ffd83dbSDimitry Andric   } else {
16215ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
16225ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
16235ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
16245ffd83dbSDimitry Andric   }
16255ffd83dbSDimitry Andric 
16265ffd83dbSDimitry Andric   SextInReg
16275ffd83dbSDimitry Andric     .scalarize(0)
16285ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
16295ffd83dbSDimitry Andric     .lower();
16305ffd83dbSDimitry Andric 
1631*fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
16325ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
16335ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1634*fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
1635*fe6060f1SDimitry Andric     .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
16365ffd83dbSDimitry Andric     .scalarize(0)
16375ffd83dbSDimitry Andric     .lower();
1638480093f4SDimitry Andric 
1639*fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
1640*fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1641*fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
1642*fe6060f1SDimitry Andric       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16))
1643*fe6060f1SDimitry Andric       .scalarize(0)
1644*fe6060f1SDimitry Andric       .lower();
1645*fe6060f1SDimitry Andric   } else {
1646*fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1647*fe6060f1SDimitry Andric       .scalarize(0)
1648*fe6060f1SDimitry Andric       .lower();
1649*fe6060f1SDimitry Andric   }
1650*fe6060f1SDimitry Andric 
1651480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1652480093f4SDimitry Andric     .legalFor({S64});
1653480093f4SDimitry Andric 
1654e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1655e8d8bef9SDimitry Andric     .alwaysLegal();
1656e8d8bef9SDimitry Andric 
1657*fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1658*fe6060f1SDimitry Andric       .scalarize(0)
1659*fe6060f1SDimitry Andric       .minScalar(0, S32)
1660*fe6060f1SDimitry Andric       .lower();
1661*fe6060f1SDimitry Andric 
1662*fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1663*fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
1664*fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
1665*fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
1666*fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
1667*fe6060f1SDimitry Andric       .scalarize(0);
1668*fe6060f1SDimitry Andric 
16695ffd83dbSDimitry Andric   getActionDefinitionsBuilder({
16705ffd83dbSDimitry Andric       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
16715ffd83dbSDimitry Andric       G_FCOPYSIGN,
16725ffd83dbSDimitry Andric 
16735ffd83dbSDimitry Andric       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1674e8d8bef9SDimitry Andric       G_ATOMICRMW_NAND,
1675e8d8bef9SDimitry Andric       G_ATOMICRMW_FSUB,
16765ffd83dbSDimitry Andric       G_READ_REGISTER,
16775ffd83dbSDimitry Andric       G_WRITE_REGISTER,
16785ffd83dbSDimitry Andric 
16795ffd83dbSDimitry Andric       G_SADDO, G_SSUBO,
16805ffd83dbSDimitry Andric 
16815ffd83dbSDimitry Andric        // TODO: Implement
1682*fe6060f1SDimitry Andric       G_FMINIMUM, G_FMAXIMUM}).lower();
16835ffd83dbSDimitry Andric 
1684480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
16855ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1686480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1687480093f4SDimitry Andric     .unsupported();
1688480093f4SDimitry Andric 
1689*fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
16900b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
16910b57cec5SDimitry Andric }
16920b57cec5SDimitry Andric 
16935ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
16945ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
16955ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
16965ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
16975ffd83dbSDimitry Andric 
16980b57cec5SDimitry Andric   switch (MI.getOpcode()) {
16990b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
17008bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
17010b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
17028bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
17030b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
17048bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
1705e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
1706e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
17070b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
17088bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
17090b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
17108bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
17110b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
17128bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
17135ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
17145ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
17155ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
17165ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
17170b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
17180b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
17190b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
17200b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
17215ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
17220b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
17238bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
17240b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
17258bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
17265ffd83dbSDimitry Andric   case TargetOpcode::G_SHUFFLE_VECTOR:
17275ffd83dbSDimitry Andric     return legalizeShuffleVector(MI, MRI, B);
17288bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
17298bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
17308bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
17318bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
17328bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
17338bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
1734*fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
1735*fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
1736e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
17378bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
17388bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
17398bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
17408bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
17415ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
17425ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
1743*fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
1744*fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
17455ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
17465ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
1747*fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
1748*fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
1749480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
1750480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
17515ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
17525ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f);
17535ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
17545ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
17555ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
17565ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
17575ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
17585ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
17595ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
17605ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
17615ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
17625ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
17630b57cec5SDimitry Andric   default:
17640b57cec5SDimitry Andric     return false;
17650b57cec5SDimitry Andric   }
17660b57cec5SDimitry Andric 
17670b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
17680b57cec5SDimitry Andric }
17690b57cec5SDimitry Andric 
17700b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
17710b57cec5SDimitry Andric   unsigned AS,
17720b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
17738bcb0991SDimitry Andric   MachineIRBuilder &B) const {
17748bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
17750b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17760b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
17770b57cec5SDimitry Andric 
17788bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
17798bcb0991SDimitry Andric 
17800b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
17810b57cec5SDimitry Andric     // FIXME: Use inline constants (src_{shared, private}_base) instead of
17820b57cec5SDimitry Andric     // getreg.
17830b57cec5SDimitry Andric     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
17840b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
17850b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
17860b57cec5SDimitry Andric     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
17870b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
17880b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
17890b57cec5SDimitry Andric     unsigned Encoding =
17900b57cec5SDimitry Andric         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
17910b57cec5SDimitry Andric         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
17920b57cec5SDimitry Andric         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
17930b57cec5SDimitry Andric 
17940b57cec5SDimitry Andric     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
17950b57cec5SDimitry Andric 
17968bcb0991SDimitry Andric     B.buildInstr(AMDGPU::S_GETREG_B32)
17970b57cec5SDimitry Andric       .addDef(GetReg)
17980b57cec5SDimitry Andric       .addImm(Encoding);
17990b57cec5SDimitry Andric     MRI.setType(GetReg, S32);
18000b57cec5SDimitry Andric 
18018bcb0991SDimitry Andric     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
18025ffd83dbSDimitry Andric     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
18030b57cec5SDimitry Andric   }
18040b57cec5SDimitry Andric 
18050b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
18060b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
18070b57cec5SDimitry Andric 
1808e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
18098bcb0991SDimitry Andric     return Register();
18100b57cec5SDimitry Andric 
18110b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
18120b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
18130b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
18140b57cec5SDimitry Andric 
1815480093f4SDimitry Andric   // TODO: can we be smarter about machine pointer info?
1816480093f4SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
18170b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
18180b57cec5SDimitry Andric       PtrInfo,
18195ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
18200b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
1821*fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
18220b57cec5SDimitry Andric 
18230b57cec5SDimitry Andric   Register LoadAddr;
18240b57cec5SDimitry Andric 
1825480093f4SDimitry Andric   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
18265ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
18270b57cec5SDimitry Andric }
18280b57cec5SDimitry Andric 
18290b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
18300b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
18318bcb0991SDimitry Andric   MachineIRBuilder &B) const {
18328bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
18330b57cec5SDimitry Andric 
18348bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
18350b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
18360b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
18370b57cec5SDimitry Andric 
18380b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
18390b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
18400b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
18410b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
18420b57cec5SDimitry Andric 
18430b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
18440b57cec5SDimitry Andric   // vector element.
18450b57cec5SDimitry Andric   assert(!DstTy.isVector());
18460b57cec5SDimitry Andric 
18470b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
18480b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
18490b57cec5SDimitry Andric 
1850e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
18518bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
18528bcb0991SDimitry Andric     return true;
18538bcb0991SDimitry Andric   }
18548bcb0991SDimitry Andric 
18558bcb0991SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
18568bcb0991SDimitry Andric     // Truncate.
18578bcb0991SDimitry Andric     B.buildExtract(Dst, Src, 0);
18588bcb0991SDimitry Andric     MI.eraseFromParent();
18598bcb0991SDimitry Andric     return true;
18608bcb0991SDimitry Andric   }
18618bcb0991SDimitry Andric 
18628bcb0991SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
18638bcb0991SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
18648bcb0991SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
18658bcb0991SDimitry Andric 
18668bcb0991SDimitry Andric     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
18678bcb0991SDimitry Andric     // another. Merge operands are required to be the same type, but creating an
18688bcb0991SDimitry Andric     // extra ptrtoint would be kind of pointless.
18698bcb0991SDimitry Andric     auto HighAddr = B.buildConstant(
18708bcb0991SDimitry Andric       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
18715ffd83dbSDimitry Andric     B.buildMerge(Dst, {Src, HighAddr});
18728bcb0991SDimitry Andric     MI.eraseFromParent();
18730b57cec5SDimitry Andric     return true;
18740b57cec5SDimitry Andric   }
18750b57cec5SDimitry Andric 
18760b57cec5SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
18770b57cec5SDimitry Andric     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
18780b57cec5SDimitry Andric            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
18790b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
18800b57cec5SDimitry Andric 
18818bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
18828bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
18830b57cec5SDimitry Andric 
18840b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
18855ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
18860b57cec5SDimitry Andric 
18875ffd83dbSDimitry Andric     auto CmpRes =
18885ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
18898bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
18900b57cec5SDimitry Andric 
18910b57cec5SDimitry Andric     MI.eraseFromParent();
18920b57cec5SDimitry Andric     return true;
18930b57cec5SDimitry Andric   }
18940b57cec5SDimitry Andric 
18958bcb0991SDimitry Andric   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
18968bcb0991SDimitry Andric     return false;
18978bcb0991SDimitry Andric 
18988bcb0991SDimitry Andric   if (!ST.hasFlatAddressSpace())
18998bcb0991SDimitry Andric     return false;
19000b57cec5SDimitry Andric 
19010b57cec5SDimitry Andric   auto SegmentNull =
19028bcb0991SDimitry Andric       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
19030b57cec5SDimitry Andric   auto FlatNull =
19048bcb0991SDimitry Andric       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
19050b57cec5SDimitry Andric 
19068bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
19078bcb0991SDimitry Andric   if (!ApertureReg.isValid())
19088bcb0991SDimitry Andric     return false;
19090b57cec5SDimitry Andric 
19105ffd83dbSDimitry Andric   auto CmpRes =
19115ffd83dbSDimitry Andric       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
19120b57cec5SDimitry Andric 
19130b57cec5SDimitry Andric   // Coerce the type of the low half of the result so we can use merge_values.
19145ffd83dbSDimitry Andric   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
19150b57cec5SDimitry Andric 
19160b57cec5SDimitry Andric   // TODO: Should we allow mismatched types but matching sizes in merges to
19170b57cec5SDimitry Andric   // avoid the ptrtoint?
19185ffd83dbSDimitry Andric   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
19195ffd83dbSDimitry Andric   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
19200b57cec5SDimitry Andric 
19210b57cec5SDimitry Andric   MI.eraseFromParent();
19220b57cec5SDimitry Andric   return true;
19230b57cec5SDimitry Andric }
19240b57cec5SDimitry Andric 
19250b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
19260b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19278bcb0991SDimitry Andric   MachineIRBuilder &B) const {
19280b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19290b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
19300b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
19310b57cec5SDimitry Andric 
19320b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
19330b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
19340b57cec5SDimitry Andric 
19358bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
19368bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
19370b57cec5SDimitry Andric 
19380b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
19398bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
19408bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
19410b57cec5SDimitry Andric 
19428bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
19438bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
19440b57cec5SDimitry Andric 
19458bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
19468bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1947e8d8bef9SDimitry Andric   MI.eraseFromParent();
19480b57cec5SDimitry Andric   return true;
19490b57cec5SDimitry Andric }
19500b57cec5SDimitry Andric 
19510b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
19520b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19530b57cec5SDimitry Andric   MachineIRBuilder &B) const {
19540b57cec5SDimitry Andric 
19550b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
19560b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
19570b57cec5SDimitry Andric 
19580b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19590b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
19600b57cec5SDimitry Andric 
19610b57cec5SDimitry Andric   // result = trunc(src)
19620b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
19630b57cec5SDimitry Andric   //   result += 1.0
19640b57cec5SDimitry Andric 
19655ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
19660b57cec5SDimitry Andric 
19670b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
19680b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
19690b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
19700b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
19710b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
19720b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
19730b57cec5SDimitry Andric 
19740b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
19750b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
19760b57cec5SDimitry Andric   return true;
19770b57cec5SDimitry Andric }
19780b57cec5SDimitry Andric 
1979e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
1980e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
1981e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
1982e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
1983e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
1984e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
1985e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
1986e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
1987e8d8bef9SDimitry Andric 
1988e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1989e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1990e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1991e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1992e8d8bef9SDimitry Andric     MI.eraseFromParent();
1993e8d8bef9SDimitry Andric     return true;
1994e8d8bef9SDimitry Andric }
1995e8d8bef9SDimitry Andric 
1996e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
19970b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
19980b57cec5SDimitry Andric   const unsigned FractBits = 52;
19990b57cec5SDimitry Andric   const unsigned ExpBits = 11;
20000b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
20010b57cec5SDimitry Andric 
20020b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
20030b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
20040b57cec5SDimitry Andric 
20050b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2006e8d8bef9SDimitry Andric     .addUse(Hi)
20070b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
20080b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
20090b57cec5SDimitry Andric 
20100b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
20110b57cec5SDimitry Andric }
20120b57cec5SDimitry Andric 
20130b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
20140b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20150b57cec5SDimitry Andric   MachineIRBuilder &B) const {
20160b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
20170b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
20180b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
20190b57cec5SDimitry Andric 
20200b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20210b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
20220b57cec5SDimitry Andric 
20230b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
20240b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
20250b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
20260b57cec5SDimitry Andric 
20270b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
20280b57cec5SDimitry Andric   // exponent.
20290b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
20300b57cec5SDimitry Andric 
20310b57cec5SDimitry Andric   const unsigned FractBits = 52;
20320b57cec5SDimitry Andric 
20330b57cec5SDimitry Andric   // Extract the sign bit.
20340b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
20350b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
20360b57cec5SDimitry Andric 
20370b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
20380b57cec5SDimitry Andric 
20390b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
20400b57cec5SDimitry Andric 
20410b57cec5SDimitry Andric   // Extend back to 64-bits.
20425ffd83dbSDimitry Andric   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
20430b57cec5SDimitry Andric 
20440b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
20450b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
20460b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
20470b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
20480b57cec5SDimitry Andric 
20490b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
20500b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
20510b57cec5SDimitry Andric 
20520b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
20530b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2054e8d8bef9SDimitry Andric   MI.eraseFromParent();
20550b57cec5SDimitry Andric   return true;
20560b57cec5SDimitry Andric }
20570b57cec5SDimitry Andric 
20580b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
20590b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20600b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
20610b57cec5SDimitry Andric 
20620b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
20630b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
20640b57cec5SDimitry Andric 
20650b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
20660b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
20670b57cec5SDimitry Andric 
20680b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
20690b57cec5SDimitry Andric 
20700b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
20710b57cec5SDimitry Andric 
20720b57cec5SDimitry Andric   auto CvtHi = Signed ?
20730b57cec5SDimitry Andric     B.buildSITOFP(S64, Unmerge.getReg(1)) :
20740b57cec5SDimitry Andric     B.buildUITOFP(S64, Unmerge.getReg(1));
20750b57cec5SDimitry Andric 
20760b57cec5SDimitry Andric   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
20770b57cec5SDimitry Andric 
20780b57cec5SDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
20790b57cec5SDimitry Andric   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
20800b57cec5SDimitry Andric     .addUse(CvtHi.getReg(0))
20810b57cec5SDimitry Andric     .addUse(ThirtyTwo.getReg(0));
20820b57cec5SDimitry Andric 
20830b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
20840b57cec5SDimitry Andric   B.buildFAdd(Dst, LdExp, CvtLo);
20850b57cec5SDimitry Andric   MI.eraseFromParent();
20860b57cec5SDimitry Andric   return true;
20870b57cec5SDimitry Andric }
20880b57cec5SDimitry Andric 
20895ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
20905ffd83dbSDimitry Andric // actually works.
2091*fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2092*fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2093*fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2094*fe6060f1SDimitry Andric                                         bool Signed) const {
20955ffd83dbSDimitry Andric 
20965ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
20975ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
20985ffd83dbSDimitry Andric 
20995ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
21005ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
21015ffd83dbSDimitry Andric 
2102*fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2103*fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
21045ffd83dbSDimitry Andric 
21055ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
21065ffd83dbSDimitry Andric 
2107*fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2108*fe6060f1SDimitry Andric   // integers is illustrated as follows:
2109*fe6060f1SDimitry Andric   //
2110*fe6060f1SDimitry Andric   //     tf := trunc(val);
2111*fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2112*fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2113*fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2114*fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2115*fe6060f1SDimitry Andric   //
2116*fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2117*fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2118*fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2119*fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2120*fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2121*fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2122*fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2123*fe6060f1SDimitry Andric     // signedness.
2124*fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2125*fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2126*fe6060f1SDimitry Andric   }
2127*fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2128*fe6060f1SDimitry Andric   if (SrcLT == S64) {
2129*fe6060f1SDimitry Andric     K0 = B.buildFConstant(S64,
2130*fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2131*fe6060f1SDimitry Andric     K1 = B.buildFConstant(S64,
2132*fe6060f1SDimitry Andric                           BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2133*fe6060f1SDimitry Andric   } else {
2134*fe6060f1SDimitry Andric     K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
2135*fe6060f1SDimitry Andric     K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
2136*fe6060f1SDimitry Andric   }
21375ffd83dbSDimitry Andric 
2138*fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2139*fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2140*fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
21415ffd83dbSDimitry Andric 
2142*fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2143*fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
21445ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
21455ffd83dbSDimitry Andric 
2146*fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2147*fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2148*fe6060f1SDimitry Andric     Sign = B.buildMerge(S64, {Sign, Sign});
2149*fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2150*fe6060f1SDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign);
2151*fe6060f1SDimitry Andric   } else
21525ffd83dbSDimitry Andric     B.buildMerge(Dst, {Lo, Hi});
21535ffd83dbSDimitry Andric   MI.eraseFromParent();
21545ffd83dbSDimitry Andric 
21555ffd83dbSDimitry Andric   return true;
21565ffd83dbSDimitry Andric }
21575ffd83dbSDimitry Andric 
21585ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
21595ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
21605ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
21610b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
21620b57cec5SDimitry Andric 
21630b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
21640b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
21650b57cec5SDimitry Andric 
21660b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
21670b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
21680b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
21690b57cec5SDimitry Andric     return !IsIEEEOp;
21700b57cec5SDimitry Andric 
21710b57cec5SDimitry Andric   if (IsIEEEOp)
21720b57cec5SDimitry Andric     return true;
21730b57cec5SDimitry Andric 
21740b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
21750b57cec5SDimitry Andric }
21760b57cec5SDimitry Andric 
21770b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
21780b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21790b57cec5SDimitry Andric   MachineIRBuilder &B) const {
21800b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
21810b57cec5SDimitry Andric 
21820b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
21835ffd83dbSDimitry Andric 
21845ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
21855ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
21865ffd83dbSDimitry Andric   // getConstantVRegValWithLookThrough.
2187e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2188e8d8bef9SDimitry Andric       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2189e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
21900b57cec5SDimitry Andric     return true;
2191e8d8bef9SDimitry Andric   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
21920b57cec5SDimitry Andric 
21930b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
21940b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
21950b57cec5SDimitry Andric 
21960b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
21970b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
21980b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Dst));
21990b57cec5SDimitry Andric 
2200e8d8bef9SDimitry Andric   if (IdxVal < VecTy.getNumElements())
2201e8d8bef9SDimitry Andric     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
22020b57cec5SDimitry Andric   else
22030b57cec5SDimitry Andric     B.buildUndef(Dst);
22040b57cec5SDimitry Andric 
22050b57cec5SDimitry Andric   MI.eraseFromParent();
22060b57cec5SDimitry Andric   return true;
22070b57cec5SDimitry Andric }
22080b57cec5SDimitry Andric 
22090b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
22100b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22110b57cec5SDimitry Andric   MachineIRBuilder &B) const {
22120b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
22130b57cec5SDimitry Andric 
22140b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
22155ffd83dbSDimitry Andric 
22165ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
22175ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
22185ffd83dbSDimitry Andric   // getConstantVRegValWithLookThrough.
2219e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2220e8d8bef9SDimitry Andric       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2221e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
22220b57cec5SDimitry Andric     return true;
22230b57cec5SDimitry Andric 
2224e8d8bef9SDimitry Andric   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
22250b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22260b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
22270b57cec5SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
22280b57cec5SDimitry Andric 
22290b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
22300b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
22310b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Ins));
22320b57cec5SDimitry Andric 
2233e8d8bef9SDimitry Andric   if (IdxVal < VecTy.getNumElements())
2234e8d8bef9SDimitry Andric     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
22350b57cec5SDimitry Andric   else
22360b57cec5SDimitry Andric     B.buildUndef(Dst);
22370b57cec5SDimitry Andric 
22380b57cec5SDimitry Andric   MI.eraseFromParent();
22390b57cec5SDimitry Andric   return true;
22400b57cec5SDimitry Andric }
22410b57cec5SDimitry Andric 
22425ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector(
22435ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22445ffd83dbSDimitry Andric   MachineIRBuilder &B) const {
2245*fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
22465ffd83dbSDimitry Andric 
22475ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22485ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
22495ffd83dbSDimitry Andric   LLT DstTy = MRI.getType(Dst);
22505ffd83dbSDimitry Andric   LLT SrcTy = MRI.getType(Src0);
22515ffd83dbSDimitry Andric 
22525ffd83dbSDimitry Andric   if (SrcTy == V2S16 && DstTy == V2S16 &&
22535ffd83dbSDimitry Andric       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
22545ffd83dbSDimitry Andric     return true;
22555ffd83dbSDimitry Andric 
22565ffd83dbSDimitry Andric   MachineIRBuilder HelperBuilder(MI);
22575ffd83dbSDimitry Andric   GISelObserverWrapper DummyObserver;
22585ffd83dbSDimitry Andric   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
22595ffd83dbSDimitry Andric   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
22605ffd83dbSDimitry Andric }
22615ffd83dbSDimitry Andric 
22628bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
22638bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22648bcb0991SDimitry Andric   MachineIRBuilder &B) const {
22658bcb0991SDimitry Andric 
22668bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
22678bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
22688bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
22698bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
22708bcb0991SDimitry Andric 
22718bcb0991SDimitry Andric   Register TrigVal;
22725ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
22738bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
22748bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
22758bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
22768bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
22778bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
22788bcb0991SDimitry Andric   } else
22798bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
22808bcb0991SDimitry Andric 
22818bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
22828bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
22838bcb0991SDimitry Andric   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
22848bcb0991SDimitry Andric     .addUse(TrigVal)
22858bcb0991SDimitry Andric     .setMIFlags(Flags);
22868bcb0991SDimitry Andric   MI.eraseFromParent();
22878bcb0991SDimitry Andric   return true;
22888bcb0991SDimitry Andric }
22898bcb0991SDimitry Andric 
22905ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
22915ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
22925ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
22935ffd83dbSDimitry Andric                                                   int64_t Offset,
22945ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
22955ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
22968bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
22978bcb0991SDimitry Andric   // to the following code sequence:
22988bcb0991SDimitry Andric   //
22998bcb0991SDimitry Andric   // For constant address space:
23008bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
23018bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
23028bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
23038bcb0991SDimitry Andric   //
23048bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
23058bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
23068bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
23078bcb0991SDimitry Andric   //   operand to the global variable.
23088bcb0991SDimitry Andric   //
23098bcb0991SDimitry Andric   // For global address space:
23108bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
23118bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
23128bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
23138bcb0991SDimitry Andric   //
23148bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
23158bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
23168bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
23178bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
23188bcb0991SDimitry Andric   //   operand to the global variable.
23198bcb0991SDimitry Andric   //
23208bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
23218bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
23228bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
23238bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
23248bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
2325e8d8bef9SDimitry Andric   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2326e8d8bef9SDimitry Andric   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2327e8d8bef9SDimitry Andric   // instruction.
23288bcb0991SDimitry Andric 
23298bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
23308bcb0991SDimitry Andric 
23318bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
23328bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
23338bcb0991SDimitry Andric 
23348bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
23358bcb0991SDimitry Andric     .addDef(PCReg);
23368bcb0991SDimitry Andric 
23378bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
23388bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
23398bcb0991SDimitry Andric     MIB.addImm(0);
23408bcb0991SDimitry Andric   else
2341e8d8bef9SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
23428bcb0991SDimitry Andric 
23438bcb0991SDimitry Andric   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
23448bcb0991SDimitry Andric 
23458bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
23468bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
23478bcb0991SDimitry Andric   return true;
23488bcb0991SDimitry Andric  }
23498bcb0991SDimitry Andric 
23508bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
23518bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23528bcb0991SDimitry Andric   MachineIRBuilder &B) const {
23538bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
23548bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
23558bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
23568bcb0991SDimitry Andric 
23578bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
23588bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
23598bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
23608bcb0991SDimitry Andric 
23618bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2362*fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2363*fe6060f1SDimitry Andric         !GV->getName().equals("llvm.amdgcn.module.lds")) {
23648bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
23658bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
23665ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
23675ffd83dbSDimitry Andric         DS_Warning);
23688bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
23695ffd83dbSDimitry Andric 
23705ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
23715ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
23725ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
23735ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
23745ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
23755ffd83dbSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
23765ffd83dbSDimitry Andric       B.buildUndef(DstReg);
23775ffd83dbSDimitry Andric       MI.eraseFromParent();
23785ffd83dbSDimitry Andric       return true;
23798bcb0991SDimitry Andric     }
23808bcb0991SDimitry Andric 
23818bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
23828bcb0991SDimitry Andric     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
23835ffd83dbSDimitry Andric       const SITargetLowering *TLI = ST.getTargetLowering();
23845ffd83dbSDimitry Andric       if (!TLI->shouldUseLDSConstAddress(GV)) {
23855ffd83dbSDimitry Andric         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
23865ffd83dbSDimitry Andric         return true; // Leave in place;
23875ffd83dbSDimitry Andric       }
23885ffd83dbSDimitry Andric 
2389e8d8bef9SDimitry Andric       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2390e8d8bef9SDimitry Andric         Type *Ty = GV->getValueType();
2391e8d8bef9SDimitry Andric         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2392e8d8bef9SDimitry Andric         // zero-sized type in other languages to declare the dynamic shared
2393e8d8bef9SDimitry Andric         // memory which size is not known at the compile time. They will be
2394e8d8bef9SDimitry Andric         // allocated by the runtime and placed directly after the static
2395e8d8bef9SDimitry Andric         // allocated ones. They all share the same offset.
2396e8d8bef9SDimitry Andric         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2397e8d8bef9SDimitry Andric           // Adjust alignment for that dynamic shared memory array.
2398e8d8bef9SDimitry Andric           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2399e8d8bef9SDimitry Andric           LLT S32 = LLT::scalar(32);
2400e8d8bef9SDimitry Andric           auto Sz =
2401e8d8bef9SDimitry Andric               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2402e8d8bef9SDimitry Andric           B.buildIntToPtr(DstReg, Sz);
2403e8d8bef9SDimitry Andric           MI.eraseFromParent();
2404e8d8bef9SDimitry Andric           return true;
2405e8d8bef9SDimitry Andric         }
2406e8d8bef9SDimitry Andric       }
2407e8d8bef9SDimitry Andric 
24085ffd83dbSDimitry Andric       B.buildConstant(
24095ffd83dbSDimitry Andric           DstReg,
24105ffd83dbSDimitry Andric           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
24118bcb0991SDimitry Andric       MI.eraseFromParent();
24128bcb0991SDimitry Andric       return true;
24138bcb0991SDimitry Andric     }
24148bcb0991SDimitry Andric 
24158bcb0991SDimitry Andric     const Function &Fn = MF.getFunction();
24168bcb0991SDimitry Andric     DiagnosticInfoUnsupported BadInit(
24178bcb0991SDimitry Andric       Fn, "unsupported initializer for address space", MI.getDebugLoc());
24188bcb0991SDimitry Andric     Fn.getContext().diagnose(BadInit);
24198bcb0991SDimitry Andric     return true;
24208bcb0991SDimitry Andric   }
24218bcb0991SDimitry Andric 
24228bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
24238bcb0991SDimitry Andric 
24248bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
24258bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
24268bcb0991SDimitry Andric     MI.eraseFromParent();
24278bcb0991SDimitry Andric     return true;
24288bcb0991SDimitry Andric   }
24298bcb0991SDimitry Andric 
24308bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
24318bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
24328bcb0991SDimitry Andric     MI.eraseFromParent();
24338bcb0991SDimitry Andric     return true;
24348bcb0991SDimitry Andric   }
24358bcb0991SDimitry Andric 
24368bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
24378bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
24388bcb0991SDimitry Andric 
2439*fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
24408bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
24418bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
24428bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
24438bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2444*fe6060f1SDimitry Andric       LoadTy, Align(8));
24458bcb0991SDimitry Andric 
24468bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
24478bcb0991SDimitry Andric 
24488bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
24498bcb0991SDimitry Andric     // Truncate if this is a 32-bit constant adrdess.
24508bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
24518bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
24528bcb0991SDimitry Andric   } else
24538bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
24548bcb0991SDimitry Andric 
24558bcb0991SDimitry Andric   MI.eraseFromParent();
24568bcb0991SDimitry Andric   return true;
24578bcb0991SDimitry Andric }
24588bcb0991SDimitry Andric 
2459e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2460e8d8bef9SDimitry Andric   if (Ty.isVector())
2461*fe6060f1SDimitry Andric     return Ty.changeElementCount(
2462*fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2463e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2464e8d8bef9SDimitry Andric }
2465e8d8bef9SDimitry Andric 
2466e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2467e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2468e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2469e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2470e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2471e8d8bef9SDimitry Andric 
2472e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2473e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2474e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2475e8d8bef9SDimitry Andric 
2476e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
24778bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2478e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
24798bcb0991SDimitry Andric     Observer.changingInstr(MI);
24808bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
24818bcb0991SDimitry Andric     Observer.changedInstr(MI);
24828bcb0991SDimitry Andric     return true;
24838bcb0991SDimitry Andric   }
24848bcb0991SDimitry Andric 
2485*fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
2486*fe6060f1SDimitry Andric     return false;
2487*fe6060f1SDimitry Andric 
2488e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2489e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2490e8d8bef9SDimitry Andric 
2491e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
2492e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
2493*fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
2494e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
2495*fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
2496e8d8bef9SDimitry Andric   const unsigned AlignInBits = 8 * MemAlign.value();
2497e8d8bef9SDimitry Andric 
2498e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
2499*fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2500e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2501e8d8bef9SDimitry Andric 
2502e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
2503e8d8bef9SDimitry Andric     // the memory type.
2504e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
2505e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
2506e8d8bef9SDimitry Andric 
2507e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
2508e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2509e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
2510e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
2511e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
2512e8d8bef9SDimitry Andric       return true;
2513e8d8bef9SDimitry Andric     }
2514e8d8bef9SDimitry Andric 
2515e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
2516e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
2517e8d8bef9SDimitry Andric       return false;
2518e8d8bef9SDimitry Andric 
2519e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
2520e8d8bef9SDimitry Andric 
2521e8d8bef9SDimitry Andric     Register WideLoad;
2522e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
2523e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2524e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
2525e8d8bef9SDimitry Andric     } else {
2526e8d8bef9SDimitry Andric       // Extract the subvector.
2527e8d8bef9SDimitry Andric 
2528e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
2529e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
2530e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
2531e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2532e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
2533e8d8bef9SDimitry Andric       } else {
2534e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
2535e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2536e8d8bef9SDimitry Andric         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2537e8d8bef9SDimitry Andric         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2538e8d8bef9SDimitry Andric         B.setInsertPt(B.getMBB(), MI.getIterator());
2539e8d8bef9SDimitry Andric         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2540e8d8bef9SDimitry Andric       }
2541e8d8bef9SDimitry Andric     }
2542e8d8bef9SDimitry Andric 
2543e8d8bef9SDimitry Andric     MI.eraseFromParent();
2544e8d8bef9SDimitry Andric     return true;
2545e8d8bef9SDimitry Andric   }
2546e8d8bef9SDimitry Andric 
2547e8d8bef9SDimitry Andric   return false;
2548e8d8bef9SDimitry Andric }
2549e8d8bef9SDimitry Andric 
25508bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
25518bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
25528bcb0991SDimitry Andric   MachineIRBuilder &B) const {
25538bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
25548bcb0991SDimitry Andric   assert(Ty.isScalar());
25558bcb0991SDimitry Andric 
2556480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
2557480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2558480093f4SDimitry Andric 
25598bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
25605ffd83dbSDimitry Andric   // FIXME: Do we need just output?
25615ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
25628bcb0991SDimitry Andric     return true;
25635ffd83dbSDimitry Andric   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
25648bcb0991SDimitry Andric     return true;
25658bcb0991SDimitry Andric 
25668bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
25678bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
25688bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
25698bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
25708bcb0991SDimitry Andric }
25718bcb0991SDimitry Andric 
2572480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2573480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2574480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2575480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2576480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
2577480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
2578480093f4SDimitry Andric 
2579e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2580480093f4SDimitry Andric          "this should not have been custom lowered");
2581480093f4SDimitry Andric 
2582480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
2583*fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
2584480093f4SDimitry Andric 
2585480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2586480093f4SDimitry Andric 
2587480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2588480093f4SDimitry Andric     .addDef(DstReg)
2589480093f4SDimitry Andric     .addUse(PtrReg)
2590480093f4SDimitry Andric     .addUse(PackedVal)
2591480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
2592480093f4SDimitry Andric 
2593480093f4SDimitry Andric   MI.eraseFromParent();
2594480093f4SDimitry Andric   return true;
2595480093f4SDimitry Andric }
2596480093f4SDimitry Andric 
25975ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog(
25985ffd83dbSDimitry Andric   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
25995ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26005ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
26015ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26025ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26035ffd83dbSDimitry Andric 
26045ffd83dbSDimitry Andric   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
26055ffd83dbSDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
26065ffd83dbSDimitry Andric 
26075ffd83dbSDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
26085ffd83dbSDimitry Andric   MI.eraseFromParent();
26095ffd83dbSDimitry Andric   return true;
26105ffd83dbSDimitry Andric }
26115ffd83dbSDimitry Andric 
26125ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
26135ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
26145ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26155ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
26165ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26175ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26185ffd83dbSDimitry Andric 
26195ffd83dbSDimitry Andric   auto K = B.buildFConstant(Ty, numbers::log2e);
26205ffd83dbSDimitry Andric   auto Mul = B.buildFMul(Ty, Src, K, Flags);
26215ffd83dbSDimitry Andric   B.buildFExp2(Dst, Mul, Flags);
26225ffd83dbSDimitry Andric   MI.eraseFromParent();
26235ffd83dbSDimitry Andric   return true;
26245ffd83dbSDimitry Andric }
26255ffd83dbSDimitry Andric 
26265ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
26275ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
26285ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26295ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
26305ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
26315ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26325ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
26335ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
26345ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
26355ffd83dbSDimitry Andric 
26365ffd83dbSDimitry Andric   if (Ty == S32) {
26375ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S32, Src0, Flags);
26385ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
26395ffd83dbSDimitry Andric       .addUse(Log.getReg(0))
26405ffd83dbSDimitry Andric       .addUse(Src1)
26415ffd83dbSDimitry Andric       .setMIFlags(Flags);
26425ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
26435ffd83dbSDimitry Andric   } else if (Ty == S16) {
26445ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
26455ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S16, Src0, Flags);
26465ffd83dbSDimitry Andric     auto Ext0 = B.buildFPExt(S32, Log, Flags);
26475ffd83dbSDimitry Andric     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
26485ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
26495ffd83dbSDimitry Andric       .addUse(Ext0.getReg(0))
26505ffd83dbSDimitry Andric       .addUse(Ext1.getReg(0))
26515ffd83dbSDimitry Andric       .setMIFlags(Flags);
26525ffd83dbSDimitry Andric 
26535ffd83dbSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
26545ffd83dbSDimitry Andric   } else
26555ffd83dbSDimitry Andric     return false;
26565ffd83dbSDimitry Andric 
26575ffd83dbSDimitry Andric   MI.eraseFromParent();
26585ffd83dbSDimitry Andric   return true;
26595ffd83dbSDimitry Andric }
26605ffd83dbSDimitry Andric 
26615ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
26625ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
26635ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
26645ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
26655ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
26665ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
26675ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
26685ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
26695ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
26705ffd83dbSDimitry Andric   return ModSrc;
26715ffd83dbSDimitry Andric }
26725ffd83dbSDimitry Andric 
26735ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
26745ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
26755ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
26765ffd83dbSDimitry Andric 
26775ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
26785ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
26795ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26805ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
26815ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26825ffd83dbSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
26835ffd83dbSDimitry Andric          "this should not have been custom lowered");
26845ffd83dbSDimitry Andric 
26855ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
26865ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
26875ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
26885ffd83dbSDimitry Andric   // V_FRACT bug is:
26895ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
26905ffd83dbSDimitry Andric   //
26915ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
26925ffd83dbSDimitry Andric 
26935ffd83dbSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
26945ffd83dbSDimitry Andric     .addUse(OrigSrc)
26955ffd83dbSDimitry Andric     .setMIFlags(Flags);
26965ffd83dbSDimitry Andric 
26975ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
26985ffd83dbSDimitry Andric   // pattern.
26995ffd83dbSDimitry Andric 
27005ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
27015ffd83dbSDimitry Andric   // shouldn't matter?
27025ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
27035ffd83dbSDimitry Andric 
27045ffd83dbSDimitry Andric   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
27055ffd83dbSDimitry Andric 
27065ffd83dbSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(S64);
27075ffd83dbSDimitry Andric 
27085ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
27095ffd83dbSDimitry Andric   // use the one which will directly select.
27105ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
27115ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
27125ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
27135ffd83dbSDimitry Andric   else
27145ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
27155ffd83dbSDimitry Andric 
27165ffd83dbSDimitry Andric   Register CorrectedFract = Min;
27175ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
27185ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
27195ffd83dbSDimitry Andric     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
27205ffd83dbSDimitry Andric   }
27215ffd83dbSDimitry Andric 
27225ffd83dbSDimitry Andric   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
27235ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
27245ffd83dbSDimitry Andric 
27255ffd83dbSDimitry Andric   MI.eraseFromParent();
27265ffd83dbSDimitry Andric   return true;
27275ffd83dbSDimitry Andric }
27285ffd83dbSDimitry Andric 
27295ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
27305ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
27315ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
27325ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
27335ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
27345ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2735*fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
27365ffd83dbSDimitry Andric 
27375ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
27385ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
27395ffd83dbSDimitry Andric   assert(MRI.getType(Src0) == LLT::scalar(16));
27405ffd83dbSDimitry Andric 
27415ffd83dbSDimitry Andric   auto Merge = B.buildMerge(S32, {Src0, Src1});
27425ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
27435ffd83dbSDimitry Andric 
27445ffd83dbSDimitry Andric   MI.eraseFromParent();
27455ffd83dbSDimitry Andric   return true;
27465ffd83dbSDimitry Andric }
27475ffd83dbSDimitry Andric 
2748e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
2749e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2750e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
2751e8d8bef9SDimitry Andric     return false;
2752e8d8bef9SDimitry Andric   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2753e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
2754e8d8bef9SDimitry Andric }
2755e8d8bef9SDimitry Andric 
27560b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
2757e8d8bef9SDimitry Andric static MachineInstr *
2758e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2759e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
27600b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
27610b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
27620b57cec5SDimitry Andric     return nullptr;
27630b57cec5SDimitry Andric 
27645ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
2765e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2766e8d8bef9SDimitry Andric 
2767e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
2768e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
2769e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
2770e8d8bef9SDimitry Andric       return nullptr;
2771e8d8bef9SDimitry Andric 
2772e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
2773e8d8bef9SDimitry Andric     UseMI->eraseFromParent();
2774e8d8bef9SDimitry Andric 
2775e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2776e8d8bef9SDimitry Andric     Negated = true;
2777e8d8bef9SDimitry Andric   }
2778e8d8bef9SDimitry Andric 
2779e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2780480093f4SDimitry Andric     return nullptr;
2781480093f4SDimitry Andric 
27825ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2783e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
27845ffd83dbSDimitry Andric   if (Next == Parent->end()) {
27855ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
27865ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
27875ffd83dbSDimitry Andric       return nullptr;
27885ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
27895ffd83dbSDimitry Andric   } else {
2790480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
2791480093f4SDimitry Andric       return nullptr;
2792480093f4SDimitry Andric     Br = &*Next;
27935ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
2794480093f4SDimitry Andric   }
2795480093f4SDimitry Andric 
2796e8d8bef9SDimitry Andric   return UseMI;
27970b57cec5SDimitry Andric }
27980b57cec5SDimitry Andric 
27990b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2800e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
2801e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
2802e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
2803e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
2804e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
28055ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
28060b57cec5SDimitry Andric 
2807e8d8bef9SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2808e8d8bef9SDimitry Andric                                              ArgTy);
28090b57cec5SDimitry Andric   if (Arg->isMasked()) {
28100b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
28110b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
28120b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
28130b57cec5SDimitry Andric     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
28140b57cec5SDimitry Andric 
28158bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
28168bcb0991SDimitry Andric 
28178bcb0991SDimitry Andric     if (Shift != 0) {
28180b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
28198bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
28208bcb0991SDimitry Andric     }
28218bcb0991SDimitry Andric 
28228bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
28235ffd83dbSDimitry Andric   } else {
28240b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
28250b57cec5SDimitry Andric   }
28260b57cec5SDimitry Andric 
28270b57cec5SDimitry Andric   return true;
28280b57cec5SDimitry Andric }
28290b57cec5SDimitry Andric 
2830e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
2831e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
2832e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2833e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2834e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
2835e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
2836e8d8bef9SDimitry Andric   LLT ArgTy;
2837e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2838e8d8bef9SDimitry Andric 
2839e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2840e8d8bef9SDimitry Andric     return false; // TODO: Handle these
2841e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2842e8d8bef9SDimitry Andric }
2843e8d8bef9SDimitry Andric 
28440b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
28455ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
28460b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2847e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
28485ffd83dbSDimitry Andric     return false;
28495ffd83dbSDimitry Andric 
28500b57cec5SDimitry Andric   MI.eraseFromParent();
28510b57cec5SDimitry Andric   return true;
28520b57cec5SDimitry Andric }
28530b57cec5SDimitry Andric 
28548bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
28558bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
28568bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
2857480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2858480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
2859480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
2860480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2861480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
28628bcb0991SDimitry Andric 
2863480093f4SDimitry Andric   if (DstTy == S16)
2864480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
2865480093f4SDimitry Andric   if (DstTy == S32)
2866480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
2867480093f4SDimitry Andric   if (DstTy == S64)
2868480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
2869480093f4SDimitry Andric 
28708bcb0991SDimitry Andric   return false;
28718bcb0991SDimitry Andric }
28728bcb0991SDimitry Andric 
2873*fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
2874*fe6060f1SDimitry Andric                                                         Register DstDivReg,
2875*fe6060f1SDimitry Andric                                                         Register DstRemReg,
28765ffd83dbSDimitry Andric                                                         Register X,
2877*fe6060f1SDimitry Andric                                                         Register Y) const {
28785ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
28795ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
28805ffd83dbSDimitry Andric 
28815ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
28825ffd83dbSDimitry Andric   // algorithm used here.
28835ffd83dbSDimitry Andric 
28845ffd83dbSDimitry Andric   // Initial estimate of inv(y).
28855ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
28865ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
28875ffd83dbSDimitry Andric   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
28885ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
28895ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
28905ffd83dbSDimitry Andric 
28915ffd83dbSDimitry Andric   // One round of UNR.
28925ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
28935ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
28945ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
28955ffd83dbSDimitry Andric 
28965ffd83dbSDimitry Andric   // Quotient/remainder estimate.
28975ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
28985ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
28995ffd83dbSDimitry Andric 
29005ffd83dbSDimitry Andric   // First quotient/remainder refinement.
29015ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
29025ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2903*fe6060f1SDimitry Andric   if (DstDivReg)
29045ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
29055ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
29065ffd83dbSDimitry Andric 
29075ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
29085ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
2909*fe6060f1SDimitry Andric   if (DstDivReg)
2910*fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
29115ffd83dbSDimitry Andric 
2912*fe6060f1SDimitry Andric   if (DstRemReg)
2913*fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
29145ffd83dbSDimitry Andric }
29155ffd83dbSDimitry Andric 
29165ffd83dbSDimitry Andric // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
29175ffd83dbSDimitry Andric //
29185ffd83dbSDimitry Andric // Return lo, hi of result
29195ffd83dbSDimitry Andric //
29205ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
29215ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
29225ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
29235ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
29245ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
29255ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
29265ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
29275ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
29285ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
29295ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
29305ffd83dbSDimitry Andric                                                        Register Val) {
29315ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
29325ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
29335ffd83dbSDimitry Andric 
29345ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
29355ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
29365ffd83dbSDimitry Andric 
29375ffd83dbSDimitry Andric   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
29385ffd83dbSDimitry Andric                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
29395ffd83dbSDimitry Andric 
29405ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
29415ffd83dbSDimitry Andric   auto Mul1 =
29425ffd83dbSDimitry Andric       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
29435ffd83dbSDimitry Andric 
29445ffd83dbSDimitry Andric   // 2**(-32)
29455ffd83dbSDimitry Andric   auto Mul2 =
29465ffd83dbSDimitry Andric       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
29475ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
29485ffd83dbSDimitry Andric 
29495ffd83dbSDimitry Andric   // -(2**32)
29505ffd83dbSDimitry Andric   auto Mad2 = B.buildFMAD(S32, Trunc,
29515ffd83dbSDimitry Andric                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
29525ffd83dbSDimitry Andric 
29535ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
29545ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
29555ffd83dbSDimitry Andric 
29565ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
29575ffd83dbSDimitry Andric }
29585ffd83dbSDimitry Andric 
2959*fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
2960*fe6060f1SDimitry Andric                                                         Register DstDivReg,
2961*fe6060f1SDimitry Andric                                                         Register DstRemReg,
29625ffd83dbSDimitry Andric                                                         Register Numer,
2963*fe6060f1SDimitry Andric                                                         Register Denom) const {
29645ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
29655ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
29665ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
29675ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
29685ffd83dbSDimitry Andric 
29695ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
29705ffd83dbSDimitry Andric 
29715ffd83dbSDimitry Andric   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
29725ffd83dbSDimitry Andric 
29735ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
29745ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
29755ffd83dbSDimitry Andric 
29765ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
29775ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
29785ffd83dbSDimitry Andric 
29795ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
29805ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
29815ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
29825ffd83dbSDimitry Andric 
29835ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
29845ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
29855ffd83dbSDimitry Andric   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
29865ffd83dbSDimitry Andric   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
29875ffd83dbSDimitry Andric 
29885ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
29895ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
29905ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
29915ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
29925ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
29935ffd83dbSDimitry Andric 
29945ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
29955ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
29965ffd83dbSDimitry Andric   auto Add2_HiC =
29975ffd83dbSDimitry Andric       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
29985ffd83dbSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
29995ffd83dbSDimitry Andric   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
30005ffd83dbSDimitry Andric 
30015ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
30025ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
30035ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
30045ffd83dbSDimitry Andric 
30055ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
30065ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
30075ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
30085ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
30095ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
30105ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
30115ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
30125ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
30135ffd83dbSDimitry Andric   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
30145ffd83dbSDimitry Andric 
30155ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
30165ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
30175ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
30185ffd83dbSDimitry Andric 
30195ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
30205ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
30215ffd83dbSDimitry Andric 
30225ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
30235ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
30245ffd83dbSDimitry Andric 
30255ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
30265ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
30275ffd83dbSDimitry Andric 
30285ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
30295ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
30305ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
30315ffd83dbSDimitry Andric 
30325ffd83dbSDimitry Andric   // if C3 != 0 ...
30335ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
30345ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
30355ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
30365ffd83dbSDimitry Andric   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
30375ffd83dbSDimitry Andric 
30385ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
30395ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
30405ffd83dbSDimitry Andric 
30415ffd83dbSDimitry Andric   auto C4 =
30425ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
30435ffd83dbSDimitry Andric   auto C5 =
30445ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
30455ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
30465ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
30475ffd83dbSDimitry Andric 
30485ffd83dbSDimitry Andric   // if (C6 != 0)
30495ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
30505ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
30515ffd83dbSDimitry Andric 
30525ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
30535ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
30545ffd83dbSDimitry Andric   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
30555ffd83dbSDimitry Andric 
30565ffd83dbSDimitry Andric   // endif C6
30575ffd83dbSDimitry Andric   // endif C3
30585ffd83dbSDimitry Andric 
3059*fe6060f1SDimitry Andric   if (DstDivReg) {
30605ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
30615ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
3062*fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3063*fe6060f1SDimitry Andric                   Sel1, MulHi3);
3064*fe6060f1SDimitry Andric   }
3065*fe6060f1SDimitry Andric 
3066*fe6060f1SDimitry Andric   if (DstRemReg) {
30675ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
30685ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
3069*fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
3070*fe6060f1SDimitry Andric                   Sel2, Sub1);
30715ffd83dbSDimitry Andric   }
30725ffd83dbSDimitry Andric }
30735ffd83dbSDimitry Andric 
3074*fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
30755ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
30765ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
3077*fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
3078*fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3079*fe6060f1SDimitry Andric   default:
3080*fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3081*fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
3082*fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3083*fe6060f1SDimitry Andric     break;
3084*fe6060f1SDimitry Andric   }
3085*fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
3086*fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3087*fe6060f1SDimitry Andric     break;
3088*fe6060f1SDimitry Andric   }
3089*fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
3090*fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3091*fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3092*fe6060f1SDimitry Andric     break;
3093*fe6060f1SDimitry Andric   }
3094*fe6060f1SDimitry Andric   }
3095*fe6060f1SDimitry Andric 
30965ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
30975ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3098*fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3099*fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
3100*fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
3101*fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
31025ffd83dbSDimitry Andric 
31035ffd83dbSDimitry Andric   if (Ty == S32)
3104*fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
31055ffd83dbSDimitry Andric   else if (Ty == S64)
3106*fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
31075ffd83dbSDimitry Andric   else
31085ffd83dbSDimitry Andric     return false;
31095ffd83dbSDimitry Andric 
31105ffd83dbSDimitry Andric   MI.eraseFromParent();
31115ffd83dbSDimitry Andric   return true;
31125ffd83dbSDimitry Andric }
31135ffd83dbSDimitry Andric 
3114*fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
31155ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
31165ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
31175ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
31185ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
31195ffd83dbSDimitry Andric 
3120*fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
31215ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
31225ffd83dbSDimitry Andric     return false;
31235ffd83dbSDimitry Andric 
3124*fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
3125*fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
3126*fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
31275ffd83dbSDimitry Andric 
31285ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
31295ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
31305ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
31315ffd83dbSDimitry Andric 
31325ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
31335ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
31345ffd83dbSDimitry Andric 
31355ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
31365ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
31375ffd83dbSDimitry Andric 
3138*fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
3139*fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
3140*fe6060f1SDimitry Andric   default:
3141*fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
3142*fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
3143*fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3144*fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3145*fe6060f1SDimitry Andric     break;
3146*fe6060f1SDimitry Andric   }
3147*fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
3148*fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
3149*fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3150*fe6060f1SDimitry Andric     break;
3151*fe6060f1SDimitry Andric   }
3152*fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
3153*fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
3154*fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
3155*fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
3156*fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
3157*fe6060f1SDimitry Andric     break;
3158*fe6060f1SDimitry Andric   }
3159*fe6060f1SDimitry Andric   }
3160*fe6060f1SDimitry Andric 
31615ffd83dbSDimitry Andric   if (Ty == S32)
3162*fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
31635ffd83dbSDimitry Andric   else
3164*fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
31655ffd83dbSDimitry Andric 
3166*fe6060f1SDimitry Andric   if (DstDivReg) {
3167*fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
3168*fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
3169*fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
3170*fe6060f1SDimitry Andric   }
31715ffd83dbSDimitry Andric 
3172*fe6060f1SDimitry Andric   if (DstRemReg) {
3173*fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
3174*fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
3175*fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
3176*fe6060f1SDimitry Andric   }
31775ffd83dbSDimitry Andric 
31785ffd83dbSDimitry Andric   MI.eraseFromParent();
31795ffd83dbSDimitry Andric   return true;
31805ffd83dbSDimitry Andric }
31815ffd83dbSDimitry Andric 
31828bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
31838bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
31848bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
31858bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
31868bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
31878bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
31888bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
31898bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
31908bcb0991SDimitry Andric 
31918bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
3192e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3193e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
31948bcb0991SDimitry Andric 
3195e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
31968bcb0991SDimitry Andric     return false;
31978bcb0991SDimitry Andric 
31988bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
31998bcb0991SDimitry Andric     // 1 / x -> RCP(x)
32008bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
32018bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
32028bcb0991SDimitry Andric         .addUse(RHS)
32038bcb0991SDimitry Andric         .setMIFlags(Flags);
32048bcb0991SDimitry Andric 
32058bcb0991SDimitry Andric       MI.eraseFromParent();
32068bcb0991SDimitry Andric       return true;
32078bcb0991SDimitry Andric     }
32088bcb0991SDimitry Andric 
32098bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
32108bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
32118bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
32128bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
32138bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
32148bcb0991SDimitry Andric         .setMIFlags(Flags);
32158bcb0991SDimitry Andric 
32168bcb0991SDimitry Andric       MI.eraseFromParent();
32178bcb0991SDimitry Andric       return true;
32188bcb0991SDimitry Andric     }
32198bcb0991SDimitry Andric   }
32208bcb0991SDimitry Andric 
32218bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
32228bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
32238bcb0991SDimitry Andric     .addUse(RHS)
32248bcb0991SDimitry Andric     .setMIFlags(Flags);
32258bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
32268bcb0991SDimitry Andric 
32278bcb0991SDimitry Andric   MI.eraseFromParent();
32288bcb0991SDimitry Andric   return true;
32298bcb0991SDimitry Andric }
32308bcb0991SDimitry Andric 
3231e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3232e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
3233e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
3234e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3235e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
3236e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
3237e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
3238e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
3239e8d8bef9SDimitry Andric 
3240e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
3241e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3242e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
3243e8d8bef9SDimitry Andric 
3244e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
32458bcb0991SDimitry Andric     return false;
3246e8d8bef9SDimitry Andric 
3247e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
3248e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
3249e8d8bef9SDimitry Andric 
3250e8d8bef9SDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3251e8d8bef9SDimitry Andric     .addUse(Y)
3252e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3253e8d8bef9SDimitry Andric 
3254e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3255e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
3256e8d8bef9SDimitry Andric 
3257e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3258e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
3259e8d8bef9SDimitry Andric 
3260e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
3261e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3262e8d8bef9SDimitry Andric 
3263e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
3264e8d8bef9SDimitry Andric   MI.eraseFromParent();
3265e8d8bef9SDimitry Andric   return true;
32668bcb0991SDimitry Andric }
32678bcb0991SDimitry Andric 
3268480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3269480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3270480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3271e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3272e8d8bef9SDimitry Andric     return true;
3273e8d8bef9SDimitry Andric 
3274480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3275480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3276480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3277480093f4SDimitry Andric 
3278480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3279480093f4SDimitry Andric 
3280480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
3281480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3282480093f4SDimitry Andric 
3283480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3284480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3285480093f4SDimitry Andric 
3286480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3287480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
3288480093f4SDimitry Andric     .setMIFlags(Flags);
3289480093f4SDimitry Andric 
3290480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3291480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3292480093f4SDimitry Andric 
3293480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3294480093f4SDimitry Andric     .addUse(RDst.getReg(0))
3295480093f4SDimitry Andric     .addUse(RHS)
3296480093f4SDimitry Andric     .addUse(LHS)
3297480093f4SDimitry Andric     .setMIFlags(Flags);
3298480093f4SDimitry Andric 
3299480093f4SDimitry Andric   MI.eraseFromParent();
3300480093f4SDimitry Andric   return true;
3301480093f4SDimitry Andric }
3302480093f4SDimitry Andric 
3303480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3304480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3305480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable,
3306480093f4SDimitry Andric                                MachineIRBuilder &B,
3307480093f4SDimitry Andric                                const GCNSubtarget &ST,
3308480093f4SDimitry Andric                                AMDGPU::SIModeRegisterDefaults Mode) {
3309480093f4SDimitry Andric   // Set SP denorm mode to this value.
3310480093f4SDimitry Andric   unsigned SPDenormMode =
33115ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3312480093f4SDimitry Andric 
3313480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
3314480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
33155ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3316480093f4SDimitry Andric 
33175ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3318480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
3319480093f4SDimitry Andric       .addImm(NewDenormModeValue);
3320480093f4SDimitry Andric 
3321480093f4SDimitry Andric   } else {
3322480093f4SDimitry Andric     // Select FP32 bit field in mode register.
3323480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3324480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3325480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3326480093f4SDimitry Andric 
3327480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3328480093f4SDimitry Andric       .addImm(SPDenormMode)
3329480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
3330480093f4SDimitry Andric   }
3331480093f4SDimitry Andric }
3332480093f4SDimitry Andric 
3333480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3334480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3335480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3336e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3337e8d8bef9SDimitry Andric     return true;
3338e8d8bef9SDimitry Andric 
3339480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3340480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3341480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3342480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3343480093f4SDimitry Andric   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3344480093f4SDimitry Andric 
3345480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3346480093f4SDimitry Andric 
3347480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3348480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3349480093f4SDimitry Andric 
3350480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
3351480093f4SDimitry Andric 
3352480093f4SDimitry Andric   auto DenominatorScaled =
3353480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3354480093f4SDimitry Andric       .addUse(LHS)
33555ffd83dbSDimitry Andric       .addUse(RHS)
33565ffd83dbSDimitry Andric       .addImm(0)
3357480093f4SDimitry Andric       .setMIFlags(Flags);
3358480093f4SDimitry Andric   auto NumeratorScaled =
3359480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3360480093f4SDimitry Andric       .addUse(LHS)
3361480093f4SDimitry Andric       .addUse(RHS)
33625ffd83dbSDimitry Andric       .addImm(1)
3363480093f4SDimitry Andric       .setMIFlags(Flags);
3364480093f4SDimitry Andric 
3365480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3366480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
3367480093f4SDimitry Andric     .setMIFlags(Flags);
3368480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3369480093f4SDimitry Andric 
3370480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3371480093f4SDimitry Andric   // aren't modeled as reading it.
33725ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3373480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
3374480093f4SDimitry Andric 
3375480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3376480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3377480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3378480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3379480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3380480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3381480093f4SDimitry Andric 
33825ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3383480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
3384480093f4SDimitry Andric 
3385480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3386480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3387480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
3388480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3389480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
3390480093f4SDimitry Andric     .setMIFlags(Flags);
3391480093f4SDimitry Andric 
3392480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3393480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3394480093f4SDimitry Andric     .addUse(RHS)
3395480093f4SDimitry Andric     .addUse(LHS)
3396480093f4SDimitry Andric     .setMIFlags(Flags);
3397480093f4SDimitry Andric 
3398480093f4SDimitry Andric   MI.eraseFromParent();
3399480093f4SDimitry Andric   return true;
3400480093f4SDimitry Andric }
3401480093f4SDimitry Andric 
3402480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3403480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3404480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3405e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3406e8d8bef9SDimitry Andric     return true;
3407e8d8bef9SDimitry Andric 
3408480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3409480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3410480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3411480093f4SDimitry Andric 
3412480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3413480093f4SDimitry Andric 
3414480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
3415480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3416480093f4SDimitry Andric 
3417480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
3418480093f4SDimitry Andric 
3419480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3420480093f4SDimitry Andric     .addUse(LHS)
3421480093f4SDimitry Andric     .addUse(RHS)
34225ffd83dbSDimitry Andric     .addImm(0)
3423480093f4SDimitry Andric     .setMIFlags(Flags);
3424480093f4SDimitry Andric 
3425480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3426480093f4SDimitry Andric 
3427480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3428480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
3429480093f4SDimitry Andric     .setMIFlags(Flags);
3430480093f4SDimitry Andric 
3431480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3432480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3433480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3434480093f4SDimitry Andric 
3435480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3436480093f4SDimitry Andric     .addUse(LHS)
3437480093f4SDimitry Andric     .addUse(RHS)
34385ffd83dbSDimitry Andric     .addImm(1)
3439480093f4SDimitry Andric     .setMIFlags(Flags);
3440480093f4SDimitry Andric 
3441480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
34425ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3443480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3444480093f4SDimitry Andric 
3445480093f4SDimitry Andric   Register Scale;
3446480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
3447480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
3448480093f4SDimitry Andric     // is not usable.
3449480093f4SDimitry Andric 
3450480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
3451480093f4SDimitry Andric 
3452480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3453480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3454480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3455480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3456480093f4SDimitry Andric 
3457480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3458480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
3459480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3460480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
34615ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3462480093f4SDimitry Andric   } else {
3463480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
3464480093f4SDimitry Andric   }
3465480093f4SDimitry Andric 
3466480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3467480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3468480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3469480093f4SDimitry Andric     .addUse(Mul.getReg(0))
3470480093f4SDimitry Andric     .addUse(Scale)
3471480093f4SDimitry Andric     .setMIFlags(Flags);
3472480093f4SDimitry Andric 
3473480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3474480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3475480093f4SDimitry Andric     .addUse(RHS)
3476480093f4SDimitry Andric     .addUse(LHS)
3477480093f4SDimitry Andric     .setMIFlags(Flags);
3478480093f4SDimitry Andric 
3479480093f4SDimitry Andric   MI.eraseFromParent();
3480480093f4SDimitry Andric   return true;
3481480093f4SDimitry Andric }
3482480093f4SDimitry Andric 
34838bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
34848bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
34858bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
34868bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
34878bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
34888bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
34898bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
34908bcb0991SDimitry Andric 
34918bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
34928bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
34938bcb0991SDimitry Andric 
34948bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
34958bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
34968bcb0991SDimitry Andric 
34978bcb0991SDimitry Andric   auto C0 = B.buildConstant(S32, 0x6f800000);
34988bcb0991SDimitry Andric   auto C1 = B.buildConstant(S32, 0x2f800000);
34998bcb0991SDimitry Andric   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
35008bcb0991SDimitry Andric 
35018bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
35028bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
35038bcb0991SDimitry Andric 
35048bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
35058bcb0991SDimitry Andric 
35068bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
35078bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
35088bcb0991SDimitry Andric     .setMIFlags(Flags);
35098bcb0991SDimitry Andric 
35108bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
35118bcb0991SDimitry Andric 
35128bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
35138bcb0991SDimitry Andric 
35148bcb0991SDimitry Andric   MI.eraseFromParent();
35158bcb0991SDimitry Andric   return true;
35168bcb0991SDimitry Andric }
35178bcb0991SDimitry Andric 
3518e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3519e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
3520e8d8bef9SDimitry Andric //
3521e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
3522e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3523e8d8bef9SDimitry Andric // +-max_float.
3524e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3525e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
3526e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
3527e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3528e8d8bef9SDimitry Andric     return true;
3529e8d8bef9SDimitry Andric 
3530e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3531e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
3532e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
3533e8d8bef9SDimitry Andric 
3534e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
3535e8d8bef9SDimitry Andric 
3536e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
3537e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
3538e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
3539e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
3540e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
3541e8d8bef9SDimitry Andric   else
3542e8d8bef9SDimitry Andric     return false;
3543e8d8bef9SDimitry Andric 
3544e8d8bef9SDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3545e8d8bef9SDimitry Andric     .addUse(Src)
3546e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3547e8d8bef9SDimitry Andric 
3548e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
3549e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
3550e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3551e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
3552e8d8bef9SDimitry Andric 
3553e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3554e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3555e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3556e8d8bef9SDimitry Andric 
3557e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3558e8d8bef9SDimitry Andric 
3559e8d8bef9SDimitry Andric   if (UseIEEE)
3560e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3561e8d8bef9SDimitry Andric   else
3562e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3563e8d8bef9SDimitry Andric   MI.eraseFromParent();
3564e8d8bef9SDimitry Andric   return true;
3565e8d8bef9SDimitry Andric }
3566e8d8bef9SDimitry Andric 
3567e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3568e8d8bef9SDimitry Andric   switch (IID) {
3569e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
3570e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
3571e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
3572e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3573e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
3574e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3575e8d8bef9SDimitry Andric   default:
3576e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
3577e8d8bef9SDimitry Andric   }
3578e8d8bef9SDimitry Andric }
3579e8d8bef9SDimitry Andric 
3580e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3581e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
3582e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
3583e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
3584e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
3585e8d8bef9SDimitry Andric 
3586e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3587e8d8bef9SDimitry Andric 
3588e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
3589e8d8bef9SDimitry Andric   // construction.
3590e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
3591e8d8bef9SDimitry Andric     MI.RemoveOperand(I);
3592e8d8bef9SDimitry Andric 
3593e8d8bef9SDimitry Andric   MI.RemoveOperand(1); // Remove the intrinsic ID.
3594e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
3595e8d8bef9SDimitry Andric   return true;
3596e8d8bef9SDimitry Andric }
3597e8d8bef9SDimitry Andric 
3598e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3599e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
3600e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
3601e8d8bef9SDimitry Andric   uint64_t Offset =
3602e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
3603e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3604e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
3605e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3606e8d8bef9SDimitry Andric 
3607e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3608e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
3609e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3610e8d8bef9SDimitry Andric     return false;
3611e8d8bef9SDimitry Andric 
3612e8d8bef9SDimitry Andric   // FIXME: This should be nuw
3613e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3614e8d8bef9SDimitry Andric   return true;
3615e8d8bef9SDimitry Andric }
3616e8d8bef9SDimitry Andric 
36170b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
36180b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
36190b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
36200b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
36210b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
36220b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
36230b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
36240b57cec5SDimitry Andric   }
36250b57cec5SDimitry Andric 
36260b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
3627e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
36280b57cec5SDimitry Andric     return false;
36290b57cec5SDimitry Andric 
36300b57cec5SDimitry Andric   MI.eraseFromParent();
36310b57cec5SDimitry Andric   return true;
36320b57cec5SDimitry Andric }
36330b57cec5SDimitry Andric 
36348bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
36358bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
36368bcb0991SDimitry Andric                                               MachineIRBuilder &B,
36378bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
36388bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3639e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3640e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
3641e8d8bef9SDimitry Andric 
36428bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
36438bcb0991SDimitry Andric   MI.eraseFromParent();
36448bcb0991SDimitry Andric   return true;
36458bcb0991SDimitry Andric }
36468bcb0991SDimitry Andric 
36475ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
36485ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
36495ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
36505ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
36515ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
36525ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
3653*fe6060f1SDimitry Andric std::pair<Register, unsigned>
36545ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
36555ffd83dbSDimitry Andric                                         Register OrigOffset) const {
36565ffd83dbSDimitry Andric   const unsigned MaxImm = 4095;
36575ffd83dbSDimitry Andric   Register BaseReg;
3658*fe6060f1SDimitry Andric   unsigned ImmOffset;
36595ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3660*fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
36615ffd83dbSDimitry Andric 
3662*fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
3663*fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
36645ffd83dbSDimitry Andric 
3665*fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
3666*fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
3667*fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
36685ffd83dbSDimitry Andric 
36695ffd83dbSDimitry Andric   // If the immediate value is too big for the immoffset field, put the value
36705ffd83dbSDimitry Andric   // and -4096 into the immoffset field so that the value that is copied/added
36715ffd83dbSDimitry Andric   // for the voffset field is a multiple of 4096, and it stands more chance
36725ffd83dbSDimitry Andric   // of being CSEd with the copy/add for another similar load/store.
36735ffd83dbSDimitry Andric   // However, do not do that rounding down to a multiple of 4096 if that is a
36745ffd83dbSDimitry Andric   // negative number, as it appears to be illegal to have a negative offset
36755ffd83dbSDimitry Andric   // in the vgpr, even if adding the immediate offset makes it positive.
36765ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
36775ffd83dbSDimitry Andric   ImmOffset -= Overflow;
36785ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
36795ffd83dbSDimitry Andric     Overflow += ImmOffset;
36805ffd83dbSDimitry Andric     ImmOffset = 0;
36815ffd83dbSDimitry Andric   }
36825ffd83dbSDimitry Andric 
36835ffd83dbSDimitry Andric   if (Overflow != 0) {
36845ffd83dbSDimitry Andric     if (!BaseReg) {
36855ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
36865ffd83dbSDimitry Andric     } else {
36875ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
36885ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
36895ffd83dbSDimitry Andric     }
36905ffd83dbSDimitry Andric   }
36915ffd83dbSDimitry Andric 
36925ffd83dbSDimitry Andric   if (!BaseReg)
36935ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
36945ffd83dbSDimitry Andric 
3695*fe6060f1SDimitry Andric   return std::make_pair(BaseReg, ImmOffset);
3696*fe6060f1SDimitry Andric }
3697*fe6060f1SDimitry Andric 
3698*fe6060f1SDimitry Andric /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
3699*fe6060f1SDimitry Andric void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
3700*fe6060f1SDimitry Andric                                           Register VOffset, Register SOffset,
3701*fe6060f1SDimitry Andric                                           unsigned ImmOffset, Register VIndex,
3702*fe6060f1SDimitry Andric                                           MachineRegisterInfo &MRI) const {
3703*fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeVOffsetVal =
3704*fe6060f1SDimitry Andric       getConstantVRegValWithLookThrough(VOffset, MRI);
3705*fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeSOffsetVal =
3706*fe6060f1SDimitry Andric       getConstantVRegValWithLookThrough(SOffset, MRI);
3707*fe6060f1SDimitry Andric   Optional<ValueAndVReg> MaybeVIndexVal =
3708*fe6060f1SDimitry Andric       getConstantVRegValWithLookThrough(VIndex, MRI);
3709*fe6060f1SDimitry Andric   // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
3710*fe6060f1SDimitry Andric   // update the MMO with that offset. The stride is unknown so we can only do
3711*fe6060f1SDimitry Andric   // this if VIndex is constant 0.
3712*fe6060f1SDimitry Andric   if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
3713*fe6060f1SDimitry Andric       MaybeVIndexVal->Value == 0) {
3714*fe6060f1SDimitry Andric     uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
3715*fe6060f1SDimitry Andric                            MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
3716*fe6060f1SDimitry Andric     MMO->setOffset(TotalOffset);
3717*fe6060f1SDimitry Andric   } else {
3718*fe6060f1SDimitry Andric     // We don't have a constant combined offset to use in the MMO. Give up.
3719*fe6060f1SDimitry Andric     MMO->setValue((Value *)nullptr);
3720*fe6060f1SDimitry Andric   }
37215ffd83dbSDimitry Andric }
37225ffd83dbSDimitry Andric 
37238bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
37248bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
37258bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
3726e8d8bef9SDimitry Andric                                              Register Reg,
3727e8d8bef9SDimitry Andric                                              bool ImageStore) const {
37288bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
37298bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
37308bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
37318bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
37328bcb0991SDimitry Andric 
3733e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
37348bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
37358bcb0991SDimitry Andric 
37368bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
37378bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
37388bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
37398bcb0991SDimitry Andric 
37408bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
37418bcb0991SDimitry Andric 
3742*fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
3743*fe6060f1SDimitry Andric         .getReg(0);
37448bcb0991SDimitry Andric   }
37458bcb0991SDimitry Andric 
3746e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
3747e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
3748e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3749e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
3750e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
3751e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3752*fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
3753*fe6060f1SDimitry Andric           .getReg(0);
3754e8d8bef9SDimitry Andric     }
3755e8d8bef9SDimitry Andric 
3756e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
3757e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3758e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
3759e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3760e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
3761e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3762*fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
3763*fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
3764e8d8bef9SDimitry Andric     }
3765e8d8bef9SDimitry Andric 
3766e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
3767e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3768*fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
3769e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
3770e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3771e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
3772e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3773*fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
3774*fe6060f1SDimitry Andric           .getReg(0);
3775e8d8bef9SDimitry Andric     }
3776e8d8bef9SDimitry Andric 
3777e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
3778e8d8bef9SDimitry Andric   }
3779e8d8bef9SDimitry Andric 
3780e8d8bef9SDimitry Andric   return Reg;
3781e8d8bef9SDimitry Andric }
3782e8d8bef9SDimitry Andric 
37835ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
37845ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
37855ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
37865ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
37878bcb0991SDimitry Andric 
37888bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
37898bcb0991SDimitry Andric 
37908bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
37918bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
37928bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
37935ffd83dbSDimitry Andric     return AnyExt;
37948bcb0991SDimitry Andric   }
37958bcb0991SDimitry Andric 
37968bcb0991SDimitry Andric   if (Ty.isVector()) {
37978bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
37988bcb0991SDimitry Andric       if (IsFormat)
37995ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
38005ffd83dbSDimitry Andric     }
38015ffd83dbSDimitry Andric   }
38025ffd83dbSDimitry Andric 
38035ffd83dbSDimitry Andric   return VData;
38045ffd83dbSDimitry Andric }
38055ffd83dbSDimitry Andric 
38065ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
38075ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
38085ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
38095ffd83dbSDimitry Andric                                               bool IsTyped,
38105ffd83dbSDimitry Andric                                               bool IsFormat) const {
38115ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
38125ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
38135ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
38145ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
38155ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
38165ffd83dbSDimitry Andric 
38175ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
38185ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
38195ffd83dbSDimitry Andric 
38205ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
38215ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
38225ffd83dbSDimitry Andric 
38235ffd83dbSDimitry Andric   unsigned ImmOffset;
38245ffd83dbSDimitry Andric 
38255ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
38265ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
38275ffd83dbSDimitry Andric 
38285ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
38295ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
38305ffd83dbSDimitry Andric   Register VIndex;
38315ffd83dbSDimitry Andric   int OpOffset = 0;
38325ffd83dbSDimitry Andric   if (HasVIndex) {
38335ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
38345ffd83dbSDimitry Andric     OpOffset = 1;
3835*fe6060f1SDimitry Andric   } else {
3836*fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
38375ffd83dbSDimitry Andric   }
38385ffd83dbSDimitry Andric 
38395ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
38405ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
38415ffd83dbSDimitry Andric 
38425ffd83dbSDimitry Andric   unsigned Format = 0;
38435ffd83dbSDimitry Andric   if (IsTyped) {
38445ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
38455ffd83dbSDimitry Andric     ++OpOffset;
38465ffd83dbSDimitry Andric   }
38475ffd83dbSDimitry Andric 
38485ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
38495ffd83dbSDimitry Andric 
3850*fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3851*fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
38525ffd83dbSDimitry Andric 
38535ffd83dbSDimitry Andric   unsigned Opc;
38545ffd83dbSDimitry Andric   if (IsTyped) {
38555ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
38565ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
38575ffd83dbSDimitry Andric   } else if (IsFormat) {
38585ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
38595ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
38605ffd83dbSDimitry Andric   } else {
38615ffd83dbSDimitry Andric     switch (MemSize) {
38625ffd83dbSDimitry Andric     case 1:
38635ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
38645ffd83dbSDimitry Andric       break;
38655ffd83dbSDimitry Andric     case 2:
38665ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
38675ffd83dbSDimitry Andric       break;
38685ffd83dbSDimitry Andric     default:
38695ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
38705ffd83dbSDimitry Andric       break;
38715ffd83dbSDimitry Andric     }
38725ffd83dbSDimitry Andric   }
38735ffd83dbSDimitry Andric 
38745ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
38755ffd83dbSDimitry Andric     .addUse(VData)              // vdata
38765ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
38775ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
38785ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
38795ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
38805ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
38815ffd83dbSDimitry Andric 
38825ffd83dbSDimitry Andric   if (IsTyped)
38835ffd83dbSDimitry Andric     MIB.addImm(Format);
38845ffd83dbSDimitry Andric 
38855ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
38865ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
38875ffd83dbSDimitry Andric      .addMemOperand(MMO);
38885ffd83dbSDimitry Andric 
38895ffd83dbSDimitry Andric   MI.eraseFromParent();
38908bcb0991SDimitry Andric   return true;
38918bcb0991SDimitry Andric }
38928bcb0991SDimitry Andric 
38935ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
38945ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
38955ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
38965ffd83dbSDimitry Andric                                              bool IsFormat,
38975ffd83dbSDimitry Andric                                              bool IsTyped) const {
38985ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
38995ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
3900*fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
39015ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
39025ffd83dbSDimitry Andric 
39035ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
39045ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
39055ffd83dbSDimitry Andric 
39065ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
39075ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
39085ffd83dbSDimitry Andric 
39095ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
39105ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
39115ffd83dbSDimitry Andric   Register VIndex;
39125ffd83dbSDimitry Andric   int OpOffset = 0;
39135ffd83dbSDimitry Andric   if (HasVIndex) {
39145ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
39155ffd83dbSDimitry Andric     OpOffset = 1;
3916*fe6060f1SDimitry Andric   } else {
3917*fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
39188bcb0991SDimitry Andric   }
39198bcb0991SDimitry Andric 
39205ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
39215ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
39225ffd83dbSDimitry Andric 
39235ffd83dbSDimitry Andric   unsigned Format = 0;
39245ffd83dbSDimitry Andric   if (IsTyped) {
39255ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
39265ffd83dbSDimitry Andric     ++OpOffset;
39278bcb0991SDimitry Andric   }
39288bcb0991SDimitry Andric 
39295ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
39305ffd83dbSDimitry Andric   unsigned ImmOffset;
39315ffd83dbSDimitry Andric 
39325ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
39335ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
39345ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
39355ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
39365ffd83dbSDimitry Andric 
3937*fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
3938*fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
39395ffd83dbSDimitry Andric 
39405ffd83dbSDimitry Andric   unsigned Opc;
39415ffd83dbSDimitry Andric 
39425ffd83dbSDimitry Andric   if (IsTyped) {
39435ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
39445ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
39455ffd83dbSDimitry Andric   } else if (IsFormat) {
39465ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
39475ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
39485ffd83dbSDimitry Andric   } else {
3949*fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
3950*fe6060f1SDimitry Andric     case 8:
39515ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
39525ffd83dbSDimitry Andric       break;
3953*fe6060f1SDimitry Andric     case 16:
39545ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
39555ffd83dbSDimitry Andric       break;
39565ffd83dbSDimitry Andric     default:
39575ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
39585ffd83dbSDimitry Andric       break;
39595ffd83dbSDimitry Andric     }
39605ffd83dbSDimitry Andric   }
39615ffd83dbSDimitry Andric 
39625ffd83dbSDimitry Andric   Register LoadDstReg;
39635ffd83dbSDimitry Andric 
3964*fe6060f1SDimitry Andric   bool IsExtLoad =
3965*fe6060f1SDimitry Andric       (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector());
39665ffd83dbSDimitry Andric   LLT UnpackedTy = Ty.changeElementSize(32);
39675ffd83dbSDimitry Andric 
39685ffd83dbSDimitry Andric   if (IsExtLoad)
39695ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
39705ffd83dbSDimitry Andric   else if (Unpacked && IsD16 && Ty.isVector())
39715ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
39725ffd83dbSDimitry Andric   else
39735ffd83dbSDimitry Andric     LoadDstReg = Dst;
39745ffd83dbSDimitry Andric 
39755ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
39765ffd83dbSDimitry Andric     .addDef(LoadDstReg)         // vdata
39775ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
39785ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
39795ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
39805ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
39815ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
39825ffd83dbSDimitry Andric 
39835ffd83dbSDimitry Andric   if (IsTyped)
39845ffd83dbSDimitry Andric     MIB.addImm(Format);
39855ffd83dbSDimitry Andric 
39865ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
39875ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
39885ffd83dbSDimitry Andric      .addMemOperand(MMO);
39895ffd83dbSDimitry Andric 
39905ffd83dbSDimitry Andric   if (LoadDstReg != Dst) {
39915ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
39925ffd83dbSDimitry Andric 
39935ffd83dbSDimitry Andric     // Widen result for extending loads was widened.
39945ffd83dbSDimitry Andric     if (IsExtLoad)
39955ffd83dbSDimitry Andric       B.buildTrunc(Dst, LoadDstReg);
39965ffd83dbSDimitry Andric     else {
39975ffd83dbSDimitry Andric       // Repack to original 16-bit vector result
39985ffd83dbSDimitry Andric       // FIXME: G_TRUNC should work, but legalization currently fails
39995ffd83dbSDimitry Andric       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
40005ffd83dbSDimitry Andric       SmallVector<Register, 4> Repack;
40015ffd83dbSDimitry Andric       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
40025ffd83dbSDimitry Andric         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
40035ffd83dbSDimitry Andric       B.buildMerge(Dst, Repack);
40045ffd83dbSDimitry Andric     }
40055ffd83dbSDimitry Andric   }
40065ffd83dbSDimitry Andric 
40075ffd83dbSDimitry Andric   MI.eraseFromParent();
40085ffd83dbSDimitry Andric   return true;
40095ffd83dbSDimitry Andric }
40105ffd83dbSDimitry Andric 
40115ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
40125ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
40135ffd83dbSDimitry Andric                                                bool IsInc) const {
40145ffd83dbSDimitry Andric   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
40155ffd83dbSDimitry Andric                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
40165ffd83dbSDimitry Andric   B.buildInstr(Opc)
40175ffd83dbSDimitry Andric     .addDef(MI.getOperand(0).getReg())
40185ffd83dbSDimitry Andric     .addUse(MI.getOperand(2).getReg())
40195ffd83dbSDimitry Andric     .addUse(MI.getOperand(3).getReg())
40205ffd83dbSDimitry Andric     .cloneMemRefs(MI);
40215ffd83dbSDimitry Andric   MI.eraseFromParent();
40225ffd83dbSDimitry Andric   return true;
40235ffd83dbSDimitry Andric }
40245ffd83dbSDimitry Andric 
40255ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
40265ffd83dbSDimitry Andric   switch (IntrID) {
40275ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
40285ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
40295ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
40305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
40315ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
40325ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
40335ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
40345ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
40355ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
40365ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
40375ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
40385ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
40395ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
40405ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
40415ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
40425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
40435ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
40445ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
40455ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
40465ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
40475ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
40485ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
40495ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
40505ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
40515ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
40525ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
40535ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
40545ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
40555ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
40565ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
40575ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
40585ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
40595ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
40605ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
40615ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
40625ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
40635ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
40645ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
40655ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
4066*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_buffer_atomic_fadd:
4067e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4068e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
4069e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
4070*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
4071*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
4072*fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
4073*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
4074*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
4075*fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
40765ffd83dbSDimitry Andric   default:
40775ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
40785ffd83dbSDimitry Andric   }
40795ffd83dbSDimitry Andric }
40805ffd83dbSDimitry Andric 
40815ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
40825ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
40835ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
40845ffd83dbSDimitry Andric   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
40855ffd83dbSDimitry Andric                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
4086e8d8bef9SDimitry Andric   const bool HasReturn = MI.getNumExplicitDefs() != 0;
40875ffd83dbSDimitry Andric 
4088e8d8bef9SDimitry Andric   Register Dst;
40895ffd83dbSDimitry Andric 
40905ffd83dbSDimitry Andric   int OpOffset = 0;
4091e8d8bef9SDimitry Andric   if (HasReturn) {
4092e8d8bef9SDimitry Andric     // A few FP atomics do not support return values.
4093e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4094e8d8bef9SDimitry Andric   } else {
4095e8d8bef9SDimitry Andric     OpOffset = -1;
4096e8d8bef9SDimitry Andric   }
4097e8d8bef9SDimitry Andric 
4098e8d8bef9SDimitry Andric   Register VData = MI.getOperand(2 + OpOffset).getReg();
4099e8d8bef9SDimitry Andric   Register CmpVal;
41005ffd83dbSDimitry Andric 
41015ffd83dbSDimitry Andric   if (IsCmpSwap) {
41025ffd83dbSDimitry Andric     CmpVal = MI.getOperand(3 + OpOffset).getReg();
41035ffd83dbSDimitry Andric     ++OpOffset;
41045ffd83dbSDimitry Andric   }
41055ffd83dbSDimitry Andric 
41065ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
4107e8d8bef9SDimitry Andric   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
41085ffd83dbSDimitry Andric 
41095ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
41105ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
41115ffd83dbSDimitry Andric   Register VIndex;
41125ffd83dbSDimitry Andric   if (HasVIndex) {
41135ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
41145ffd83dbSDimitry Andric     ++OpOffset;
4115*fe6060f1SDimitry Andric   } else {
4116*fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
41175ffd83dbSDimitry Andric   }
41185ffd83dbSDimitry Andric 
41195ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
41205ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
41215ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
41225ffd83dbSDimitry Andric 
41235ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
41245ffd83dbSDimitry Andric 
41255ffd83dbSDimitry Andric   unsigned ImmOffset;
4126*fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
4127*fe6060f1SDimitry Andric   updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
41285ffd83dbSDimitry Andric 
4129e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
4130e8d8bef9SDimitry Andric 
4131e8d8bef9SDimitry Andric   if (HasReturn)
4132e8d8bef9SDimitry Andric     MIB.addDef(Dst);
4133e8d8bef9SDimitry Andric 
4134e8d8bef9SDimitry Andric   MIB.addUse(VData); // vdata
41355ffd83dbSDimitry Andric 
41365ffd83dbSDimitry Andric   if (IsCmpSwap)
41375ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
41385ffd83dbSDimitry Andric 
41395ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
41405ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
41415ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
41425ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
41435ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
41445ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
41455ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
41465ffd83dbSDimitry Andric      .addMemOperand(MMO);
41475ffd83dbSDimitry Andric 
41485ffd83dbSDimitry Andric   MI.eraseFromParent();
41495ffd83dbSDimitry Andric   return true;
41505ffd83dbSDimitry Andric }
41515ffd83dbSDimitry Andric 
4152*fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
41535ffd83dbSDimitry Andric /// vector with s16 typed elements.
4154*fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
4155*fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
4156*fe6060f1SDimitry Andric                                       unsigned ArgOffset,
4157*fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
4158*fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
41595ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4160*fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
4161*fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
41625ffd83dbSDimitry Andric 
4163e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
4164e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
41655ffd83dbSDimitry Andric     if (!SrcOp.isReg())
41665ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
41675ffd83dbSDimitry Andric 
41685ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
41695ffd83dbSDimitry Andric 
4170*fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
4171*fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
4172*fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
4173*fe6060f1SDimitry Andric       // Handle any gradient or coordinate operands that should not be packed
41745ffd83dbSDimitry Andric       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
41755ffd83dbSDimitry Andric       PackedAddrs.push_back(AddrReg);
41765ffd83dbSDimitry Andric     } else {
41775ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
41785ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
41795ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
4180e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
4181e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
4182e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
4183e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
4184e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
41855ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
4186e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
41875ffd83dbSDimitry Andric         PackedAddrs.push_back(
41885ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
41895ffd83dbSDimitry Andric                 .getReg(0));
41905ffd83dbSDimitry Andric       } else {
41915ffd83dbSDimitry Andric         PackedAddrs.push_back(
4192e8d8bef9SDimitry Andric             B.buildBuildVector(
4193e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
41945ffd83dbSDimitry Andric                 .getReg(0));
41955ffd83dbSDimitry Andric         ++I;
41965ffd83dbSDimitry Andric       }
41975ffd83dbSDimitry Andric     }
41985ffd83dbSDimitry Andric   }
41995ffd83dbSDimitry Andric }
42005ffd83dbSDimitry Andric 
42015ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
42025ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
42035ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
42045ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
42055ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
42065ffd83dbSDimitry Andric 
42075ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
42085ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
42095ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
42105ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
42115ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
42125ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
42135ffd83dbSDimitry Andric     }
42145ffd83dbSDimitry Andric   }
42155ffd83dbSDimitry Andric 
42165ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
42175ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
4218*fe6060f1SDimitry Andric     // Above 8 elements round up to next power of 2 (i.e. 16).
4219*fe6060f1SDimitry Andric     if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) {
42205ffd83dbSDimitry Andric       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
42215ffd83dbSDimitry Andric       auto Undef = B.buildUndef(S32);
42225ffd83dbSDimitry Andric       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
42235ffd83dbSDimitry Andric       NumAddrRegs = RoundedNumRegs;
42245ffd83dbSDimitry Andric     }
42255ffd83dbSDimitry Andric 
4226*fe6060f1SDimitry Andric     auto VAddr =
4227*fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
42285ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
42295ffd83dbSDimitry Andric   }
42305ffd83dbSDimitry Andric 
42315ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
42325ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
42335ffd83dbSDimitry Andric     if (SrcOp.isReg())
42345ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
42355ffd83dbSDimitry Andric   }
42365ffd83dbSDimitry Andric }
42375ffd83dbSDimitry Andric 
42385ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
42395ffd83dbSDimitry Andric ///
42405ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
42415ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
42425ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
42435ffd83dbSDimitry Andric /// registers.
42445ffd83dbSDimitry Andric ///
42455ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
42465ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
42475ffd83dbSDimitry Andric /// want a selected instrution entering RegBankSelect. In order to avoid
42485ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
42495ffd83dbSDimitry Andric /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
42505ffd83dbSDimitry Andric /// now unnecessary arguments with $noreg.
42515ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4252e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4253e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
42545ffd83dbSDimitry Andric 
4255e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
4256e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
42575ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
42585ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
42595ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
42605ffd83dbSDimitry Andric 
42615ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
42625ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4263e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
42645ffd83dbSDimitry Andric 
42655ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
42665ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
42675ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
4268*fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
42695ffd83dbSDimitry Andric 
42705ffd83dbSDimitry Andric   unsigned DMask = 0;
42715ffd83dbSDimitry Andric 
42725ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
4273e8d8bef9SDimitry Andric   LLT GradTy =
4274e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4275e8d8bef9SDimitry Andric   LLT AddrTy =
4276e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
42775ffd83dbSDimitry Andric   const bool IsG16 = GradTy == S16;
42785ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
42795ffd83dbSDimitry Andric 
42805ffd83dbSDimitry Andric   int DMaskLanes = 0;
42815ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
4282e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
42835ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
42845ffd83dbSDimitry Andric       DMaskLanes = 4;
42855ffd83dbSDimitry Andric     } else if (DMask != 0) {
42865ffd83dbSDimitry Andric       DMaskLanes = countPopulation(DMask);
42875ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
42885ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
42895ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
42905ffd83dbSDimitry Andric       MI.eraseFromParent();
42915ffd83dbSDimitry Andric       return true;
42925ffd83dbSDimitry Andric     }
42935ffd83dbSDimitry Andric   }
42945ffd83dbSDimitry Andric 
42955ffd83dbSDimitry Andric   Observer.changingInstr(MI);
42965ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
42975ffd83dbSDimitry Andric 
42985ffd83dbSDimitry Andric   unsigned NewOpcode = NumDefs == 0 ?
42995ffd83dbSDimitry Andric     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
43005ffd83dbSDimitry Andric 
43015ffd83dbSDimitry Andric   // Track that we legalized this
43025ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
43035ffd83dbSDimitry Andric 
43045ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
43055ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
43065ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
43075ffd83dbSDimitry Andric     DMask = 0x1;
43085ffd83dbSDimitry Andric     DMaskLanes = 1;
4309e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
43105ffd83dbSDimitry Andric   }
43115ffd83dbSDimitry Andric 
43125ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
43135ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
43145ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
43155ffd83dbSDimitry Andric 
43165ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
43175ffd83dbSDimitry Andric     if (Ty.isVector())
43185ffd83dbSDimitry Andric       return false;
43195ffd83dbSDimitry Andric 
43205ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
43215ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
43225ffd83dbSDimitry Andric       // The two values are packed in one register.
4323*fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
43245ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
43255ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
43265ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
43275ffd83dbSDimitry Andric     }
43285ffd83dbSDimitry Andric   }
43295ffd83dbSDimitry Andric 
4330e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
43315ffd83dbSDimitry Andric 
43325ffd83dbSDimitry Andric   // Optimize _L to _LZ when _L is zero
43335ffd83dbSDimitry Andric   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4334e8d8bef9SDimitry Andric           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
43355ffd83dbSDimitry Andric     const ConstantFP *ConstantLod;
43365ffd83dbSDimitry Andric 
4337e8d8bef9SDimitry Andric     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4338e8d8bef9SDimitry Andric                  m_GFCst(ConstantLod))) {
43395ffd83dbSDimitry Andric       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
43405ffd83dbSDimitry Andric         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4341e8d8bef9SDimitry Andric         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4342e8d8bef9SDimitry Andric             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4343e8d8bef9SDimitry Andric                                                       Intr->Dim);
43445ffd83dbSDimitry Andric 
43455ffd83dbSDimitry Andric         // The starting indexes should remain in the same place.
43465ffd83dbSDimitry Andric         --CorrectedNumVAddrs;
43475ffd83dbSDimitry Andric 
4348e8d8bef9SDimitry Andric         MI.getOperand(MI.getNumExplicitDefs())
4349e8d8bef9SDimitry Andric             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4350e8d8bef9SDimitry Andric         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4351e8d8bef9SDimitry Andric         Intr = NewImageDimIntr;
43525ffd83dbSDimitry Andric       }
43535ffd83dbSDimitry Andric     }
43545ffd83dbSDimitry Andric   }
43555ffd83dbSDimitry Andric 
43565ffd83dbSDimitry Andric   // Optimize _mip away, when 'lod' is zero
4357e8d8bef9SDimitry Andric   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
43585ffd83dbSDimitry Andric     int64_t ConstantLod;
4359e8d8bef9SDimitry Andric     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4360e8d8bef9SDimitry Andric                  m_ICst(ConstantLod))) {
43615ffd83dbSDimitry Andric       if (ConstantLod == 0) {
43625ffd83dbSDimitry Andric         // TODO: Change intrinsic opcode and remove operand instead or replacing
43635ffd83dbSDimitry Andric         // it with 0, as the _L to _LZ handling is done above.
4364e8d8bef9SDimitry Andric         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
43655ffd83dbSDimitry Andric         --CorrectedNumVAddrs;
43665ffd83dbSDimitry Andric       }
43675ffd83dbSDimitry Andric     }
43685ffd83dbSDimitry Andric   }
43695ffd83dbSDimitry Andric 
43705ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
4371*fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
4372*fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
4373*fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
43745ffd83dbSDimitry Andric     return false;
4375*fe6060f1SDimitry Andric   }
43765ffd83dbSDimitry Andric 
4377*fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
4378*fe6060f1SDimitry Andric     // A16 not supported
4379*fe6060f1SDimitry Andric     return false;
4380*fe6060f1SDimitry Andric   }
4381*fe6060f1SDimitry Andric 
4382*fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
4383e8d8bef9SDimitry Andric     if (Intr->NumVAddrs > 1) {
43845ffd83dbSDimitry Andric       SmallVector<Register, 4> PackedRegs;
43855ffd83dbSDimitry Andric 
4386*fe6060f1SDimitry Andric       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
4387*fe6060f1SDimitry Andric                                 IsG16);
43885ffd83dbSDimitry Andric 
43895ffd83dbSDimitry Andric       // See also below in the non-a16 branch
4390*fe6060f1SDimitry Andric       const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 &&
4391*fe6060f1SDimitry Andric                           PackedRegs.size() <= ST.getNSAMaxSize();
43925ffd83dbSDimitry Andric 
43935ffd83dbSDimitry Andric       if (!UseNSA && PackedRegs.size() > 1) {
4394*fe6060f1SDimitry Andric         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
43955ffd83dbSDimitry Andric         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
43965ffd83dbSDimitry Andric         PackedRegs[0] = Concat.getReg(0);
43975ffd83dbSDimitry Andric         PackedRegs.resize(1);
43985ffd83dbSDimitry Andric       }
43995ffd83dbSDimitry Andric 
4400e8d8bef9SDimitry Andric       const unsigned NumPacked = PackedRegs.size();
4401e8d8bef9SDimitry Andric       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4402e8d8bef9SDimitry Andric         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
44035ffd83dbSDimitry Andric         if (!SrcOp.isReg()) {
44045ffd83dbSDimitry Andric           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
44055ffd83dbSDimitry Andric           continue;
44065ffd83dbSDimitry Andric         }
44075ffd83dbSDimitry Andric 
44085ffd83dbSDimitry Andric         assert(SrcOp.getReg() != AMDGPU::NoRegister);
44095ffd83dbSDimitry Andric 
4410e8d8bef9SDimitry Andric         if (I - Intr->VAddrStart < NumPacked)
4411e8d8bef9SDimitry Andric           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
44125ffd83dbSDimitry Andric         else
44135ffd83dbSDimitry Andric           SrcOp.setReg(AMDGPU::NoRegister);
44145ffd83dbSDimitry Andric       }
44155ffd83dbSDimitry Andric     }
44165ffd83dbSDimitry Andric   } else {
44175ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
44185ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
44195ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
44205ffd83dbSDimitry Andric     // wash in terms of code size or even better.
44215ffd83dbSDimitry Andric     //
44225ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
44235ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
44245ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
44255ffd83dbSDimitry Andric     //
44265ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
44275ffd83dbSDimitry Andric     // allocation when possible.
4428*fe6060f1SDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 &&
4429*fe6060f1SDimitry Andric                         CorrectedNumVAddrs <= ST.getNSAMaxSize();
44305ffd83dbSDimitry Andric 
4431e8d8bef9SDimitry Andric     if (!UseNSA && Intr->NumVAddrs > 1)
4432e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4433e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
44345ffd83dbSDimitry Andric   }
44355ffd83dbSDimitry Andric 
44365ffd83dbSDimitry Andric   int Flags = 0;
44375ffd83dbSDimitry Andric   if (IsA16)
44385ffd83dbSDimitry Andric     Flags |= 1;
44395ffd83dbSDimitry Andric   if (IsG16)
44405ffd83dbSDimitry Andric     Flags |= 2;
44415ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
44425ffd83dbSDimitry Andric 
44435ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
44445ffd83dbSDimitry Andric     // TODO: Handle dmask trim
44455ffd83dbSDimitry Andric     Register VData = MI.getOperand(1).getReg();
44465ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData);
44475ffd83dbSDimitry Andric     if (!Ty.isVector() || Ty.getElementType() != S16)
44485ffd83dbSDimitry Andric       return true;
44495ffd83dbSDimitry Andric 
4450e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
44515ffd83dbSDimitry Andric     if (RepackedReg != VData) {
44525ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
44535ffd83dbSDimitry Andric     }
44545ffd83dbSDimitry Andric 
44555ffd83dbSDimitry Andric     return true;
44565ffd83dbSDimitry Andric   }
44575ffd83dbSDimitry Andric 
44585ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
44595ffd83dbSDimitry Andric   LLT Ty = MRI->getType(DstReg);
44605ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
44615ffd83dbSDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
44625ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
44635ffd83dbSDimitry Andric 
44645ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
44655ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
44665ffd83dbSDimitry Andric     return false;
44675ffd83dbSDimitry Andric 
44685ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
44695ffd83dbSDimitry Andric     return false;
44705ffd83dbSDimitry Andric 
44715ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
4472*fe6060f1SDimitry Andric   const LLT AdjustedTy =
4473*fe6060f1SDimitry Andric       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
44745ffd83dbSDimitry Andric 
44755ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
44765ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
44775ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
44785ffd83dbSDimitry Andric   LLT RoundedTy;
44795ffd83dbSDimitry Andric 
44805ffd83dbSDimitry Andric   // S32 vector to to cover all data, plus TFE result element.
44815ffd83dbSDimitry Andric   LLT TFETy;
44825ffd83dbSDimitry Andric 
44835ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
44845ffd83dbSDimitry Andric   LLT RegTy;
44855ffd83dbSDimitry Andric 
44865ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
4487*fe6060f1SDimitry Andric     RoundedTy =
4488*fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
4489*fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
44905ffd83dbSDimitry Andric     RegTy = S32;
44915ffd83dbSDimitry Andric   } else {
44925ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
44935ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
44945ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
4495*fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
4496*fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
4497*fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
44985ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
44995ffd83dbSDimitry Andric   }
45005ffd83dbSDimitry Andric 
45015ffd83dbSDimitry Andric   // The return type does not need adjustment.
45025ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
45035ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
45045ffd83dbSDimitry Andric     return true;
45055ffd83dbSDimitry Andric 
45065ffd83dbSDimitry Andric   Register Dst1Reg;
45075ffd83dbSDimitry Andric 
45085ffd83dbSDimitry Andric   // Insert after the instruction.
45095ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
45105ffd83dbSDimitry Andric 
45115ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
45125ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
45135ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
45145ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
45155ffd83dbSDimitry Andric 
45165ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
45175ffd83dbSDimitry Andric 
45185ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
45195ffd83dbSDimitry Andric 
45205ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
45215ffd83dbSDimitry Andric   // type. The intruction really returns these two values in one contiguous
45225ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
45235ffd83dbSDimitry Andric   // return type to use a single register result.
45245ffd83dbSDimitry Andric 
45255ffd83dbSDimitry Andric   if (IsTFE) {
45265ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
45275ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
45285ffd83dbSDimitry Andric       return false;
45295ffd83dbSDimitry Andric 
45305ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
45315ffd83dbSDimitry Andric     MI.RemoveOperand(1);
45325ffd83dbSDimitry Andric 
45335ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
45345ffd83dbSDimitry Andric     if (Ty == S32) {
45355ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
45365ffd83dbSDimitry Andric       return true;
45375ffd83dbSDimitry Andric     }
45385ffd83dbSDimitry Andric   }
45395ffd83dbSDimitry Andric 
45405ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
45415ffd83dbSDimitry Andric   // result.
45425ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
45435ffd83dbSDimitry Andric 
45445ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
45455ffd83dbSDimitry Andric 
45465ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
45475ffd83dbSDimitry Andric     assert(!IsTFE);
45485ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
45495ffd83dbSDimitry Andric   } else {
45505ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
45515ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
45525ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
45535ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
45545ffd83dbSDimitry Andric 
45555ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
45565ffd83dbSDimitry Andric     // directly written to the right place already.
45575ffd83dbSDimitry Andric     if (IsTFE)
45585ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
45595ffd83dbSDimitry Andric   }
45605ffd83dbSDimitry Andric 
45615ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
45625ffd83dbSDimitry Andric   // of packed vs. unpacked.
45635ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
45645ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
45655ffd83dbSDimitry Andric     return true;
45665ffd83dbSDimitry Andric   }
45675ffd83dbSDimitry Andric 
45685ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
45695ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
45705ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
45715ffd83dbSDimitry Andric     return true;
45725ffd83dbSDimitry Andric   }
45735ffd83dbSDimitry Andric 
45745ffd83dbSDimitry Andric   assert(Ty.isVector());
45755ffd83dbSDimitry Andric 
45765ffd83dbSDimitry Andric   if (IsD16) {
45775ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
45785ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
45795ffd83dbSDimitry Andric     //
45805ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
45815ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
45825ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
45835ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
45845ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
45855ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
45865ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
45875ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
45885ffd83dbSDimitry Andric     }
45895ffd83dbSDimitry Andric   }
45905ffd83dbSDimitry Andric 
45915ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
45925ffd83dbSDimitry Andric     if (NumElts == 0)
45935ffd83dbSDimitry Andric       return;
45945ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
45955ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
45965ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
45975ffd83dbSDimitry Andric   };
45985ffd83dbSDimitry Andric 
45995ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
46005ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
46015ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
46025ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
46035ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
46045ffd83dbSDimitry Andric     return true;
46055ffd83dbSDimitry Andric   }
46065ffd83dbSDimitry Andric 
46075ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
46085ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
46095ffd83dbSDimitry Andric 
46105ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
4611*fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
46125ffd83dbSDimitry Andric   if (Ty == V3S16) {
46135ffd83dbSDimitry Andric     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
4614*fe6060f1SDimitry Andric     auto Concat = B.buildConcatVectors(LLT::fixed_vector(6, 16), ResultRegs);
46155ffd83dbSDimitry Andric     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
46165ffd83dbSDimitry Andric     return true;
46175ffd83dbSDimitry Andric   }
46185ffd83dbSDimitry Andric 
46195ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
46205ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
46215ffd83dbSDimitry Andric   return true;
46225ffd83dbSDimitry Andric }
46235ffd83dbSDimitry Andric 
46245ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4625e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
4626e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
4627e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
4628e8d8bef9SDimitry Andric 
46295ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
46305ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
46315ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
46325ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
46335ffd83dbSDimitry Andric 
46345ffd83dbSDimitry Andric   Observer.changingInstr(MI);
46355ffd83dbSDimitry Andric 
4636*fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
4637e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
4638e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
4639e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4640e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
4641e8d8bef9SDimitry Andric   }
4642e8d8bef9SDimitry Andric 
46435ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
46445ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
46455ffd83dbSDimitry Andric   // allowed to add one.
46465ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
46475ffd83dbSDimitry Andric   MI.RemoveOperand(1); // Remove intrinsic ID
46485ffd83dbSDimitry Andric 
46495ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
46505ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
46515ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
46525ffd83dbSDimitry Andric   const Align MemAlign(4);
46535ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
46545ffd83dbSDimitry Andric       MachinePointerInfo(),
46555ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
46565ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
46575ffd83dbSDimitry Andric       MemSize, MemAlign);
46585ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
46595ffd83dbSDimitry Andric 
46605ffd83dbSDimitry Andric   // There are no 96-bit result scalar loads, but widening to 128-bit should
46615ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
46625ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
46635ffd83dbSDimitry Andric   if (!isPowerOf2_32(Size)) {
46645ffd83dbSDimitry Andric     if (Ty.isVector())
46655ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
46665ffd83dbSDimitry Andric     else
46675ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
46685ffd83dbSDimitry Andric   }
46695ffd83dbSDimitry Andric 
46705ffd83dbSDimitry Andric   Observer.changedInstr(MI);
46715ffd83dbSDimitry Andric   return true;
46725ffd83dbSDimitry Andric }
46735ffd83dbSDimitry Andric 
4674e8d8bef9SDimitry Andric // TODO: Move to selection
46755ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
46760b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
46770b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
4678*fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
4679*fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
4680*fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
4681*fe6060f1SDimitry Andric 
4682*fe6060f1SDimitry Andric   if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
4683*fe6060f1SDimitry Andric     switch (*HsaAbiVer) {
4684*fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
4685*fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
4686*fe6060f1SDimitry Andric       return legalizeTrapHsaQueuePtr(MI, MRI, B);
4687*fe6060f1SDimitry Andric     case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
4688*fe6060f1SDimitry Andric       return ST.supportsGetDoorbellID() ?
4689*fe6060f1SDimitry Andric           legalizeTrapHsa(MI, MRI, B) :
4690*fe6060f1SDimitry Andric           legalizeTrapHsaQueuePtr(MI, MRI, B);
4691*fe6060f1SDimitry Andric     }
4692*fe6060f1SDimitry Andric   }
4693*fe6060f1SDimitry Andric 
4694*fe6060f1SDimitry Andric   llvm_unreachable("Unknown trap handler");
4695*fe6060f1SDimitry Andric }
4696*fe6060f1SDimitry Andric 
4697*fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
4698*fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
46995ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
4700*fe6060f1SDimitry Andric   MI.eraseFromParent();
4701*fe6060f1SDimitry Andric   return true;
4702*fe6060f1SDimitry Andric }
4703*fe6060f1SDimitry Andric 
4704*fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
4705*fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
47065ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
47075ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
4708e8d8bef9SDimitry Andric   Register LiveIn =
4709e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4710e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
47115ffd83dbSDimitry Andric     return false;
4712e8d8bef9SDimitry Andric 
4713e8d8bef9SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
47145ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
47155ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
4716*fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
47175ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
4718*fe6060f1SDimitry Andric 
4719*fe6060f1SDimitry Andric   MI.eraseFromParent();
4720*fe6060f1SDimitry Andric   return true;
47215ffd83dbSDimitry Andric }
47225ffd83dbSDimitry Andric 
4723*fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(
4724*fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
4725*fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
4726*fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
47275ffd83dbSDimitry Andric   MI.eraseFromParent();
47285ffd83dbSDimitry Andric   return true;
47295ffd83dbSDimitry Andric }
47305ffd83dbSDimitry Andric 
47315ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
47325ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
47335ffd83dbSDimitry Andric   // Is non-HSA path or trap-handler disabled? then, report a warning
47345ffd83dbSDimitry Andric   // accordingly
4735*fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
4736*fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
47375ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
47385ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
47395ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
47405ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
47415ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
47425ffd83dbSDimitry Andric   } else {
47435ffd83dbSDimitry Andric     // Insert debug-trap instruction
4744*fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
4745*fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
47465ffd83dbSDimitry Andric   }
47475ffd83dbSDimitry Andric 
47485ffd83dbSDimitry Andric   MI.eraseFromParent();
47495ffd83dbSDimitry Andric   return true;
47505ffd83dbSDimitry Andric }
47515ffd83dbSDimitry Andric 
4752e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4753e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
4754e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
4755e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
4756e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
4757e8d8bef9SDimitry Andric 
4758e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4759e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
4760e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
4761e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
4762e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
4763e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
4764e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
4765e8d8bef9SDimitry Andric 
4766*fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
4767*fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
4768*fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
4769*fe6060f1SDimitry Andric                                         MI.getDebugLoc());
4770*fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
4771*fe6060f1SDimitry Andric     return false;
4772*fe6060f1SDimitry Andric   }
4773*fe6060f1SDimitry Andric 
4774e8d8bef9SDimitry Andric   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4775e8d8bef9SDimitry Andric   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4776e8d8bef9SDimitry Andric   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4777e8d8bef9SDimitry Andric                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4778e8d8bef9SDimitry Andric                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4779e8d8bef9SDimitry Andric                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4780e8d8bef9SDimitry Andric 
4781e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
4782e8d8bef9SDimitry Andric   if (Is64) {
4783e8d8bef9SDimitry Andric     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4784e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(0));
4785e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(1));
4786e8d8bef9SDimitry Andric   } else {
4787e8d8bef9SDimitry Andric     Ops.push_back(NodePtr);
4788e8d8bef9SDimitry Andric   }
4789e8d8bef9SDimitry Andric   Ops.push_back(RayExtent);
4790e8d8bef9SDimitry Andric 
4791e8d8bef9SDimitry Andric   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4792e8d8bef9SDimitry Andric     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4793e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(0));
4794e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(1));
4795e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(2));
4796e8d8bef9SDimitry Andric   };
4797e8d8bef9SDimitry Andric 
4798e8d8bef9SDimitry Andric   packLanes(RayOrigin);
4799e8d8bef9SDimitry Andric   if (IsA16) {
4800e8d8bef9SDimitry Andric     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4801e8d8bef9SDimitry Andric     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4802e8d8bef9SDimitry Andric     Register R1 = MRI.createGenericVirtualRegister(S32);
4803e8d8bef9SDimitry Andric     Register R2 = MRI.createGenericVirtualRegister(S32);
4804e8d8bef9SDimitry Andric     Register R3 = MRI.createGenericVirtualRegister(S32);
4805e8d8bef9SDimitry Andric     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4806e8d8bef9SDimitry Andric     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4807e8d8bef9SDimitry Andric     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4808e8d8bef9SDimitry Andric     Ops.push_back(R1);
4809e8d8bef9SDimitry Andric     Ops.push_back(R2);
4810e8d8bef9SDimitry Andric     Ops.push_back(R3);
4811e8d8bef9SDimitry Andric   } else {
4812e8d8bef9SDimitry Andric     packLanes(RayDir);
4813e8d8bef9SDimitry Andric     packLanes(RayInvDir);
4814e8d8bef9SDimitry Andric   }
4815e8d8bef9SDimitry Andric 
4816e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4817e8d8bef9SDimitry Andric     .addDef(DstReg)
4818e8d8bef9SDimitry Andric     .addImm(Opcode);
4819e8d8bef9SDimitry Andric 
4820e8d8bef9SDimitry Andric   for (Register R : Ops) {
4821e8d8bef9SDimitry Andric     MIB.addUse(R);
4822e8d8bef9SDimitry Andric   }
4823e8d8bef9SDimitry Andric 
4824e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
4825e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
4826e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
4827e8d8bef9SDimitry Andric 
4828e8d8bef9SDimitry Andric   MI.eraseFromParent();
4829e8d8bef9SDimitry Andric   return true;
4830e8d8bef9SDimitry Andric }
4831e8d8bef9SDimitry Andric 
48325ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
48335ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
48345ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
48355ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
48365ffd83dbSDimitry Andric 
48370b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4838480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
4839480093f4SDimitry Andric   switch (IntrID) {
4840480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
4841480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
4842480093f4SDimitry Andric     MachineInstr *Br = nullptr;
48435ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4844e8d8bef9SDimitry Andric     bool Negated = false;
4845e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
4846e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
48470b57cec5SDimitry Andric       const SIRegisterInfo *TRI
48480b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
48490b57cec5SDimitry Andric 
48500b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
48510b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
4852480093f4SDimitry Andric 
48535ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4854e8d8bef9SDimitry Andric 
4855e8d8bef9SDimitry Andric       if (Negated)
4856e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
4857e8d8bef9SDimitry Andric 
48585ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4859480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
48600b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
48610b57cec5SDimitry Andric           .addDef(Def)
48620b57cec5SDimitry Andric           .addUse(Use)
48635ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
4864480093f4SDimitry Andric       } else {
4865480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
4866480093f4SDimitry Andric             .addDef(Def)
4867480093f4SDimitry Andric             .addUse(Use)
4868e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
4869480093f4SDimitry Andric       }
4870480093f4SDimitry Andric 
48715ffd83dbSDimitry Andric       if (Br) {
48725ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
48735ffd83dbSDimitry Andric       } else {
48745ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
48755ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
48765ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
48775ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
48785ffd83dbSDimitry Andric       }
48790b57cec5SDimitry Andric 
48800b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
48810b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
48820b57cec5SDimitry Andric       MI.eraseFromParent();
48830b57cec5SDimitry Andric       BrCond->eraseFromParent();
48840b57cec5SDimitry Andric       return true;
48850b57cec5SDimitry Andric     }
48860b57cec5SDimitry Andric 
48870b57cec5SDimitry Andric     return false;
48880b57cec5SDimitry Andric   }
48890b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
4890480093f4SDimitry Andric     MachineInstr *Br = nullptr;
48915ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4892e8d8bef9SDimitry Andric     bool Negated = false;
4893e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
4894e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
48950b57cec5SDimitry Andric       const SIRegisterInfo *TRI
48960b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
48970b57cec5SDimitry Andric 
48985ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
48990b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
49005ffd83dbSDimitry Andric 
4901e8d8bef9SDimitry Andric       if (Negated)
4902e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
4903e8d8bef9SDimitry Andric 
49045ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
49050b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
49060b57cec5SDimitry Andric         .addUse(Reg)
49075ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
49085ffd83dbSDimitry Andric 
49095ffd83dbSDimitry Andric       if (Br)
49105ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
49115ffd83dbSDimitry Andric       else
49125ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
49135ffd83dbSDimitry Andric 
49140b57cec5SDimitry Andric       MI.eraseFromParent();
49150b57cec5SDimitry Andric       BrCond->eraseFromParent();
49160b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
49170b57cec5SDimitry Andric       return true;
49180b57cec5SDimitry Andric     }
49190b57cec5SDimitry Andric 
49200b57cec5SDimitry Andric     return false;
49210b57cec5SDimitry Andric   }
49220b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
49235ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
49245ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
49255ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
49265ffd83dbSDimitry Andric       MI.eraseFromParent();
49275ffd83dbSDimitry Andric       return true;
49285ffd83dbSDimitry Andric     }
49295ffd83dbSDimitry Andric 
49300b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
49310b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
49320b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
49330b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
49340b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
49350b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49360b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
49370b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
49380b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49390b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
49400b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
49410b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49420b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
49430b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
49440b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49450b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
49460b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
49470b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49480b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
49490b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
49500b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49510b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
49520b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
49530b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49540b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
49550b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
49560b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49570b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
49580b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
49590b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
49600b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
49610b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
49620b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
49630b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
49648bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
49658bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
49668bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
49678bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
49688bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
49698bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
49708bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
49718bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
49728bcb0991SDimitry Andric     MI.eraseFromParent();
49738bcb0991SDimitry Andric     return true;
49748bcb0991SDimitry Andric   }
49755ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
4976e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
49778bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
49785ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
49795ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
49808bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
49815ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
49825ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
49835ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
49845ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
49855ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
49865ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
49875ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
49885ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
49895ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
49905ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
49915ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
49925ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
49935ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
49945ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
49955ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
49965ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
49975ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
49985ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
49995ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
50005ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
50015ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
50025ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
50035ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
50045ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
50055ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
50065ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
50075ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
50085ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
50095ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
50105ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
50115ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
50125ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
50135ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
50145ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
50155ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
50165ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
50175ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
50185ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5019e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5020e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
50215ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
50225ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5023*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_buffer_atomic_fadd:
5024*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5025*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5026*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5027*fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
50285ffd83dbSDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
50295ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_inc:
50305ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, true);
50315ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_dec:
50325ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, false);
50335ffd83dbSDimitry Andric   case Intrinsic::trap:
50345ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
50355ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
50365ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
5037e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
5038e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
5039e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
5040e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
5041e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
5042e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
5043e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
5044e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
50455ffd83dbSDimitry Andric   default: {
50465ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
50475ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
50485ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
50490b57cec5SDimitry Andric     return true;
50500b57cec5SDimitry Andric   }
50515ffd83dbSDimitry Andric   }
50520b57cec5SDimitry Andric 
50530b57cec5SDimitry Andric   return true;
50540b57cec5SDimitry Andric }
5055