xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision e8d8bef961a50d4dc22501cde4fb9fb0be1b2532)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18*e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
215ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
220b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
235ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24*e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
258bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
26*e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
270b57cec5SDimitry Andric 
280b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
290b57cec5SDimitry Andric 
300b57cec5SDimitry Andric using namespace llvm;
310b57cec5SDimitry Andric using namespace LegalizeActions;
320b57cec5SDimitry Andric using namespace LegalizeMutations;
330b57cec5SDimitry Andric using namespace LegalityPredicates;
345ffd83dbSDimitry Andric using namespace MIPatternMatch;
350b57cec5SDimitry Andric 
365ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
375ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
385ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
395ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
405ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
415ffd83dbSDimitry Andric   cl::init(false),
425ffd83dbSDimitry Andric   cl::ReallyHidden);
430b57cec5SDimitry Andric 
445ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
455ffd83dbSDimitry Andric 
465ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
475ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
485ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
495ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
505ffd83dbSDimitry Andric   return Ty.changeNumElements(Pow2NElts);
510b57cec5SDimitry Andric }
520b57cec5SDimitry Andric 
535ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
545ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
555ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
565ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
575ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
588bcb0991SDimitry Andric }
598bcb0991SDimitry Andric 
60*e8d8bef9SDimitry Andric /// \returs true if this is an odd sized vector which should widen by adding an
61*e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
62*e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
630b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
640b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
650b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
66*e8d8bef9SDimitry Andric     if (!Ty.isVector())
67*e8d8bef9SDimitry Andric       return false;
68*e8d8bef9SDimitry Andric 
69*e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
70*e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
71*e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
72*e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
738bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
748bcb0991SDimitry Andric   };
758bcb0991SDimitry Andric }
768bcb0991SDimitry Andric 
77*e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
78*e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
79*e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
80*e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
81*e8d8bef9SDimitry Andric   };
82*e8d8bef9SDimitry Andric }
83*e8d8bef9SDimitry Andric 
848bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
858bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
868bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
878bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
888bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
890b57cec5SDimitry Andric   };
900b57cec5SDimitry Andric }
910b57cec5SDimitry Andric 
920b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
930b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
940b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
950b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
960b57cec5SDimitry Andric     return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy));
970b57cec5SDimitry Andric   };
980b57cec5SDimitry Andric }
990b57cec5SDimitry Andric 
1000b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1010b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1020b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1030b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1040b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1050b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1060b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
1070b57cec5SDimitry Andric     return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy));
1080b57cec5SDimitry Andric   };
1090b57cec5SDimitry Andric }
1100b57cec5SDimitry Andric 
1118bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1128bcb0991SDimitry Andric // type.
1138bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1148bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1158bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1168bcb0991SDimitry Andric 
1178bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1188bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1198bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1208bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1218bcb0991SDimitry Andric 
1228bcb0991SDimitry Andric     assert(EltSize < 32);
1238bcb0991SDimitry Andric 
1248bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
1258bcb0991SDimitry Andric     return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
1268bcb0991SDimitry Andric   };
1278bcb0991SDimitry Andric }
1288bcb0991SDimitry Andric 
129*e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
130*e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1315ffd83dbSDimitry Andric 
1325ffd83dbSDimitry Andric   LLT CoercedTy;
1335ffd83dbSDimitry Andric   if (Size <= 32) {
1345ffd83dbSDimitry Andric     // <2 x s8> -> s16
1355ffd83dbSDimitry Andric     // <4 x s8> -> s32
136*e8d8bef9SDimitry Andric     return LLT::scalar(Size);
137*e8d8bef9SDimitry Andric   }
1385ffd83dbSDimitry Andric 
139*e8d8bef9SDimitry Andric   return LLT::scalarOrVector(Size / 32, 32);
140*e8d8bef9SDimitry Andric }
141*e8d8bef9SDimitry Andric 
142*e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
143*e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
144*e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
145*e8d8bef9SDimitry Andric     return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
146*e8d8bef9SDimitry Andric   };
147*e8d8bef9SDimitry Andric }
148*e8d8bef9SDimitry Andric 
149*e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
150*e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
151*e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
152*e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
153*e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
154*e8d8bef9SDimitry Andric     return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
1555ffd83dbSDimitry Andric   };
1565ffd83dbSDimitry Andric }
1575ffd83dbSDimitry Andric 
1588bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
1598bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1608bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1618bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
1628bcb0991SDimitry Andric   };
1638bcb0991SDimitry Andric }
1648bcb0991SDimitry Andric 
1650b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
1660b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1670b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1680b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
1690b57cec5SDimitry Andric   };
1700b57cec5SDimitry Andric }
1710b57cec5SDimitry Andric 
1720b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
1730b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1740b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
1750b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
1760b57cec5SDimitry Andric   };
1770b57cec5SDimitry Andric }
1780b57cec5SDimitry Andric 
1795ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
1805ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
1815ffd83dbSDimitry Andric }
1825ffd83dbSDimitry Andric 
1835ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
1845ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
1855ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
1865ffd83dbSDimitry Andric }
1875ffd83dbSDimitry Andric 
1885ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
1890b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
1900b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
1910b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
1920b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
1930b57cec5SDimitry Andric }
1940b57cec5SDimitry Andric 
1955ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
1965ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
1975ffd83dbSDimitry Andric     return false;
1985ffd83dbSDimitry Andric 
1995ffd83dbSDimitry Andric   if (Ty.isVector())
2005ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2015ffd83dbSDimitry Andric 
2025ffd83dbSDimitry Andric   return true;
2035ffd83dbSDimitry Andric }
2045ffd83dbSDimitry Andric 
2055ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2065ffd83dbSDimitry Andric // multiples of v2s16.
2075ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2085ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2095ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2108bcb0991SDimitry Andric   };
2118bcb0991SDimitry Andric }
2128bcb0991SDimitry Andric 
2135ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2148bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2155ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2165ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2175ffd83dbSDimitry Andric       return false;
2185ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2195ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2208bcb0991SDimitry Andric   };
2218bcb0991SDimitry Andric }
2228bcb0991SDimitry Andric 
2238bcb0991SDimitry Andric static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
2248bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2258bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2268bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
2278bcb0991SDimitry Andric            Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
2280b57cec5SDimitry Andric   };
2290b57cec5SDimitry Andric }
2300b57cec5SDimitry Andric 
2315ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2325ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2335ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2345ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
2355ffd83dbSDimitry Andric                                     bool IsLoad) {
2365ffd83dbSDimitry Andric   switch (AS) {
2375ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2385ffd83dbSDimitry Andric     // FIXME: Private element size.
239*e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2405ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2415ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
2425ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
2435ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
2445ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
2455ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
2465ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
2475ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
2485ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
2495ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
2505ffd83dbSDimitry Andric     // kernel.
2515ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
2525ffd83dbSDimitry Andric   default:
2535ffd83dbSDimitry Andric     // Flat addresses may contextually need to be split to 32-bit parts if they
2545ffd83dbSDimitry Andric     // may alias scratch depending on the subtarget.
2555ffd83dbSDimitry Andric     return 128;
2565ffd83dbSDimitry Andric   }
2575ffd83dbSDimitry Andric }
2585ffd83dbSDimitry Andric 
2595ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
2605ffd83dbSDimitry Andric                                  const LegalityQuery &Query,
2615ffd83dbSDimitry Andric                                  unsigned Opcode) {
2625ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
2635ffd83dbSDimitry Andric 
2645ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
2655ffd83dbSDimitry Andric   const bool IsLoad = Opcode != AMDGPU::G_STORE;
2665ffd83dbSDimitry Andric 
2675ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
2685ffd83dbSDimitry Andric   unsigned MemSize = Query.MMODescrs[0].SizeInBits;
269*e8d8bef9SDimitry Andric   unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
2705ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
2715ffd83dbSDimitry Andric 
2725ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
2735ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2745ffd83dbSDimitry Andric     return false;
2755ffd83dbSDimitry Andric 
2765ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
2775ffd83dbSDimitry Andric   // we also need to modify the memory access size.
2785ffd83dbSDimitry Andric #if 0
2795ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
2805ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
2815ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
2825ffd83dbSDimitry Andric #endif
2835ffd83dbSDimitry Andric 
2845ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
2855ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
2865ffd83dbSDimitry Andric     return false;
2875ffd83dbSDimitry Andric 
2885ffd83dbSDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
2895ffd83dbSDimitry Andric     return false;
2905ffd83dbSDimitry Andric 
2915ffd83dbSDimitry Andric   switch (MemSize) {
2925ffd83dbSDimitry Andric   case 8:
2935ffd83dbSDimitry Andric   case 16:
2945ffd83dbSDimitry Andric   case 32:
2955ffd83dbSDimitry Andric   case 64:
2965ffd83dbSDimitry Andric   case 128:
2975ffd83dbSDimitry Andric     break;
2985ffd83dbSDimitry Andric   case 96:
2995ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3005ffd83dbSDimitry Andric       return false;
3015ffd83dbSDimitry Andric     break;
3025ffd83dbSDimitry Andric   case 256:
3035ffd83dbSDimitry Andric   case 512:
3045ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3055ffd83dbSDimitry Andric     break;
3065ffd83dbSDimitry Andric   default:
3075ffd83dbSDimitry Andric     return false;
3085ffd83dbSDimitry Andric   }
3095ffd83dbSDimitry Andric 
3105ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3115ffd83dbSDimitry Andric 
312*e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3135ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
314*e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
315*e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3165ffd83dbSDimitry Andric       return false;
3175ffd83dbSDimitry Andric   }
3185ffd83dbSDimitry Andric 
3195ffd83dbSDimitry Andric   return true;
3205ffd83dbSDimitry Andric }
3215ffd83dbSDimitry Andric 
3225ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
3235ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
3245ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
3255ffd83dbSDimitry Andric // bitcasting.
3265ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
3275ffd83dbSDimitry Andric   if (EnableNewLegality)
3285ffd83dbSDimitry Andric     return false;
3295ffd83dbSDimitry Andric 
3305ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
3315ffd83dbSDimitry Andric   if (Size <= 64)
3325ffd83dbSDimitry Andric     return false;
3335ffd83dbSDimitry Andric   if (!Ty.isVector())
3345ffd83dbSDimitry Andric     return true;
335*e8d8bef9SDimitry Andric 
336*e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
337*e8d8bef9SDimitry Andric   if (EltTy.isPointer())
338*e8d8bef9SDimitry Andric     return true;
339*e8d8bef9SDimitry Andric 
340*e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
3415ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
3425ffd83dbSDimitry Andric }
3435ffd83dbSDimitry Andric 
3445ffd83dbSDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
3455ffd83dbSDimitry Andric                              unsigned Opcode) {
3465ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
3475ffd83dbSDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
3485ffd83dbSDimitry Andric          !loadStoreBitcastWorkaround(Ty);
3495ffd83dbSDimitry Andric }
3505ffd83dbSDimitry Andric 
351*e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
352*e8d8bef9SDimitry Andric /// to a different type.
353*e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
354*e8d8bef9SDimitry Andric                                        const unsigned MemSizeInBits) {
355*e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
356*e8d8bef9SDimitry Andric     if (Size != MemSizeInBits)
357*e8d8bef9SDimitry Andric       return Size <= 32 && Ty.isVector();
358*e8d8bef9SDimitry Andric 
359*e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
360*e8d8bef9SDimitry Andric     return true;
361*e8d8bef9SDimitry Andric   return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
362*e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
363*e8d8bef9SDimitry Andric }
364*e8d8bef9SDimitry Andric 
365*e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
366*e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
367*e8d8bef9SDimitry Andric /// changes, not the size of the result register.
368*e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
369*e8d8bef9SDimitry Andric                             unsigned AlignInBits, unsigned AddrSpace,
370*e8d8bef9SDimitry Andric                             unsigned Opcode) {
371*e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
372*e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
373*e8d8bef9SDimitry Andric     return false;
374*e8d8bef9SDimitry Andric 
375*e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
376*e8d8bef9SDimitry Andric   // end up widening these for a scalar load during RegBankSelect, since there
377*e8d8bef9SDimitry Andric   // aren't 96-bit scalar loads.
378*e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
379*e8d8bef9SDimitry Andric     return false;
380*e8d8bef9SDimitry Andric 
381*e8d8bef9SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
382*e8d8bef9SDimitry Andric     return false;
383*e8d8bef9SDimitry Andric 
384*e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
385*e8d8bef9SDimitry Andric   // to it.
386*e8d8bef9SDimitry Andric   //
387*e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
388*e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
389*e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
390*e8d8bef9SDimitry Andric     return false;
391*e8d8bef9SDimitry Andric 
392*e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
393*e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
394*e8d8bef9SDimitry Andric   bool Fast = false;
395*e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
396*e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
397*e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
398*e8d8bef9SDimitry Andric          Fast;
399*e8d8bef9SDimitry Andric }
400*e8d8bef9SDimitry Andric 
401*e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
402*e8d8bef9SDimitry Andric                             unsigned Opcode) {
403*e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
404*e8d8bef9SDimitry Andric     return false;
405*e8d8bef9SDimitry Andric 
406*e8d8bef9SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
407*e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
408*e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
409*e8d8bef9SDimitry Andric }
410*e8d8bef9SDimitry Andric 
4110b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
4120b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
4130b57cec5SDimitry Andric   :  ST(ST_) {
4140b57cec5SDimitry Andric   using namespace TargetOpcode;
4150b57cec5SDimitry Andric 
4160b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
4170b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
4180b57cec5SDimitry Andric   };
4190b57cec5SDimitry Andric 
4200b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
421*e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
4220b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
4230b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
4240b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
4250b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
4260b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
4275ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
4285ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
4290b57cec5SDimitry Andric 
430*e8d8bef9SDimitry Andric   const LLT V2S8 = LLT::vector(2, 8);
4310b57cec5SDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
4320b57cec5SDimitry Andric   const LLT V4S16 = LLT::vector(4, 16);
4330b57cec5SDimitry Andric 
4340b57cec5SDimitry Andric   const LLT V2S32 = LLT::vector(2, 32);
4350b57cec5SDimitry Andric   const LLT V3S32 = LLT::vector(3, 32);
4360b57cec5SDimitry Andric   const LLT V4S32 = LLT::vector(4, 32);
4370b57cec5SDimitry Andric   const LLT V5S32 = LLT::vector(5, 32);
4380b57cec5SDimitry Andric   const LLT V6S32 = LLT::vector(6, 32);
4390b57cec5SDimitry Andric   const LLT V7S32 = LLT::vector(7, 32);
4400b57cec5SDimitry Andric   const LLT V8S32 = LLT::vector(8, 32);
4410b57cec5SDimitry Andric   const LLT V9S32 = LLT::vector(9, 32);
4420b57cec5SDimitry Andric   const LLT V10S32 = LLT::vector(10, 32);
4430b57cec5SDimitry Andric   const LLT V11S32 = LLT::vector(11, 32);
4440b57cec5SDimitry Andric   const LLT V12S32 = LLT::vector(12, 32);
4450b57cec5SDimitry Andric   const LLT V13S32 = LLT::vector(13, 32);
4460b57cec5SDimitry Andric   const LLT V14S32 = LLT::vector(14, 32);
4470b57cec5SDimitry Andric   const LLT V15S32 = LLT::vector(15, 32);
4480b57cec5SDimitry Andric   const LLT V16S32 = LLT::vector(16, 32);
4498bcb0991SDimitry Andric   const LLT V32S32 = LLT::vector(32, 32);
4500b57cec5SDimitry Andric 
4510b57cec5SDimitry Andric   const LLT V2S64 = LLT::vector(2, 64);
4520b57cec5SDimitry Andric   const LLT V3S64 = LLT::vector(3, 64);
4530b57cec5SDimitry Andric   const LLT V4S64 = LLT::vector(4, 64);
4540b57cec5SDimitry Andric   const LLT V5S64 = LLT::vector(5, 64);
4550b57cec5SDimitry Andric   const LLT V6S64 = LLT::vector(6, 64);
4560b57cec5SDimitry Andric   const LLT V7S64 = LLT::vector(7, 64);
4570b57cec5SDimitry Andric   const LLT V8S64 = LLT::vector(8, 64);
4588bcb0991SDimitry Andric   const LLT V16S64 = LLT::vector(16, 64);
4590b57cec5SDimitry Andric 
4600b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
4610b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
4628bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
4630b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
4648bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
4650b57cec5SDimitry Andric 
4660b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
4670b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
4688bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
4690b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
4708bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
4710b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
4720b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
4730b57cec5SDimitry Andric 
4740b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
4750b57cec5SDimitry Andric 
4760b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
4770b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
4780b57cec5SDimitry Andric   };
4790b57cec5SDimitry Andric 
4800b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
4818bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
4820b57cec5SDimitry Andric   };
4830b57cec5SDimitry Andric 
4840b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
4850b57cec5SDimitry Andric     S32, S64
4860b57cec5SDimitry Andric   };
4870b57cec5SDimitry Andric 
4880b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
4890b57cec5SDimitry Andric     S32, S64, S16
4900b57cec5SDimitry Andric   };
4910b57cec5SDimitry Andric 
4920b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
4930b57cec5SDimitry Andric     S32, S64, S16, V2S16
4940b57cec5SDimitry Andric   };
4950b57cec5SDimitry Andric 
4965ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
4975ffd83dbSDimitry Andric 
498480093f4SDimitry Andric   setAction({G_BRCOND, S1}, Legal); // VCC branches
499480093f4SDimitry Andric   setAction({G_BRCOND, S32}, Legal); // SCC branches
5000b57cec5SDimitry Andric 
5010b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
5020b57cec5SDimitry Andric   // elements for v3s16
5030b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
504*e8d8bef9SDimitry Andric     .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
5050b57cec5SDimitry Andric     .legalFor(AllS32Vectors)
5060b57cec5SDimitry Andric     .legalFor(AllS64Vectors)
5070b57cec5SDimitry Andric     .legalFor(AddrSpaces64)
5080b57cec5SDimitry Andric     .legalFor(AddrSpaces32)
509*e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
510*e8d8bef9SDimitry Andric     .clampScalar(0, S16, S256)
5110b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
5120b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 16)
5130b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
514*e8d8bef9SDimitry Andric     .scalarize(0);
5150b57cec5SDimitry Andric 
516*e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
517*e8d8bef9SDimitry Andric     // Full set of gfx9 features.
5185ffd83dbSDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5195ffd83dbSDimitry Andric       .legalFor({S32, S16, V2S16})
5205ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
5215ffd83dbSDimitry Andric       .clampMaxNumElements(0, S16, 2)
5225ffd83dbSDimitry Andric       .scalarize(0)
5235ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32);
524*e8d8bef9SDimitry Andric 
525*e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
526*e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
527*e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
528*e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S16, 2)
529*e8d8bef9SDimitry Andric       .scalarize(0)
530*e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
531*e8d8bef9SDimitry Andric       .lower();
5325ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
5330b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5340b57cec5SDimitry Andric       .legalFor({S32, S16})
5350b57cec5SDimitry Andric       .clampScalar(0, S16, S32)
5365ffd83dbSDimitry Andric       .scalarize(0)
537*e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
538*e8d8bef9SDimitry Andric 
539*e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
540*e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
541*e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
542*e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
543*e8d8bef9SDimitry Andric       .minScalar(0, S16)
544*e8d8bef9SDimitry Andric       .scalarize(0)
545*e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
546*e8d8bef9SDimitry Andric       .lower();
547*e8d8bef9SDimitry Andric 
548*e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
549*e8d8bef9SDimitry Andric     // coerce to the desired type first.
550*e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
551*e8d8bef9SDimitry Andric       .minScalar(0, S16)
552*e8d8bef9SDimitry Andric       .scalarize(0)
553*e8d8bef9SDimitry Andric       .lower();
5540b57cec5SDimitry Andric   } else {
5550b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
5560b57cec5SDimitry Andric       .legalFor({S32})
5570b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
5580b57cec5SDimitry Andric       .scalarize(0);
559*e8d8bef9SDimitry Andric 
560*e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
561*e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
562*e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
563*e8d8bef9SDimitry Andric         .scalarize(0)
564*e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
565*e8d8bef9SDimitry Andric         .lower();
566*e8d8bef9SDimitry Andric     } else {
567*e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
568*e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
569*e8d8bef9SDimitry Andric         .minScalar(0, S32)
570*e8d8bef9SDimitry Andric         .scalarize(0)
571*e8d8bef9SDimitry Andric         .lower();
5720b57cec5SDimitry Andric     }
5730b57cec5SDimitry Andric 
574*e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
575*e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
576*e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
577*e8d8bef9SDimitry Andric       .minScalar(0, S32)
578*e8d8bef9SDimitry Andric       .scalarize(0)
579*e8d8bef9SDimitry Andric       .lower();
580*e8d8bef9SDimitry Andric   }
581*e8d8bef9SDimitry Andric 
582480093f4SDimitry Andric   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
5835ffd83dbSDimitry Andric     .customFor({S32, S64})
584480093f4SDimitry Andric     .clampScalar(0, S32, S64)
585480093f4SDimitry Andric     .widenScalarToNextPow2(0, 32)
586480093f4SDimitry Andric     .scalarize(0);
587480093f4SDimitry Andric 
588*e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
5890b57cec5SDimitry Andric                    .legalFor({S32})
590*e8d8bef9SDimitry Andric                    .maxScalarOrElt(0, S32);
591*e8d8bef9SDimitry Andric 
592*e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
593*e8d8bef9SDimitry Andric     Mulh
594*e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
595*e8d8bef9SDimitry Andric       .lowerFor({V2S8});
596*e8d8bef9SDimitry Andric   }
597*e8d8bef9SDimitry Andric 
598*e8d8bef9SDimitry Andric   Mulh
599*e8d8bef9SDimitry Andric     .scalarize(0)
600*e8d8bef9SDimitry Andric     .lower();
6010b57cec5SDimitry Andric 
6020b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
6030b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
6040b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
6050b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
6060b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
6070b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6088bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
6090b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
6100b57cec5SDimitry Andric     .scalarize(0);
6110b57cec5SDimitry Andric 
6128bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
6130b57cec5SDimitry Andric                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
614480093f4SDimitry Andric     .legalFor({{S32, S1}, {S32, S32}})
6155ffd83dbSDimitry Andric     .minScalar(0, S32)
6165ffd83dbSDimitry Andric     // TODO: .scalarize(0)
6178bcb0991SDimitry Andric     .lower();
6180b57cec5SDimitry Andric 
6190b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
6200b57cec5SDimitry Andric     // Don't worry about the size constraint.
6218bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
6225ffd83dbSDimitry Andric     .lower();
6230b57cec5SDimitry Andric 
6240b57cec5SDimitry Andric 
6250b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
6268bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
6270b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
628*e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
6290b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
630*e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
6310b57cec5SDimitry Andric 
6325ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
6335ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
6345ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
6358bcb0991SDimitry Andric 
6365ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
6375ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
6385ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
6395ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
6405ffd83dbSDimitry Andric       .legalFor({S1, S16})
6415ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6425ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
6435ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
6445ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
6455ffd83dbSDimitry Andric 
6465ffd83dbSDimitry Andric   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
6475ffd83dbSDimitry Andric 
6485ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
6495ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
6505ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
6515ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
6525ffd83dbSDimitry Andric 
6535ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
654*e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
655*e8d8bef9SDimitry Andric 
6565ffd83dbSDimitry Andric   setAction({G_BLOCK_ADDR, CodePtr}, Legal);
6570b57cec5SDimitry Andric 
6580b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
6598bcb0991SDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
6600b57cec5SDimitry Andric     .legalFor({S32, S64});
6618bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
6628bcb0991SDimitry Andric     .customFor({S32, S64});
6638bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
6648bcb0991SDimitry Andric     .customFor({S32, S64});
6650b57cec5SDimitry Andric 
6660b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
6670b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
6680b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
6690b57cec5SDimitry Andric     else
6700b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
6718bcb0991SDimitry Andric 
6728bcb0991SDimitry Andric     TrigActions.customFor({S16});
6738bcb0991SDimitry Andric     FDIVActions.customFor({S16});
6740b57cec5SDimitry Andric   }
6750b57cec5SDimitry Andric 
6760b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
6770b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
6780b57cec5SDimitry Andric 
6790b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
6800b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
681480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
6820b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
6830b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
6840b57cec5SDimitry Andric       .scalarize(0);
6850b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
6860b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
6870b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
6880b57cec5SDimitry Andric       .scalarize(0);
6890b57cec5SDimitry Andric   } else {
6900b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
6910b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
6920b57cec5SDimitry Andric       .scalarize(0);
6930b57cec5SDimitry Andric   }
6940b57cec5SDimitry Andric 
6950b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
6960b57cec5SDimitry Andric     FPOpActions.clampMaxNumElements(0, S16, 2);
6978bcb0991SDimitry Andric 
6980b57cec5SDimitry Andric   FPOpActions
6990b57cec5SDimitry Andric     .scalarize(0)
7000b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7010b57cec5SDimitry Andric 
7028bcb0991SDimitry Andric   TrigActions
7038bcb0991SDimitry Andric     .scalarize(0)
7048bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7058bcb0991SDimitry Andric 
7068bcb0991SDimitry Andric   FDIVActions
7078bcb0991SDimitry Andric     .scalarize(0)
7088bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
7098bcb0991SDimitry Andric 
7108bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
7118bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
7128bcb0991SDimitry Andric     .clampMaxNumElements(0, S16, 2)
7138bcb0991SDimitry Andric     .scalarize(0)
7148bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
7158bcb0991SDimitry Andric 
7160b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
7178bcb0991SDimitry Andric     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
7180b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
7190b57cec5SDimitry Andric       .scalarize(0)
7200b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
7210b57cec5SDimitry Andric   } else {
7225ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
7235ffd83dbSDimitry Andric       .legalFor({S32, S64})
7245ffd83dbSDimitry Andric       .scalarize(0)
7255ffd83dbSDimitry Andric       .clampScalar(0, S32, S64);
7265ffd83dbSDimitry Andric 
7275ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
7285ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7295ffd83dbSDimitry Andric         .customFor({S64})
7305ffd83dbSDimitry Andric         .legalFor({S32, S64})
7315ffd83dbSDimitry Andric         .scalarize(0)
7325ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
7335ffd83dbSDimitry Andric     } else {
7345ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
7350b57cec5SDimitry Andric         .legalFor({S32, S64})
7360b57cec5SDimitry Andric         .scalarize(0)
7370b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
7380b57cec5SDimitry Andric     }
7395ffd83dbSDimitry Andric   }
7400b57cec5SDimitry Andric 
7410b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
7420b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
7435ffd83dbSDimitry Andric     .scalarize(0)
7445ffd83dbSDimitry Andric     .lower();
7450b57cec5SDimitry Andric 
7460b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
7470b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
748*e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
7490b57cec5SDimitry Andric     .scalarize(0);
7500b57cec5SDimitry Andric 
7510b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FSUB)
7520b57cec5SDimitry Andric       // Use actual fsub instruction
7530b57cec5SDimitry Andric       .legalFor({S32})
7540b57cec5SDimitry Andric       // Must use fadd + fneg
7550b57cec5SDimitry Andric       .lowerFor({S64, S16, V2S16})
7560b57cec5SDimitry Andric       .scalarize(0)
7570b57cec5SDimitry Andric       .clampScalar(0, S32, S64);
7580b57cec5SDimitry Andric 
7598bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
7608bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
7615ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
7628bcb0991SDimitry Andric     FMad.customFor({S32, S16});
7635ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
7648bcb0991SDimitry Andric     FMad.customFor({S32});
7655ffd83dbSDimitry Andric   else if (ST.hasMadF16())
7665ffd83dbSDimitry Andric     FMad.customFor({S16});
7678bcb0991SDimitry Andric   FMad.scalarize(0)
7688bcb0991SDimitry Andric       .lower();
7698bcb0991SDimitry Andric 
770*e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
771*e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
772*e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
773*e8d8bef9SDimitry Andric   } else {
774*e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
775*e8d8bef9SDimitry Andric         .customFor({S32, S64});
776*e8d8bef9SDimitry Andric   }
777*e8d8bef9SDimitry Andric   FRem.scalarize(0);
778*e8d8bef9SDimitry Andric 
7795ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
7805ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
7815ffd83dbSDimitry Andric     .legalIf(isScalar(0))
7825ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
7835ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
7845ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
7855ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
7865ffd83dbSDimitry Andric     // in the legalizer.
7875ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
7885ffd83dbSDimitry Andric     .alwaysLegal();
7895ffd83dbSDimitry Andric 
7900b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
7910b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
7925ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
793480093f4SDimitry Andric     .scalarize(0)
7945ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
7955ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
7960b57cec5SDimitry Andric 
7978bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
7988bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
799480093f4SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
8000b57cec5SDimitry Andric     .lowerFor({{S32, S64}})
801480093f4SDimitry Andric     .lowerIf(typeIs(1, S1))
8028bcb0991SDimitry Andric     .customFor({{S64, S64}});
8038bcb0991SDimitry Andric   if (ST.has16BitInsts())
8048bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
8058bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
806*e8d8bef9SDimitry Andric        .minScalar(0, S32)
8075ffd83dbSDimitry Andric        .scalarize(0)
8085ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
8090b57cec5SDimitry Andric 
8108bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
8115ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
812*e8d8bef9SDimitry Andric     .customFor({{S64, S64}})
813*e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
8148bcb0991SDimitry Andric   if (ST.has16BitInsts())
8158bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
8168bcb0991SDimitry Andric   else
8178bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
8188bcb0991SDimitry Andric 
8198bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
8205ffd83dbSDimitry Andric        .scalarize(0)
8215ffd83dbSDimitry Andric        .lower();
8220b57cec5SDimitry Andric 
823*e8d8bef9SDimitry Andric   // Lower roundeven into G_FRINT
824*e8d8bef9SDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
825480093f4SDimitry Andric     .scalarize(0)
826480093f4SDimitry Andric     .lower();
8270b57cec5SDimitry Andric 
828480093f4SDimitry Andric   if (ST.has16BitInsts()) {
829480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
830480093f4SDimitry Andric       .legalFor({S16, S32, S64})
831480093f4SDimitry Andric       .clampScalar(0, S16, S64)
832480093f4SDimitry Andric       .scalarize(0);
833480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
8340b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8350b57cec5SDimitry Andric       .legalFor({S32, S64})
8360b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8370b57cec5SDimitry Andric       .scalarize(0);
8380b57cec5SDimitry Andric   } else {
8390b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
8400b57cec5SDimitry Andric       .legalFor({S32})
8410b57cec5SDimitry Andric       .customFor({S64})
8420b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8430b57cec5SDimitry Andric       .scalarize(0);
8440b57cec5SDimitry Andric   }
8450b57cec5SDimitry Andric 
846480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
847*e8d8bef9SDimitry Andric     .legalIf(all(isPointer(0), sameSize(0, 1)))
848*e8d8bef9SDimitry Andric     .scalarize(0)
849*e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0);
8500b57cec5SDimitry Andric 
8515ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
852*e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
853*e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
8545ffd83dbSDimitry Andric     .scalarize(0);
8550b57cec5SDimitry Andric 
8560b57cec5SDimitry Andric   auto &CmpBuilder =
8570b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
858480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
859480093f4SDimitry Andric     // so make both s1 and s32 legal.
860480093f4SDimitry Andric     //
861480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
862480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
863480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
864480093f4SDimitry Andric     // before that won't try to use s32 result types.
865480093f4SDimitry Andric     //
866480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
867480093f4SDimitry Andric     // bank.
8680b57cec5SDimitry Andric     .legalForCartesianProduct(
8690b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
870480093f4SDimitry Andric     .legalForCartesianProduct(
871480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
8720b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
8730b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
8740b57cec5SDimitry Andric   }
8750b57cec5SDimitry Andric 
8760b57cec5SDimitry Andric   CmpBuilder
8770b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
8780b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
8790b57cec5SDimitry Andric     .scalarize(0)
880480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
8810b57cec5SDimitry Andric 
8820b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
8830b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
8840b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
8850b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
8860b57cec5SDimitry Andric     .scalarize(0);
8870b57cec5SDimitry Andric 
8885ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
8895ffd83dbSDimitry Andric   auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
8905ffd83dbSDimitry Andric   if (ST.has16BitInsts())
8915ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32, S16});
8925ffd83dbSDimitry Andric   else
8935ffd83dbSDimitry Andric     Exp2Ops.legalFor({S32});
8945ffd83dbSDimitry Andric   Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
8955ffd83dbSDimitry Andric   Exp2Ops.scalarize(0);
8965ffd83dbSDimitry Andric 
8975ffd83dbSDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
8985ffd83dbSDimitry Andric   if (ST.has16BitInsts())
8995ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
9005ffd83dbSDimitry Andric   else
9015ffd83dbSDimitry Andric     ExpOps.customFor({S32});
9025ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
9030b57cec5SDimitry Andric         .scalarize(0);
9040b57cec5SDimitry Andric 
905*e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
906*e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
907*e8d8bef9SDimitry Andric     .lower();
908*e8d8bef9SDimitry Andric 
9090b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9105ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
9110b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9120b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
9130b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
9140b57cec5SDimitry Andric     .scalarize(0)
9150b57cec5SDimitry Andric     .widenScalarToNextPow2(0, 32)
9160b57cec5SDimitry Andric     .widenScalarToNextPow2(1, 32);
9170b57cec5SDimitry Andric 
9185ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
9195ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
9205ffd83dbSDimitry Andric   // bitwidth.
9215ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
9225ffd83dbSDimitry Andric     .scalarize(0)
9235ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9245ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9255ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9265ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
9275ffd83dbSDimitry Andric     .lower();
9285ffd83dbSDimitry Andric 
9295ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
9305ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
9315ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
9325ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
9335ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
9345ffd83dbSDimitry Andric     .scalarize(0)
9355ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
9365ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
9375ffd83dbSDimitry Andric 
9385ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
9390b57cec5SDimitry Andric     .legalFor({S32})
9400b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
9410b57cec5SDimitry Andric     .scalarize(0);
9420b57cec5SDimitry Andric 
9430b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
9445ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
9455ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
9465ffd83dbSDimitry Andric       .clampMaxNumElements(0, S16, 2)
9475ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
9485ffd83dbSDimitry Andric       // narrowScalar limitation.
9495ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
9505ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
9515ffd83dbSDimitry Andric       .scalarize(0);
9525ffd83dbSDimitry Andric 
9530b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
9540b57cec5SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
9550b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
9560b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
9570b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
9585ffd83dbSDimitry Andric         .minScalar(0, S16)
9590b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
9605ffd83dbSDimitry Andric         .scalarize(0)
9615ffd83dbSDimitry Andric         .lower();
9620b57cec5SDimitry Andric     } else {
9630b57cec5SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
9640b57cec5SDimitry Andric         .legalFor({S32, S16})
9650b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
9665ffd83dbSDimitry Andric         .minScalar(0, S16)
9675ffd83dbSDimitry Andric         .scalarize(0)
9685ffd83dbSDimitry Andric         .lower();
9690b57cec5SDimitry Andric     }
9700b57cec5SDimitry Andric   } else {
9715ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
9725ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
9735ffd83dbSDimitry Andric       .legalFor({S32})
9745ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
9755ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
9765ffd83dbSDimitry Andric       // narrowScalar limitation.
9775ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
9785ffd83dbSDimitry Andric       .maxScalar(0, S32)
9795ffd83dbSDimitry Andric       .scalarize(0)
9805ffd83dbSDimitry Andric       .lower();
9815ffd83dbSDimitry Andric 
9820b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
9830b57cec5SDimitry Andric       .legalFor({S32})
9845ffd83dbSDimitry Andric       .minScalar(0, S32)
9850b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
9865ffd83dbSDimitry Andric       .scalarize(0)
9875ffd83dbSDimitry Andric       .lower();
9880b57cec5SDimitry Andric   }
9890b57cec5SDimitry Andric 
9900b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
9910b57cec5SDimitry Andric     // List the common cases
9920b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
9930b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
9940b57cec5SDimitry Andric     .scalarize(0)
9950b57cec5SDimitry Andric     // Accept any address space as long as the size matches
9960b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
9970b57cec5SDimitry Andric     .widenScalarIf(smallerThan(1, 0),
9980b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
9990b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10000b57cec5SDimitry Andric       })
10015ffd83dbSDimitry Andric     .narrowScalarIf(largerThan(1, 0),
10020b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10030b57cec5SDimitry Andric         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
10040b57cec5SDimitry Andric       });
10050b57cec5SDimitry Andric 
10060b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
10070b57cec5SDimitry Andric     // List the common cases
10080b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces64, {S64})
10090b57cec5SDimitry Andric     .legalForCartesianProduct(AddrSpaces32, {S32})
10100b57cec5SDimitry Andric     .scalarize(0)
10110b57cec5SDimitry Andric     // Accept any address space as long as the size matches
10120b57cec5SDimitry Andric     .legalIf(sameSize(0, 1))
10130b57cec5SDimitry Andric     .widenScalarIf(smallerThan(0, 1),
10140b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10150b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10160b57cec5SDimitry Andric       })
10170b57cec5SDimitry Andric     .narrowScalarIf(
10185ffd83dbSDimitry Andric       largerThan(0, 1),
10190b57cec5SDimitry Andric       [](const LegalityQuery &Query) {
10200b57cec5SDimitry Andric         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
10210b57cec5SDimitry Andric       });
10220b57cec5SDimitry Andric 
10230b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
10240b57cec5SDimitry Andric     .scalarize(0)
10250b57cec5SDimitry Andric     .custom();
10260b57cec5SDimitry Andric 
10275ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
10285ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
10298bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
10308bcb0991SDimitry Andric 
10318bcb0991SDimitry Andric     // Split vector extloads.
10328bcb0991SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].SizeInBits;
1033*e8d8bef9SDimitry Andric     unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
1034480093f4SDimitry Andric 
1035480093f4SDimitry Andric     if (MemSize < DstTy.getSizeInBits())
1036*e8d8bef9SDimitry Andric       MemSize = std::max(MemSize, AlignBits);
1037480093f4SDimitry Andric 
10388bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
10398bcb0991SDimitry Andric       return true;
10408bcb0991SDimitry Andric 
10418bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
10428bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
10435ffd83dbSDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
10448bcb0991SDimitry Andric       return true;
10458bcb0991SDimitry Andric 
10468bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
10478bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
10485ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
10495ffd83dbSDimitry Andric     if (NumRegs == 3) {
10505ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
10518bcb0991SDimitry Andric         return true;
10525ffd83dbSDimitry Andric     } else {
10535ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
10545ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
10555ffd83dbSDimitry Andric         return true;
10565ffd83dbSDimitry Andric     }
10578bcb0991SDimitry Andric 
1058*e8d8bef9SDimitry Andric     if (AlignBits < MemSize) {
10598bcb0991SDimitry Andric       const SITargetLowering *TLI = ST.getTargetLowering();
1060*e8d8bef9SDimitry Andric       return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
1061*e8d8bef9SDimitry Andric                                                       Align(AlignBits / 8));
10628bcb0991SDimitry Andric     }
10638bcb0991SDimitry Andric 
10648bcb0991SDimitry Andric     return false;
10658bcb0991SDimitry Andric   };
10668bcb0991SDimitry Andric 
1067*e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1068*e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1069*e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
10708bcb0991SDimitry Andric 
10718bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
10728bcb0991SDimitry Andric   // LDS
10738bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
10748bcb0991SDimitry Andric 
10758bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
10768bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
10778bcb0991SDimitry Andric 
10788bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
10795ffd83dbSDimitry Andric     // Explicitly list some common cases.
10805ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
10818bcb0991SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
10828bcb0991SDimitry Andric                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
10838bcb0991SDimitry Andric                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
10848bcb0991SDimitry Andric                                       {S64, GlobalPtr, 64, GlobalAlign32},
10858bcb0991SDimitry Andric                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
10868bcb0991SDimitry Andric                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
10878bcb0991SDimitry Andric                                       {S32, GlobalPtr, 8, GlobalAlign8},
10888bcb0991SDimitry Andric                                       {S32, GlobalPtr, 16, GlobalAlign16},
10898bcb0991SDimitry Andric 
10908bcb0991SDimitry Andric                                       {S32, LocalPtr, 32, 32},
10918bcb0991SDimitry Andric                                       {S64, LocalPtr, 64, 32},
10928bcb0991SDimitry Andric                                       {V2S32, LocalPtr, 64, 32},
10938bcb0991SDimitry Andric                                       {S32, LocalPtr, 8, 8},
10948bcb0991SDimitry Andric                                       {S32, LocalPtr, 16, 16},
10958bcb0991SDimitry Andric                                       {V2S16, LocalPtr, 32, 32},
10968bcb0991SDimitry Andric 
10978bcb0991SDimitry Andric                                       {S32, PrivatePtr, 32, 32},
10988bcb0991SDimitry Andric                                       {S32, PrivatePtr, 8, 8},
10998bcb0991SDimitry Andric                                       {S32, PrivatePtr, 16, 16},
11008bcb0991SDimitry Andric                                       {V2S16, PrivatePtr, 32, 32},
11018bcb0991SDimitry Andric 
11028bcb0991SDimitry Andric                                       {S32, ConstantPtr, 32, GlobalAlign32},
11038bcb0991SDimitry Andric                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
11048bcb0991SDimitry Andric                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
11058bcb0991SDimitry Andric                                       {S64, ConstantPtr, 64, GlobalAlign32},
11068bcb0991SDimitry Andric                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
11075ffd83dbSDimitry Andric     Actions.legalIf(
11085ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
11095ffd83dbSDimitry Andric         return isLoadStoreLegal(ST, Query, Op);
11105ffd83dbSDimitry Andric       });
11115ffd83dbSDimitry Andric 
11125ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
11135ffd83dbSDimitry Andric     // 64-bits.
11145ffd83dbSDimitry Andric     //
11155ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
11165ffd83dbSDimitry Andric     // inserting addrspacecasts.
11175ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
11185ffd83dbSDimitry Andric 
11195ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
11205ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
11215ffd83dbSDimitry Andric     // parts anyway.
11225ffd83dbSDimitry Andric     //
11235ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
11245ffd83dbSDimitry Andric     // 16-bit vector parts.
11255ffd83dbSDimitry Andric     Actions.bitcastIf(
11265ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1127*e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1128*e8d8bef9SDimitry Andric                                           Query.MMODescrs[0].SizeInBits);
11295ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
11305ffd83dbSDimitry Andric 
1131*e8d8bef9SDimitry Andric     if (!IsStore) {
1132*e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1133*e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1134*e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1135*e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1136*e8d8bef9SDimitry Andric       });
1137*e8d8bef9SDimitry Andric     }
1138*e8d8bef9SDimitry Andric 
1139*e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
11408bcb0991SDimitry Andric     Actions
11418bcb0991SDimitry Andric         .narrowScalarIf(
11428bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
11435ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
11445ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
11458bcb0991SDimitry Andric             },
11468bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
11478bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
11488bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
11498bcb0991SDimitry Andric 
11508bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
11518bcb0991SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
11528bcb0991SDimitry Andric 
11538bcb0991SDimitry Andric               // Split extloads.
11548bcb0991SDimitry Andric               if (DstSize > MemSize)
11558bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MemSize));
11568bcb0991SDimitry Andric 
11575ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
11585ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
11595ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
11605ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
11615ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
11625ffd83dbSDimitry Andric                 return std::make_pair(0, LLT::scalar(FloorSize));
11635ffd83dbSDimitry Andric               }
11645ffd83dbSDimitry Andric 
11658bcb0991SDimitry Andric               if (DstSize > 32 && (DstSize % 32 != 0)) {
11668bcb0991SDimitry Andric                 // FIXME: Need a way to specify non-extload of larger size if
11678bcb0991SDimitry Andric                 // suitably aligned.
11688bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
11698bcb0991SDimitry Andric               }
11708bcb0991SDimitry Andric 
11715ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
11725ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
11735ffd83dbSDimitry Andric                                                      Op == G_LOAD);
11748bcb0991SDimitry Andric               if (MemSize > MaxSize)
11758bcb0991SDimitry Andric                 return std::make_pair(0, LLT::scalar(MaxSize));
11768bcb0991SDimitry Andric 
11778bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
11788bcb0991SDimitry Andric               return std::make_pair(0, LLT::scalar(Align));
11798bcb0991SDimitry Andric             })
11808bcb0991SDimitry Andric         .fewerElementsIf(
11818bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
11825ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
11835ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
11848bcb0991SDimitry Andric             },
11858bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
11868bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
11878bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
11888bcb0991SDimitry Andric 
11898bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
11905ffd83dbSDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(ST,
11915ffd83dbSDimitry Andric                                                      PtrTy.getAddressSpace(),
11925ffd83dbSDimitry Andric                                                      Op == G_LOAD);
11935ffd83dbSDimitry Andric 
11945ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
11955ffd83dbSDimitry Andric               // up scalarizing.
11965ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
11978bcb0991SDimitry Andric 
11988bcb0991SDimitry Andric               // Split if it's too large for the address space.
11998bcb0991SDimitry Andric               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
12008bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
12015ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
12025ffd83dbSDimitry Andric 
12035ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
12045ffd83dbSDimitry Andric                   return std::make_pair(
12055ffd83dbSDimitry Andric                     0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
12065ffd83dbSDimitry Andric                 }
12075ffd83dbSDimitry Andric 
12088bcb0991SDimitry Andric                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
12098bcb0991SDimitry Andric 
12108bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
12118bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
12128bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
12138bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
12148bcb0991SDimitry Andric                   return std::make_pair(0, EltTy);
12158bcb0991SDimitry Andric 
12168bcb0991SDimitry Andric                 return std::make_pair(0,
12178bcb0991SDimitry Andric                                       LLT::vector(NumElts / NumPieces, EltTy));
12188bcb0991SDimitry Andric               }
12198bcb0991SDimitry Andric 
12205ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
12215ffd83dbSDimitry Andric               unsigned MemSize = Query.MMODescrs[0].SizeInBits;
12225ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
12235ffd83dbSDimitry Andric                 return std::make_pair(0, EltTy);
12245ffd83dbSDimitry Andric 
12255ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
12265ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
12275ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
12285ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
12295ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
12305ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
12315ffd83dbSDimitry Andric                 unsigned FloorSize = PowerOf2Floor(DstSize);
12325ffd83dbSDimitry Andric                 return std::make_pair(
12335ffd83dbSDimitry Andric                   0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
12345ffd83dbSDimitry Andric               }
12355ffd83dbSDimitry Andric 
12368bcb0991SDimitry Andric               // Need to split because of alignment.
12378bcb0991SDimitry Andric               unsigned Align = Query.MMODescrs[0].AlignInBits;
12388bcb0991SDimitry Andric               if (EltSize > Align &&
12398bcb0991SDimitry Andric                   (EltSize / Align < DstTy.getNumElements())) {
12408bcb0991SDimitry Andric                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
12418bcb0991SDimitry Andric               }
12428bcb0991SDimitry Andric 
12438bcb0991SDimitry Andric               // May need relegalization for the scalars.
12448bcb0991SDimitry Andric               return std::make_pair(0, EltTy);
12458bcb0991SDimitry Andric             })
1246*e8d8bef9SDimitry Andric     .lowerIfMemSizeNotPow2()
12478bcb0991SDimitry Andric     .minScalar(0, S32);
12488bcb0991SDimitry Andric 
12498bcb0991SDimitry Andric     if (IsStore)
12508bcb0991SDimitry Andric       Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
12518bcb0991SDimitry Andric 
12528bcb0991SDimitry Andric     Actions
12538bcb0991SDimitry Andric         .widenScalarToNextPow2(0)
1254*e8d8bef9SDimitry Andric         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1255*e8d8bef9SDimitry Andric         .lower();
12568bcb0991SDimitry Andric   }
12570b57cec5SDimitry Andric 
12580b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
12598bcb0991SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
12608bcb0991SDimitry Andric                                                   {S32, GlobalPtr, 16, 2 * 8},
12610b57cec5SDimitry Andric                                                   {S32, LocalPtr, 8, 8},
12628bcb0991SDimitry Andric                                                   {S32, LocalPtr, 16, 16},
12630b57cec5SDimitry Andric                                                   {S32, PrivatePtr, 8, 8},
12648bcb0991SDimitry Andric                                                   {S32, PrivatePtr, 16, 16},
12658bcb0991SDimitry Andric                                                   {S32, ConstantPtr, 8, 8},
12668bcb0991SDimitry Andric                                                   {S32, ConstantPtr, 16, 2 * 8}});
12670b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
12688bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
12698bcb0991SDimitry Andric         {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
12700b57cec5SDimitry Andric   }
12710b57cec5SDimitry Andric 
12720b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
12730b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
12740b57cec5SDimitry Andric           .unsupportedIfMemSizeNotPow2()
12750b57cec5SDimitry Andric           .lower();
12760b57cec5SDimitry Andric 
12770b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
12780b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
12790b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
12800b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1281480093f4SDimitry Andric      G_ATOMICRMW_UMIN})
12820b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1283*e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1284*e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
12850b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
12860b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
12870b57cec5SDimitry Andric   }
12880b57cec5SDimitry Andric 
12895ffd83dbSDimitry Andric   if (ST.hasLDSFPAtomics()) {
12908bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
1291*e8d8bef9SDimitry Andric       .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
12925ffd83dbSDimitry Andric   }
12938bcb0991SDimitry Andric 
1294480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1295480093f4SDimitry Andric   // demarshalling
1296480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1297480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1298480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1299480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1300480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
13010b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1302480093f4SDimitry Andric 
1303480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
13040b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
13050b57cec5SDimitry Andric     .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
13060b57cec5SDimitry Andric           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
1307480093f4SDimitry Andric           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
13080b57cec5SDimitry Andric     .clampScalar(0, S16, S64)
13095ffd83dbSDimitry Andric     .scalarize(1)
13100b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
13110b57cec5SDimitry Andric     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
13120b57cec5SDimitry Andric     .clampMaxNumElements(0, S32, 2)
13130b57cec5SDimitry Andric     .clampMaxNumElements(0, LocalPtr, 2)
13140b57cec5SDimitry Andric     .clampMaxNumElements(0, PrivatePtr, 2)
13150b57cec5SDimitry Andric     .scalarize(0)
13160b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
1317480093f4SDimitry Andric     .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
13180b57cec5SDimitry Andric 
13190b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
13200b57cec5SDimitry Andric   // be more flexible with the shift amount type.
13210b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
13220b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
13230b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
13240b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
13255ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
13260b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
13270b57cec5SDimitry Andric     } else
13285ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
13290b57cec5SDimitry Andric 
13305ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
13315ffd83dbSDimitry Andric     Shifts.widenScalarIf(
13325ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
13335ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
13345ffd83dbSDimitry Andric         // 32-bit amount.
13355ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
13365ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
13375ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
13385ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
13395ffd83dbSDimitry Andric       }, changeTo(1, S16));
13405ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1341480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
13420b57cec5SDimitry Andric     Shifts.clampScalar(0, S16, S64);
13430b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
1344*e8d8bef9SDimitry Andric 
1345*e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1346*e8d8bef9SDimitry Andric       .minScalar(0, S16)
1347*e8d8bef9SDimitry Andric       .scalarize(0)
1348*e8d8bef9SDimitry Andric       .lower();
13490b57cec5SDimitry Andric   } else {
13500b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
13510b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
13520b57cec5SDimitry Andric     // been truncated already.
13530b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
13540b57cec5SDimitry Andric     Shifts.clampScalar(0, S32, S64);
13550b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
1356*e8d8bef9SDimitry Andric 
1357*e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1358*e8d8bef9SDimitry Andric       .minScalar(0, S32)
1359*e8d8bef9SDimitry Andric       .scalarize(0)
1360*e8d8bef9SDimitry Andric       .lower();
13610b57cec5SDimitry Andric   }
13620b57cec5SDimitry Andric   Shifts.scalarize(0);
13630b57cec5SDimitry Andric 
13640b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
13650b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
13660b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
13670b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
13680b57cec5SDimitry Andric 
13690b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
13700b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
13710b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
13720b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
13730b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1374*e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
1375*e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
13760b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
13775ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
13780b57cec5SDimitry Andric                   IdxTy.getSizeInBits() == 32;
13790b57cec5SDimitry Andric         })
1380*e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1381*e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1382*e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1383*e8d8bef9SDimitry Andric       .bitcastIf(
1384*e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1385*e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1386*e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1387*e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1388*e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1389*e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1390*e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1391*e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1392*e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1393*e8d8bef9SDimitry Andric 
1394*e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1395*e8d8bef9SDimitry Andric           return std::make_pair(
1396*e8d8bef9SDimitry Andric             VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
1397*e8d8bef9SDimitry Andric         })
13980b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
13990b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1400*e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1401*e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1402*e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
1403*e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1404*e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1405*e8d8bef9SDimitry Andric       .lower();
14060b57cec5SDimitry Andric   }
14070b57cec5SDimitry Andric 
14080b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
14090b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
14100b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
14110b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
14120b57cec5SDimitry Andric       });
14130b57cec5SDimitry Andric 
14140b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
14150b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
14160b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
14170b57cec5SDimitry Andric 
14180b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
14190b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
14208bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
14218bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
14220b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
14230b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14240b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14250b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
14260b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
14270b57cec5SDimitry Andric         })
14280b57cec5SDimitry Andric       .widenScalarIf(
14290b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
14300b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
14310b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
14320b57cec5SDimitry Andric         },
14330b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
14340b57cec5SDimitry Andric       .widenScalarIf(
14350b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
14360b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
14370b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
14380b57cec5SDimitry Andric         },
14390b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
14400b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
14410b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
14420b57cec5SDimitry Andric 
14430b57cec5SDimitry Andric   }
14440b57cec5SDimitry Andric 
14458bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
14460b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
14470b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
14488bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
14498bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
14508bcb0991SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
14518bcb0991SDimitry Andric 
14528bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
14535ffd83dbSDimitry Andric     BuildVector
14545ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
14555ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
14565ffd83dbSDimitry Andric       // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
14575ffd83dbSDimitry Andric       .minScalar(1, S32);
14585ffd83dbSDimitry Andric 
14598bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
14608bcb0991SDimitry Andric       .legalFor({V2S16, S32})
14618bcb0991SDimitry Andric       .lower();
14625ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
14638bcb0991SDimitry Andric   } else {
14645ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
14655ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
14665ffd83dbSDimitry Andric 
14678bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
14685ffd83dbSDimitry Andric       .customFor({V2S16, S32})
14698bcb0991SDimitry Andric       .lower();
14708bcb0991SDimitry Andric   }
14718bcb0991SDimitry Andric 
14725ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
14735ffd83dbSDimitry Andric 
14745ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
14750b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1476*e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1477*e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1478*e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1479*e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
14800b57cec5SDimitry Andric 
14815ffd83dbSDimitry Andric   // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
14825ffd83dbSDimitry Andric   // pre-legalize.
14835ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
14845ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
14855ffd83dbSDimitry Andric       .customFor({V2S16, V2S16})
14865ffd83dbSDimitry Andric       .lower();
14875ffd83dbSDimitry Andric   } else
14888bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
14898bcb0991SDimitry Andric 
14900b57cec5SDimitry Andric   // Merge/Unmerge
14910b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
14920b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
14930b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
14940b57cec5SDimitry Andric 
14950b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
14965ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
14970b57cec5SDimitry Andric       if (Ty.isVector()) {
14980b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
14995ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
15000b57cec5SDimitry Andric           return true;
15010b57cec5SDimitry Andric         if (!isPowerOf2_32(EltTy.getSizeInBits()))
15020b57cec5SDimitry Andric           return true;
15030b57cec5SDimitry Andric       }
15040b57cec5SDimitry Andric       return false;
15050b57cec5SDimitry Andric     };
15060b57cec5SDimitry Andric 
15078bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1508*e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
15095ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
15105ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
15115ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
15125ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
15135ffd83dbSDimitry Andric         })
15145ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
15155ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
15165ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
15170b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
15188bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
15198bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
15208bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
15218bcb0991SDimitry Andric                        changeTo(1, V2S16))
15225ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
15235ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
15245ffd83dbSDimitry Andric       // valid.
15255ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
15265ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
15270b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
15280b57cec5SDimitry Andric       .fewerElementsIf(
15295ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
15300b57cec5SDimitry Andric         scalarize(0))
15310b57cec5SDimitry Andric       .fewerElementsIf(
15325ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
15330b57cec5SDimitry Andric         scalarize(1))
15345ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
15358bcb0991SDimitry Andric 
15368bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
15378bcb0991SDimitry Andric       Builder.widenScalarIf(
15388bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
15390b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
15408bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
15418bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
15428bcb0991SDimitry Andric         },
15438bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
15448bcb0991SDimitry Andric     }
15458bcb0991SDimitry Andric 
15468bcb0991SDimitry Andric     Builder.widenScalarIf(
15478bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
15488bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
15490b57cec5SDimitry Andric         return !isPowerOf2_32(Ty.getSizeInBits()) &&
15500b57cec5SDimitry Andric           Ty.getSizeInBits() % 16 != 0;
15510b57cec5SDimitry Andric       },
15520b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
15530b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
15540b57cec5SDimitry Andric         // Whichever is smaller.
15550b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
15560b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
15570b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
15580b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
15590b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
15600b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
15610b57cec5SDimitry Andric         }
15620b57cec5SDimitry Andric         return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
15630b57cec5SDimitry Andric       })
15640b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
15650b57cec5SDimitry Andric       .scalarize(0)
15660b57cec5SDimitry Andric       .scalarize(1);
15670b57cec5SDimitry Andric   }
15680b57cec5SDimitry Andric 
15695ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
15705ffd83dbSDimitry Andric   // RegBankSelect.
15715ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
15725ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
15738bcb0991SDimitry Andric 
15745ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
15755ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
15765ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
15775ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
15785ffd83dbSDimitry Andric       // expanded.
15795ffd83dbSDimitry Andric       .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
15805ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
15815ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
15825ffd83dbSDimitry Andric   } else {
15835ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
15845ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
15855ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
15865ffd83dbSDimitry Andric   }
15875ffd83dbSDimitry Andric 
15885ffd83dbSDimitry Andric   SextInReg
15895ffd83dbSDimitry Andric     .scalarize(0)
15905ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
15915ffd83dbSDimitry Andric     .lower();
15925ffd83dbSDimitry Andric 
15935ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
15945ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
15955ffd83dbSDimitry Andric     .scalarize(0)
15965ffd83dbSDimitry Andric     .lower();
1597480093f4SDimitry Andric 
1598480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1599480093f4SDimitry Andric     .legalFor({S64});
1600480093f4SDimitry Andric 
1601*e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1602*e8d8bef9SDimitry Andric     .alwaysLegal();
1603*e8d8bef9SDimitry Andric 
16045ffd83dbSDimitry Andric   getActionDefinitionsBuilder({
16055ffd83dbSDimitry Andric       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
16065ffd83dbSDimitry Andric       G_FCOPYSIGN,
16075ffd83dbSDimitry Andric 
16085ffd83dbSDimitry Andric       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1609*e8d8bef9SDimitry Andric       G_ATOMICRMW_NAND,
1610*e8d8bef9SDimitry Andric       G_ATOMICRMW_FSUB,
16115ffd83dbSDimitry Andric       G_READ_REGISTER,
16125ffd83dbSDimitry Andric       G_WRITE_REGISTER,
16135ffd83dbSDimitry Andric 
16145ffd83dbSDimitry Andric       G_SADDO, G_SSUBO,
16155ffd83dbSDimitry Andric 
16165ffd83dbSDimitry Andric        // TODO: Implement
16175ffd83dbSDimitry Andric       G_FMINIMUM, G_FMAXIMUM,
16185ffd83dbSDimitry Andric       G_FSHL
16195ffd83dbSDimitry Andric     }).lower();
16205ffd83dbSDimitry Andric 
1621480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
16225ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1623480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1624480093f4SDimitry Andric     .unsupported();
1625480093f4SDimitry Andric 
16260b57cec5SDimitry Andric   computeTables();
16270b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
16280b57cec5SDimitry Andric }
16290b57cec5SDimitry Andric 
16305ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
16315ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
16325ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
16335ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
16345ffd83dbSDimitry Andric 
16350b57cec5SDimitry Andric   switch (MI.getOpcode()) {
16360b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
16378bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
16380b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
16398bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
16400b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
16418bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
1642*e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
1643*e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
16440b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
16458bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
16460b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
16478bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
16480b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
16498bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
16505ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
16515ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
16525ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
16535ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
16540b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
16550b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
16560b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
16570b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
16585ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
16590b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
16608bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
16610b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
16628bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
16635ffd83dbSDimitry Andric   case TargetOpcode::G_SHUFFLE_VECTOR:
16645ffd83dbSDimitry Andric     return legalizeShuffleVector(MI, MRI, B);
16658bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
16668bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
16678bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
16688bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
16698bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
16708bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
1671*e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
16728bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
16738bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
16748bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
16758bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
16765ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
16775ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
16785ffd83dbSDimitry Andric     return legalizeUDIV_UREM(MI, MRI, B);
16795ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
16805ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
16815ffd83dbSDimitry Andric     return legalizeSDIV_SREM(MI, MRI, B);
1682480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
1683480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
16845ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
16855ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f);
16865ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
16875ffd83dbSDimitry Andric     return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
16885ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
16895ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
16905ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
16915ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
16925ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
16935ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
16945ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
16955ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
16960b57cec5SDimitry Andric   default:
16970b57cec5SDimitry Andric     return false;
16980b57cec5SDimitry Andric   }
16990b57cec5SDimitry Andric 
17000b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
17010b57cec5SDimitry Andric }
17020b57cec5SDimitry Andric 
17030b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
17040b57cec5SDimitry Andric   unsigned AS,
17050b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
17068bcb0991SDimitry Andric   MachineIRBuilder &B) const {
17078bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
17080b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17090b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
17100b57cec5SDimitry Andric 
17118bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
17128bcb0991SDimitry Andric 
17130b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
17140b57cec5SDimitry Andric     // FIXME: Use inline constants (src_{shared, private}_base) instead of
17150b57cec5SDimitry Andric     // getreg.
17160b57cec5SDimitry Andric     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
17170b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
17180b57cec5SDimitry Andric         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
17190b57cec5SDimitry Andric     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
17200b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
17210b57cec5SDimitry Andric         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
17220b57cec5SDimitry Andric     unsigned Encoding =
17230b57cec5SDimitry Andric         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
17240b57cec5SDimitry Andric         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
17250b57cec5SDimitry Andric         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
17260b57cec5SDimitry Andric 
17270b57cec5SDimitry Andric     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
17280b57cec5SDimitry Andric 
17298bcb0991SDimitry Andric     B.buildInstr(AMDGPU::S_GETREG_B32)
17300b57cec5SDimitry Andric       .addDef(GetReg)
17310b57cec5SDimitry Andric       .addImm(Encoding);
17320b57cec5SDimitry Andric     MRI.setType(GetReg, S32);
17330b57cec5SDimitry Andric 
17348bcb0991SDimitry Andric     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
17355ffd83dbSDimitry Andric     return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
17360b57cec5SDimitry Andric   }
17370b57cec5SDimitry Andric 
17380b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
17390b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
17400b57cec5SDimitry Andric 
1741*e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
17428bcb0991SDimitry Andric     return Register();
17430b57cec5SDimitry Andric 
17440b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
17450b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
17460b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
17470b57cec5SDimitry Andric 
1748480093f4SDimitry Andric   // TODO: can we be smarter about machine pointer info?
1749480093f4SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
17500b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
17510b57cec5SDimitry Andric       PtrInfo,
17525ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
17530b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
17545ffd83dbSDimitry Andric       4, commonAlignment(Align(64), StructOffset));
17550b57cec5SDimitry Andric 
17560b57cec5SDimitry Andric   Register LoadAddr;
17570b57cec5SDimitry Andric 
1758480093f4SDimitry Andric   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
17595ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
17600b57cec5SDimitry Andric }
17610b57cec5SDimitry Andric 
17620b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
17630b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
17648bcb0991SDimitry Andric   MachineIRBuilder &B) const {
17658bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
17660b57cec5SDimitry Andric 
17678bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
17680b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
17690b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
17700b57cec5SDimitry Andric 
17710b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
17720b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
17730b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
17740b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
17750b57cec5SDimitry Andric 
17760b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
17770b57cec5SDimitry Andric   // vector element.
17780b57cec5SDimitry Andric   assert(!DstTy.isVector());
17790b57cec5SDimitry Andric 
17800b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
17810b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
17820b57cec5SDimitry Andric 
1783*e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
17848bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
17858bcb0991SDimitry Andric     return true;
17868bcb0991SDimitry Andric   }
17878bcb0991SDimitry Andric 
17888bcb0991SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
17898bcb0991SDimitry Andric     // Truncate.
17908bcb0991SDimitry Andric     B.buildExtract(Dst, Src, 0);
17918bcb0991SDimitry Andric     MI.eraseFromParent();
17928bcb0991SDimitry Andric     return true;
17938bcb0991SDimitry Andric   }
17948bcb0991SDimitry Andric 
17958bcb0991SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
17968bcb0991SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
17978bcb0991SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
17988bcb0991SDimitry Andric 
17998bcb0991SDimitry Andric     // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
18008bcb0991SDimitry Andric     // another. Merge operands are required to be the same type, but creating an
18018bcb0991SDimitry Andric     // extra ptrtoint would be kind of pointless.
18028bcb0991SDimitry Andric     auto HighAddr = B.buildConstant(
18038bcb0991SDimitry Andric       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
18045ffd83dbSDimitry Andric     B.buildMerge(Dst, {Src, HighAddr});
18058bcb0991SDimitry Andric     MI.eraseFromParent();
18060b57cec5SDimitry Andric     return true;
18070b57cec5SDimitry Andric   }
18080b57cec5SDimitry Andric 
18090b57cec5SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
18100b57cec5SDimitry Andric     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
18110b57cec5SDimitry Andric            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
18120b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
18130b57cec5SDimitry Andric 
18148bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
18158bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
18160b57cec5SDimitry Andric 
18170b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
18185ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
18190b57cec5SDimitry Andric 
18205ffd83dbSDimitry Andric     auto CmpRes =
18215ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
18228bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
18230b57cec5SDimitry Andric 
18240b57cec5SDimitry Andric     MI.eraseFromParent();
18250b57cec5SDimitry Andric     return true;
18260b57cec5SDimitry Andric   }
18270b57cec5SDimitry Andric 
18288bcb0991SDimitry Andric   if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
18298bcb0991SDimitry Andric     return false;
18308bcb0991SDimitry Andric 
18318bcb0991SDimitry Andric   if (!ST.hasFlatAddressSpace())
18328bcb0991SDimitry Andric     return false;
18330b57cec5SDimitry Andric 
18340b57cec5SDimitry Andric   auto SegmentNull =
18358bcb0991SDimitry Andric       B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
18360b57cec5SDimitry Andric   auto FlatNull =
18378bcb0991SDimitry Andric       B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
18380b57cec5SDimitry Andric 
18398bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
18408bcb0991SDimitry Andric   if (!ApertureReg.isValid())
18418bcb0991SDimitry Andric     return false;
18420b57cec5SDimitry Andric 
18435ffd83dbSDimitry Andric   auto CmpRes =
18445ffd83dbSDimitry Andric       B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
18450b57cec5SDimitry Andric 
18460b57cec5SDimitry Andric   // Coerce the type of the low half of the result so we can use merge_values.
18475ffd83dbSDimitry Andric   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
18480b57cec5SDimitry Andric 
18490b57cec5SDimitry Andric   // TODO: Should we allow mismatched types but matching sizes in merges to
18500b57cec5SDimitry Andric   // avoid the ptrtoint?
18515ffd83dbSDimitry Andric   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
18525ffd83dbSDimitry Andric   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
18530b57cec5SDimitry Andric 
18540b57cec5SDimitry Andric   MI.eraseFromParent();
18550b57cec5SDimitry Andric   return true;
18560b57cec5SDimitry Andric }
18570b57cec5SDimitry Andric 
18580b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
18590b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
18608bcb0991SDimitry Andric   MachineIRBuilder &B) const {
18610b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
18620b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
18630b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
18640b57cec5SDimitry Andric 
18650b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
18660b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
18670b57cec5SDimitry Andric 
18688bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
18698bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
18700b57cec5SDimitry Andric 
18710b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
18728bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
18738bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
18740b57cec5SDimitry Andric 
18758bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
18768bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
18770b57cec5SDimitry Andric 
18788bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
18798bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
1880*e8d8bef9SDimitry Andric   MI.eraseFromParent();
18810b57cec5SDimitry Andric   return true;
18820b57cec5SDimitry Andric }
18830b57cec5SDimitry Andric 
18840b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
18850b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
18860b57cec5SDimitry Andric   MachineIRBuilder &B) const {
18870b57cec5SDimitry Andric 
18880b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
18890b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
18900b57cec5SDimitry Andric 
18910b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
18920b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
18930b57cec5SDimitry Andric 
18940b57cec5SDimitry Andric   // result = trunc(src)
18950b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
18960b57cec5SDimitry Andric   //   result += 1.0
18970b57cec5SDimitry Andric 
18985ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
18990b57cec5SDimitry Andric 
19000b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
19010b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
19020b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
19030b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
19040b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
19050b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
19060b57cec5SDimitry Andric 
19070b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
19080b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
19090b57cec5SDimitry Andric   return true;
19100b57cec5SDimitry Andric }
19110b57cec5SDimitry Andric 
1912*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
1913*e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
1914*e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
1915*e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
1916*e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
1917*e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
1918*e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
1919*e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
1920*e8d8bef9SDimitry Andric 
1921*e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
1922*e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
1923*e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
1924*e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
1925*e8d8bef9SDimitry Andric     MI.eraseFromParent();
1926*e8d8bef9SDimitry Andric     return true;
1927*e8d8bef9SDimitry Andric }
1928*e8d8bef9SDimitry Andric 
1929*e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
19300b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
19310b57cec5SDimitry Andric   const unsigned FractBits = 52;
19320b57cec5SDimitry Andric   const unsigned ExpBits = 11;
19330b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
19340b57cec5SDimitry Andric 
19350b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
19360b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
19370b57cec5SDimitry Andric 
19380b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
1939*e8d8bef9SDimitry Andric     .addUse(Hi)
19400b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
19410b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
19420b57cec5SDimitry Andric 
19430b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
19440b57cec5SDimitry Andric }
19450b57cec5SDimitry Andric 
19460b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
19470b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19480b57cec5SDimitry Andric   MachineIRBuilder &B) const {
19490b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
19500b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
19510b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
19520b57cec5SDimitry Andric 
19530b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19540b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
19550b57cec5SDimitry Andric 
19560b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
19570b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
19580b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
19590b57cec5SDimitry Andric 
19600b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
19610b57cec5SDimitry Andric   // exponent.
19620b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
19630b57cec5SDimitry Andric 
19640b57cec5SDimitry Andric   const unsigned FractBits = 52;
19650b57cec5SDimitry Andric 
19660b57cec5SDimitry Andric   // Extract the sign bit.
19670b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
19680b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
19690b57cec5SDimitry Andric 
19700b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
19710b57cec5SDimitry Andric 
19720b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
19730b57cec5SDimitry Andric 
19740b57cec5SDimitry Andric   // Extend back to 64-bits.
19755ffd83dbSDimitry Andric   auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
19760b57cec5SDimitry Andric 
19770b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
19780b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
19790b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
19800b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
19810b57cec5SDimitry Andric 
19820b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
19830b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
19840b57cec5SDimitry Andric 
19850b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
19860b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
1987*e8d8bef9SDimitry Andric   MI.eraseFromParent();
19880b57cec5SDimitry Andric   return true;
19890b57cec5SDimitry Andric }
19900b57cec5SDimitry Andric 
19910b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
19920b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
19930b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
19940b57cec5SDimitry Andric 
19950b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
19960b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
19970b57cec5SDimitry Andric 
19980b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
19990b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
20000b57cec5SDimitry Andric 
20010b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
20020b57cec5SDimitry Andric 
20030b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
20040b57cec5SDimitry Andric 
20050b57cec5SDimitry Andric   auto CvtHi = Signed ?
20060b57cec5SDimitry Andric     B.buildSITOFP(S64, Unmerge.getReg(1)) :
20070b57cec5SDimitry Andric     B.buildUITOFP(S64, Unmerge.getReg(1));
20080b57cec5SDimitry Andric 
20090b57cec5SDimitry Andric   auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
20100b57cec5SDimitry Andric 
20110b57cec5SDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
20120b57cec5SDimitry Andric   auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
20130b57cec5SDimitry Andric     .addUse(CvtHi.getReg(0))
20140b57cec5SDimitry Andric     .addUse(ThirtyTwo.getReg(0));
20150b57cec5SDimitry Andric 
20160b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
20170b57cec5SDimitry Andric   B.buildFAdd(Dst, LdExp, CvtLo);
20180b57cec5SDimitry Andric   MI.eraseFromParent();
20190b57cec5SDimitry Andric   return true;
20200b57cec5SDimitry Andric }
20210b57cec5SDimitry Andric 
20225ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
20235ffd83dbSDimitry Andric // actually works.
20245ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(
20250b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20265ffd83dbSDimitry Andric   MachineIRBuilder &B, bool Signed) const {
20275ffd83dbSDimitry Andric 
20285ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
20295ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
20305ffd83dbSDimitry Andric 
20315ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
20325ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
20335ffd83dbSDimitry Andric 
20345ffd83dbSDimitry Andric   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
20355ffd83dbSDimitry Andric 
20365ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
20375ffd83dbSDimitry Andric 
20385ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
20395ffd83dbSDimitry Andric   auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
20405ffd83dbSDimitry Andric   auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
20415ffd83dbSDimitry Andric 
20425ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
20435ffd83dbSDimitry Andric   auto FloorMul = B.buildFFloor(S64, Mul, Flags);
20445ffd83dbSDimitry Andric   auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
20455ffd83dbSDimitry Andric 
20465ffd83dbSDimitry Andric   auto Hi = Signed ?
20475ffd83dbSDimitry Andric     B.buildFPTOSI(S32, FloorMul) :
20485ffd83dbSDimitry Andric     B.buildFPTOUI(S32, FloorMul);
20495ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
20505ffd83dbSDimitry Andric 
20515ffd83dbSDimitry Andric   B.buildMerge(Dst, { Lo, Hi });
20525ffd83dbSDimitry Andric   MI.eraseFromParent();
20535ffd83dbSDimitry Andric 
20545ffd83dbSDimitry Andric   return true;
20555ffd83dbSDimitry Andric }
20565ffd83dbSDimitry Andric 
20575ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
20585ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
20595ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
20600b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
20610b57cec5SDimitry Andric 
20620b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
20630b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
20640b57cec5SDimitry Andric 
20650b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
20660b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
20670b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
20680b57cec5SDimitry Andric     return !IsIEEEOp;
20690b57cec5SDimitry Andric 
20700b57cec5SDimitry Andric   if (IsIEEEOp)
20710b57cec5SDimitry Andric     return true;
20720b57cec5SDimitry Andric 
20730b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
20740b57cec5SDimitry Andric }
20750b57cec5SDimitry Andric 
20760b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
20770b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
20780b57cec5SDimitry Andric   MachineIRBuilder &B) const {
20790b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
20800b57cec5SDimitry Andric 
20810b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
20825ffd83dbSDimitry Andric 
20835ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
20845ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
20855ffd83dbSDimitry Andric   // getConstantVRegValWithLookThrough.
2086*e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2087*e8d8bef9SDimitry Andric       getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2088*e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
20890b57cec5SDimitry Andric     return true;
2090*e8d8bef9SDimitry Andric   const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
20910b57cec5SDimitry Andric 
20920b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
20930b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
20940b57cec5SDimitry Andric 
20950b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
20960b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
20970b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Dst));
20980b57cec5SDimitry Andric 
2099*e8d8bef9SDimitry Andric   if (IdxVal < VecTy.getNumElements())
2100*e8d8bef9SDimitry Andric     B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
21010b57cec5SDimitry Andric   else
21020b57cec5SDimitry Andric     B.buildUndef(Dst);
21030b57cec5SDimitry Andric 
21040b57cec5SDimitry Andric   MI.eraseFromParent();
21050b57cec5SDimitry Andric   return true;
21060b57cec5SDimitry Andric }
21070b57cec5SDimitry Andric 
21080b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
21090b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21100b57cec5SDimitry Andric   MachineIRBuilder &B) const {
21110b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
21120b57cec5SDimitry Andric 
21130b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
21145ffd83dbSDimitry Andric 
21155ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
21165ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
21175ffd83dbSDimitry Andric   // getConstantVRegValWithLookThrough.
2118*e8d8bef9SDimitry Andric   Optional<ValueAndVReg> MaybeIdxVal =
2119*e8d8bef9SDimitry Andric       getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2120*e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
21210b57cec5SDimitry Andric     return true;
21220b57cec5SDimitry Andric 
2123*e8d8bef9SDimitry Andric   int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
21240b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
21250b57cec5SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
21260b57cec5SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
21270b57cec5SDimitry Andric 
21280b57cec5SDimitry Andric   LLT VecTy = MRI.getType(Vec);
21290b57cec5SDimitry Andric   LLT EltTy = VecTy.getElementType();
21300b57cec5SDimitry Andric   assert(EltTy == MRI.getType(Ins));
21310b57cec5SDimitry Andric 
2132*e8d8bef9SDimitry Andric   if (IdxVal < VecTy.getNumElements())
2133*e8d8bef9SDimitry Andric     B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
21340b57cec5SDimitry Andric   else
21350b57cec5SDimitry Andric     B.buildUndef(Dst);
21360b57cec5SDimitry Andric 
21370b57cec5SDimitry Andric   MI.eraseFromParent();
21380b57cec5SDimitry Andric   return true;
21390b57cec5SDimitry Andric }
21400b57cec5SDimitry Andric 
21415ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector(
21425ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21435ffd83dbSDimitry Andric   MachineIRBuilder &B) const {
21445ffd83dbSDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
21455ffd83dbSDimitry Andric 
21465ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
21475ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
21485ffd83dbSDimitry Andric   LLT DstTy = MRI.getType(Dst);
21495ffd83dbSDimitry Andric   LLT SrcTy = MRI.getType(Src0);
21505ffd83dbSDimitry Andric 
21515ffd83dbSDimitry Andric   if (SrcTy == V2S16 && DstTy == V2S16 &&
21525ffd83dbSDimitry Andric       AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
21535ffd83dbSDimitry Andric     return true;
21545ffd83dbSDimitry Andric 
21555ffd83dbSDimitry Andric   MachineIRBuilder HelperBuilder(MI);
21565ffd83dbSDimitry Andric   GISelObserverWrapper DummyObserver;
21575ffd83dbSDimitry Andric   LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
21585ffd83dbSDimitry Andric   return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
21595ffd83dbSDimitry Andric }
21605ffd83dbSDimitry Andric 
21618bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
21628bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21638bcb0991SDimitry Andric   MachineIRBuilder &B) const {
21648bcb0991SDimitry Andric 
21658bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
21668bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
21678bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
21688bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
21698bcb0991SDimitry Andric 
21708bcb0991SDimitry Andric   Register TrigVal;
21715ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
21728bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
21738bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
21748bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
21758bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
21768bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
21778bcb0991SDimitry Andric   } else
21788bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
21798bcb0991SDimitry Andric 
21808bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
21818bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
21828bcb0991SDimitry Andric   B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
21838bcb0991SDimitry Andric     .addUse(TrigVal)
21848bcb0991SDimitry Andric     .setMIFlags(Flags);
21858bcb0991SDimitry Andric   MI.eraseFromParent();
21868bcb0991SDimitry Andric   return true;
21878bcb0991SDimitry Andric }
21888bcb0991SDimitry Andric 
21895ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
21905ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
21915ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
21925ffd83dbSDimitry Andric                                                   int64_t Offset,
21935ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
21945ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
21958bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
21968bcb0991SDimitry Andric   // to the following code sequence:
21978bcb0991SDimitry Andric   //
21988bcb0991SDimitry Andric   // For constant address space:
21998bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
22008bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
22018bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
22028bcb0991SDimitry Andric   //
22038bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
22048bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
22058bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
22068bcb0991SDimitry Andric   //   operand to the global variable.
22078bcb0991SDimitry Andric   //
22088bcb0991SDimitry Andric   // For global address space:
22098bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
22108bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
22118bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
22128bcb0991SDimitry Andric   //
22138bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
22148bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
22158bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
22168bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
22178bcb0991SDimitry Andric   //   operand to the global variable.
22188bcb0991SDimitry Andric   //
22198bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
22208bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
22218bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
22228bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
22238bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
2224*e8d8bef9SDimitry Andric   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2225*e8d8bef9SDimitry Andric   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2226*e8d8bef9SDimitry Andric   // instruction.
22278bcb0991SDimitry Andric 
22288bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
22298bcb0991SDimitry Andric 
22308bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
22318bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
22328bcb0991SDimitry Andric 
22338bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
22348bcb0991SDimitry Andric     .addDef(PCReg);
22358bcb0991SDimitry Andric 
22368bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
22378bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
22388bcb0991SDimitry Andric     MIB.addImm(0);
22398bcb0991SDimitry Andric   else
2240*e8d8bef9SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
22418bcb0991SDimitry Andric 
22428bcb0991SDimitry Andric   B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
22438bcb0991SDimitry Andric 
22448bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
22458bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
22468bcb0991SDimitry Andric   return true;
22478bcb0991SDimitry Andric  }
22488bcb0991SDimitry Andric 
22498bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
22508bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22518bcb0991SDimitry Andric   MachineIRBuilder &B) const {
22528bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
22538bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
22548bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
22558bcb0991SDimitry Andric 
22568bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
22578bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
22588bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
22598bcb0991SDimitry Andric 
22608bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2261*e8d8bef9SDimitry Andric     if (!MFI->isModuleEntryFunction()) {
22628bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
22638bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
22645ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
22655ffd83dbSDimitry Andric         DS_Warning);
22668bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
22675ffd83dbSDimitry Andric 
22685ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
22695ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
22705ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
22715ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
22725ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
22735ffd83dbSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
22745ffd83dbSDimitry Andric       B.buildUndef(DstReg);
22755ffd83dbSDimitry Andric       MI.eraseFromParent();
22765ffd83dbSDimitry Andric       return true;
22778bcb0991SDimitry Andric     }
22788bcb0991SDimitry Andric 
22798bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
22808bcb0991SDimitry Andric     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
22815ffd83dbSDimitry Andric       const SITargetLowering *TLI = ST.getTargetLowering();
22825ffd83dbSDimitry Andric       if (!TLI->shouldUseLDSConstAddress(GV)) {
22835ffd83dbSDimitry Andric         MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
22845ffd83dbSDimitry Andric         return true; // Leave in place;
22855ffd83dbSDimitry Andric       }
22865ffd83dbSDimitry Andric 
2287*e8d8bef9SDimitry Andric       if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2288*e8d8bef9SDimitry Andric         Type *Ty = GV->getValueType();
2289*e8d8bef9SDimitry Andric         // HIP uses an unsized array `extern __shared__ T s[]` or similar
2290*e8d8bef9SDimitry Andric         // zero-sized type in other languages to declare the dynamic shared
2291*e8d8bef9SDimitry Andric         // memory which size is not known at the compile time. They will be
2292*e8d8bef9SDimitry Andric         // allocated by the runtime and placed directly after the static
2293*e8d8bef9SDimitry Andric         // allocated ones. They all share the same offset.
2294*e8d8bef9SDimitry Andric         if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2295*e8d8bef9SDimitry Andric           // Adjust alignment for that dynamic shared memory array.
2296*e8d8bef9SDimitry Andric           MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
2297*e8d8bef9SDimitry Andric           LLT S32 = LLT::scalar(32);
2298*e8d8bef9SDimitry Andric           auto Sz =
2299*e8d8bef9SDimitry Andric               B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2300*e8d8bef9SDimitry Andric           B.buildIntToPtr(DstReg, Sz);
2301*e8d8bef9SDimitry Andric           MI.eraseFromParent();
2302*e8d8bef9SDimitry Andric           return true;
2303*e8d8bef9SDimitry Andric         }
2304*e8d8bef9SDimitry Andric       }
2305*e8d8bef9SDimitry Andric 
23065ffd83dbSDimitry Andric       B.buildConstant(
23075ffd83dbSDimitry Andric           DstReg,
23085ffd83dbSDimitry Andric           MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
23098bcb0991SDimitry Andric       MI.eraseFromParent();
23108bcb0991SDimitry Andric       return true;
23118bcb0991SDimitry Andric     }
23128bcb0991SDimitry Andric 
23138bcb0991SDimitry Andric     const Function &Fn = MF.getFunction();
23148bcb0991SDimitry Andric     DiagnosticInfoUnsupported BadInit(
23158bcb0991SDimitry Andric       Fn, "unsupported initializer for address space", MI.getDebugLoc());
23168bcb0991SDimitry Andric     Fn.getContext().diagnose(BadInit);
23178bcb0991SDimitry Andric     return true;
23188bcb0991SDimitry Andric   }
23198bcb0991SDimitry Andric 
23208bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
23218bcb0991SDimitry Andric 
23228bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
23238bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
23248bcb0991SDimitry Andric     MI.eraseFromParent();
23258bcb0991SDimitry Andric     return true;
23268bcb0991SDimitry Andric   }
23278bcb0991SDimitry Andric 
23288bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
23298bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
23308bcb0991SDimitry Andric     MI.eraseFromParent();
23318bcb0991SDimitry Andric     return true;
23328bcb0991SDimitry Andric   }
23338bcb0991SDimitry Andric 
23348bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
23358bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
23368bcb0991SDimitry Andric 
23378bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
23388bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
23398bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
23408bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
23415ffd83dbSDimitry Andric       8 /*Size*/, Align(8));
23428bcb0991SDimitry Andric 
23438bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
23448bcb0991SDimitry Andric 
23458bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
23468bcb0991SDimitry Andric     // Truncate if this is a 32-bit constant adrdess.
23478bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
23488bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
23498bcb0991SDimitry Andric   } else
23508bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
23518bcb0991SDimitry Andric 
23528bcb0991SDimitry Andric   MI.eraseFromParent();
23538bcb0991SDimitry Andric   return true;
23548bcb0991SDimitry Andric }
23558bcb0991SDimitry Andric 
2356*e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2357*e8d8bef9SDimitry Andric   if (Ty.isVector())
2358*e8d8bef9SDimitry Andric     return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
2359*e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2360*e8d8bef9SDimitry Andric }
2361*e8d8bef9SDimitry Andric 
2362*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2363*e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2364*e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2365*e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2366*e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2367*e8d8bef9SDimitry Andric 
2368*e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2369*e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2370*e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2371*e8d8bef9SDimitry Andric 
2372*e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
23738bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2374*e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
23758bcb0991SDimitry Andric     Observer.changingInstr(MI);
23768bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
23778bcb0991SDimitry Andric     Observer.changedInstr(MI);
23788bcb0991SDimitry Andric     return true;
23798bcb0991SDimitry Andric   }
23808bcb0991SDimitry Andric 
2381*e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2382*e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2383*e8d8bef9SDimitry Andric 
2384*e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
2385*e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
2386*e8d8bef9SDimitry Andric   const unsigned MemSize = 8 * MMO->getSize();
2387*e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
2388*e8d8bef9SDimitry Andric   const unsigned AlignInBits = 8 * MemAlign.value();
2389*e8d8bef9SDimitry Andric 
2390*e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
2391*e8d8bef9SDimitry Andric   if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
2392*e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2393*e8d8bef9SDimitry Andric 
2394*e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
2395*e8d8bef9SDimitry Andric     // the memory type.
2396*e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
2397*e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
2398*e8d8bef9SDimitry Andric 
2399*e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
2400*e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2401*e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
2402*e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
2403*e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
2404*e8d8bef9SDimitry Andric       return true;
2405*e8d8bef9SDimitry Andric     }
2406*e8d8bef9SDimitry Andric 
2407*e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
2408*e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
2409*e8d8bef9SDimitry Andric       return false;
2410*e8d8bef9SDimitry Andric 
2411*e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
2412*e8d8bef9SDimitry Andric 
2413*e8d8bef9SDimitry Andric     Register WideLoad;
2414*e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
2415*e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2416*e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
2417*e8d8bef9SDimitry Andric     } else {
2418*e8d8bef9SDimitry Andric       // Extract the subvector.
2419*e8d8bef9SDimitry Andric 
2420*e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
2421*e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
2422*e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
2423*e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2424*e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
2425*e8d8bef9SDimitry Andric       } else {
2426*e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
2427*e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
2428*e8d8bef9SDimitry Andric         B.setInsertPt(B.getMBB(), ++B.getInsertPt());
2429*e8d8bef9SDimitry Andric         WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
2430*e8d8bef9SDimitry Andric         B.setInsertPt(B.getMBB(), MI.getIterator());
2431*e8d8bef9SDimitry Andric         B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
2432*e8d8bef9SDimitry Andric       }
2433*e8d8bef9SDimitry Andric     }
2434*e8d8bef9SDimitry Andric 
2435*e8d8bef9SDimitry Andric     MI.eraseFromParent();
2436*e8d8bef9SDimitry Andric     return true;
2437*e8d8bef9SDimitry Andric   }
2438*e8d8bef9SDimitry Andric 
2439*e8d8bef9SDimitry Andric   return false;
2440*e8d8bef9SDimitry Andric }
2441*e8d8bef9SDimitry Andric 
24428bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
24438bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24448bcb0991SDimitry Andric   MachineIRBuilder &B) const {
24458bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
24468bcb0991SDimitry Andric   assert(Ty.isScalar());
24478bcb0991SDimitry Andric 
2448480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
2449480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2450480093f4SDimitry Andric 
24518bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
24525ffd83dbSDimitry Andric   // FIXME: Do we need just output?
24535ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
24548bcb0991SDimitry Andric     return true;
24555ffd83dbSDimitry Andric   if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
24568bcb0991SDimitry Andric     return true;
24578bcb0991SDimitry Andric 
24588bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
24598bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
24608bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
24618bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
24628bcb0991SDimitry Andric }
24638bcb0991SDimitry Andric 
2464480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2465480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2466480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2467480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2468480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
2469480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
2470480093f4SDimitry Andric 
2471*e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2472480093f4SDimitry Andric          "this should not have been custom lowered");
2473480093f4SDimitry Andric 
2474480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
2475480093f4SDimitry Andric   LLT VecTy = LLT::vector(2, ValTy);
2476480093f4SDimitry Andric 
2477480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
2478480093f4SDimitry Andric 
2479480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
2480480093f4SDimitry Andric     .addDef(DstReg)
2481480093f4SDimitry Andric     .addUse(PtrReg)
2482480093f4SDimitry Andric     .addUse(PackedVal)
2483480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
2484480093f4SDimitry Andric 
2485480093f4SDimitry Andric   MI.eraseFromParent();
2486480093f4SDimitry Andric   return true;
2487480093f4SDimitry Andric }
2488480093f4SDimitry Andric 
24895ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog(
24905ffd83dbSDimitry Andric   MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
24915ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
24925ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
24935ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
24945ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
24955ffd83dbSDimitry Andric 
24965ffd83dbSDimitry Andric   auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
24975ffd83dbSDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
24985ffd83dbSDimitry Andric 
24995ffd83dbSDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
25005ffd83dbSDimitry Andric   MI.eraseFromParent();
25015ffd83dbSDimitry Andric   return true;
25025ffd83dbSDimitry Andric }
25035ffd83dbSDimitry Andric 
25045ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
25055ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
25065ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
25075ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
25085ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
25095ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
25105ffd83dbSDimitry Andric 
25115ffd83dbSDimitry Andric   auto K = B.buildFConstant(Ty, numbers::log2e);
25125ffd83dbSDimitry Andric   auto Mul = B.buildFMul(Ty, Src, K, Flags);
25135ffd83dbSDimitry Andric   B.buildFExp2(Dst, Mul, Flags);
25145ffd83dbSDimitry Andric   MI.eraseFromParent();
25155ffd83dbSDimitry Andric   return true;
25165ffd83dbSDimitry Andric }
25175ffd83dbSDimitry Andric 
25185ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
25195ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
25205ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
25215ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
25225ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
25235ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
25245ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
25255ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
25265ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
25275ffd83dbSDimitry Andric 
25285ffd83dbSDimitry Andric   if (Ty == S32) {
25295ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S32, Src0, Flags);
25305ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
25315ffd83dbSDimitry Andric       .addUse(Log.getReg(0))
25325ffd83dbSDimitry Andric       .addUse(Src1)
25335ffd83dbSDimitry Andric       .setMIFlags(Flags);
25345ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
25355ffd83dbSDimitry Andric   } else if (Ty == S16) {
25365ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
25375ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S16, Src0, Flags);
25385ffd83dbSDimitry Andric     auto Ext0 = B.buildFPExt(S32, Log, Flags);
25395ffd83dbSDimitry Andric     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
25405ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
25415ffd83dbSDimitry Andric       .addUse(Ext0.getReg(0))
25425ffd83dbSDimitry Andric       .addUse(Ext1.getReg(0))
25435ffd83dbSDimitry Andric       .setMIFlags(Flags);
25445ffd83dbSDimitry Andric 
25455ffd83dbSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
25465ffd83dbSDimitry Andric   } else
25475ffd83dbSDimitry Andric     return false;
25485ffd83dbSDimitry Andric 
25495ffd83dbSDimitry Andric   MI.eraseFromParent();
25505ffd83dbSDimitry Andric   return true;
25515ffd83dbSDimitry Andric }
25525ffd83dbSDimitry Andric 
25535ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
25545ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
25555ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
25565ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
25575ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
25585ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
25595ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
25605ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
25615ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
25625ffd83dbSDimitry Andric   return ModSrc;
25635ffd83dbSDimitry Andric }
25645ffd83dbSDimitry Andric 
25655ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
25665ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
25675ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
25685ffd83dbSDimitry Andric 
25695ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
25705ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
25715ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
25725ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
25735ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
25745ffd83dbSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
25755ffd83dbSDimitry Andric          "this should not have been custom lowered");
25765ffd83dbSDimitry Andric 
25775ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
25785ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
25795ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
25805ffd83dbSDimitry Andric   // V_FRACT bug is:
25815ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
25825ffd83dbSDimitry Andric   //
25835ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
25845ffd83dbSDimitry Andric 
25855ffd83dbSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
25865ffd83dbSDimitry Andric     .addUse(OrigSrc)
25875ffd83dbSDimitry Andric     .setMIFlags(Flags);
25885ffd83dbSDimitry Andric 
25895ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
25905ffd83dbSDimitry Andric   // pattern.
25915ffd83dbSDimitry Andric 
25925ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
25935ffd83dbSDimitry Andric   // shouldn't matter?
25945ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
25955ffd83dbSDimitry Andric 
25965ffd83dbSDimitry Andric   auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
25975ffd83dbSDimitry Andric 
25985ffd83dbSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(S64);
25995ffd83dbSDimitry Andric 
26005ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
26015ffd83dbSDimitry Andric   // use the one which will directly select.
26025ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
26035ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
26045ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
26055ffd83dbSDimitry Andric   else
26065ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
26075ffd83dbSDimitry Andric 
26085ffd83dbSDimitry Andric   Register CorrectedFract = Min;
26095ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
26105ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
26115ffd83dbSDimitry Andric     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
26125ffd83dbSDimitry Andric   }
26135ffd83dbSDimitry Andric 
26145ffd83dbSDimitry Andric   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
26155ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
26165ffd83dbSDimitry Andric 
26175ffd83dbSDimitry Andric   MI.eraseFromParent();
26185ffd83dbSDimitry Andric   return true;
26195ffd83dbSDimitry Andric }
26205ffd83dbSDimitry Andric 
26215ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
26225ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
26235ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
26245ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
26255ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26265ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
26275ffd83dbSDimitry Andric   assert(MRI.getType(Dst) == LLT::vector(2, 16));
26285ffd83dbSDimitry Andric 
26295ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
26305ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
26315ffd83dbSDimitry Andric   assert(MRI.getType(Src0) == LLT::scalar(16));
26325ffd83dbSDimitry Andric 
26335ffd83dbSDimitry Andric   auto Merge = B.buildMerge(S32, {Src0, Src1});
26345ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
26355ffd83dbSDimitry Andric 
26365ffd83dbSDimitry Andric   MI.eraseFromParent();
26375ffd83dbSDimitry Andric   return true;
26385ffd83dbSDimitry Andric }
26395ffd83dbSDimitry Andric 
2640*e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
2641*e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
2642*e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
2643*e8d8bef9SDimitry Andric     return false;
2644*e8d8bef9SDimitry Andric   auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
2645*e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
2646*e8d8bef9SDimitry Andric }
2647*e8d8bef9SDimitry Andric 
26480b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
2649*e8d8bef9SDimitry Andric static MachineInstr *
2650*e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
2651*e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
26520b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
26530b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
26540b57cec5SDimitry Andric     return nullptr;
26550b57cec5SDimitry Andric 
26565ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
2657*e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
2658*e8d8bef9SDimitry Andric 
2659*e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
2660*e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
2661*e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
2662*e8d8bef9SDimitry Andric       return nullptr;
2663*e8d8bef9SDimitry Andric 
2664*e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
2665*e8d8bef9SDimitry Andric     UseMI->eraseFromParent();
2666*e8d8bef9SDimitry Andric 
2667*e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
2668*e8d8bef9SDimitry Andric     Negated = true;
2669*e8d8bef9SDimitry Andric   }
2670*e8d8bef9SDimitry Andric 
2671*e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
2672480093f4SDimitry Andric     return nullptr;
2673480093f4SDimitry Andric 
26745ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
2675*e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
26765ffd83dbSDimitry Andric   if (Next == Parent->end()) {
26775ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
26785ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
26795ffd83dbSDimitry Andric       return nullptr;
26805ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
26815ffd83dbSDimitry Andric   } else {
2682480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
2683480093f4SDimitry Andric       return nullptr;
2684480093f4SDimitry Andric     Br = &*Next;
26855ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
2686480093f4SDimitry Andric   }
2687480093f4SDimitry Andric 
2688*e8d8bef9SDimitry Andric   return UseMI;
26890b57cec5SDimitry Andric }
26900b57cec5SDimitry Andric 
26910b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
2692*e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
2693*e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
2694*e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
2695*e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
2696*e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
26975ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
26980b57cec5SDimitry Andric 
2699*e8d8bef9SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
2700*e8d8bef9SDimitry Andric                                              ArgTy);
27010b57cec5SDimitry Andric   if (Arg->isMasked()) {
27020b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
27030b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
27040b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
27050b57cec5SDimitry Andric     const unsigned Shift = countTrailingZeros<unsigned>(Mask);
27060b57cec5SDimitry Andric 
27078bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
27088bcb0991SDimitry Andric 
27098bcb0991SDimitry Andric     if (Shift != 0) {
27100b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
27118bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
27128bcb0991SDimitry Andric     }
27138bcb0991SDimitry Andric 
27148bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
27155ffd83dbSDimitry Andric   } else {
27160b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
27170b57cec5SDimitry Andric   }
27180b57cec5SDimitry Andric 
27190b57cec5SDimitry Andric   return true;
27200b57cec5SDimitry Andric }
27210b57cec5SDimitry Andric 
2722*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
2723*e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
2724*e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2725*e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
2726*e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
2727*e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
2728*e8d8bef9SDimitry Andric   LLT ArgTy;
2729*e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
2730*e8d8bef9SDimitry Andric 
2731*e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
2732*e8d8bef9SDimitry Andric     return false; // TODO: Handle these
2733*e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
2734*e8d8bef9SDimitry Andric }
2735*e8d8bef9SDimitry Andric 
27360b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
27375ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
27380b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
2739*e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
27405ffd83dbSDimitry Andric     return false;
27415ffd83dbSDimitry Andric 
27420b57cec5SDimitry Andric   MI.eraseFromParent();
27430b57cec5SDimitry Andric   return true;
27440b57cec5SDimitry Andric }
27450b57cec5SDimitry Andric 
27468bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
27478bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
27488bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
2749480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2750480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
2751480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
2752480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
2753480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
27548bcb0991SDimitry Andric 
2755480093f4SDimitry Andric   if (DstTy == S16)
2756480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
2757480093f4SDimitry Andric   if (DstTy == S32)
2758480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
2759480093f4SDimitry Andric   if (DstTy == S64)
2760480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
2761480093f4SDimitry Andric 
27628bcb0991SDimitry Andric   return false;
27638bcb0991SDimitry Andric }
27648bcb0991SDimitry Andric 
27655ffd83dbSDimitry Andric void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
27665ffd83dbSDimitry Andric                                                   Register DstReg,
27675ffd83dbSDimitry Andric                                                   Register X,
27685ffd83dbSDimitry Andric                                                   Register Y,
27695ffd83dbSDimitry Andric                                                   bool IsDiv) const {
27705ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
27715ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
27725ffd83dbSDimitry Andric 
27735ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
27745ffd83dbSDimitry Andric   // algorithm used here.
27755ffd83dbSDimitry Andric 
27765ffd83dbSDimitry Andric   // Initial estimate of inv(y).
27775ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
27785ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
27795ffd83dbSDimitry Andric   auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
27805ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
27815ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
27825ffd83dbSDimitry Andric 
27835ffd83dbSDimitry Andric   // One round of UNR.
27845ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
27855ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
27865ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
27875ffd83dbSDimitry Andric 
27885ffd83dbSDimitry Andric   // Quotient/remainder estimate.
27895ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
27905ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
27915ffd83dbSDimitry Andric 
27925ffd83dbSDimitry Andric   // First quotient/remainder refinement.
27935ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
27945ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
27955ffd83dbSDimitry Andric   if (IsDiv)
27965ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
27975ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
27985ffd83dbSDimitry Andric 
27995ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
28005ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
28015ffd83dbSDimitry Andric   if (IsDiv)
28025ffd83dbSDimitry Andric     B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
28035ffd83dbSDimitry Andric   else
28045ffd83dbSDimitry Andric     B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
28055ffd83dbSDimitry Andric }
28065ffd83dbSDimitry Andric 
28075ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
28085ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
28095ffd83dbSDimitry Andric                                               MachineIRBuilder &B) const {
28105ffd83dbSDimitry Andric   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
28115ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
28125ffd83dbSDimitry Andric   Register Num = MI.getOperand(1).getReg();
28135ffd83dbSDimitry Andric   Register Den = MI.getOperand(2).getReg();
28145ffd83dbSDimitry Andric   legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
28155ffd83dbSDimitry Andric   MI.eraseFromParent();
28165ffd83dbSDimitry Andric   return true;
28175ffd83dbSDimitry Andric }
28185ffd83dbSDimitry Andric 
28195ffd83dbSDimitry Andric // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
28205ffd83dbSDimitry Andric //
28215ffd83dbSDimitry Andric // Return lo, hi of result
28225ffd83dbSDimitry Andric //
28235ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
28245ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
28255ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
28265ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
28275ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
28285ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
28295ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
28305ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
28315ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
28325ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
28335ffd83dbSDimitry Andric                                                        Register Val) {
28345ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
28355ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
28365ffd83dbSDimitry Andric 
28375ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
28385ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
28395ffd83dbSDimitry Andric 
28405ffd83dbSDimitry Andric   auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
28415ffd83dbSDimitry Andric                          B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
28425ffd83dbSDimitry Andric 
28435ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
28445ffd83dbSDimitry Andric   auto Mul1 =
28455ffd83dbSDimitry Andric       B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
28465ffd83dbSDimitry Andric 
28475ffd83dbSDimitry Andric   // 2**(-32)
28485ffd83dbSDimitry Andric   auto Mul2 =
28495ffd83dbSDimitry Andric       B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
28505ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
28515ffd83dbSDimitry Andric 
28525ffd83dbSDimitry Andric   // -(2**32)
28535ffd83dbSDimitry Andric   auto Mad2 = B.buildFMAD(S32, Trunc,
28545ffd83dbSDimitry Andric                           B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
28555ffd83dbSDimitry Andric 
28565ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
28575ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
28585ffd83dbSDimitry Andric 
28595ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
28605ffd83dbSDimitry Andric }
28615ffd83dbSDimitry Andric 
28625ffd83dbSDimitry Andric void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
28635ffd83dbSDimitry Andric                                                   Register DstReg,
28645ffd83dbSDimitry Andric                                                   Register Numer,
28655ffd83dbSDimitry Andric                                                   Register Denom,
28665ffd83dbSDimitry Andric                                                   bool IsDiv) const {
28675ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
28685ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
28695ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
28705ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
28715ffd83dbSDimitry Andric 
28725ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
28735ffd83dbSDimitry Andric 
28745ffd83dbSDimitry Andric   auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
28755ffd83dbSDimitry Andric 
28765ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
28775ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
28785ffd83dbSDimitry Andric 
28795ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
28805ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
28815ffd83dbSDimitry Andric 
28825ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
28835ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
28845ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
28855ffd83dbSDimitry Andric 
28865ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
28875ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
28885ffd83dbSDimitry Andric   auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
28895ffd83dbSDimitry Andric   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
28905ffd83dbSDimitry Andric 
28915ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
28925ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
28935ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
28945ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
28955ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
28965ffd83dbSDimitry Andric 
28975ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
28985ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
28995ffd83dbSDimitry Andric   auto Add2_HiC =
29005ffd83dbSDimitry Andric       B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
29015ffd83dbSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
29025ffd83dbSDimitry Andric   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
29035ffd83dbSDimitry Andric 
29045ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
29055ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
29065ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
29075ffd83dbSDimitry Andric 
29085ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
29095ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
29105ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
29115ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
29125ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
29135ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
29145ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
29155ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
29165ffd83dbSDimitry Andric   auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
29175ffd83dbSDimitry Andric 
29185ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
29195ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
29205ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
29215ffd83dbSDimitry Andric 
29225ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
29235ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
29245ffd83dbSDimitry Andric 
29255ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
29265ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
29275ffd83dbSDimitry Andric 
29285ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
29295ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
29305ffd83dbSDimitry Andric 
29315ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
29325ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
29335ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
29345ffd83dbSDimitry Andric 
29355ffd83dbSDimitry Andric   // if C3 != 0 ...
29365ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
29375ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
29385ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
29395ffd83dbSDimitry Andric   auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
29405ffd83dbSDimitry Andric 
29415ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
29425ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
29435ffd83dbSDimitry Andric 
29445ffd83dbSDimitry Andric   auto C4 =
29455ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
29465ffd83dbSDimitry Andric   auto C5 =
29475ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
29485ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
29495ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
29505ffd83dbSDimitry Andric 
29515ffd83dbSDimitry Andric   // if (C6 != 0)
29525ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
29535ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
29545ffd83dbSDimitry Andric 
29555ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
29565ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
29575ffd83dbSDimitry Andric   auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
29585ffd83dbSDimitry Andric 
29595ffd83dbSDimitry Andric   // endif C6
29605ffd83dbSDimitry Andric   // endif C3
29615ffd83dbSDimitry Andric 
29625ffd83dbSDimitry Andric   if (IsDiv) {
29635ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
29645ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
29655ffd83dbSDimitry Andric     B.buildSelect(DstReg,
29665ffd83dbSDimitry Andric                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
29675ffd83dbSDimitry Andric   } else {
29685ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
29695ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
29705ffd83dbSDimitry Andric     B.buildSelect(DstReg,
29715ffd83dbSDimitry Andric                   B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
29725ffd83dbSDimitry Andric   }
29735ffd83dbSDimitry Andric }
29745ffd83dbSDimitry Andric 
29755ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
29765ffd83dbSDimitry Andric                                             MachineRegisterInfo &MRI,
29775ffd83dbSDimitry Andric                                             MachineIRBuilder &B) const {
29785ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
29795ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
29805ffd83dbSDimitry Andric   const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
29815ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
29825ffd83dbSDimitry Andric   Register Num = MI.getOperand(1).getReg();
29835ffd83dbSDimitry Andric   Register Den = MI.getOperand(2).getReg();
29845ffd83dbSDimitry Andric   LLT Ty = MRI.getType(DstReg);
29855ffd83dbSDimitry Andric 
29865ffd83dbSDimitry Andric   if (Ty == S32)
29875ffd83dbSDimitry Andric     legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
29885ffd83dbSDimitry Andric   else if (Ty == S64)
29895ffd83dbSDimitry Andric     legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
29905ffd83dbSDimitry Andric   else
29915ffd83dbSDimitry Andric     return false;
29925ffd83dbSDimitry Andric 
29935ffd83dbSDimitry Andric   MI.eraseFromParent();
29945ffd83dbSDimitry Andric   return true;
29955ffd83dbSDimitry Andric 
29965ffd83dbSDimitry Andric }
29975ffd83dbSDimitry Andric 
29985ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
29995ffd83dbSDimitry Andric                                             MachineRegisterInfo &MRI,
30005ffd83dbSDimitry Andric                                             MachineIRBuilder &B) const {
30015ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
30025ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
30035ffd83dbSDimitry Andric 
30045ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
30055ffd83dbSDimitry Andric   const LLT Ty = MRI.getType(DstReg);
30065ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
30075ffd83dbSDimitry Andric     return false;
30085ffd83dbSDimitry Andric 
30095ffd83dbSDimitry Andric   const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
30105ffd83dbSDimitry Andric 
30115ffd83dbSDimitry Andric   Register LHS = MI.getOperand(1).getReg();
30125ffd83dbSDimitry Andric   Register RHS = MI.getOperand(2).getReg();
30135ffd83dbSDimitry Andric 
30145ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
30155ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
30165ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
30175ffd83dbSDimitry Andric 
30185ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
30195ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
30205ffd83dbSDimitry Andric 
30215ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
30225ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
30235ffd83dbSDimitry Andric 
30245ffd83dbSDimitry Andric   Register UDivRem = MRI.createGenericVirtualRegister(Ty);
30255ffd83dbSDimitry Andric   if (Ty == S32)
30265ffd83dbSDimitry Andric     legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
30275ffd83dbSDimitry Andric   else
30285ffd83dbSDimitry Andric     legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
30295ffd83dbSDimitry Andric 
30305ffd83dbSDimitry Andric   Register Sign;
30315ffd83dbSDimitry Andric   if (IsDiv)
30325ffd83dbSDimitry Andric     Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
30335ffd83dbSDimitry Andric   else
30345ffd83dbSDimitry Andric     Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
30355ffd83dbSDimitry Andric 
30365ffd83dbSDimitry Andric   UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
30375ffd83dbSDimitry Andric   B.buildSub(DstReg, UDivRem, Sign);
30385ffd83dbSDimitry Andric 
30395ffd83dbSDimitry Andric   MI.eraseFromParent();
30405ffd83dbSDimitry Andric   return true;
30415ffd83dbSDimitry Andric }
30425ffd83dbSDimitry Andric 
30438bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
30448bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
30458bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
30468bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
30478bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
30488bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
30498bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
30508bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
30518bcb0991SDimitry Andric 
30528bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
3053*e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3054*e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
30558bcb0991SDimitry Andric 
3056*e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
30578bcb0991SDimitry Andric     return false;
30588bcb0991SDimitry Andric 
30598bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
30608bcb0991SDimitry Andric     // 1 / x -> RCP(x)
30618bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
30628bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
30638bcb0991SDimitry Andric         .addUse(RHS)
30648bcb0991SDimitry Andric         .setMIFlags(Flags);
30658bcb0991SDimitry Andric 
30668bcb0991SDimitry Andric       MI.eraseFromParent();
30678bcb0991SDimitry Andric       return true;
30688bcb0991SDimitry Andric     }
30698bcb0991SDimitry Andric 
30708bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
30718bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
30728bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
30738bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
30748bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
30758bcb0991SDimitry Andric         .setMIFlags(Flags);
30768bcb0991SDimitry Andric 
30778bcb0991SDimitry Andric       MI.eraseFromParent();
30788bcb0991SDimitry Andric       return true;
30798bcb0991SDimitry Andric     }
30808bcb0991SDimitry Andric   }
30818bcb0991SDimitry Andric 
30828bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
30838bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
30848bcb0991SDimitry Andric     .addUse(RHS)
30858bcb0991SDimitry Andric     .setMIFlags(Flags);
30868bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
30878bcb0991SDimitry Andric 
30888bcb0991SDimitry Andric   MI.eraseFromParent();
30898bcb0991SDimitry Andric   return true;
30908bcb0991SDimitry Andric }
30918bcb0991SDimitry Andric 
3092*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
3093*e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
3094*e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
3095*e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3096*e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
3097*e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
3098*e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
3099*e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
3100*e8d8bef9SDimitry Andric 
3101*e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
3102*e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
3103*e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
3104*e8d8bef9SDimitry Andric 
3105*e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
31068bcb0991SDimitry Andric     return false;
3107*e8d8bef9SDimitry Andric 
3108*e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
3109*e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
3110*e8d8bef9SDimitry Andric 
3111*e8d8bef9SDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
3112*e8d8bef9SDimitry Andric     .addUse(Y)
3113*e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3114*e8d8bef9SDimitry Andric 
3115*e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
3116*e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
3117*e8d8bef9SDimitry Andric 
3118*e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
3119*e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
3120*e8d8bef9SDimitry Andric 
3121*e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
3122*e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
3123*e8d8bef9SDimitry Andric 
3124*e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
3125*e8d8bef9SDimitry Andric   MI.eraseFromParent();
3126*e8d8bef9SDimitry Andric   return true;
31278bcb0991SDimitry Andric }
31288bcb0991SDimitry Andric 
3129480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
3130480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3131480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3132*e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3133*e8d8bef9SDimitry Andric     return true;
3134*e8d8bef9SDimitry Andric 
3135480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3136480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3137480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3138480093f4SDimitry Andric 
3139480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3140480093f4SDimitry Andric 
3141480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
3142480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3143480093f4SDimitry Andric 
3144480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
3145480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
3146480093f4SDimitry Andric 
3147480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3148480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
3149480093f4SDimitry Andric     .setMIFlags(Flags);
3150480093f4SDimitry Andric 
3151480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
3152480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
3153480093f4SDimitry Andric 
3154480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3155480093f4SDimitry Andric     .addUse(RDst.getReg(0))
3156480093f4SDimitry Andric     .addUse(RHS)
3157480093f4SDimitry Andric     .addUse(LHS)
3158480093f4SDimitry Andric     .setMIFlags(Flags);
3159480093f4SDimitry Andric 
3160480093f4SDimitry Andric   MI.eraseFromParent();
3161480093f4SDimitry Andric   return true;
3162480093f4SDimitry Andric }
3163480093f4SDimitry Andric 
3164480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
3165480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
3166480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable,
3167480093f4SDimitry Andric                                MachineIRBuilder &B,
3168480093f4SDimitry Andric                                const GCNSubtarget &ST,
3169480093f4SDimitry Andric                                AMDGPU::SIModeRegisterDefaults Mode) {
3170480093f4SDimitry Andric   // Set SP denorm mode to this value.
3171480093f4SDimitry Andric   unsigned SPDenormMode =
31725ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
3173480093f4SDimitry Andric 
3174480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
3175480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
31765ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
3177480093f4SDimitry Andric 
31785ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
3179480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
3180480093f4SDimitry Andric       .addImm(NewDenormModeValue);
3181480093f4SDimitry Andric 
3182480093f4SDimitry Andric   } else {
3183480093f4SDimitry Andric     // Select FP32 bit field in mode register.
3184480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
3185480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
3186480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
3187480093f4SDimitry Andric 
3188480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
3189480093f4SDimitry Andric       .addImm(SPDenormMode)
3190480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
3191480093f4SDimitry Andric   }
3192480093f4SDimitry Andric }
3193480093f4SDimitry Andric 
3194480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
3195480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3196480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3197*e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
3198*e8d8bef9SDimitry Andric     return true;
3199*e8d8bef9SDimitry Andric 
3200480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3201480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3202480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3203480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3204480093f4SDimitry Andric   AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
3205480093f4SDimitry Andric 
3206480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3207480093f4SDimitry Andric 
3208480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
3209480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3210480093f4SDimitry Andric 
3211480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
3212480093f4SDimitry Andric 
3213480093f4SDimitry Andric   auto DenominatorScaled =
3214480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3215480093f4SDimitry Andric       .addUse(LHS)
32165ffd83dbSDimitry Andric       .addUse(RHS)
32175ffd83dbSDimitry Andric       .addImm(0)
3218480093f4SDimitry Andric       .setMIFlags(Flags);
3219480093f4SDimitry Andric   auto NumeratorScaled =
3220480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
3221480093f4SDimitry Andric       .addUse(LHS)
3222480093f4SDimitry Andric       .addUse(RHS)
32235ffd83dbSDimitry Andric       .addImm(1)
3224480093f4SDimitry Andric       .setMIFlags(Flags);
3225480093f4SDimitry Andric 
3226480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
3227480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
3228480093f4SDimitry Andric     .setMIFlags(Flags);
3229480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
3230480093f4SDimitry Andric 
3231480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
3232480093f4SDimitry Andric   // aren't modeled as reading it.
32335ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3234480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
3235480093f4SDimitry Andric 
3236480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
3237480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
3238480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
3239480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
3240480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
3241480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
3242480093f4SDimitry Andric 
32435ffd83dbSDimitry Andric   if (!Mode.allFP32Denormals())
3244480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
3245480093f4SDimitry Andric 
3246480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
3247480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3248480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
3249480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3250480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
3251480093f4SDimitry Andric     .setMIFlags(Flags);
3252480093f4SDimitry Andric 
3253480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
3254480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3255480093f4SDimitry Andric     .addUse(RHS)
3256480093f4SDimitry Andric     .addUse(LHS)
3257480093f4SDimitry Andric     .setMIFlags(Flags);
3258480093f4SDimitry Andric 
3259480093f4SDimitry Andric   MI.eraseFromParent();
3260480093f4SDimitry Andric   return true;
3261480093f4SDimitry Andric }
3262480093f4SDimitry Andric 
3263480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
3264480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
3265480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
3266*e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
3267*e8d8bef9SDimitry Andric     return true;
3268*e8d8bef9SDimitry Andric 
3269480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
3270480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
3271480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
3272480093f4SDimitry Andric 
3273480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
3274480093f4SDimitry Andric 
3275480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
3276480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
3277480093f4SDimitry Andric 
3278480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
3279480093f4SDimitry Andric 
3280480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3281480093f4SDimitry Andric     .addUse(LHS)
3282480093f4SDimitry Andric     .addUse(RHS)
32835ffd83dbSDimitry Andric     .addImm(0)
3284480093f4SDimitry Andric     .setMIFlags(Flags);
3285480093f4SDimitry Andric 
3286480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
3287480093f4SDimitry Andric 
3288480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
3289480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
3290480093f4SDimitry Andric     .setMIFlags(Flags);
3291480093f4SDimitry Andric 
3292480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
3293480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
3294480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
3295480093f4SDimitry Andric 
3296480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
3297480093f4SDimitry Andric     .addUse(LHS)
3298480093f4SDimitry Andric     .addUse(RHS)
32995ffd83dbSDimitry Andric     .addImm(1)
3300480093f4SDimitry Andric     .setMIFlags(Flags);
3301480093f4SDimitry Andric 
3302480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
33035ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
3304480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
3305480093f4SDimitry Andric 
3306480093f4SDimitry Andric   Register Scale;
3307480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
3308480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
3309480093f4SDimitry Andric     // is not usable.
3310480093f4SDimitry Andric 
3311480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
3312480093f4SDimitry Andric 
3313480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
3314480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
3315480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
3316480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
3317480093f4SDimitry Andric 
3318480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
3319480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
3320480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
3321480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
33225ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
3323480093f4SDimitry Andric   } else {
3324480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
3325480093f4SDimitry Andric   }
3326480093f4SDimitry Andric 
3327480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
3328480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
3329480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
3330480093f4SDimitry Andric     .addUse(Mul.getReg(0))
3331480093f4SDimitry Andric     .addUse(Scale)
3332480093f4SDimitry Andric     .setMIFlags(Flags);
3333480093f4SDimitry Andric 
3334480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false)
3335480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
3336480093f4SDimitry Andric     .addUse(RHS)
3337480093f4SDimitry Andric     .addUse(LHS)
3338480093f4SDimitry Andric     .setMIFlags(Flags);
3339480093f4SDimitry Andric 
3340480093f4SDimitry Andric   MI.eraseFromParent();
3341480093f4SDimitry Andric   return true;
3342480093f4SDimitry Andric }
3343480093f4SDimitry Andric 
33448bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
33458bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
33468bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
33478bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
33488bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
33498bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
33508bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
33518bcb0991SDimitry Andric 
33528bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
33538bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
33548bcb0991SDimitry Andric 
33558bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
33568bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
33578bcb0991SDimitry Andric 
33588bcb0991SDimitry Andric   auto C0 = B.buildConstant(S32, 0x6f800000);
33598bcb0991SDimitry Andric   auto C1 = B.buildConstant(S32, 0x2f800000);
33608bcb0991SDimitry Andric   auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
33618bcb0991SDimitry Andric 
33628bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
33638bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
33648bcb0991SDimitry Andric 
33658bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
33668bcb0991SDimitry Andric 
33678bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
33688bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
33698bcb0991SDimitry Andric     .setMIFlags(Flags);
33708bcb0991SDimitry Andric 
33718bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
33728bcb0991SDimitry Andric 
33738bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
33748bcb0991SDimitry Andric 
33758bcb0991SDimitry Andric   MI.eraseFromParent();
33768bcb0991SDimitry Andric   return true;
33778bcb0991SDimitry Andric }
33788bcb0991SDimitry Andric 
3379*e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
3380*e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
3381*e8d8bef9SDimitry Andric //
3382*e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
3383*e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
3384*e8d8bef9SDimitry Andric // +-max_float.
3385*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
3386*e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
3387*e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
3388*e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
3389*e8d8bef9SDimitry Andric     return true;
3390*e8d8bef9SDimitry Andric 
3391*e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3392*e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
3393*e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
3394*e8d8bef9SDimitry Andric 
3395*e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
3396*e8d8bef9SDimitry Andric 
3397*e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
3398*e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
3399*e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
3400*e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
3401*e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
3402*e8d8bef9SDimitry Andric   else
3403*e8d8bef9SDimitry Andric     return false;
3404*e8d8bef9SDimitry Andric 
3405*e8d8bef9SDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
3406*e8d8bef9SDimitry Andric     .addUse(Src)
3407*e8d8bef9SDimitry Andric     .setMIFlags(Flags);
3408*e8d8bef9SDimitry Andric 
3409*e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
3410*e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
3411*e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
3412*e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
3413*e8d8bef9SDimitry Andric 
3414*e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
3415*e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
3416*e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
3417*e8d8bef9SDimitry Andric 
3418*e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
3419*e8d8bef9SDimitry Andric 
3420*e8d8bef9SDimitry Andric   if (UseIEEE)
3421*e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
3422*e8d8bef9SDimitry Andric   else
3423*e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
3424*e8d8bef9SDimitry Andric   MI.eraseFromParent();
3425*e8d8bef9SDimitry Andric   return true;
3426*e8d8bef9SDimitry Andric }
3427*e8d8bef9SDimitry Andric 
3428*e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
3429*e8d8bef9SDimitry Andric   switch (IID) {
3430*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
3431*e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
3432*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
3433*e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
3434*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
3435*e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
3436*e8d8bef9SDimitry Andric   default:
3437*e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
3438*e8d8bef9SDimitry Andric   }
3439*e8d8bef9SDimitry Andric }
3440*e8d8bef9SDimitry Andric 
3441*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
3442*e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
3443*e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
3444*e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
3445*e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
3446*e8d8bef9SDimitry Andric 
3447*e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
3448*e8d8bef9SDimitry Andric 
3449*e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
3450*e8d8bef9SDimitry Andric   // construction.
3451*e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
3452*e8d8bef9SDimitry Andric     MI.RemoveOperand(I);
3453*e8d8bef9SDimitry Andric 
3454*e8d8bef9SDimitry Andric   MI.RemoveOperand(1); // Remove the intrinsic ID.
3455*e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
3456*e8d8bef9SDimitry Andric   return true;
3457*e8d8bef9SDimitry Andric }
3458*e8d8bef9SDimitry Andric 
3459*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
3460*e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
3461*e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
3462*e8d8bef9SDimitry Andric   uint64_t Offset =
3463*e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
3464*e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
3465*e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
3466*e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
3467*e8d8bef9SDimitry Andric 
3468*e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
3469*e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
3470*e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
3471*e8d8bef9SDimitry Andric     return false;
3472*e8d8bef9SDimitry Andric 
3473*e8d8bef9SDimitry Andric   // FIXME: This should be nuw
3474*e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
3475*e8d8bef9SDimitry Andric   return true;
3476*e8d8bef9SDimitry Andric }
3477*e8d8bef9SDimitry Andric 
34780b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
34790b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
34800b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
34810b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
34820b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
34830b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
34840b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
34850b57cec5SDimitry Andric   }
34860b57cec5SDimitry Andric 
34870b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
3488*e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
34890b57cec5SDimitry Andric     return false;
34900b57cec5SDimitry Andric 
34910b57cec5SDimitry Andric   MI.eraseFromParent();
34920b57cec5SDimitry Andric   return true;
34930b57cec5SDimitry Andric }
34940b57cec5SDimitry Andric 
34958bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
34968bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
34978bcb0991SDimitry Andric                                               MachineIRBuilder &B,
34988bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
34998bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
3500*e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
3501*e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
3502*e8d8bef9SDimitry Andric 
35038bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
35048bcb0991SDimitry Andric   MI.eraseFromParent();
35058bcb0991SDimitry Andric   return true;
35068bcb0991SDimitry Andric }
35078bcb0991SDimitry Andric 
35085ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
35095ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
35105ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
35115ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
35125ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
35135ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
35145ffd83dbSDimitry Andric std::tuple<Register, unsigned, unsigned>
35155ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
35165ffd83dbSDimitry Andric                                         Register OrigOffset) const {
35175ffd83dbSDimitry Andric   const unsigned MaxImm = 4095;
35185ffd83dbSDimitry Andric   Register BaseReg;
35195ffd83dbSDimitry Andric   unsigned TotalConstOffset;
35205ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
35215ffd83dbSDimitry Andric 
3522*e8d8bef9SDimitry Andric   std::tie(BaseReg, TotalConstOffset) =
3523*e8d8bef9SDimitry Andric       AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
35245ffd83dbSDimitry Andric 
35255ffd83dbSDimitry Andric   unsigned ImmOffset = TotalConstOffset;
35265ffd83dbSDimitry Andric 
35275ffd83dbSDimitry Andric   // If the immediate value is too big for the immoffset field, put the value
35285ffd83dbSDimitry Andric   // and -4096 into the immoffset field so that the value that is copied/added
35295ffd83dbSDimitry Andric   // for the voffset field is a multiple of 4096, and it stands more chance
35305ffd83dbSDimitry Andric   // of being CSEd with the copy/add for another similar load/store.
35315ffd83dbSDimitry Andric   // However, do not do that rounding down to a multiple of 4096 if that is a
35325ffd83dbSDimitry Andric   // negative number, as it appears to be illegal to have a negative offset
35335ffd83dbSDimitry Andric   // in the vgpr, even if adding the immediate offset makes it positive.
35345ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
35355ffd83dbSDimitry Andric   ImmOffset -= Overflow;
35365ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
35375ffd83dbSDimitry Andric     Overflow += ImmOffset;
35385ffd83dbSDimitry Andric     ImmOffset = 0;
35395ffd83dbSDimitry Andric   }
35405ffd83dbSDimitry Andric 
35415ffd83dbSDimitry Andric   if (Overflow != 0) {
35425ffd83dbSDimitry Andric     if (!BaseReg) {
35435ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
35445ffd83dbSDimitry Andric     } else {
35455ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
35465ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
35475ffd83dbSDimitry Andric     }
35485ffd83dbSDimitry Andric   }
35495ffd83dbSDimitry Andric 
35505ffd83dbSDimitry Andric   if (!BaseReg)
35515ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
35525ffd83dbSDimitry Andric 
35535ffd83dbSDimitry Andric   return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
35545ffd83dbSDimitry Andric }
35555ffd83dbSDimitry Andric 
35568bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
35578bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
35588bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
3559*e8d8bef9SDimitry Andric                                              Register Reg,
3560*e8d8bef9SDimitry Andric                                              bool ImageStore) const {
35618bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
35628bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
35638bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
35648bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
35658bcb0991SDimitry Andric 
3566*e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
35678bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
35688bcb0991SDimitry Andric 
35698bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
35708bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
35718bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
35728bcb0991SDimitry Andric 
35738bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
35748bcb0991SDimitry Andric 
35758bcb0991SDimitry Andric     return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
35768bcb0991SDimitry Andric   }
35778bcb0991SDimitry Andric 
3578*e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
3579*e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
3580*e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3581*e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
3582*e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
3583*e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
3584*e8d8bef9SDimitry Andric       return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
3585*e8d8bef9SDimitry Andric     }
3586*e8d8bef9SDimitry Andric 
3587*e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
3588*e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3589*e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
3590*e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3591*e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
3592*e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
3593*e8d8bef9SDimitry Andric       Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
3594*e8d8bef9SDimitry Andric       return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
3595*e8d8bef9SDimitry Andric     }
3596*e8d8bef9SDimitry Andric 
3597*e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
3598*e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
3599*e8d8bef9SDimitry Andric       Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
3600*e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
3601*e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
3602*e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
3603*e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
3604*e8d8bef9SDimitry Andric       return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
3605*e8d8bef9SDimitry Andric     }
3606*e8d8bef9SDimitry Andric 
3607*e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
3608*e8d8bef9SDimitry Andric   }
3609*e8d8bef9SDimitry Andric 
3610*e8d8bef9SDimitry Andric   return Reg;
3611*e8d8bef9SDimitry Andric }
3612*e8d8bef9SDimitry Andric 
36135ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
36145ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
36155ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
36165ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
36178bcb0991SDimitry Andric 
36188bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
36198bcb0991SDimitry Andric 
36208bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
36218bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
36228bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
36235ffd83dbSDimitry Andric     return AnyExt;
36248bcb0991SDimitry Andric   }
36258bcb0991SDimitry Andric 
36268bcb0991SDimitry Andric   if (Ty.isVector()) {
36278bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
36288bcb0991SDimitry Andric       if (IsFormat)
36295ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
36305ffd83dbSDimitry Andric     }
36315ffd83dbSDimitry Andric   }
36325ffd83dbSDimitry Andric 
36335ffd83dbSDimitry Andric   return VData;
36345ffd83dbSDimitry Andric }
36355ffd83dbSDimitry Andric 
36365ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
36375ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
36385ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
36395ffd83dbSDimitry Andric                                               bool IsTyped,
36405ffd83dbSDimitry Andric                                               bool IsFormat) const {
36415ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
36425ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
36435ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
36445ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
36455ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
36465ffd83dbSDimitry Andric 
36475ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
36485ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
36495ffd83dbSDimitry Andric 
36505ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
36515ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
36525ffd83dbSDimitry Andric 
36535ffd83dbSDimitry Andric   unsigned ImmOffset;
36545ffd83dbSDimitry Andric   unsigned TotalOffset;
36555ffd83dbSDimitry Andric 
36565ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
36575ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
36585ffd83dbSDimitry Andric 
36595ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
36605ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
36615ffd83dbSDimitry Andric   Register VIndex;
36625ffd83dbSDimitry Andric   int OpOffset = 0;
36635ffd83dbSDimitry Andric   if (HasVIndex) {
36645ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
36655ffd83dbSDimitry Andric     OpOffset = 1;
36665ffd83dbSDimitry Andric   }
36675ffd83dbSDimitry Andric 
36685ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
36695ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
36705ffd83dbSDimitry Andric 
36715ffd83dbSDimitry Andric   unsigned Format = 0;
36725ffd83dbSDimitry Andric   if (IsTyped) {
36735ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
36745ffd83dbSDimitry Andric     ++OpOffset;
36755ffd83dbSDimitry Andric   }
36765ffd83dbSDimitry Andric 
36775ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
36785ffd83dbSDimitry Andric 
36795ffd83dbSDimitry Andric   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
36805ffd83dbSDimitry Andric   if (TotalOffset != 0)
36815ffd83dbSDimitry Andric     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
36825ffd83dbSDimitry Andric 
36835ffd83dbSDimitry Andric   unsigned Opc;
36845ffd83dbSDimitry Andric   if (IsTyped) {
36855ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
36865ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
36875ffd83dbSDimitry Andric   } else if (IsFormat) {
36885ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
36895ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
36905ffd83dbSDimitry Andric   } else {
36915ffd83dbSDimitry Andric     switch (MemSize) {
36925ffd83dbSDimitry Andric     case 1:
36935ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
36945ffd83dbSDimitry Andric       break;
36955ffd83dbSDimitry Andric     case 2:
36965ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
36975ffd83dbSDimitry Andric       break;
36985ffd83dbSDimitry Andric     default:
36995ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
37005ffd83dbSDimitry Andric       break;
37015ffd83dbSDimitry Andric     }
37025ffd83dbSDimitry Andric   }
37035ffd83dbSDimitry Andric 
37045ffd83dbSDimitry Andric   if (!VIndex)
37055ffd83dbSDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
37065ffd83dbSDimitry Andric 
37075ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
37085ffd83dbSDimitry Andric     .addUse(VData)              // vdata
37095ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
37105ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
37115ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
37125ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
37135ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
37145ffd83dbSDimitry Andric 
37155ffd83dbSDimitry Andric   if (IsTyped)
37165ffd83dbSDimitry Andric     MIB.addImm(Format);
37175ffd83dbSDimitry Andric 
37185ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
37195ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
37205ffd83dbSDimitry Andric      .addMemOperand(MMO);
37215ffd83dbSDimitry Andric 
37225ffd83dbSDimitry Andric   MI.eraseFromParent();
37238bcb0991SDimitry Andric   return true;
37248bcb0991SDimitry Andric }
37258bcb0991SDimitry Andric 
37265ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
37275ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
37285ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
37295ffd83dbSDimitry Andric                                              bool IsFormat,
37305ffd83dbSDimitry Andric                                              bool IsTyped) const {
37315ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
37325ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
37335ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
37345ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
37355ffd83dbSDimitry Andric 
37365ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
37375ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
37385ffd83dbSDimitry Andric 
37395ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
37405ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
37415ffd83dbSDimitry Andric 
37425ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
37435ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
37445ffd83dbSDimitry Andric   Register VIndex;
37455ffd83dbSDimitry Andric   int OpOffset = 0;
37465ffd83dbSDimitry Andric   if (HasVIndex) {
37475ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
37485ffd83dbSDimitry Andric     OpOffset = 1;
37498bcb0991SDimitry Andric   }
37508bcb0991SDimitry Andric 
37515ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
37525ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
37535ffd83dbSDimitry Andric 
37545ffd83dbSDimitry Andric   unsigned Format = 0;
37555ffd83dbSDimitry Andric   if (IsTyped) {
37565ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
37575ffd83dbSDimitry Andric     ++OpOffset;
37588bcb0991SDimitry Andric   }
37598bcb0991SDimitry Andric 
37605ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
37615ffd83dbSDimitry Andric   unsigned ImmOffset;
37625ffd83dbSDimitry Andric   unsigned TotalOffset;
37635ffd83dbSDimitry Andric 
37645ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
37655ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
37665ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
37675ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
37685ffd83dbSDimitry Andric 
37695ffd83dbSDimitry Andric   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
37705ffd83dbSDimitry Andric   if (TotalOffset != 0)
37715ffd83dbSDimitry Andric     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
37725ffd83dbSDimitry Andric 
37735ffd83dbSDimitry Andric   unsigned Opc;
37745ffd83dbSDimitry Andric 
37755ffd83dbSDimitry Andric   if (IsTyped) {
37765ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
37775ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
37785ffd83dbSDimitry Andric   } else if (IsFormat) {
37795ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
37805ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
37815ffd83dbSDimitry Andric   } else {
37825ffd83dbSDimitry Andric     switch (MemSize) {
37835ffd83dbSDimitry Andric     case 1:
37845ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
37855ffd83dbSDimitry Andric       break;
37865ffd83dbSDimitry Andric     case 2:
37875ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
37885ffd83dbSDimitry Andric       break;
37895ffd83dbSDimitry Andric     default:
37905ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
37915ffd83dbSDimitry Andric       break;
37925ffd83dbSDimitry Andric     }
37935ffd83dbSDimitry Andric   }
37945ffd83dbSDimitry Andric 
37955ffd83dbSDimitry Andric   Register LoadDstReg;
37965ffd83dbSDimitry Andric 
37975ffd83dbSDimitry Andric   bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
37985ffd83dbSDimitry Andric   LLT UnpackedTy = Ty.changeElementSize(32);
37995ffd83dbSDimitry Andric 
38005ffd83dbSDimitry Andric   if (IsExtLoad)
38015ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
38025ffd83dbSDimitry Andric   else if (Unpacked && IsD16 && Ty.isVector())
38035ffd83dbSDimitry Andric     LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
38045ffd83dbSDimitry Andric   else
38055ffd83dbSDimitry Andric     LoadDstReg = Dst;
38065ffd83dbSDimitry Andric 
38075ffd83dbSDimitry Andric   if (!VIndex)
38085ffd83dbSDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
38095ffd83dbSDimitry Andric 
38105ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
38115ffd83dbSDimitry Andric     .addDef(LoadDstReg)         // vdata
38125ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
38135ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
38145ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
38155ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
38165ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
38175ffd83dbSDimitry Andric 
38185ffd83dbSDimitry Andric   if (IsTyped)
38195ffd83dbSDimitry Andric     MIB.addImm(Format);
38205ffd83dbSDimitry Andric 
38215ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
38225ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
38235ffd83dbSDimitry Andric      .addMemOperand(MMO);
38245ffd83dbSDimitry Andric 
38255ffd83dbSDimitry Andric   if (LoadDstReg != Dst) {
38265ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
38275ffd83dbSDimitry Andric 
38285ffd83dbSDimitry Andric     // Widen result for extending loads was widened.
38295ffd83dbSDimitry Andric     if (IsExtLoad)
38305ffd83dbSDimitry Andric       B.buildTrunc(Dst, LoadDstReg);
38315ffd83dbSDimitry Andric     else {
38325ffd83dbSDimitry Andric       // Repack to original 16-bit vector result
38335ffd83dbSDimitry Andric       // FIXME: G_TRUNC should work, but legalization currently fails
38345ffd83dbSDimitry Andric       auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
38355ffd83dbSDimitry Andric       SmallVector<Register, 4> Repack;
38365ffd83dbSDimitry Andric       for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
38375ffd83dbSDimitry Andric         Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
38385ffd83dbSDimitry Andric       B.buildMerge(Dst, Repack);
38395ffd83dbSDimitry Andric     }
38405ffd83dbSDimitry Andric   }
38415ffd83dbSDimitry Andric 
38425ffd83dbSDimitry Andric   MI.eraseFromParent();
38435ffd83dbSDimitry Andric   return true;
38445ffd83dbSDimitry Andric }
38455ffd83dbSDimitry Andric 
38465ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
38475ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
38485ffd83dbSDimitry Andric                                                bool IsInc) const {
38495ffd83dbSDimitry Andric   unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
38505ffd83dbSDimitry Andric                          AMDGPU::G_AMDGPU_ATOMIC_DEC;
38515ffd83dbSDimitry Andric   B.buildInstr(Opc)
38525ffd83dbSDimitry Andric     .addDef(MI.getOperand(0).getReg())
38535ffd83dbSDimitry Andric     .addUse(MI.getOperand(2).getReg())
38545ffd83dbSDimitry Andric     .addUse(MI.getOperand(3).getReg())
38555ffd83dbSDimitry Andric     .cloneMemRefs(MI);
38565ffd83dbSDimitry Andric   MI.eraseFromParent();
38575ffd83dbSDimitry Andric   return true;
38585ffd83dbSDimitry Andric }
38595ffd83dbSDimitry Andric 
38605ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
38615ffd83dbSDimitry Andric   switch (IntrID) {
38625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
38635ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
38645ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
38655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
38665ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
38675ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
38685ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
38695ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
38705ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
38715ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
38725ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
38735ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
38745ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
38755ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
38765ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
38775ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
38785ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
38795ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
38805ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
38815ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
38825ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
38835ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
38845ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
38855ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
38865ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
38875ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
38885ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
38895ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
38905ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
38915ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
38925ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
38935ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
38945ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
38955ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
38965ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
38975ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
38985ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
38995ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
39005ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
3901*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
3902*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
3903*e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
39045ffd83dbSDimitry Andric   default:
39055ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
39065ffd83dbSDimitry Andric   }
39075ffd83dbSDimitry Andric }
39085ffd83dbSDimitry Andric 
39095ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
39105ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
39115ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
39125ffd83dbSDimitry Andric   const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
39135ffd83dbSDimitry Andric                          IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
3914*e8d8bef9SDimitry Andric   const bool HasReturn = MI.getNumExplicitDefs() != 0;
39155ffd83dbSDimitry Andric 
3916*e8d8bef9SDimitry Andric   Register Dst;
39175ffd83dbSDimitry Andric 
39185ffd83dbSDimitry Andric   int OpOffset = 0;
3919*e8d8bef9SDimitry Andric   if (HasReturn) {
3920*e8d8bef9SDimitry Andric     // A few FP atomics do not support return values.
3921*e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
3922*e8d8bef9SDimitry Andric   } else {
3923*e8d8bef9SDimitry Andric     OpOffset = -1;
3924*e8d8bef9SDimitry Andric   }
3925*e8d8bef9SDimitry Andric 
3926*e8d8bef9SDimitry Andric   Register VData = MI.getOperand(2 + OpOffset).getReg();
3927*e8d8bef9SDimitry Andric   Register CmpVal;
39285ffd83dbSDimitry Andric 
39295ffd83dbSDimitry Andric   if (IsCmpSwap) {
39305ffd83dbSDimitry Andric     CmpVal = MI.getOperand(3 + OpOffset).getReg();
39315ffd83dbSDimitry Andric     ++OpOffset;
39325ffd83dbSDimitry Andric   }
39335ffd83dbSDimitry Andric 
39345ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
3935*e8d8bef9SDimitry Andric   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
39365ffd83dbSDimitry Andric 
39375ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
39385ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
39395ffd83dbSDimitry Andric   Register VIndex;
39405ffd83dbSDimitry Andric   if (HasVIndex) {
39415ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
39425ffd83dbSDimitry Andric     ++OpOffset;
39435ffd83dbSDimitry Andric   }
39445ffd83dbSDimitry Andric 
39455ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
39465ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
39475ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
39485ffd83dbSDimitry Andric 
39495ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
39505ffd83dbSDimitry Andric 
39515ffd83dbSDimitry Andric   unsigned ImmOffset;
39525ffd83dbSDimitry Andric   unsigned TotalOffset;
39535ffd83dbSDimitry Andric   std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
39545ffd83dbSDimitry Andric   if (TotalOffset != 0)
39555ffd83dbSDimitry Andric     MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
39565ffd83dbSDimitry Andric 
39575ffd83dbSDimitry Andric   if (!VIndex)
39585ffd83dbSDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
39595ffd83dbSDimitry Andric 
3960*e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
3961*e8d8bef9SDimitry Andric 
3962*e8d8bef9SDimitry Andric   if (HasReturn)
3963*e8d8bef9SDimitry Andric     MIB.addDef(Dst);
3964*e8d8bef9SDimitry Andric 
3965*e8d8bef9SDimitry Andric   MIB.addUse(VData); // vdata
39665ffd83dbSDimitry Andric 
39675ffd83dbSDimitry Andric   if (IsCmpSwap)
39685ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
39695ffd83dbSDimitry Andric 
39705ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
39715ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
39725ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
39735ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
39745ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
39755ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
39765ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
39775ffd83dbSDimitry Andric      .addMemOperand(MMO);
39785ffd83dbSDimitry Andric 
39795ffd83dbSDimitry Andric   MI.eraseFromParent();
39805ffd83dbSDimitry Andric   return true;
39815ffd83dbSDimitry Andric }
39825ffd83dbSDimitry Andric 
39835ffd83dbSDimitry Andric /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
39845ffd83dbSDimitry Andric /// vector with s16 typed elements.
3985*e8d8bef9SDimitry Andric static void packImageA16AddressToDwords(
3986*e8d8bef9SDimitry Andric     MachineIRBuilder &B, MachineInstr &MI,
3987*e8d8bef9SDimitry Andric     SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
3988*e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
39895ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
39905ffd83dbSDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
39915ffd83dbSDimitry Andric 
3992*e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
3993*e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
39945ffd83dbSDimitry Andric     if (!SrcOp.isReg())
39955ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
39965ffd83dbSDimitry Andric 
39975ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
39985ffd83dbSDimitry Andric 
3999*e8d8bef9SDimitry Andric     if (I < Intr->GradientStart) {
40005ffd83dbSDimitry Andric       AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
40015ffd83dbSDimitry Andric       PackedAddrs.push_back(AddrReg);
40025ffd83dbSDimitry Andric     } else {
40035ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
40045ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
40055ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
4006*e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
4007*e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
4008*e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
4009*e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
4010*e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
40115ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
4012*e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
40135ffd83dbSDimitry Andric         PackedAddrs.push_back(
40145ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
40155ffd83dbSDimitry Andric                 .getReg(0));
40165ffd83dbSDimitry Andric       } else {
40175ffd83dbSDimitry Andric         PackedAddrs.push_back(
4018*e8d8bef9SDimitry Andric             B.buildBuildVector(
4019*e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
40205ffd83dbSDimitry Andric                 .getReg(0));
40215ffd83dbSDimitry Andric         ++I;
40225ffd83dbSDimitry Andric       }
40235ffd83dbSDimitry Andric     }
40245ffd83dbSDimitry Andric   }
40255ffd83dbSDimitry Andric }
40265ffd83dbSDimitry Andric 
40275ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
40285ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
40295ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
40305ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
40315ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
40325ffd83dbSDimitry Andric 
40335ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
40345ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
40355ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
40365ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
40375ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
40385ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
40395ffd83dbSDimitry Andric     }
40405ffd83dbSDimitry Andric   }
40415ffd83dbSDimitry Andric 
40425ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
40435ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
40445ffd83dbSDimitry Andric     // Round up to 8 elements for v5-v7
40455ffd83dbSDimitry Andric     // FIXME: Missing intermediate sized register classes and instructions.
40465ffd83dbSDimitry Andric     if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
40475ffd83dbSDimitry Andric       const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
40485ffd83dbSDimitry Andric       auto Undef = B.buildUndef(S32);
40495ffd83dbSDimitry Andric       AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
40505ffd83dbSDimitry Andric       NumAddrRegs = RoundedNumRegs;
40515ffd83dbSDimitry Andric     }
40525ffd83dbSDimitry Andric 
40535ffd83dbSDimitry Andric     auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
40545ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
40555ffd83dbSDimitry Andric   }
40565ffd83dbSDimitry Andric 
40575ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
40585ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
40595ffd83dbSDimitry Andric     if (SrcOp.isReg())
40605ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
40615ffd83dbSDimitry Andric   }
40625ffd83dbSDimitry Andric }
40635ffd83dbSDimitry Andric 
40645ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
40655ffd83dbSDimitry Andric ///
40665ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
40675ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
40685ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
40695ffd83dbSDimitry Andric /// registers.
40705ffd83dbSDimitry Andric ///
40715ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
40725ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
40735ffd83dbSDimitry Andric /// want a selected instrution entering RegBankSelect. In order to avoid
40745ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
40755ffd83dbSDimitry Andric /// the intrinsic's arguments. In cases like a16 addreses, this requires padding
40765ffd83dbSDimitry Andric /// now unnecessary arguments with $noreg.
40775ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
4078*e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
4079*e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
40805ffd83dbSDimitry Andric 
4081*e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
4082*e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
40835ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
40845ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
40855ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
40865ffd83dbSDimitry Andric 
40875ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
40885ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4089*e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
40905ffd83dbSDimitry Andric 
40915ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
40925ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
40935ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
40945ffd83dbSDimitry Andric   const LLT V2S16 = LLT::vector(2, 16);
40955ffd83dbSDimitry Andric 
40965ffd83dbSDimitry Andric   unsigned DMask = 0;
40975ffd83dbSDimitry Andric 
40985ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
4099*e8d8bef9SDimitry Andric   LLT GradTy =
4100*e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
4101*e8d8bef9SDimitry Andric   LLT AddrTy =
4102*e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
41035ffd83dbSDimitry Andric   const bool IsG16 = GradTy == S16;
41045ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
41055ffd83dbSDimitry Andric 
41065ffd83dbSDimitry Andric   int DMaskLanes = 0;
41075ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
4108*e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
41095ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
41105ffd83dbSDimitry Andric       DMaskLanes = 4;
41115ffd83dbSDimitry Andric     } else if (DMask != 0) {
41125ffd83dbSDimitry Andric       DMaskLanes = countPopulation(DMask);
41135ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
41145ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
41155ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
41165ffd83dbSDimitry Andric       MI.eraseFromParent();
41175ffd83dbSDimitry Andric       return true;
41185ffd83dbSDimitry Andric     }
41195ffd83dbSDimitry Andric   }
41205ffd83dbSDimitry Andric 
41215ffd83dbSDimitry Andric   Observer.changingInstr(MI);
41225ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
41235ffd83dbSDimitry Andric 
41245ffd83dbSDimitry Andric   unsigned NewOpcode = NumDefs == 0 ?
41255ffd83dbSDimitry Andric     AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
41265ffd83dbSDimitry Andric 
41275ffd83dbSDimitry Andric   // Track that we legalized this
41285ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
41295ffd83dbSDimitry Andric 
41305ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
41315ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
41325ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
41335ffd83dbSDimitry Andric     DMask = 0x1;
41345ffd83dbSDimitry Andric     DMaskLanes = 1;
4135*e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
41365ffd83dbSDimitry Andric   }
41375ffd83dbSDimitry Andric 
41385ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
41395ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
41405ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
41415ffd83dbSDimitry Andric 
41425ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
41435ffd83dbSDimitry Andric     if (Ty.isVector())
41445ffd83dbSDimitry Andric       return false;
41455ffd83dbSDimitry Andric 
41465ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
41475ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
41485ffd83dbSDimitry Andric       // The two values are packed in one register.
41495ffd83dbSDimitry Andric       LLT PackedTy = LLT::vector(2, Ty);
41505ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
41515ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
41525ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
41535ffd83dbSDimitry Andric     }
41545ffd83dbSDimitry Andric   }
41555ffd83dbSDimitry Andric 
4156*e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
41575ffd83dbSDimitry Andric 
41585ffd83dbSDimitry Andric   // Optimize _L to _LZ when _L is zero
41595ffd83dbSDimitry Andric   if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4160*e8d8bef9SDimitry Andric           AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
41615ffd83dbSDimitry Andric     const ConstantFP *ConstantLod;
41625ffd83dbSDimitry Andric 
4163*e8d8bef9SDimitry Andric     if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
4164*e8d8bef9SDimitry Andric                  m_GFCst(ConstantLod))) {
41655ffd83dbSDimitry Andric       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
41665ffd83dbSDimitry Andric         // Set new opcode to _lz variant of _l, and change the intrinsic ID.
4167*e8d8bef9SDimitry Andric         const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
4168*e8d8bef9SDimitry Andric             AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
4169*e8d8bef9SDimitry Andric                                                       Intr->Dim);
41705ffd83dbSDimitry Andric 
41715ffd83dbSDimitry Andric         // The starting indexes should remain in the same place.
41725ffd83dbSDimitry Andric         --CorrectedNumVAddrs;
41735ffd83dbSDimitry Andric 
4174*e8d8bef9SDimitry Andric         MI.getOperand(MI.getNumExplicitDefs())
4175*e8d8bef9SDimitry Andric             .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
4176*e8d8bef9SDimitry Andric         MI.RemoveOperand(ArgOffset + Intr->LodIndex);
4177*e8d8bef9SDimitry Andric         Intr = NewImageDimIntr;
41785ffd83dbSDimitry Andric       }
41795ffd83dbSDimitry Andric     }
41805ffd83dbSDimitry Andric   }
41815ffd83dbSDimitry Andric 
41825ffd83dbSDimitry Andric   // Optimize _mip away, when 'lod' is zero
4183*e8d8bef9SDimitry Andric   if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
41845ffd83dbSDimitry Andric     int64_t ConstantLod;
4185*e8d8bef9SDimitry Andric     if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
4186*e8d8bef9SDimitry Andric                  m_ICst(ConstantLod))) {
41875ffd83dbSDimitry Andric       if (ConstantLod == 0) {
41885ffd83dbSDimitry Andric         // TODO: Change intrinsic opcode and remove operand instead or replacing
41895ffd83dbSDimitry Andric         // it with 0, as the _L to _LZ handling is done above.
4190*e8d8bef9SDimitry Andric         MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
41915ffd83dbSDimitry Andric         --CorrectedNumVAddrs;
41925ffd83dbSDimitry Andric       }
41935ffd83dbSDimitry Andric     }
41945ffd83dbSDimitry Andric   }
41955ffd83dbSDimitry Andric 
41965ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
41975ffd83dbSDimitry Andric   if (IsA16 || IsG16) {
41985ffd83dbSDimitry Andric     if (IsA16) {
41995ffd83dbSDimitry Andric       // Target must support the feature and gradients need to be 16 bit too
42005ffd83dbSDimitry Andric       if (!ST.hasA16() || !IsG16)
42015ffd83dbSDimitry Andric         return false;
42025ffd83dbSDimitry Andric     } else if (!ST.hasG16())
42035ffd83dbSDimitry Andric       return false;
42045ffd83dbSDimitry Andric 
4205*e8d8bef9SDimitry Andric     if (Intr->NumVAddrs > 1) {
42065ffd83dbSDimitry Andric       SmallVector<Register, 4> PackedRegs;
42075ffd83dbSDimitry Andric       // Don't compress addresses for G16
4208*e8d8bef9SDimitry Andric       const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
4209*e8d8bef9SDimitry Andric       packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
4210*e8d8bef9SDimitry Andric                                   PackEndIdx);
42115ffd83dbSDimitry Andric 
42125ffd83dbSDimitry Andric       if (!IsA16) {
42135ffd83dbSDimitry Andric         // Add uncompressed address
4214*e8d8bef9SDimitry Andric         for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
4215*e8d8bef9SDimitry Andric           int AddrReg = MI.getOperand(ArgOffset + I).getReg();
42165ffd83dbSDimitry Andric           assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
42175ffd83dbSDimitry Andric           PackedRegs.push_back(AddrReg);
42185ffd83dbSDimitry Andric         }
42195ffd83dbSDimitry Andric       }
42205ffd83dbSDimitry Andric 
42215ffd83dbSDimitry Andric       // See also below in the non-a16 branch
42225ffd83dbSDimitry Andric       const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
42235ffd83dbSDimitry Andric 
42245ffd83dbSDimitry Andric       if (!UseNSA && PackedRegs.size() > 1) {
42255ffd83dbSDimitry Andric         LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
42265ffd83dbSDimitry Andric         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
42275ffd83dbSDimitry Andric         PackedRegs[0] = Concat.getReg(0);
42285ffd83dbSDimitry Andric         PackedRegs.resize(1);
42295ffd83dbSDimitry Andric       }
42305ffd83dbSDimitry Andric 
4231*e8d8bef9SDimitry Andric       const unsigned NumPacked = PackedRegs.size();
4232*e8d8bef9SDimitry Andric       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
4233*e8d8bef9SDimitry Andric         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
42345ffd83dbSDimitry Andric         if (!SrcOp.isReg()) {
42355ffd83dbSDimitry Andric           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
42365ffd83dbSDimitry Andric           continue;
42375ffd83dbSDimitry Andric         }
42385ffd83dbSDimitry Andric 
42395ffd83dbSDimitry Andric         assert(SrcOp.getReg() != AMDGPU::NoRegister);
42405ffd83dbSDimitry Andric 
4241*e8d8bef9SDimitry Andric         if (I - Intr->VAddrStart < NumPacked)
4242*e8d8bef9SDimitry Andric           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
42435ffd83dbSDimitry Andric         else
42445ffd83dbSDimitry Andric           SrcOp.setReg(AMDGPU::NoRegister);
42455ffd83dbSDimitry Andric       }
42465ffd83dbSDimitry Andric     }
42475ffd83dbSDimitry Andric   } else {
42485ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
42495ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
42505ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
42515ffd83dbSDimitry Andric     // wash in terms of code size or even better.
42525ffd83dbSDimitry Andric     //
42535ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
42545ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
42555ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
42565ffd83dbSDimitry Andric     //
42575ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
42585ffd83dbSDimitry Andric     // allocation when possible.
42595ffd83dbSDimitry Andric     const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
42605ffd83dbSDimitry Andric 
4261*e8d8bef9SDimitry Andric     if (!UseNSA && Intr->NumVAddrs > 1)
4262*e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
4263*e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
42645ffd83dbSDimitry Andric   }
42655ffd83dbSDimitry Andric 
42665ffd83dbSDimitry Andric   int Flags = 0;
42675ffd83dbSDimitry Andric   if (IsA16)
42685ffd83dbSDimitry Andric     Flags |= 1;
42695ffd83dbSDimitry Andric   if (IsG16)
42705ffd83dbSDimitry Andric     Flags |= 2;
42715ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
42725ffd83dbSDimitry Andric 
42735ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
42745ffd83dbSDimitry Andric     // TODO: Handle dmask trim
42755ffd83dbSDimitry Andric     Register VData = MI.getOperand(1).getReg();
42765ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData);
42775ffd83dbSDimitry Andric     if (!Ty.isVector() || Ty.getElementType() != S16)
42785ffd83dbSDimitry Andric       return true;
42795ffd83dbSDimitry Andric 
4280*e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
42815ffd83dbSDimitry Andric     if (RepackedReg != VData) {
42825ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
42835ffd83dbSDimitry Andric     }
42845ffd83dbSDimitry Andric 
42855ffd83dbSDimitry Andric     return true;
42865ffd83dbSDimitry Andric   }
42875ffd83dbSDimitry Andric 
42885ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
42895ffd83dbSDimitry Andric   LLT Ty = MRI->getType(DstReg);
42905ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
42915ffd83dbSDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
42925ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
42935ffd83dbSDimitry Andric 
42945ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
42955ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
42965ffd83dbSDimitry Andric     return false;
42975ffd83dbSDimitry Andric 
42985ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
42995ffd83dbSDimitry Andric     return false;
43005ffd83dbSDimitry Andric 
43015ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
43025ffd83dbSDimitry Andric   const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
43035ffd83dbSDimitry Andric 
43045ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
43055ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
43065ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
43075ffd83dbSDimitry Andric   LLT RoundedTy;
43085ffd83dbSDimitry Andric 
43095ffd83dbSDimitry Andric   // S32 vector to to cover all data, plus TFE result element.
43105ffd83dbSDimitry Andric   LLT TFETy;
43115ffd83dbSDimitry Andric 
43125ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
43135ffd83dbSDimitry Andric   LLT RegTy;
43145ffd83dbSDimitry Andric 
43155ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
43165ffd83dbSDimitry Andric     RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
43175ffd83dbSDimitry Andric     TFETy = LLT::vector(AdjustedNumElts + 1, 32);
43185ffd83dbSDimitry Andric     RegTy = S32;
43195ffd83dbSDimitry Andric   } else {
43205ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
43215ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
43225ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
43235ffd83dbSDimitry Andric     RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
43245ffd83dbSDimitry Andric     TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
43255ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
43265ffd83dbSDimitry Andric   }
43275ffd83dbSDimitry Andric 
43285ffd83dbSDimitry Andric   // The return type does not need adjustment.
43295ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
43305ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
43315ffd83dbSDimitry Andric     return true;
43325ffd83dbSDimitry Andric 
43335ffd83dbSDimitry Andric   Register Dst1Reg;
43345ffd83dbSDimitry Andric 
43355ffd83dbSDimitry Andric   // Insert after the instruction.
43365ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
43375ffd83dbSDimitry Andric 
43385ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
43395ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
43405ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
43415ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
43425ffd83dbSDimitry Andric 
43435ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
43445ffd83dbSDimitry Andric 
43455ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
43465ffd83dbSDimitry Andric 
43475ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
43485ffd83dbSDimitry Andric   // type. The intruction really returns these two values in one contiguous
43495ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
43505ffd83dbSDimitry Andric   // return type to use a single register result.
43515ffd83dbSDimitry Andric 
43525ffd83dbSDimitry Andric   if (IsTFE) {
43535ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
43545ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
43555ffd83dbSDimitry Andric       return false;
43565ffd83dbSDimitry Andric 
43575ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
43585ffd83dbSDimitry Andric     MI.RemoveOperand(1);
43595ffd83dbSDimitry Andric 
43605ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
43615ffd83dbSDimitry Andric     if (Ty == S32) {
43625ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
43635ffd83dbSDimitry Andric       return true;
43645ffd83dbSDimitry Andric     }
43655ffd83dbSDimitry Andric   }
43665ffd83dbSDimitry Andric 
43675ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
43685ffd83dbSDimitry Andric   // result.
43695ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
43705ffd83dbSDimitry Andric 
43715ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
43725ffd83dbSDimitry Andric 
43735ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
43745ffd83dbSDimitry Andric     assert(!IsTFE);
43755ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
43765ffd83dbSDimitry Andric   } else {
43775ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
43785ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
43795ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
43805ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
43815ffd83dbSDimitry Andric 
43825ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
43835ffd83dbSDimitry Andric     // directly written to the right place already.
43845ffd83dbSDimitry Andric     if (IsTFE)
43855ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
43865ffd83dbSDimitry Andric   }
43875ffd83dbSDimitry Andric 
43885ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
43895ffd83dbSDimitry Andric   // of packed vs. unpacked.
43905ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
43915ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
43925ffd83dbSDimitry Andric     return true;
43935ffd83dbSDimitry Andric   }
43945ffd83dbSDimitry Andric 
43955ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
43965ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
43975ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
43985ffd83dbSDimitry Andric     return true;
43995ffd83dbSDimitry Andric   }
44005ffd83dbSDimitry Andric 
44015ffd83dbSDimitry Andric   assert(Ty.isVector());
44025ffd83dbSDimitry Andric 
44035ffd83dbSDimitry Andric   if (IsD16) {
44045ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
44055ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
44065ffd83dbSDimitry Andric     //
44075ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
44085ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
44095ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
44105ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
44115ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
44125ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
44135ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
44145ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
44155ffd83dbSDimitry Andric     }
44165ffd83dbSDimitry Andric   }
44175ffd83dbSDimitry Andric 
44185ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
44195ffd83dbSDimitry Andric     if (NumElts == 0)
44205ffd83dbSDimitry Andric       return;
44215ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
44225ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
44235ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
44245ffd83dbSDimitry Andric   };
44255ffd83dbSDimitry Andric 
44265ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
44275ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
44285ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
44295ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
44305ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
44315ffd83dbSDimitry Andric     return true;
44325ffd83dbSDimitry Andric   }
44335ffd83dbSDimitry Andric 
44345ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
44355ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
44365ffd83dbSDimitry Andric 
44375ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
44385ffd83dbSDimitry Andric   const LLT V3S16 = LLT::vector(3, 16);
44395ffd83dbSDimitry Andric   if (Ty == V3S16) {
44405ffd83dbSDimitry Andric     padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
44415ffd83dbSDimitry Andric     auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
44425ffd83dbSDimitry Andric     B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
44435ffd83dbSDimitry Andric     return true;
44445ffd83dbSDimitry Andric   }
44455ffd83dbSDimitry Andric 
44465ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
44475ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
44485ffd83dbSDimitry Andric   return true;
44495ffd83dbSDimitry Andric }
44505ffd83dbSDimitry Andric 
44515ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
4452*e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
4453*e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
4454*e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
4455*e8d8bef9SDimitry Andric 
44565ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
44575ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
44585ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
44595ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
44605ffd83dbSDimitry Andric 
44615ffd83dbSDimitry Andric   Observer.changingInstr(MI);
44625ffd83dbSDimitry Andric 
4463*e8d8bef9SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
4464*e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
4465*e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
4466*e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
4467*e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
4468*e8d8bef9SDimitry Andric   }
4469*e8d8bef9SDimitry Andric 
44705ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
44715ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
44725ffd83dbSDimitry Andric   // allowed to add one.
44735ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
44745ffd83dbSDimitry Andric   MI.RemoveOperand(1); // Remove intrinsic ID
44755ffd83dbSDimitry Andric 
44765ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
44775ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
44785ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
44795ffd83dbSDimitry Andric   const Align MemAlign(4);
44805ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
44815ffd83dbSDimitry Andric       MachinePointerInfo(),
44825ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
44835ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
44845ffd83dbSDimitry Andric       MemSize, MemAlign);
44855ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
44865ffd83dbSDimitry Andric 
44875ffd83dbSDimitry Andric   // There are no 96-bit result scalar loads, but widening to 128-bit should
44885ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
44895ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
44905ffd83dbSDimitry Andric   if (!isPowerOf2_32(Size)) {
44915ffd83dbSDimitry Andric     if (Ty.isVector())
44925ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
44935ffd83dbSDimitry Andric     else
44945ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
44955ffd83dbSDimitry Andric   }
44965ffd83dbSDimitry Andric 
44975ffd83dbSDimitry Andric   Observer.changedInstr(MI);
44985ffd83dbSDimitry Andric   return true;
44995ffd83dbSDimitry Andric }
45005ffd83dbSDimitry Andric 
4501*e8d8bef9SDimitry Andric // TODO: Move to selection
45025ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
45030b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
45040b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
45055ffd83dbSDimitry Andric   // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
45065ffd83dbSDimitry Andric   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
45075ffd83dbSDimitry Andric       !ST.isTrapHandlerEnabled()) {
45085ffd83dbSDimitry Andric     B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
45095ffd83dbSDimitry Andric   } else {
45105ffd83dbSDimitry Andric     // Pass queue pointer to trap handler as input, and insert trap instruction
45115ffd83dbSDimitry Andric     // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
45125ffd83dbSDimitry Andric     MachineRegisterInfo &MRI = *B.getMRI();
4513*e8d8bef9SDimitry Andric 
4514*e8d8bef9SDimitry Andric     Register LiveIn =
4515*e8d8bef9SDimitry Andric       MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
4516*e8d8bef9SDimitry Andric     if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
45175ffd83dbSDimitry Andric       return false;
4518*e8d8bef9SDimitry Andric 
4519*e8d8bef9SDimitry Andric     Register SGPR01(AMDGPU::SGPR0_SGPR1);
45205ffd83dbSDimitry Andric     B.buildCopy(SGPR01, LiveIn);
45215ffd83dbSDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
45225ffd83dbSDimitry Andric         .addImm(GCNSubtarget::TrapIDLLVMTrap)
45235ffd83dbSDimitry Andric         .addReg(SGPR01, RegState::Implicit);
45245ffd83dbSDimitry Andric   }
45255ffd83dbSDimitry Andric 
45265ffd83dbSDimitry Andric   MI.eraseFromParent();
45275ffd83dbSDimitry Andric   return true;
45285ffd83dbSDimitry Andric }
45295ffd83dbSDimitry Andric 
45305ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
45315ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
45325ffd83dbSDimitry Andric   // Is non-HSA path or trap-handler disabled? then, report a warning
45335ffd83dbSDimitry Andric   // accordingly
45345ffd83dbSDimitry Andric   if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
45355ffd83dbSDimitry Andric       !ST.isTrapHandlerEnabled()) {
45365ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
45375ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
45385ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
45395ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
45405ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
45415ffd83dbSDimitry Andric   } else {
45425ffd83dbSDimitry Andric     // Insert debug-trap instruction
45435ffd83dbSDimitry Andric     B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
45445ffd83dbSDimitry Andric   }
45455ffd83dbSDimitry Andric 
45465ffd83dbSDimitry Andric   MI.eraseFromParent();
45475ffd83dbSDimitry Andric   return true;
45485ffd83dbSDimitry Andric }
45495ffd83dbSDimitry Andric 
4550*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
4551*e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
4552*e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
4553*e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
4554*e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
4555*e8d8bef9SDimitry Andric 
4556*e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
4557*e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
4558*e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
4559*e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
4560*e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
4561*e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
4562*e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
4563*e8d8bef9SDimitry Andric 
4564*e8d8bef9SDimitry Andric   bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
4565*e8d8bef9SDimitry Andric   bool Is64 =  MRI.getType(NodePtr).getSizeInBits() == 64;
4566*e8d8bef9SDimitry Andric   unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
4567*e8d8bef9SDimitry Andric                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
4568*e8d8bef9SDimitry Andric                           : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
4569*e8d8bef9SDimitry Andric                                  : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
4570*e8d8bef9SDimitry Andric 
4571*e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
4572*e8d8bef9SDimitry Andric   if (Is64) {
4573*e8d8bef9SDimitry Andric     auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
4574*e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(0));
4575*e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(1));
4576*e8d8bef9SDimitry Andric   } else {
4577*e8d8bef9SDimitry Andric     Ops.push_back(NodePtr);
4578*e8d8bef9SDimitry Andric   }
4579*e8d8bef9SDimitry Andric   Ops.push_back(RayExtent);
4580*e8d8bef9SDimitry Andric 
4581*e8d8bef9SDimitry Andric   auto packLanes = [&Ops, &S32, &B] (Register Src) {
4582*e8d8bef9SDimitry Andric     auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
4583*e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(0));
4584*e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(1));
4585*e8d8bef9SDimitry Andric     Ops.push_back(Unmerge.getReg(2));
4586*e8d8bef9SDimitry Andric   };
4587*e8d8bef9SDimitry Andric 
4588*e8d8bef9SDimitry Andric   packLanes(RayOrigin);
4589*e8d8bef9SDimitry Andric   if (IsA16) {
4590*e8d8bef9SDimitry Andric     auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
4591*e8d8bef9SDimitry Andric     auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
4592*e8d8bef9SDimitry Andric     Register R1 = MRI.createGenericVirtualRegister(S32);
4593*e8d8bef9SDimitry Andric     Register R2 = MRI.createGenericVirtualRegister(S32);
4594*e8d8bef9SDimitry Andric     Register R3 = MRI.createGenericVirtualRegister(S32);
4595*e8d8bef9SDimitry Andric     B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
4596*e8d8bef9SDimitry Andric     B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
4597*e8d8bef9SDimitry Andric     B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
4598*e8d8bef9SDimitry Andric     Ops.push_back(R1);
4599*e8d8bef9SDimitry Andric     Ops.push_back(R2);
4600*e8d8bef9SDimitry Andric     Ops.push_back(R3);
4601*e8d8bef9SDimitry Andric   } else {
4602*e8d8bef9SDimitry Andric     packLanes(RayDir);
4603*e8d8bef9SDimitry Andric     packLanes(RayInvDir);
4604*e8d8bef9SDimitry Andric   }
4605*e8d8bef9SDimitry Andric 
4606*e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
4607*e8d8bef9SDimitry Andric     .addDef(DstReg)
4608*e8d8bef9SDimitry Andric     .addImm(Opcode);
4609*e8d8bef9SDimitry Andric 
4610*e8d8bef9SDimitry Andric   for (Register R : Ops) {
4611*e8d8bef9SDimitry Andric     MIB.addUse(R);
4612*e8d8bef9SDimitry Andric   }
4613*e8d8bef9SDimitry Andric 
4614*e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
4615*e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
4616*e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
4617*e8d8bef9SDimitry Andric 
4618*e8d8bef9SDimitry Andric   MI.eraseFromParent();
4619*e8d8bef9SDimitry Andric   return true;
4620*e8d8bef9SDimitry Andric }
4621*e8d8bef9SDimitry Andric 
46225ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
46235ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
46245ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
46255ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
46265ffd83dbSDimitry Andric 
46270b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
4628480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
4629480093f4SDimitry Andric   switch (IntrID) {
4630480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
4631480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
4632480093f4SDimitry Andric     MachineInstr *Br = nullptr;
46335ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4634*e8d8bef9SDimitry Andric     bool Negated = false;
4635*e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
4636*e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
46370b57cec5SDimitry Andric       const SIRegisterInfo *TRI
46380b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
46390b57cec5SDimitry Andric 
46400b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
46410b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
4642480093f4SDimitry Andric 
46435ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
4644*e8d8bef9SDimitry Andric 
4645*e8d8bef9SDimitry Andric       if (Negated)
4646*e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
4647*e8d8bef9SDimitry Andric 
46485ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
4649480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
46500b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
46510b57cec5SDimitry Andric           .addDef(Def)
46520b57cec5SDimitry Andric           .addUse(Use)
46535ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
4654480093f4SDimitry Andric       } else {
4655480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
4656480093f4SDimitry Andric             .addDef(Def)
4657480093f4SDimitry Andric             .addUse(Use)
4658*e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
4659480093f4SDimitry Andric       }
4660480093f4SDimitry Andric 
46615ffd83dbSDimitry Andric       if (Br) {
46625ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
46635ffd83dbSDimitry Andric       } else {
46645ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
46655ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
46665ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
46675ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
46685ffd83dbSDimitry Andric       }
46690b57cec5SDimitry Andric 
46700b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
46710b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
46720b57cec5SDimitry Andric       MI.eraseFromParent();
46730b57cec5SDimitry Andric       BrCond->eraseFromParent();
46740b57cec5SDimitry Andric       return true;
46750b57cec5SDimitry Andric     }
46760b57cec5SDimitry Andric 
46770b57cec5SDimitry Andric     return false;
46780b57cec5SDimitry Andric   }
46790b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
4680480093f4SDimitry Andric     MachineInstr *Br = nullptr;
46815ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
4682*e8d8bef9SDimitry Andric     bool Negated = false;
4683*e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
4684*e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
46850b57cec5SDimitry Andric       const SIRegisterInfo *TRI
46860b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
46870b57cec5SDimitry Andric 
46885ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
46890b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
46905ffd83dbSDimitry Andric 
4691*e8d8bef9SDimitry Andric       if (Negated)
4692*e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
4693*e8d8bef9SDimitry Andric 
46945ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
46950b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
46960b57cec5SDimitry Andric         .addUse(Reg)
46975ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
46985ffd83dbSDimitry Andric 
46995ffd83dbSDimitry Andric       if (Br)
47005ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
47015ffd83dbSDimitry Andric       else
47025ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
47035ffd83dbSDimitry Andric 
47040b57cec5SDimitry Andric       MI.eraseFromParent();
47050b57cec5SDimitry Andric       BrCond->eraseFromParent();
47060b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
47070b57cec5SDimitry Andric       return true;
47080b57cec5SDimitry Andric     }
47090b57cec5SDimitry Andric 
47100b57cec5SDimitry Andric     return false;
47110b57cec5SDimitry Andric   }
47120b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
47135ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
47145ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
47155ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
47165ffd83dbSDimitry Andric       MI.eraseFromParent();
47175ffd83dbSDimitry Andric       return true;
47185ffd83dbSDimitry Andric     }
47195ffd83dbSDimitry Andric 
47200b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
47210b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
47220b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
47230b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
47240b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
47250b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47260b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
47270b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
47280b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47290b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
47300b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
47310b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47320b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
47330b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
47340b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47350b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
47360b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
47370b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47380b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
47390b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
47400b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47410b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
47420b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
47430b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47440b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
47450b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
47460b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47470b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
47480b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
47490b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
47500b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
47510b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
47520b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
47530b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
47548bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
47558bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
47568bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
47578bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
47588bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
47598bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
47608bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
47618bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
47628bcb0991SDimitry Andric     MI.eraseFromParent();
47638bcb0991SDimitry Andric     return true;
47648bcb0991SDimitry Andric   }
47655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
4766*e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
47678bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
47685ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
47695ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
47708bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
47715ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
47725ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
47735ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
47745ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
47755ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
47765ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
47775ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
47785ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
47795ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
47805ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
47815ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
47825ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
47835ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
47845ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
47855ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
47865ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
47875ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
47885ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
47895ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
47905ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
47915ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
47925ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
47935ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
47945ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
47955ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
47965ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
47975ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
47985ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
47995ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
48005ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
48015ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
48025ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
48035ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
48045ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
48055ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
48065ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
48075ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
48085ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
4809*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
4810*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
48115ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
48125ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
48135ffd83dbSDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
48145ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_inc:
48155ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, true);
48165ffd83dbSDimitry Andric   case Intrinsic::amdgcn_atomic_dec:
48175ffd83dbSDimitry Andric     return legalizeAtomicIncDec(MI, B, false);
48185ffd83dbSDimitry Andric   case Intrinsic::trap:
48195ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
48205ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
48215ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
4822*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
4823*e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
4824*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
4825*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
4826*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
4827*e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
4828*e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
4829*e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
48305ffd83dbSDimitry Andric   default: {
48315ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
48325ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
48335ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
48340b57cec5SDimitry Andric     return true;
48350b57cec5SDimitry Andric   }
48365ffd83dbSDimitry Andric   }
48370b57cec5SDimitry Andric 
48380b57cec5SDimitry Andric   return true;
48390b57cec5SDimitry Andric }
4840