xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
21fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
23fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27*06c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h"
288bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
29e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
3081ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
310b57cec5SDimitry Andric 
320b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
330b57cec5SDimitry Andric 
340b57cec5SDimitry Andric using namespace llvm;
350b57cec5SDimitry Andric using namespace LegalizeActions;
360b57cec5SDimitry Andric using namespace LegalizeMutations;
370b57cec5SDimitry Andric using namespace LegalityPredicates;
385ffd83dbSDimitry Andric using namespace MIPatternMatch;
390b57cec5SDimitry Andric 
405ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
415ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
425ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
435ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
445ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
455ffd83dbSDimitry Andric   cl::init(false),
465ffd83dbSDimitry Andric   cl::ReallyHidden);
470b57cec5SDimitry Andric 
485ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
495ffd83dbSDimitry Andric 
505ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
515ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
525ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
535ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
54fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
550b57cec5SDimitry Andric }
560b57cec5SDimitry Andric 
575ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
585ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
595ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
605ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
615ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
628bcb0991SDimitry Andric }
638bcb0991SDimitry Andric 
64349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an
65e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
66e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
670b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
680b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
690b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
70e8d8bef9SDimitry Andric     if (!Ty.isVector())
71e8d8bef9SDimitry Andric       return false;
72e8d8bef9SDimitry Andric 
73e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
74e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
75e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
76e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
778bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
788bcb0991SDimitry Andric   };
798bcb0991SDimitry Andric }
808bcb0991SDimitry Andric 
81e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
82e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
83e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
84e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
85e8d8bef9SDimitry Andric   };
86e8d8bef9SDimitry Andric }
87e8d8bef9SDimitry Andric 
888bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
898bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
908bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
918bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
928bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
930b57cec5SDimitry Andric   };
940b57cec5SDimitry Andric }
950b57cec5SDimitry Andric 
960b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
970b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
980b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
990b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
100bdd1243dSDimitry Andric     return std::pair(TypeIdx,
101fe6060f1SDimitry Andric                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1020b57cec5SDimitry Andric   };
1030b57cec5SDimitry Andric }
1040b57cec5SDimitry Andric 
1050b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1060b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1070b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1080b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1090b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1100b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1110b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
112bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::scalarOrVector(
113bdd1243dSDimitry Andric                                   ElementCount::getFixed(NewNumElts), EltTy));
1140b57cec5SDimitry Andric   };
1150b57cec5SDimitry Andric }
1160b57cec5SDimitry Andric 
1178bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1188bcb0991SDimitry Andric // type.
1198bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1208bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1218bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1228bcb0991SDimitry Andric 
1238bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1248bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1258bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1268bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1278bcb0991SDimitry Andric 
1288bcb0991SDimitry Andric     assert(EltSize < 32);
1298bcb0991SDimitry Andric 
1308bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
131bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1328bcb0991SDimitry Andric   };
1338bcb0991SDimitry Andric }
1348bcb0991SDimitry Andric 
135*06c3fb27SDimitry Andric // Increase the number of vector elements to reach the next legal RegClass.
136*06c3fb27SDimitry Andric static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
137*06c3fb27SDimitry Andric   return [=](const LegalityQuery &Query) {
138*06c3fb27SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
139*06c3fb27SDimitry Andric     const unsigned NumElts = Ty.getNumElements();
140*06c3fb27SDimitry Andric     const unsigned EltSize = Ty.getElementType().getSizeInBits();
141*06c3fb27SDimitry Andric     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
142*06c3fb27SDimitry Andric 
143*06c3fb27SDimitry Andric     assert(EltSize == 32 || EltSize == 64);
144*06c3fb27SDimitry Andric     assert(Ty.getSizeInBits() < MaxRegisterSize);
145*06c3fb27SDimitry Andric 
146*06c3fb27SDimitry Andric     unsigned NewNumElts;
147*06c3fb27SDimitry Andric     // Find the nearest legal RegClass that is larger than the current type.
148*06c3fb27SDimitry Andric     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
149*06c3fb27SDimitry Andric       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
150*06c3fb27SDimitry Andric         break;
151*06c3fb27SDimitry Andric     }
152*06c3fb27SDimitry Andric 
153*06c3fb27SDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
154*06c3fb27SDimitry Andric   };
155*06c3fb27SDimitry Andric }
156*06c3fb27SDimitry Andric 
157*06c3fb27SDimitry Andric static LLT getBufferRsrcScalarType(const LLT Ty) {
158*06c3fb27SDimitry Andric   if (!Ty.isVector())
159*06c3fb27SDimitry Andric     return LLT::scalar(128);
160*06c3fb27SDimitry Andric   const ElementCount NumElems = Ty.getElementCount();
161*06c3fb27SDimitry Andric   return LLT::vector(NumElems, LLT::scalar(128));
162*06c3fb27SDimitry Andric }
163*06c3fb27SDimitry Andric 
164*06c3fb27SDimitry Andric static LLT getBufferRsrcRegisterType(const LLT Ty) {
165*06c3fb27SDimitry Andric   if (!Ty.isVector())
166*06c3fb27SDimitry Andric     return LLT::fixed_vector(4, LLT::scalar(32));
167*06c3fb27SDimitry Andric   const unsigned NumElems = Ty.getElementCount().getFixedValue();
168*06c3fb27SDimitry Andric   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
169*06c3fb27SDimitry Andric }
170*06c3fb27SDimitry Andric 
171e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
172e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1735ffd83dbSDimitry Andric 
1745ffd83dbSDimitry Andric   if (Size <= 32) {
1755ffd83dbSDimitry Andric     // <2 x s8> -> s16
1765ffd83dbSDimitry Andric     // <4 x s8> -> s32
177e8d8bef9SDimitry Andric     return LLT::scalar(Size);
178e8d8bef9SDimitry Andric   }
1795ffd83dbSDimitry Andric 
180fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
181e8d8bef9SDimitry Andric }
182e8d8bef9SDimitry Andric 
183e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
184e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
185e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
186bdd1243dSDimitry Andric     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
187e8d8bef9SDimitry Andric   };
188e8d8bef9SDimitry Andric }
189e8d8bef9SDimitry Andric 
190e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
191e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
192e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
193e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
194e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
195bdd1243dSDimitry Andric     return std::pair(
196fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
1975ffd83dbSDimitry Andric   };
1985ffd83dbSDimitry Andric }
1995ffd83dbSDimitry Andric 
2008bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
2018bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2028bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2038bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
2048bcb0991SDimitry Andric   };
2058bcb0991SDimitry Andric }
2068bcb0991SDimitry Andric 
2070b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
2080b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
2090b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2100b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
2110b57cec5SDimitry Andric   };
2120b57cec5SDimitry Andric }
2130b57cec5SDimitry Andric 
2140b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
2150b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
2160b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2170b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
2180b57cec5SDimitry Andric   };
2190b57cec5SDimitry Andric }
2200b57cec5SDimitry Andric 
2215ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
2225ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
2235ffd83dbSDimitry Andric }
2245ffd83dbSDimitry Andric 
2255ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
2265ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
2275ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
2285ffd83dbSDimitry Andric }
2295ffd83dbSDimitry Andric 
2305ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
2310b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
2320b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
2330b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
2340b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
2350b57cec5SDimitry Andric }
2360b57cec5SDimitry Andric 
2375ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2385ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2395ffd83dbSDimitry Andric     return false;
2405ffd83dbSDimitry Andric 
2415ffd83dbSDimitry Andric   if (Ty.isVector())
2425ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2435ffd83dbSDimitry Andric 
2445ffd83dbSDimitry Andric   return true;
2455ffd83dbSDimitry Andric }
2465ffd83dbSDimitry Andric 
2475ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2485ffd83dbSDimitry Andric // multiples of v2s16.
2495ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2505ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2515ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2528bcb0991SDimitry Andric   };
2538bcb0991SDimitry Andric }
2548bcb0991SDimitry Andric 
255*06c3fb27SDimitry Andric // RegisterType that doesn't have a corresponding RegClass.
256*06c3fb27SDimitry Andric static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
257*06c3fb27SDimitry Andric   return [=](const LegalityQuery &Query) {
258*06c3fb27SDimitry Andric     LLT Ty = Query.Types[TypeIdx];
259*06c3fb27SDimitry Andric     return isRegisterType(Ty) &&
260*06c3fb27SDimitry Andric            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
261*06c3fb27SDimitry Andric   };
262*06c3fb27SDimitry Andric }
263*06c3fb27SDimitry Andric 
2645ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2658bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2665ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2675ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2685ffd83dbSDimitry Andric       return false;
2695ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2705ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2718bcb0991SDimitry Andric   };
2728bcb0991SDimitry Andric }
2738bcb0991SDimitry Andric 
274fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
275fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
276fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
2778bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2788bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2798bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
280fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
2810b57cec5SDimitry Andric   };
2820b57cec5SDimitry Andric }
2830b57cec5SDimitry Andric 
2845ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2855ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2865ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2875ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
288*06c3fb27SDimitry Andric                                     bool IsLoad, bool IsAtomic) {
2895ffd83dbSDimitry Andric   switch (AS) {
2905ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2915ffd83dbSDimitry Andric     // FIXME: Private element size.
292e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2935ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2945ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
2955ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
2965ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
2975ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
298*06c3fb27SDimitry Andric   case AMDGPUAS::BUFFER_RESOURCE:
2995ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
3005ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
3015ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
3025ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
3035ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
3045ffd83dbSDimitry Andric     // kernel.
3055ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
3065ffd83dbSDimitry Andric   default:
307*06c3fb27SDimitry Andric     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
308*06c3fb27SDimitry Andric     // if they may alias scratch depending on the subtarget.  This needs to be
309*06c3fb27SDimitry Andric     // moved to custom handling to use addressMayBeAccessedAsPrivate
310*06c3fb27SDimitry Andric     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
3115ffd83dbSDimitry Andric   }
3125ffd83dbSDimitry Andric }
3135ffd83dbSDimitry Andric 
3145ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
315fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
3165ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
3175ffd83dbSDimitry Andric 
3185ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
319fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
3205ffd83dbSDimitry Andric 
3215ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
32204eeddc0SDimitry Andric   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
32304eeddc0SDimitry Andric   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
3245ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
3255ffd83dbSDimitry Andric 
3265ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
3275ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
3285ffd83dbSDimitry Andric     return false;
3295ffd83dbSDimitry Andric 
330fe6060f1SDimitry Andric   // Do not handle extending vector loads.
331fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
332fe6060f1SDimitry Andric     return false;
333fe6060f1SDimitry Andric 
3345ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
3355ffd83dbSDimitry Andric   // we also need to modify the memory access size.
3365ffd83dbSDimitry Andric #if 0
3375ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
3385ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
3395ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
3405ffd83dbSDimitry Andric #endif
3415ffd83dbSDimitry Andric 
3425ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
3435ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
3445ffd83dbSDimitry Andric     return false;
3455ffd83dbSDimitry Andric 
346*06c3fb27SDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
347*06c3fb27SDimitry Andric                                     Query.MMODescrs[0].Ordering !=
348*06c3fb27SDimitry Andric                                         AtomicOrdering::NotAtomic))
3495ffd83dbSDimitry Andric     return false;
3505ffd83dbSDimitry Andric 
3515ffd83dbSDimitry Andric   switch (MemSize) {
3525ffd83dbSDimitry Andric   case 8:
3535ffd83dbSDimitry Andric   case 16:
3545ffd83dbSDimitry Andric   case 32:
3555ffd83dbSDimitry Andric   case 64:
3565ffd83dbSDimitry Andric   case 128:
3575ffd83dbSDimitry Andric     break;
3585ffd83dbSDimitry Andric   case 96:
3595ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3605ffd83dbSDimitry Andric       return false;
3615ffd83dbSDimitry Andric     break;
3625ffd83dbSDimitry Andric   case 256:
3635ffd83dbSDimitry Andric   case 512:
3645ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3655ffd83dbSDimitry Andric     break;
3665ffd83dbSDimitry Andric   default:
3675ffd83dbSDimitry Andric     return false;
3685ffd83dbSDimitry Andric   }
3695ffd83dbSDimitry Andric 
3705ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3715ffd83dbSDimitry Andric 
372e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3735ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
374e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
375e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3765ffd83dbSDimitry Andric       return false;
3775ffd83dbSDimitry Andric   }
3785ffd83dbSDimitry Andric 
3795ffd83dbSDimitry Andric   return true;
3805ffd83dbSDimitry Andric }
3815ffd83dbSDimitry Andric 
382*06c3fb27SDimitry Andric // The newer buffer intrinsic forms take their resource arguments as
383*06c3fb27SDimitry Andric // pointers in address space 8, aka s128 values. However, in order to not break
384*06c3fb27SDimitry Andric // SelectionDAG, the underlying operations have to continue to take v4i32
385*06c3fb27SDimitry Andric // arguments. Therefore, we convert resource pointers - or vectors of them
386*06c3fb27SDimitry Andric // to integer values here.
387*06c3fb27SDimitry Andric static bool hasBufferRsrcWorkaround(const LLT Ty) {
388*06c3fb27SDimitry Andric   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
389*06c3fb27SDimitry Andric     return true;
390*06c3fb27SDimitry Andric   if (Ty.isVector()) {
391*06c3fb27SDimitry Andric     const LLT ElemTy = Ty.getElementType();
392*06c3fb27SDimitry Andric     return hasBufferRsrcWorkaround(ElemTy);
393*06c3fb27SDimitry Andric   }
394*06c3fb27SDimitry Andric   return false;
395*06c3fb27SDimitry Andric }
396*06c3fb27SDimitry Andric 
3975ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
3985ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
3995ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
4005ffd83dbSDimitry Andric // bitcasting.
4015ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
4025ffd83dbSDimitry Andric   if (EnableNewLegality)
4035ffd83dbSDimitry Andric     return false;
4045ffd83dbSDimitry Andric 
4055ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
4065ffd83dbSDimitry Andric   if (Size <= 64)
4075ffd83dbSDimitry Andric     return false;
408*06c3fb27SDimitry Andric   // Address space 8 pointers get their own workaround.
409*06c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty))
410*06c3fb27SDimitry Andric     return false;
4115ffd83dbSDimitry Andric   if (!Ty.isVector())
4125ffd83dbSDimitry Andric     return true;
413e8d8bef9SDimitry Andric 
414e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
415e8d8bef9SDimitry Andric   if (EltTy.isPointer())
416e8d8bef9SDimitry Andric     return true;
417e8d8bef9SDimitry Andric 
418e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
4195ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
4205ffd83dbSDimitry Andric }
4215ffd83dbSDimitry Andric 
422fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
4235ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
424fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
425*06c3fb27SDimitry Andric          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
4265ffd83dbSDimitry Andric }
4275ffd83dbSDimitry Andric 
428e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
429e8d8bef9SDimitry Andric /// to a different type.
430e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
431fe6060f1SDimitry Andric                                        const LLT MemTy) {
432fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
433e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
434e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
435e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
436e8d8bef9SDimitry Andric 
437e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
438e8d8bef9SDimitry Andric     return true;
439fe6060f1SDimitry Andric 
440fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
441fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
442fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
443e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
444e8d8bef9SDimitry Andric }
445e8d8bef9SDimitry Andric 
446e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
447e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
448e8d8bef9SDimitry Andric /// changes, not the size of the result register.
449fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
45004eeddc0SDimitry Andric                             uint64_t AlignInBits, unsigned AddrSpace,
451e8d8bef9SDimitry Andric                             unsigned Opcode) {
452fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
453e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
454e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
455e8d8bef9SDimitry Andric     return false;
456e8d8bef9SDimitry Andric 
457e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
458e8d8bef9SDimitry Andric   // end up widening these for a scalar load during RegBankSelect, since there
459e8d8bef9SDimitry Andric   // aren't 96-bit scalar loads.
460e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
461e8d8bef9SDimitry Andric     return false;
462e8d8bef9SDimitry Andric 
463*06c3fb27SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
464e8d8bef9SDimitry Andric     return false;
465e8d8bef9SDimitry Andric 
466e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
467e8d8bef9SDimitry Andric   // to it.
468e8d8bef9SDimitry Andric   //
469e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
470e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
471e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
472e8d8bef9SDimitry Andric     return false;
473e8d8bef9SDimitry Andric 
474e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
475e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
476bdd1243dSDimitry Andric   unsigned Fast = 0;
477e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
478e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
479e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
480e8d8bef9SDimitry Andric          Fast;
481e8d8bef9SDimitry Andric }
482e8d8bef9SDimitry Andric 
483e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
484e8d8bef9SDimitry Andric                             unsigned Opcode) {
485e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
486e8d8bef9SDimitry Andric     return false;
487e8d8bef9SDimitry Andric 
488fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
489e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
490e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
491e8d8bef9SDimitry Andric }
492e8d8bef9SDimitry Andric 
493*06c3fb27SDimitry Andric /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
494*06c3fb27SDimitry Andric /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
495*06c3fb27SDimitry Andric /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
496*06c3fb27SDimitry Andric static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
497*06c3fb27SDimitry Andric                                    MachineRegisterInfo &MRI, unsigned Idx) {
498*06c3fb27SDimitry Andric   MachineOperand &MO = MI.getOperand(Idx);
499*06c3fb27SDimitry Andric 
500*06c3fb27SDimitry Andric   const LLT PointerTy = MRI.getType(MO.getReg());
501*06c3fb27SDimitry Andric 
502*06c3fb27SDimitry Andric   // Paranoidly prevent us from doing this multiple times.
503*06c3fb27SDimitry Andric   if (!hasBufferRsrcWorkaround(PointerTy))
504*06c3fb27SDimitry Andric     return PointerTy;
505*06c3fb27SDimitry Andric 
506*06c3fb27SDimitry Andric   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
507*06c3fb27SDimitry Andric   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
508*06c3fb27SDimitry Andric   if (!PointerTy.isVector()) {
509*06c3fb27SDimitry Andric     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
510*06c3fb27SDimitry Andric     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
511*06c3fb27SDimitry Andric     const LLT S32 = LLT::scalar(32);
512*06c3fb27SDimitry Andric 
513*06c3fb27SDimitry Andric     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
514*06c3fb27SDimitry Andric     std::array<Register, 4> VectorElems;
515*06c3fb27SDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
516*06c3fb27SDimitry Andric     for (unsigned I = 0; I < NumParts; ++I)
517*06c3fb27SDimitry Andric       VectorElems[I] =
518*06c3fb27SDimitry Andric           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
519*06c3fb27SDimitry Andric     B.buildMergeValues(MO, VectorElems);
520*06c3fb27SDimitry Andric     MO.setReg(VectorReg);
521*06c3fb27SDimitry Andric     return VectorTy;
522*06c3fb27SDimitry Andric   }
523*06c3fb27SDimitry Andric   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
524*06c3fb27SDimitry Andric   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
525*06c3fb27SDimitry Andric   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
526*06c3fb27SDimitry Andric   B.buildIntToPtr(MO, Scalar);
527*06c3fb27SDimitry Andric   MO.setReg(BitcastReg);
528*06c3fb27SDimitry Andric 
529*06c3fb27SDimitry Andric   return VectorTy;
530*06c3fb27SDimitry Andric }
531*06c3fb27SDimitry Andric 
532*06c3fb27SDimitry Andric /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
533*06c3fb27SDimitry Andric /// the form in which the value must be in order to be passed to the low-level
534*06c3fb27SDimitry Andric /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
535*06c3fb27SDimitry Andric /// needed in order to account for the fact that we can't define a register
536*06c3fb27SDimitry Andric /// class for s128 without breaking SelectionDAG.
537*06c3fb27SDimitry Andric static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
538*06c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
539*06c3fb27SDimitry Andric   const LLT PointerTy = MRI.getType(Pointer);
540*06c3fb27SDimitry Andric   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
541*06c3fb27SDimitry Andric   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
542*06c3fb27SDimitry Andric 
543*06c3fb27SDimitry Andric   if (!PointerTy.isVector()) {
544*06c3fb27SDimitry Andric     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
545*06c3fb27SDimitry Andric     SmallVector<Register, 4> PointerParts;
546*06c3fb27SDimitry Andric     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
547*06c3fb27SDimitry Andric     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
548*06c3fb27SDimitry Andric     for (unsigned I = 0; I < NumParts; ++I)
549*06c3fb27SDimitry Andric       PointerParts.push_back(Unmerged.getReg(I));
550*06c3fb27SDimitry Andric     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
551*06c3fb27SDimitry Andric   }
552*06c3fb27SDimitry Andric   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
553*06c3fb27SDimitry Andric   return B.buildBitcast(VectorTy, Scalar).getReg(0);
554*06c3fb27SDimitry Andric }
555*06c3fb27SDimitry Andric 
556*06c3fb27SDimitry Andric static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
557*06c3fb27SDimitry Andric                                      unsigned Idx) {
558*06c3fb27SDimitry Andric   MachineOperand &MO = MI.getOperand(Idx);
559*06c3fb27SDimitry Andric 
560*06c3fb27SDimitry Andric   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
561*06c3fb27SDimitry Andric   // Paranoidly prevent us from doing this multiple times.
562*06c3fb27SDimitry Andric   if (!hasBufferRsrcWorkaround(PointerTy))
563*06c3fb27SDimitry Andric     return;
564*06c3fb27SDimitry Andric   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
565*06c3fb27SDimitry Andric }
566*06c3fb27SDimitry Andric 
5670b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
5680b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
5690b57cec5SDimitry Andric   :  ST(ST_) {
5700b57cec5SDimitry Andric   using namespace TargetOpcode;
5710b57cec5SDimitry Andric 
5720b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
5730b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
5740b57cec5SDimitry Andric   };
5750b57cec5SDimitry Andric 
5760b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
577e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
5780b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
5790b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
5800b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
5810b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
5820b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
5835ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
5845ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
5850b57cec5SDimitry Andric 
586fe6060f1SDimitry Andric   const LLT V2S8 = LLT::fixed_vector(2, 8);
587fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
588fe6060f1SDimitry Andric   const LLT V4S16 = LLT::fixed_vector(4, 16);
5890b57cec5SDimitry Andric 
590fe6060f1SDimitry Andric   const LLT V2S32 = LLT::fixed_vector(2, 32);
591fe6060f1SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
592fe6060f1SDimitry Andric   const LLT V4S32 = LLT::fixed_vector(4, 32);
593fe6060f1SDimitry Andric   const LLT V5S32 = LLT::fixed_vector(5, 32);
594fe6060f1SDimitry Andric   const LLT V6S32 = LLT::fixed_vector(6, 32);
595fe6060f1SDimitry Andric   const LLT V7S32 = LLT::fixed_vector(7, 32);
596fe6060f1SDimitry Andric   const LLT V8S32 = LLT::fixed_vector(8, 32);
597fe6060f1SDimitry Andric   const LLT V9S32 = LLT::fixed_vector(9, 32);
598fe6060f1SDimitry Andric   const LLT V10S32 = LLT::fixed_vector(10, 32);
599fe6060f1SDimitry Andric   const LLT V11S32 = LLT::fixed_vector(11, 32);
600fe6060f1SDimitry Andric   const LLT V12S32 = LLT::fixed_vector(12, 32);
601fe6060f1SDimitry Andric   const LLT V13S32 = LLT::fixed_vector(13, 32);
602fe6060f1SDimitry Andric   const LLT V14S32 = LLT::fixed_vector(14, 32);
603fe6060f1SDimitry Andric   const LLT V15S32 = LLT::fixed_vector(15, 32);
604fe6060f1SDimitry Andric   const LLT V16S32 = LLT::fixed_vector(16, 32);
605fe6060f1SDimitry Andric   const LLT V32S32 = LLT::fixed_vector(32, 32);
6060b57cec5SDimitry Andric 
607fe6060f1SDimitry Andric   const LLT V2S64 = LLT::fixed_vector(2, 64);
608fe6060f1SDimitry Andric   const LLT V3S64 = LLT::fixed_vector(3, 64);
609fe6060f1SDimitry Andric   const LLT V4S64 = LLT::fixed_vector(4, 64);
610fe6060f1SDimitry Andric   const LLT V5S64 = LLT::fixed_vector(5, 64);
611fe6060f1SDimitry Andric   const LLT V6S64 = LLT::fixed_vector(6, 64);
612fe6060f1SDimitry Andric   const LLT V7S64 = LLT::fixed_vector(7, 64);
613fe6060f1SDimitry Andric   const LLT V8S64 = LLT::fixed_vector(8, 64);
614fe6060f1SDimitry Andric   const LLT V16S64 = LLT::fixed_vector(16, 64);
6150b57cec5SDimitry Andric 
6160b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
6170b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
6188bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
6190b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
6208bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
6210b57cec5SDimitry Andric 
6220b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
6230b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
6248bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
6250b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
6268bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
6270b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
6280b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
629*06c3fb27SDimitry Andric   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
630*06c3fb27SDimitry Andric   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
6310b57cec5SDimitry Andric 
6320b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
6330b57cec5SDimitry Andric 
6340b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
6350b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
6360b57cec5SDimitry Andric   };
6370b57cec5SDimitry Andric 
6380b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
6398bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
6400b57cec5SDimitry Andric   };
6410b57cec5SDimitry Andric 
642*06c3fb27SDimitry Andric   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
643*06c3fb27SDimitry Andric 
6440b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
6450b57cec5SDimitry Andric     S32, S64
6460b57cec5SDimitry Andric   };
6470b57cec5SDimitry Andric 
6480b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
6490b57cec5SDimitry Andric     S32, S64, S16
6500b57cec5SDimitry Andric   };
6510b57cec5SDimitry Andric 
6520b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
6530b57cec5SDimitry Andric     S32, S64, S16, V2S16
6540b57cec5SDimitry Andric   };
6550b57cec5SDimitry Andric 
6565ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
6575ffd83dbSDimitry Andric 
658fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
659fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
6600b57cec5SDimitry Andric 
6610b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
6620b57cec5SDimitry Andric   // elements for v3s16
6630b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
664e8d8bef9SDimitry Andric       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
6650b57cec5SDimitry Andric       .legalFor(AllS32Vectors)
6660b57cec5SDimitry Andric       .legalFor(AllS64Vectors)
6670b57cec5SDimitry Andric       .legalFor(AddrSpaces64)
6680b57cec5SDimitry Andric       .legalFor(AddrSpaces32)
669*06c3fb27SDimitry Andric       .legalFor(AddrSpaces128)
670e8d8bef9SDimitry Andric       .legalIf(isPointer(0))
671e8d8bef9SDimitry Andric       .clampScalar(0, S16, S256)
6720b57cec5SDimitry Andric       .widenScalarToNextPow2(0, 32)
6730b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 16)
6740b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
675e8d8bef9SDimitry Andric       .scalarize(0);
6760b57cec5SDimitry Andric 
677e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
678e8d8bef9SDimitry Andric     // Full set of gfx9 features.
67981ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
6805ffd83dbSDimitry Andric       .legalFor({S32, S16, V2S16})
6810eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
68281ad6265SDimitry Andric       .scalarize(0)
68381ad6265SDimitry Andric       .minScalar(0, S16)
684349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
68581ad6265SDimitry Andric       .maxScalar(0, S32);
68681ad6265SDimitry Andric 
68781ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
68881ad6265SDimitry Andric       .legalFor({S32, S16, V2S16})
68981ad6265SDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
69081ad6265SDimitry Andric       .scalarize(0)
69181ad6265SDimitry Andric       .minScalar(0, S16)
69281ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
69381ad6265SDimitry Andric       .custom();
69481ad6265SDimitry Andric     assert(ST.hasMad64_32());
695e8d8bef9SDimitry Andric 
696e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
697e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
698e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
6990eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
700e8d8bef9SDimitry Andric       .scalarize(0)
701e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
702e8d8bef9SDimitry Andric       .lower();
7035ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
70481ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
7050b57cec5SDimitry Andric       .legalFor({S32, S16})
706349cc55cSDimitry Andric       .minScalar(0, S16)
707349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
708349cc55cSDimitry Andric       .maxScalar(0, S32)
709349cc55cSDimitry Andric       .scalarize(0);
710e8d8bef9SDimitry Andric 
71181ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
71281ad6265SDimitry Andric       .legalFor({S32, S16})
71381ad6265SDimitry Andric       .scalarize(0)
71481ad6265SDimitry Andric       .minScalar(0, S16)
71581ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
71681ad6265SDimitry Andric       .custom();
71781ad6265SDimitry Andric     assert(ST.hasMad64_32());
71881ad6265SDimitry Andric 
719e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
720e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
721e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
722e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
723e8d8bef9SDimitry Andric       .minScalar(0, S16)
724e8d8bef9SDimitry Andric       .scalarize(0)
725e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
726e8d8bef9SDimitry Andric       .lower();
727e8d8bef9SDimitry Andric 
728e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
729e8d8bef9SDimitry Andric     // coerce to the desired type first.
730e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
731e8d8bef9SDimitry Andric       .minScalar(0, S16)
732e8d8bef9SDimitry Andric       .scalarize(0)
733e8d8bef9SDimitry Andric       .lower();
7340b57cec5SDimitry Andric   } else {
73581ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
7360b57cec5SDimitry Andric       .legalFor({S32})
737349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
7380b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
7390b57cec5SDimitry Andric       .scalarize(0);
740e8d8bef9SDimitry Andric 
74181ad6265SDimitry Andric     auto &Mul = getActionDefinitionsBuilder(G_MUL)
74281ad6265SDimitry Andric       .legalFor({S32})
74381ad6265SDimitry Andric       .scalarize(0)
74481ad6265SDimitry Andric       .minScalar(0, S32)
74581ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32);
74681ad6265SDimitry Andric 
74781ad6265SDimitry Andric     if (ST.hasMad64_32())
74881ad6265SDimitry Andric       Mul.custom();
74981ad6265SDimitry Andric     else
75081ad6265SDimitry Andric       Mul.maxScalar(0, S32);
75181ad6265SDimitry Andric 
752e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
753e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
754e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
755e8d8bef9SDimitry Andric         .scalarize(0)
756e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
757e8d8bef9SDimitry Andric         .lower();
758e8d8bef9SDimitry Andric     } else {
759e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
760e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
761e8d8bef9SDimitry Andric         .minScalar(0, S32)
762e8d8bef9SDimitry Andric         .scalarize(0)
763e8d8bef9SDimitry Andric         .lower();
7640b57cec5SDimitry Andric     }
7650b57cec5SDimitry Andric 
766e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
767e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
768e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
769e8d8bef9SDimitry Andric       .minScalar(0, S32)
770e8d8bef9SDimitry Andric       .scalarize(0)
771e8d8bef9SDimitry Andric       .lower();
772e8d8bef9SDimitry Andric   }
773e8d8bef9SDimitry Andric 
774fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
775fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
7765ffd83dbSDimitry Andric       .customFor({S32, S64})
777480093f4SDimitry Andric       .clampScalar(0, S32, S64)
778480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
779480093f4SDimitry Andric       .scalarize(0);
780480093f4SDimitry Andric 
781e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
7820b57cec5SDimitry Andric                    .legalFor({S32})
783349cc55cSDimitry Andric                    .maxScalar(0, S32);
784e8d8bef9SDimitry Andric 
785e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
786e8d8bef9SDimitry Andric     Mulh
787e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
788e8d8bef9SDimitry Andric       .lowerFor({V2S8});
789e8d8bef9SDimitry Andric   }
790e8d8bef9SDimitry Andric 
791e8d8bef9SDimitry Andric   Mulh
792e8d8bef9SDimitry Andric     .scalarize(0)
793e8d8bef9SDimitry Andric     .lower();
7940b57cec5SDimitry Andric 
7950b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
7960b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
7970b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
7980b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
7990b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
8000b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8018bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
8020b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
8030b57cec5SDimitry Andric     .scalarize(0);
8040b57cec5SDimitry Andric 
805bdd1243dSDimitry Andric   getActionDefinitionsBuilder(
806bdd1243dSDimitry Andric       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
807480093f4SDimitry Andric       .legalFor({{S32, S1}, {S32, S32}})
808bdd1243dSDimitry Andric       .clampScalar(0, S32, S32)
809bdd1243dSDimitry Andric       .scalarize(0);
8100b57cec5SDimitry Andric 
8110b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
8120b57cec5SDimitry Andric     // Don't worry about the size constraint.
8138bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
8145ffd83dbSDimitry Andric     .lower();
8150b57cec5SDimitry Andric 
8160b57cec5SDimitry Andric 
8170b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
8188bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
8190b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
820e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
8210b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
822e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
8230b57cec5SDimitry Andric 
8245ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
8255ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
8265ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
8278bcb0991SDimitry Andric 
8285ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
8295ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
8305ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
8315ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
8325ffd83dbSDimitry Andric       .legalFor({S1, S16})
8335ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8345ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
8355ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
8365ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
8375ffd83dbSDimitry Andric 
838fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
8395ffd83dbSDimitry Andric 
8405ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
8415ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
8425ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
8435ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
8445ffd83dbSDimitry Andric 
8455ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
846e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
847e8d8bef9SDimitry Andric 
848fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
8490b57cec5SDimitry Andric 
8500b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
851bdd1243dSDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
852bdd1243dSDimitry Andric       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
8530b57cec5SDimitry Andric     .legalFor({S32, S64});
8548bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
8558bcb0991SDimitry Andric     .customFor({S32, S64});
8568bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
8578bcb0991SDimitry Andric     .customFor({S32, S64});
8580b57cec5SDimitry Andric 
8590b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
8600b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
8610b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
8620b57cec5SDimitry Andric     else
8630b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
8648bcb0991SDimitry Andric 
8658bcb0991SDimitry Andric     TrigActions.customFor({S16});
8668bcb0991SDimitry Andric     FDIVActions.customFor({S16});
8670b57cec5SDimitry Andric   }
8680b57cec5SDimitry Andric 
8690b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
8700b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
8710b57cec5SDimitry Andric 
8720b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
8730b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
874480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8750b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
8760b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
8770b57cec5SDimitry Andric       .scalarize(0);
8780b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
8790b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
8800b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
8810b57cec5SDimitry Andric       .scalarize(0);
8820b57cec5SDimitry Andric   } else {
8830b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
8840b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
8850b57cec5SDimitry Andric       .scalarize(0);
8860b57cec5SDimitry Andric   }
8870b57cec5SDimitry Andric 
8880b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
8890eae32dcSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
8908bcb0991SDimitry Andric 
8910b57cec5SDimitry Andric   FPOpActions
8920b57cec5SDimitry Andric     .scalarize(0)
8930b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
8940b57cec5SDimitry Andric 
8958bcb0991SDimitry Andric   TrigActions
8968bcb0991SDimitry Andric     .scalarize(0)
8978bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
8988bcb0991SDimitry Andric 
8998bcb0991SDimitry Andric   FDIVActions
9008bcb0991SDimitry Andric     .scalarize(0)
9018bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9028bcb0991SDimitry Andric 
9038bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
9048bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
9050eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
9068bcb0991SDimitry Andric     .scalarize(0)
9078bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
9088bcb0991SDimitry Andric 
9090b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
910*06c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
911*06c3fb27SDimitry Andric       .legalFor({S32, S16})
912*06c3fb27SDimitry Andric       .customFor({S64})
913*06c3fb27SDimitry Andric       .scalarize(0)
914*06c3fb27SDimitry Andric       .clampScalar(0, S16, S64);
915*06c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFLOOR)
9160b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
9170b57cec5SDimitry Andric       .scalarize(0)
9180b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
919*06c3fb27SDimitry Andric 
920*06c3fb27SDimitry Andric     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
921*06c3fb27SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
922*06c3fb27SDimitry Andric       .scalarize(0)
923*06c3fb27SDimitry Andric       .maxScalarIf(typeIs(0, S16), 1, S16)
924*06c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
925*06c3fb27SDimitry Andric       .lower();
926*06c3fb27SDimitry Andric 
927*06c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFREXP)
928*06c3fb27SDimitry Andric       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
929*06c3fb27SDimitry Andric       .scalarize(0)
930*06c3fb27SDimitry Andric       .lower();
9310b57cec5SDimitry Andric   } else {
9325ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
933*06c3fb27SDimitry Andric       .legalFor({S32})
934*06c3fb27SDimitry Andric       .customFor({S64})
9355ffd83dbSDimitry Andric       .scalarize(0)
9365ffd83dbSDimitry Andric       .clampScalar(0, S32, S64);
9375ffd83dbSDimitry Andric 
9385ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
9395ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
9405ffd83dbSDimitry Andric         .customFor({S64})
9415ffd83dbSDimitry Andric         .legalFor({S32, S64})
9425ffd83dbSDimitry Andric         .scalarize(0)
9435ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
9445ffd83dbSDimitry Andric     } else {
9455ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
9460b57cec5SDimitry Andric         .legalFor({S32, S64})
9470b57cec5SDimitry Andric         .scalarize(0)
9480b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
9490b57cec5SDimitry Andric     }
950*06c3fb27SDimitry Andric 
951*06c3fb27SDimitry Andric     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
952*06c3fb27SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
953*06c3fb27SDimitry Andric       .scalarize(0)
954*06c3fb27SDimitry Andric       .clampScalar(0, S32, S64)
955*06c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
956*06c3fb27SDimitry Andric       .lower();
957*06c3fb27SDimitry Andric 
958*06c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFREXP)
959*06c3fb27SDimitry Andric       .customFor({{S32, S32}, {S64, S32}})
960*06c3fb27SDimitry Andric       .scalarize(0)
961*06c3fb27SDimitry Andric       .minScalar(0, S32)
962*06c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
963*06c3fb27SDimitry Andric       .lower();
9645ffd83dbSDimitry Andric   }
9650b57cec5SDimitry Andric 
9660b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
9670b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
9685ffd83dbSDimitry Andric     .scalarize(0)
9695ffd83dbSDimitry Andric     .lower();
9700b57cec5SDimitry Andric 
9710b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
9720b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
973e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
9740b57cec5SDimitry Andric     .scalarize(0);
9750b57cec5SDimitry Andric 
976bdd1243dSDimitry Andric   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
97781ad6265SDimitry Andric   if (ST.has16BitInsts()) {
97881ad6265SDimitry Andric     FSubActions
97981ad6265SDimitry Andric       // Use actual fsub instruction
98081ad6265SDimitry Andric       .legalFor({S32, S16})
98181ad6265SDimitry Andric       // Must use fadd + fneg
98281ad6265SDimitry Andric       .lowerFor({S64, V2S16});
98381ad6265SDimitry Andric   } else {
98481ad6265SDimitry Andric     FSubActions
9850b57cec5SDimitry Andric       // Use actual fsub instruction
9860b57cec5SDimitry Andric       .legalFor({S32})
9870b57cec5SDimitry Andric       // Must use fadd + fneg
98881ad6265SDimitry Andric       .lowerFor({S64, S16, V2S16});
98981ad6265SDimitry Andric   }
99081ad6265SDimitry Andric 
99181ad6265SDimitry Andric   FSubActions
9920b57cec5SDimitry Andric     .scalarize(0)
9930b57cec5SDimitry Andric     .clampScalar(0, S32, S64);
9940b57cec5SDimitry Andric 
9958bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
9968bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
9975ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
9988bcb0991SDimitry Andric     FMad.customFor({S32, S16});
9995ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
10008bcb0991SDimitry Andric     FMad.customFor({S32});
10015ffd83dbSDimitry Andric   else if (ST.hasMadF16())
10025ffd83dbSDimitry Andric     FMad.customFor({S16});
10038bcb0991SDimitry Andric   FMad.scalarize(0)
10048bcb0991SDimitry Andric       .lower();
10058bcb0991SDimitry Andric 
1006e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1007e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
1008e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
1009e8d8bef9SDimitry Andric   } else {
1010e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
1011e8d8bef9SDimitry Andric         .customFor({S32, S64});
1012e8d8bef9SDimitry Andric   }
1013e8d8bef9SDimitry Andric   FRem.scalarize(0);
1014e8d8bef9SDimitry Andric 
10155ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
10165ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
10175ffd83dbSDimitry Andric     .legalIf(isScalar(0))
10185ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
10195ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
10205ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
10215ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
10225ffd83dbSDimitry Andric     // in the legalizer.
10235ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
10245ffd83dbSDimitry Andric     .alwaysLegal();
10255ffd83dbSDimitry Andric 
10260b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
10270b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
10285ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
1029480093f4SDimitry Andric     .scalarize(0)
10305ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
10315ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
10320b57cec5SDimitry Andric 
10338bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
10348bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1035480093f4SDimitry Andric                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1036480093f4SDimitry Andric                     .lowerIf(typeIs(1, S1))
1037349cc55cSDimitry Andric                     .customFor({{S32, S64}, {S64, S64}});
10388bcb0991SDimitry Andric   if (ST.has16BitInsts())
10398bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
10408bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
1041e8d8bef9SDimitry Andric        .minScalar(0, S32)
10425ffd83dbSDimitry Andric        .scalarize(0)
10435ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
10440b57cec5SDimitry Andric 
10458bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
10465ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1047fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
1048e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
10498bcb0991SDimitry Andric   if (ST.has16BitInsts())
10508bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
10518bcb0991SDimitry Andric   else
10528bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
10538bcb0991SDimitry Andric 
10548bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
1055fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
10565ffd83dbSDimitry Andric        .scalarize(0)
10575ffd83dbSDimitry Andric        .lower();
10580b57cec5SDimitry Andric 
105981ad6265SDimitry Andric   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
106081ad6265SDimitry Andric       .customFor({S16, S32})
106181ad6265SDimitry Andric       .scalarize(0)
106281ad6265SDimitry Andric       .lower();
106381ad6265SDimitry Andric 
1064e8d8bef9SDimitry Andric   // Lower roundeven into G_FRINT
1065e8d8bef9SDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
1066480093f4SDimitry Andric     .scalarize(0)
1067480093f4SDimitry Andric     .lower();
10680b57cec5SDimitry Andric 
1069480093f4SDimitry Andric   if (ST.has16BitInsts()) {
1070480093f4SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
1071480093f4SDimitry Andric       .legalFor({S16, S32, S64})
1072480093f4SDimitry Andric       .clampScalar(0, S16, S64)
1073480093f4SDimitry Andric       .scalarize(0);
1074480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
10750b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
10760b57cec5SDimitry Andric       .legalFor({S32, S64})
10770b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
10780b57cec5SDimitry Andric       .scalarize(0);
10790b57cec5SDimitry Andric   } else {
10800b57cec5SDimitry Andric     getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
10810b57cec5SDimitry Andric       .legalFor({S32})
10820b57cec5SDimitry Andric       .customFor({S64})
10830b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
10840b57cec5SDimitry Andric       .scalarize(0);
10850b57cec5SDimitry Andric   }
10860b57cec5SDimitry Andric 
1087480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
1088*06c3fb27SDimitry Andric       .unsupportedFor({BufferFatPtr, RsrcPtr})
1089e8d8bef9SDimitry Andric       .legalIf(all(isPointer(0), sameSize(0, 1)))
1090e8d8bef9SDimitry Andric       .scalarize(0)
1091e8d8bef9SDimitry Andric       .scalarSameSizeAs(1, 0);
10920b57cec5SDimitry Andric 
10935ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
1094e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1095e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
10965ffd83dbSDimitry Andric     .scalarize(0);
10970b57cec5SDimitry Andric 
10980b57cec5SDimitry Andric   auto &CmpBuilder =
10990b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
1100480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
1101480093f4SDimitry Andric     // so make both s1 and s32 legal.
1102480093f4SDimitry Andric     //
1103480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
1104480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
1105480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
1106480093f4SDimitry Andric     // before that won't try to use s32 result types.
1107480093f4SDimitry Andric     //
1108480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1109480093f4SDimitry Andric     // bank.
11100b57cec5SDimitry Andric     .legalForCartesianProduct(
11110b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1112480093f4SDimitry Andric     .legalForCartesianProduct(
1113480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
11140b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
11150b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
11160b57cec5SDimitry Andric   }
11170b57cec5SDimitry Andric 
11180b57cec5SDimitry Andric   CmpBuilder
11190b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
11200b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
11210b57cec5SDimitry Andric     .scalarize(0)
1122480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
11230b57cec5SDimitry Andric 
11240b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FCMP)
11250b57cec5SDimitry Andric     .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
11260b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
11270b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
11280b57cec5SDimitry Andric     .scalarize(0);
11290b57cec5SDimitry Andric 
11305ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
1131*06c3fb27SDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
11325ffd83dbSDimitry Andric   if (ST.has16BitInsts())
11335ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
11345ffd83dbSDimitry Andric   else
11355ffd83dbSDimitry Andric     ExpOps.customFor({S32});
11365ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
11370b57cec5SDimitry Andric         .scalarize(0);
11380b57cec5SDimitry Andric 
1139e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
1140e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
1141e8d8bef9SDimitry Andric     .lower();
1142e8d8bef9SDimitry Andric 
1143*06c3fb27SDimitry Andric   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1144*06c3fb27SDimitry Andric   Log2Ops.customFor({S32});
1145*06c3fb27SDimitry Andric   if (ST.has16BitInsts())
1146*06c3fb27SDimitry Andric     Log2Ops.legalFor({S16});
1147*06c3fb27SDimitry Andric   else
1148*06c3fb27SDimitry Andric     Log2Ops.customFor({S16});
1149*06c3fb27SDimitry Andric   Log2Ops.scalarize(0)
1150*06c3fb27SDimitry Andric     .lower();
1151*06c3fb27SDimitry Andric 
1152*06c3fb27SDimitry Andric   auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP});
1153*06c3fb27SDimitry Andric   LogOps.customFor({S32, S16});
1154*06c3fb27SDimitry Andric   LogOps.clampScalar(0, MinScalarFPTy, S32)
1155*06c3fb27SDimitry Andric         .scalarize(0);
1156*06c3fb27SDimitry Andric 
11570b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
11585ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
11590b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
11600b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
116104eeddc0SDimitry Andric     .widenScalarToNextPow2(1, 32)
11620b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
11630b57cec5SDimitry Andric     .scalarize(0)
116404eeddc0SDimitry Andric     .widenScalarToNextPow2(0, 32);
116504eeddc0SDimitry Andric 
1166bdd1243dSDimitry Andric   // If no 16 bit instr is available, lower into different instructions.
1167bdd1243dSDimitry Andric   if (ST.has16BitInsts())
1168bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
1169bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypes16)
1170bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
1171bdd1243dSDimitry Andric         .scalarize(0)
1172bdd1243dSDimitry Andric         .lower();
1173bdd1243dSDimitry Andric   else
1174bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
1175bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypesBase)
1176bdd1243dSDimitry Andric         .lowerFor({S1, S16})
1177bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
1178bdd1243dSDimitry Andric         .scalarize(0)
1179bdd1243dSDimitry Andric         .lower();
11800b57cec5SDimitry Andric 
11815ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
11825ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
11835ffd83dbSDimitry Andric   // bitwidth.
11845ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
11855ffd83dbSDimitry Andric     .scalarize(0)
11865ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
11875ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
11885ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
11895ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
1190349cc55cSDimitry Andric     .custom();
11915ffd83dbSDimitry Andric 
11925ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
11935ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
11945ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
11955ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
11965ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
11975ffd83dbSDimitry Andric     .scalarize(0)
11985ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
11995ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
12005ffd83dbSDimitry Andric 
1201fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1202fe6060f1SDimitry Andric   // RegBankSelect.
12035ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
1204fe6060f1SDimitry Andric     .legalFor({S32, S64})
1205fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
1206fe6060f1SDimitry Andric     .scalarize(0)
1207fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
12080b57cec5SDimitry Andric 
12090b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
12105ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
12115ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
12120eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
12135ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
12145ffd83dbSDimitry Andric       // narrowScalar limitation.
12155ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
12165ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
12175ffd83dbSDimitry Andric       .scalarize(0);
12185ffd83dbSDimitry Andric 
12190b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
1220fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12210b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
12220b57cec5SDimitry Andric         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
12230b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
12245ffd83dbSDimitry Andric         .minScalar(0, S16)
12250b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
12265ffd83dbSDimitry Andric         .scalarize(0)
12275ffd83dbSDimitry Andric         .lower();
12280b57cec5SDimitry Andric     } else {
1229fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12300b57cec5SDimitry Andric         .legalFor({S32, S16})
12310b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
12325ffd83dbSDimitry Andric         .minScalar(0, S16)
12335ffd83dbSDimitry Andric         .scalarize(0)
12345ffd83dbSDimitry Andric         .lower();
12350b57cec5SDimitry Andric     }
12360b57cec5SDimitry Andric   } else {
12375ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
12385ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
12395ffd83dbSDimitry Andric       .legalFor({S32})
12405ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
12415ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
12425ffd83dbSDimitry Andric       // narrowScalar limitation.
12435ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
12445ffd83dbSDimitry Andric       .maxScalar(0, S32)
12455ffd83dbSDimitry Andric       .scalarize(0)
12465ffd83dbSDimitry Andric       .lower();
12475ffd83dbSDimitry Andric 
1248fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12490b57cec5SDimitry Andric       .legalFor({S32})
12505ffd83dbSDimitry Andric       .minScalar(0, S32)
12510b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
12525ffd83dbSDimitry Andric       .scalarize(0)
12535ffd83dbSDimitry Andric       .lower();
12540b57cec5SDimitry Andric   }
12550b57cec5SDimitry Andric 
12560b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
12570b57cec5SDimitry Andric       // List the common cases
12580b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
12590b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
12600b57cec5SDimitry Andric       .scalarize(0)
12610b57cec5SDimitry Andric       // Accept any address space as long as the size matches
12620b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
12630b57cec5SDimitry Andric       .widenScalarIf(smallerThan(1, 0),
12640b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1265bdd1243dSDimitry Andric                        return std::pair(
1266bdd1243dSDimitry Andric                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
12670b57cec5SDimitry Andric                      })
1268bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1269bdd1243dSDimitry Andric         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
12700b57cec5SDimitry Andric       });
12710b57cec5SDimitry Andric 
12720b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
12730b57cec5SDimitry Andric       // List the common cases
12740b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
12750b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
12760b57cec5SDimitry Andric       .scalarize(0)
12770b57cec5SDimitry Andric       // Accept any address space as long as the size matches
12780b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
12790b57cec5SDimitry Andric       .widenScalarIf(smallerThan(0, 1),
12800b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1281bdd1243dSDimitry Andric                        return std::pair(
1282bdd1243dSDimitry Andric                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
12830b57cec5SDimitry Andric                      })
1284bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1285bdd1243dSDimitry Andric         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
12860b57cec5SDimitry Andric       });
12870b57cec5SDimitry Andric 
12880b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
12890b57cec5SDimitry Andric     .scalarize(0)
12900b57cec5SDimitry Andric     .custom();
12910b57cec5SDimitry Andric 
12925ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
12935ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
12948bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
12958bcb0991SDimitry Andric 
12968bcb0991SDimitry Andric     // Split vector extloads.
1297fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1298480093f4SDimitry Andric 
12998bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
13008bcb0991SDimitry Andric       return true;
13018bcb0991SDimitry Andric 
13028bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
13038bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
1304*06c3fb27SDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1305*06c3fb27SDimitry Andric                                       Query.MMODescrs[0].Ordering !=
1306*06c3fb27SDimitry Andric                                           AtomicOrdering::NotAtomic))
13078bcb0991SDimitry Andric       return true;
13088bcb0991SDimitry Andric 
13098bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
13108bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
13115ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
13125ffd83dbSDimitry Andric     if (NumRegs == 3) {
13135ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
13148bcb0991SDimitry Andric         return true;
13155ffd83dbSDimitry Andric     } else {
13165ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
13175ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
13185ffd83dbSDimitry Andric         return true;
13195ffd83dbSDimitry Andric     }
13208bcb0991SDimitry Andric 
13218bcb0991SDimitry Andric     return false;
13228bcb0991SDimitry Andric   };
13238bcb0991SDimitry Andric 
1324e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1325e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1326e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
13278bcb0991SDimitry Andric 
13288bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
13298bcb0991SDimitry Andric   // LDS
13308bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
13318bcb0991SDimitry Andric 
13328bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
13338bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
13348bcb0991SDimitry Andric 
13358bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
13365ffd83dbSDimitry Andric     // Explicitly list some common cases.
13375ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1338fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1339fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1340fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1341fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1342fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1343fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1344fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1345fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
13468bcb0991SDimitry Andric 
1347fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1348fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1349fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1350fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1351fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1352fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
13538bcb0991SDimitry Andric 
1354fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1355fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1356fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1357fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
13588bcb0991SDimitry Andric 
1359fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1360fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1361fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1362fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1363fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
13645ffd83dbSDimitry Andric     Actions.legalIf(
13655ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1366fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
13675ffd83dbSDimitry Andric       });
13685ffd83dbSDimitry Andric 
1369*06c3fb27SDimitry Andric     // The custom pointers (fat pointers, buffer resources) don't work with load
1370*06c3fb27SDimitry Andric     // and store at this level. Fat pointers should have been lowered to
1371*06c3fb27SDimitry Andric     // intrinsics before the translation to MIR.
1372*06c3fb27SDimitry Andric     Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
1373*06c3fb27SDimitry Andric 
1374*06c3fb27SDimitry Andric     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1375*06c3fb27SDimitry Andric     // ptrtoint. This is needed to account for the fact that we can't have i128
1376*06c3fb27SDimitry Andric     // as a register class for SelectionDAG reasons.
1377*06c3fb27SDimitry Andric     Actions.customIf([=](const LegalityQuery &Query) -> bool {
1378*06c3fb27SDimitry Andric       return hasBufferRsrcWorkaround(Query.Types[0]);
1379*06c3fb27SDimitry Andric     });
1380*06c3fb27SDimitry Andric 
13815ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
13825ffd83dbSDimitry Andric     // 64-bits.
13835ffd83dbSDimitry Andric     //
13845ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
13855ffd83dbSDimitry Andric     // inserting addrspacecasts.
13865ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
13875ffd83dbSDimitry Andric 
13885ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
13895ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
13905ffd83dbSDimitry Andric     // parts anyway.
13915ffd83dbSDimitry Andric     //
13925ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
13935ffd83dbSDimitry Andric     // 16-bit vector parts.
13945ffd83dbSDimitry Andric     Actions.bitcastIf(
13955ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1396e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1397fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
13985ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
13995ffd83dbSDimitry Andric 
1400e8d8bef9SDimitry Andric     if (!IsStore) {
1401e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1402e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1403e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1404e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1405e8d8bef9SDimitry Andric       });
1406e8d8bef9SDimitry Andric     }
1407e8d8bef9SDimitry Andric 
1408e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
14098bcb0991SDimitry Andric     Actions
14108bcb0991SDimitry Andric         .narrowScalarIf(
14118bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
14125ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
14135ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
14148bcb0991SDimitry Andric             },
14158bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
14168bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
14178bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
14188bcb0991SDimitry Andric 
14198bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1420fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
14218bcb0991SDimitry Andric 
14228bcb0991SDimitry Andric               // Split extloads.
14238bcb0991SDimitry Andric               if (DstSize > MemSize)
1424bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MemSize));
14258bcb0991SDimitry Andric 
1426*06c3fb27SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(
1427*06c3fb27SDimitry Andric                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1428*06c3fb27SDimitry Andric                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
14298bcb0991SDimitry Andric               if (MemSize > MaxSize)
1430bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MaxSize));
14318bcb0991SDimitry Andric 
143204eeddc0SDimitry Andric               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1433bdd1243dSDimitry Andric               return std::pair(0, LLT::scalar(Align));
14348bcb0991SDimitry Andric             })
14358bcb0991SDimitry Andric         .fewerElementsIf(
14368bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
14375ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
14385ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
14398bcb0991SDimitry Andric             },
14408bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
14418bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
14428bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
14438bcb0991SDimitry Andric 
14448bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
1445*06c3fb27SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(
1446*06c3fb27SDimitry Andric                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1447*06c3fb27SDimitry Andric                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
14485ffd83dbSDimitry Andric 
14495ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
14505ffd83dbSDimitry Andric               // up scalarizing.
14515ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
14528bcb0991SDimitry Andric 
14538bcb0991SDimitry Andric               // Split if it's too large for the address space.
1454fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1455fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
14568bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
14575ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
14585ffd83dbSDimitry Andric 
14595ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
1460bdd1243dSDimitry Andric                   return std::pair(
1461fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1462fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
14635ffd83dbSDimitry Andric                 }
14645ffd83dbSDimitry Andric 
1465fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
14668bcb0991SDimitry Andric 
14678bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
14688bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
14698bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
14708bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
1471bdd1243dSDimitry Andric                   return std::pair(0, EltTy);
14728bcb0991SDimitry Andric 
1473bdd1243dSDimitry Andric                 return std::pair(0,
1474bdd1243dSDimitry Andric                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
14758bcb0991SDimitry Andric               }
14768bcb0991SDimitry Andric 
14775ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
14785ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
1479bdd1243dSDimitry Andric                 return std::pair(0, EltTy);
14805ffd83dbSDimitry Andric 
14815ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
14825ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
14835ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
14845ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
14855ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
14865ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
1487*06c3fb27SDimitry Andric                 unsigned FloorSize = llvm::bit_floor(DstSize);
1488bdd1243dSDimitry Andric                 return std::pair(
1489fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1490fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
14915ffd83dbSDimitry Andric               }
14925ffd83dbSDimitry Andric 
14938bcb0991SDimitry Andric               // May need relegalization for the scalars.
1494bdd1243dSDimitry Andric               return std::pair(0, EltTy);
14958bcb0991SDimitry Andric             })
1496fe6060f1SDimitry Andric     .minScalar(0, S32)
1497fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
14988bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1499e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1500e8d8bef9SDimitry Andric     .lower();
15018bcb0991SDimitry Andric   }
15020b57cec5SDimitry Andric 
1503fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
15040b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1505fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1506fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1507fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1508fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1509fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1510fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1511fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1512fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1513fe6060f1SDimitry Andric                        .legalIf(
1514fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1515fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1516fe6060f1SDimitry Andric                          });
1517fe6060f1SDimitry Andric 
15180b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
15198bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1520fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
15210b57cec5SDimitry Andric   }
15220b57cec5SDimitry Andric 
1523fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1524fe6060f1SDimitry Andric   // 64-bits.
1525fe6060f1SDimitry Andric   //
1526fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1527fe6060f1SDimitry Andric   // inserting addrspacecasts.
1528fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1529fe6060f1SDimitry Andric 
15300b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
15310b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
15320b57cec5SDimitry Andric           .lower();
15330b57cec5SDimitry Andric 
15340b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
15350b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
15360b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
15370b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1538*06c3fb27SDimitry Andric      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
15390b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1540e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1541e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
15420b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
15430b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
15440b57cec5SDimitry Andric   }
15450b57cec5SDimitry Andric 
1546fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1547349cc55cSDimitry Andric   if (ST.hasLDSFPAtomicAdd()) {
1548fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1549fe6060f1SDimitry Andric     if (ST.hasGFX90AInsts())
1550fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
1551*06c3fb27SDimitry Andric     if (ST.hasAtomicDsPkAdd16Insts())
155281ad6265SDimitry Andric       Atomic.legalFor({{V2S16, LocalPtr}});
15535ffd83dbSDimitry Andric   }
1554fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1555fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
1556bdd1243dSDimitry Andric   if (ST.hasFlatAtomicFaddF32Inst())
1557bdd1243dSDimitry Andric     Atomic.legalFor({{S32, FlatPtr}});
15588bcb0991SDimitry Andric 
155904eeddc0SDimitry Andric   if (ST.hasGFX90AInsts()) {
156004eeddc0SDimitry Andric     // These are legal with some caveats, and should have undergone expansion in
156104eeddc0SDimitry Andric     // the IR in most situations
156204eeddc0SDimitry Andric     // TODO: Move atomic expansion into legalizer
156304eeddc0SDimitry Andric     Atomic.legalFor({
156404eeddc0SDimitry Andric         {S32, GlobalPtr},
156504eeddc0SDimitry Andric         {S64, GlobalPtr},
156604eeddc0SDimitry Andric         {S64, FlatPtr}
156704eeddc0SDimitry Andric       });
156804eeddc0SDimitry Andric   }
156904eeddc0SDimitry Andric 
1570480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1571480093f4SDimitry Andric   // demarshalling
1572480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1573480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1574480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1575480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1576480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
15770b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1578480093f4SDimitry Andric 
1579480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
15800b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1581fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1582fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1583fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1584fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1585fe6060f1SDimitry Andric                                 {S1, S32})
15860b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
15875ffd83dbSDimitry Andric       .scalarize(1)
15880b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
15890b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
15900b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
15910b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
15920b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
15930b57cec5SDimitry Andric       .scalarize(0)
15940b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1595480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
15960b57cec5SDimitry Andric 
15970b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
15980b57cec5SDimitry Andric   // be more flexible with the shift amount type.
15990b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
16000b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
16010b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
16020b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
16035ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
16040b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
16050b57cec5SDimitry Andric     } else
16065ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
16070b57cec5SDimitry Andric 
16085ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
16095ffd83dbSDimitry Andric     Shifts.widenScalarIf(
16105ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
16115ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
16125ffd83dbSDimitry Andric         // 32-bit amount.
16135ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
16145ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
16155ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
16165ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
16175ffd83dbSDimitry Andric       }, changeTo(1, S16));
16185ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1619480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
16200b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
162104eeddc0SDimitry Andric     Shifts.clampScalar(0, S16, S64);
1622e8d8bef9SDimitry Andric 
1623e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1624e8d8bef9SDimitry Andric       .minScalar(0, S16)
1625e8d8bef9SDimitry Andric       .scalarize(0)
1626e8d8bef9SDimitry Andric       .lower();
16270b57cec5SDimitry Andric   } else {
16280b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
16290b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
16300b57cec5SDimitry Andric     // been truncated already.
16310b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
16320b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
163304eeddc0SDimitry Andric     Shifts.clampScalar(0, S32, S64);
1634e8d8bef9SDimitry Andric 
1635e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1636e8d8bef9SDimitry Andric       .minScalar(0, S32)
1637e8d8bef9SDimitry Andric       .scalarize(0)
1638e8d8bef9SDimitry Andric       .lower();
16390b57cec5SDimitry Andric   }
16400b57cec5SDimitry Andric   Shifts.scalarize(0);
16410b57cec5SDimitry Andric 
16420b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
16430b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
16440b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
16450b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
16460b57cec5SDimitry Andric 
16470b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
16480b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
16490b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
16500b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
16510b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1652e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
1653*06c3fb27SDimitry Andric           const bool isLegalVecType =
1654*06c3fb27SDimitry Andric               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
1655*06c3fb27SDimitry Andric           // Address space 8 pointers are 128-bit wide values, but the logic
1656*06c3fb27SDimitry Andric           // below will try to bitcast them to 2N x s64, which will fail.
1657*06c3fb27SDimitry Andric           // Therefore, as an intermediate step, wrap extracts/insertions from a
1658*06c3fb27SDimitry Andric           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1659*06c3fb27SDimitry Andric           // extraction result) in order to produce a vector operation that can
1660*06c3fb27SDimitry Andric           // be handled by the logic below.
1661*06c3fb27SDimitry Andric           if (EltTy.isPointer() && EltSize > 64)
1662*06c3fb27SDimitry Andric             return true;
1663e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
16640b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
16655ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
1666*06c3fb27SDimitry Andric                   IdxTy.getSizeInBits() == 32 &&
1667*06c3fb27SDimitry Andric                   isLegalVecType;
16680b57cec5SDimitry Andric         })
1669e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1670e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1671e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1672e8d8bef9SDimitry Andric       .bitcastIf(
1673e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1674e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1675e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1676e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1677e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1678e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1679e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1680e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1681e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1682e8d8bef9SDimitry Andric 
1683e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1684bdd1243dSDimitry Andric           return std::pair(
1685fe6060f1SDimitry Andric               VecTypeIdx,
1686fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1687e8d8bef9SDimitry Andric         })
16880b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
16890b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1690e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1691e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1692e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
1693*06c3fb27SDimitry Andric       .moreElementsIf(
1694*06c3fb27SDimitry Andric         isIllegalRegisterType(VecTypeIdx),
1695*06c3fb27SDimitry Andric         moreElementsToNextExistingRegClass(VecTypeIdx))
1696e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1697e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1698e8d8bef9SDimitry Andric       .lower();
16990b57cec5SDimitry Andric   }
17000b57cec5SDimitry Andric 
17010b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
17020b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
17030b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
17040b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
17050b57cec5SDimitry Andric       });
17060b57cec5SDimitry Andric 
17070b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
17080b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
17090b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
17100b57cec5SDimitry Andric 
17110b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
17120b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
17138bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
17140eae32dcSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
17150eae32dcSDimitry Andric           // Sub-vector(or single element) insert and extract.
17160eae32dcSDimitry Andric           // TODO: verify immediate offset here since lower only works with
17170eae32dcSDimitry Andric           // whole elements.
17180eae32dcSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17190eae32dcSDimitry Andric           return BigTy.isVector();
17200eae32dcSDimitry Andric         })
17218bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
17220b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
17230b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17240b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
17250b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
17260b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
17270b57cec5SDimitry Andric         })
17280b57cec5SDimitry Andric       .widenScalarIf(
17290b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
17300b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17310b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
17320b57cec5SDimitry Andric         },
17330b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
17340b57cec5SDimitry Andric       .widenScalarIf(
17350b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
17360b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
17370b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
17380b57cec5SDimitry Andric         },
17390b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
17400b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
17410b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
17420b57cec5SDimitry Andric 
17430b57cec5SDimitry Andric   }
17440b57cec5SDimitry Andric 
17458bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
17460b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
17470b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
17488bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
17498bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
1750*06c3fb27SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
1751*06c3fb27SDimitry Andric     .moreElementsIf(
1752*06c3fb27SDimitry Andric       isIllegalRegisterType(0),
1753*06c3fb27SDimitry Andric       moreElementsToNextExistingRegClass(0));
17548bcb0991SDimitry Andric 
17558bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
17565ffd83dbSDimitry Andric     BuildVector
17575ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
17585ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
1759bdd1243dSDimitry Andric       .minScalar(1, S16);
17605ffd83dbSDimitry Andric 
17618bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
17628bcb0991SDimitry Andric       .legalFor({V2S16, S32})
17638bcb0991SDimitry Andric       .lower();
17648bcb0991SDimitry Andric   } else {
17655ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
17665ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
17675ffd83dbSDimitry Andric 
17688bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
17695ffd83dbSDimitry Andric       .customFor({V2S16, S32})
17708bcb0991SDimitry Andric       .lower();
17718bcb0991SDimitry Andric   }
17728bcb0991SDimitry Andric 
17735ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
17745ffd83dbSDimitry Andric 
17755ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
17760b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1777e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1778e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1779e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1780e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
17810b57cec5SDimitry Andric 
17828bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
17838bcb0991SDimitry Andric 
17840b57cec5SDimitry Andric   // Merge/Unmerge
17850b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
17860b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
17870b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
17880b57cec5SDimitry Andric 
17890b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
17905ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
17910b57cec5SDimitry Andric       if (Ty.isVector()) {
17920b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
17935ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
17940b57cec5SDimitry Andric           return true;
1795*06c3fb27SDimitry Andric         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
17960b57cec5SDimitry Andric           return true;
17970b57cec5SDimitry Andric       }
17980b57cec5SDimitry Andric       return false;
17990b57cec5SDimitry Andric     };
18000b57cec5SDimitry Andric 
18018bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1802e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
18035ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
18045ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
18055ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
18065ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
18075ffd83dbSDimitry Andric         })
18085ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
18095ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
18105ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
18110b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
18128bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
18138bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
18148bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
18158bcb0991SDimitry Andric                        changeTo(1, V2S16))
18165ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
18175ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
18185ffd83dbSDimitry Andric       // valid.
18195ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
18205ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
18210b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
18220b57cec5SDimitry Andric       .fewerElementsIf(
18235ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
18240b57cec5SDimitry Andric         scalarize(0))
18250b57cec5SDimitry Andric       .fewerElementsIf(
18265ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
18270b57cec5SDimitry Andric         scalarize(1))
18285ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
18298bcb0991SDimitry Andric 
18308bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
18318bcb0991SDimitry Andric       Builder.widenScalarIf(
18328bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
18330b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
18348bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
18358bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
18368bcb0991SDimitry Andric         },
18378bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
18388bcb0991SDimitry Andric     }
18398bcb0991SDimitry Andric 
18408bcb0991SDimitry Andric     Builder.widenScalarIf(
18418bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
18428bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
1843*06c3fb27SDimitry Andric         return Ty.getSizeInBits() % 16 != 0;
18440b57cec5SDimitry Andric       },
18450b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
18460b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
18470b57cec5SDimitry Andric         // Whichever is smaller.
18480b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
18490b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
18500b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
18510b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
18520b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
18530b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
18540b57cec5SDimitry Andric         }
1855bdd1243dSDimitry Andric         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
18560b57cec5SDimitry Andric       })
18570b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
18580b57cec5SDimitry Andric       .scalarize(0)
18590b57cec5SDimitry Andric       .scalarize(1);
18600b57cec5SDimitry Andric   }
18610b57cec5SDimitry Andric 
18625ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
18635ffd83dbSDimitry Andric   // RegBankSelect.
18645ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
18655ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
18668bcb0991SDimitry Andric 
18675ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
18685ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
18695ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
18705ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
18715ffd83dbSDimitry Andric       // expanded.
18720eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2);
18735ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
18745ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
18755ffd83dbSDimitry Andric   } else {
18765ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
18775ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
18785ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
18795ffd83dbSDimitry Andric   }
18805ffd83dbSDimitry Andric 
18815ffd83dbSDimitry Andric   SextInReg
18825ffd83dbSDimitry Andric     .scalarize(0)
18835ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
18845ffd83dbSDimitry Andric     .lower();
18855ffd83dbSDimitry Andric 
1886349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1887349cc55cSDimitry Andric     .scalarize(0)
1888349cc55cSDimitry Andric     .lower();
1889349cc55cSDimitry Andric 
1890fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
18915ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
18925ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1893fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
18940eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
18955ffd83dbSDimitry Andric     .scalarize(0)
18965ffd83dbSDimitry Andric     .lower();
1897480093f4SDimitry Andric 
1898fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
1899fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1900fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
19010eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
1902fe6060f1SDimitry Andric       .scalarize(0)
1903fe6060f1SDimitry Andric       .lower();
1904fe6060f1SDimitry Andric   } else {
1905fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1906fe6060f1SDimitry Andric       .scalarize(0)
1907fe6060f1SDimitry Andric       .lower();
1908fe6060f1SDimitry Andric   }
1909fe6060f1SDimitry Andric 
1910480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1911480093f4SDimitry Andric     .legalFor({S64});
1912480093f4SDimitry Andric 
1913e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1914e8d8bef9SDimitry Andric     .alwaysLegal();
1915e8d8bef9SDimitry Andric 
1916fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1917fe6060f1SDimitry Andric       .scalarize(0)
1918fe6060f1SDimitry Andric       .minScalar(0, S32)
1919fe6060f1SDimitry Andric       .lower();
1920fe6060f1SDimitry Andric 
1921fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1922fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
1923fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
1924fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
1925fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
1926fe6060f1SDimitry Andric       .scalarize(0);
1927fe6060f1SDimitry Andric 
19285ffd83dbSDimitry Andric   getActionDefinitionsBuilder({
19295ffd83dbSDimitry Andric       // TODO: Verify V_BFI_B32 is generated from expanded bit ops
19305ffd83dbSDimitry Andric       G_FCOPYSIGN,
19315ffd83dbSDimitry Andric 
19325ffd83dbSDimitry Andric       G_ATOMIC_CMPXCHG_WITH_SUCCESS,
1933e8d8bef9SDimitry Andric       G_ATOMICRMW_NAND,
1934e8d8bef9SDimitry Andric       G_ATOMICRMW_FSUB,
19355ffd83dbSDimitry Andric       G_READ_REGISTER,
19365ffd83dbSDimitry Andric       G_WRITE_REGISTER,
19375ffd83dbSDimitry Andric 
19385ffd83dbSDimitry Andric       G_SADDO, G_SSUBO,
19395ffd83dbSDimitry Andric 
19405ffd83dbSDimitry Andric        // TODO: Implement
1941fe6060f1SDimitry Andric       G_FMINIMUM, G_FMAXIMUM}).lower();
19425ffd83dbSDimitry Andric 
1943349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1944349cc55cSDimitry Andric       .lower();
1945349cc55cSDimitry Andric 
1946480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
19475ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1948480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1949480093f4SDimitry Andric     .unsupported();
1950480093f4SDimitry Andric 
1951fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
19520b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
19530b57cec5SDimitry Andric }
19540b57cec5SDimitry Andric 
19555ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
19565ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
19575ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
19585ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
19595ffd83dbSDimitry Andric 
19600b57cec5SDimitry Andric   switch (MI.getOpcode()) {
19610b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
19628bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
19630b57cec5SDimitry Andric   case TargetOpcode::G_FRINT:
19648bcb0991SDimitry Andric     return legalizeFrint(MI, MRI, B);
19650b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
19668bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
1967e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
1968e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
19690b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
19708bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
19710b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
19728bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
19730b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
19748bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
19755ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
19765ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
19775ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
19785ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
19790b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
19800b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
19810b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
19820b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
19835ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
19840b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
19858bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
19860b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
19878bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
19888bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
19898bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
19908bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
19918bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
19928bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
19938bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
1994fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
1995fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
1996e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
1997*06c3fb27SDimitry Andric   case TargetOpcode::G_STORE:
1998*06c3fb27SDimitry Andric     return legalizeStore(Helper, MI);
19998bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
20008bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
20018bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
20028bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
2003*06c3fb27SDimitry Andric   case TargetOpcode::G_FFREXP:
2004*06c3fb27SDimitry Andric     return legalizeFFREXP(MI, MRI, B);
2005*06c3fb27SDimitry Andric   case TargetOpcode::G_FSQRT:
2006*06c3fb27SDimitry Andric     return legalizeFSQRT(MI, MRI, B);
20075ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
20085ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
2009fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
2010fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
20115ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
20125ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
2013fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
2014fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
2015480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
2016480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
2017*06c3fb27SDimitry Andric   case TargetOpcode::G_FLOG2:
2018*06c3fb27SDimitry Andric     return legalizeFlog2(MI, B);
20195ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
20205ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
2021*06c3fb27SDimitry Andric     return legalizeFlogCommon(MI, B);
2022*06c3fb27SDimitry Andric   case TargetOpcode::G_FEXP2:
2023*06c3fb27SDimitry Andric     return legalizeFExp2(MI, B);
20245ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
20255ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
20265ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
20275ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
20285ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
20295ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
20305ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
2031bdd1243dSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
20325ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
203381ad6265SDimitry Andric   case TargetOpcode::G_MUL:
203481ad6265SDimitry Andric     return legalizeMul(Helper, MI);
2035349cc55cSDimitry Andric   case TargetOpcode::G_CTLZ:
2036349cc55cSDimitry Andric   case TargetOpcode::G_CTTZ:
2037349cc55cSDimitry Andric     return legalizeCTLZ_CTTZ(MI, MRI, B);
203881ad6265SDimitry Andric   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
203981ad6265SDimitry Andric     return legalizeFPTruncRound(MI, B);
20400b57cec5SDimitry Andric   default:
20410b57cec5SDimitry Andric     return false;
20420b57cec5SDimitry Andric   }
20430b57cec5SDimitry Andric 
20440b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
20450b57cec5SDimitry Andric }
20460b57cec5SDimitry Andric 
20470b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
20480b57cec5SDimitry Andric   unsigned AS,
20490b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
20508bcb0991SDimitry Andric   MachineIRBuilder &B) const {
20518bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
20520b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
20530b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
2054bdd1243dSDimitry Andric   const LLT S64 = LLT::scalar(64);
20550b57cec5SDimitry Andric 
20568bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
20578bcb0991SDimitry Andric 
20580b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
2059bdd1243dSDimitry Andric     // Note: this register is somewhat broken. When used as a 32-bit operand,
2060bdd1243dSDimitry Andric     // it only returns zeroes. The real value is in the upper 32 bits.
2061bdd1243dSDimitry Andric     // Thus, we must emit extract the high 32 bits.
2062bdd1243dSDimitry Andric     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2063bdd1243dSDimitry Andric                                        ? AMDGPU::SRC_SHARED_BASE
2064bdd1243dSDimitry Andric                                        : AMDGPU::SRC_PRIVATE_BASE;
2065bdd1243dSDimitry Andric     // FIXME: It would be more natural to emit a COPY here, but then copy
2066bdd1243dSDimitry Andric     // coalescing would kick in and it would think it's okay to use the "HI"
2067bdd1243dSDimitry Andric     // subregister (instead of extracting the HI 32 bits) which is an artificial
2068bdd1243dSDimitry Andric     // (unusable) register.
2069bdd1243dSDimitry Andric     //  Register TableGen definitions would need an overhaul to get rid of the
2070bdd1243dSDimitry Andric     //  artificial "HI" aperture registers and prevent this kind of issue from
2071bdd1243dSDimitry Andric     //  happening.
2072bdd1243dSDimitry Andric     Register Dst = MRI.createGenericVirtualRegister(S64);
2073bdd1243dSDimitry Andric     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2074bdd1243dSDimitry Andric     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2075bdd1243dSDimitry Andric     return B.buildUnmerge(S32, Dst).getReg(1);
20760b57cec5SDimitry Andric   }
20770b57cec5SDimitry Andric 
207881ad6265SDimitry Andric   // TODO: can we be smarter about machine pointer info?
207981ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
208081ad6265SDimitry Andric   Register LoadAddr = MRI.createGenericVirtualRegister(
208181ad6265SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
208281ad6265SDimitry Andric   // For code object version 5, private_base and shared_base are passed through
208381ad6265SDimitry Andric   // implicit kernargs.
2084*06c3fb27SDimitry Andric   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
2085*06c3fb27SDimitry Andric       AMDGPU::AMDHSA_COV5) {
208681ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
208781ad6265SDimitry Andric         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
208881ad6265SDimitry Andric                                       : AMDGPUTargetLowering::PRIVATE_BASE;
208981ad6265SDimitry Andric     uint64_t Offset =
209081ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
209181ad6265SDimitry Andric 
209281ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
209381ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
209481ad6265SDimitry Andric 
209581ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
209681ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
209781ad6265SDimitry Andric       return Register();
209881ad6265SDimitry Andric 
209981ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
210081ad6265SDimitry Andric         PtrInfo,
210181ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
210281ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
210381ad6265SDimitry Andric         LLT::scalar(32), commonAlignment(Align(64), Offset));
210481ad6265SDimitry Andric 
210581ad6265SDimitry Andric     // Pointer address
210681ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
210781ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
210881ad6265SDimitry Andric     // Load address
210981ad6265SDimitry Andric     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
211081ad6265SDimitry Andric   }
211181ad6265SDimitry Andric 
21120b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
21130b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
21140b57cec5SDimitry Andric 
2115e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
21168bcb0991SDimitry Andric     return Register();
21170b57cec5SDimitry Andric 
21180b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
21190b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
21200b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
21210b57cec5SDimitry Andric 
21220b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
21230b57cec5SDimitry Andric       PtrInfo,
21245ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
21250b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
2126fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
21270b57cec5SDimitry Andric 
212881ad6265SDimitry Andric   B.buildPtrAdd(LoadAddr, QueuePtr,
212981ad6265SDimitry Andric                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
21305ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
21310b57cec5SDimitry Andric }
21320b57cec5SDimitry Andric 
213304eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is
213404eeddc0SDimitry Andric /// not necessary.
213504eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
213604eeddc0SDimitry Andric                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
213704eeddc0SDimitry Andric   MachineInstr *Def = MRI.getVRegDef(Val);
213804eeddc0SDimitry Andric   switch (Def->getOpcode()) {
213904eeddc0SDimitry Andric   case AMDGPU::G_FRAME_INDEX:
214004eeddc0SDimitry Andric   case AMDGPU::G_GLOBAL_VALUE:
214104eeddc0SDimitry Andric   case AMDGPU::G_BLOCK_ADDR:
214204eeddc0SDimitry Andric     return true;
214304eeddc0SDimitry Andric   case AMDGPU::G_CONSTANT: {
214404eeddc0SDimitry Andric     const ConstantInt *CI = Def->getOperand(1).getCImm();
214504eeddc0SDimitry Andric     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
214604eeddc0SDimitry Andric   }
214704eeddc0SDimitry Andric   default:
214804eeddc0SDimitry Andric     return false;
214904eeddc0SDimitry Andric   }
215004eeddc0SDimitry Andric 
215104eeddc0SDimitry Andric   return false;
215204eeddc0SDimitry Andric }
215304eeddc0SDimitry Andric 
21540b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
21550b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
21568bcb0991SDimitry Andric   MachineIRBuilder &B) const {
21578bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
21580b57cec5SDimitry Andric 
21598bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
21600b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
21610b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
21620b57cec5SDimitry Andric 
21630b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
21640b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
21650b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
21660b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
21670b57cec5SDimitry Andric 
21680b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
21690b57cec5SDimitry Andric   // vector element.
21700b57cec5SDimitry Andric   assert(!DstTy.isVector());
21710b57cec5SDimitry Andric 
21720b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
21730b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
21740b57cec5SDimitry Andric 
2175e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
21768bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
21778bcb0991SDimitry Andric     return true;
21788bcb0991SDimitry Andric   }
21798bcb0991SDimitry Andric 
218081ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
218181ad6265SDimitry Andric       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
218281ad6265SDimitry Andric        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
218304eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
218404eeddc0SDimitry Andric       // Extract low 32-bits of the pointer.
218504eeddc0SDimitry Andric       B.buildExtract(Dst, Src, 0);
218604eeddc0SDimitry Andric       MI.eraseFromParent();
218704eeddc0SDimitry Andric       return true;
218804eeddc0SDimitry Andric     }
218904eeddc0SDimitry Andric 
21900b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
21910b57cec5SDimitry Andric 
21928bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
21938bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
21940b57cec5SDimitry Andric 
21950b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
21965ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
21970b57cec5SDimitry Andric 
21985ffd83dbSDimitry Andric     auto CmpRes =
21995ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
22008bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
22010b57cec5SDimitry Andric 
22020b57cec5SDimitry Andric     MI.eraseFromParent();
22030b57cec5SDimitry Andric     return true;
22040b57cec5SDimitry Andric   }
22050b57cec5SDimitry Andric 
220681ad6265SDimitry Andric   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
220781ad6265SDimitry Andric       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
220881ad6265SDimitry Andric        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
22098bcb0991SDimitry Andric     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
22108bcb0991SDimitry Andric     if (!ApertureReg.isValid())
22118bcb0991SDimitry Andric       return false;
22120b57cec5SDimitry Andric 
22130b57cec5SDimitry Andric     // Coerce the type of the low half of the result so we can use merge_values.
22145ffd83dbSDimitry Andric     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
22150b57cec5SDimitry Andric 
22160b57cec5SDimitry Andric     // TODO: Should we allow mismatched types but matching sizes in merges to
22170b57cec5SDimitry Andric     // avoid the ptrtoint?
2218bdd1243dSDimitry Andric     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
221904eeddc0SDimitry Andric 
222004eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
222104eeddc0SDimitry Andric       B.buildCopy(Dst, BuildPtr);
222204eeddc0SDimitry Andric       MI.eraseFromParent();
222304eeddc0SDimitry Andric       return true;
222404eeddc0SDimitry Andric     }
222504eeddc0SDimitry Andric 
222604eeddc0SDimitry Andric     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
222704eeddc0SDimitry Andric     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
222804eeddc0SDimitry Andric 
222981ad6265SDimitry Andric     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
223081ad6265SDimitry Andric                               SegmentNull.getReg(0));
223104eeddc0SDimitry Andric 
22325ffd83dbSDimitry Andric     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
22330b57cec5SDimitry Andric 
22340b57cec5SDimitry Andric     MI.eraseFromParent();
22350b57cec5SDimitry Andric     return true;
22360b57cec5SDimitry Andric   }
22370b57cec5SDimitry Andric 
223881ad6265SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
223981ad6265SDimitry Andric       SrcTy.getSizeInBits() == 64) {
224081ad6265SDimitry Andric     // Truncate.
224181ad6265SDimitry Andric     B.buildExtract(Dst, Src, 0);
224281ad6265SDimitry Andric     MI.eraseFromParent();
224381ad6265SDimitry Andric     return true;
224481ad6265SDimitry Andric   }
224581ad6265SDimitry Andric 
224681ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
224781ad6265SDimitry Andric       DstTy.getSizeInBits() == 64) {
224881ad6265SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
224981ad6265SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2250bdd1243dSDimitry Andric     auto PtrLo = B.buildPtrToInt(S32, Src);
2251bdd1243dSDimitry Andric     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2252bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
225381ad6265SDimitry Andric     MI.eraseFromParent();
225481ad6265SDimitry Andric     return true;
225581ad6265SDimitry Andric   }
225681ad6265SDimitry Andric 
225781ad6265SDimitry Andric   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
225881ad6265SDimitry Andric       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
225981ad6265SDimitry Andric 
226081ad6265SDimitry Andric   LLVMContext &Ctx = MF.getFunction().getContext();
226181ad6265SDimitry Andric   Ctx.diagnose(InvalidAddrSpaceCast);
226281ad6265SDimitry Andric   B.buildUndef(Dst);
226381ad6265SDimitry Andric   MI.eraseFromParent();
226481ad6265SDimitry Andric   return true;
226581ad6265SDimitry Andric }
226681ad6265SDimitry Andric 
22670b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint(
22680b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22698bcb0991SDimitry Andric   MachineIRBuilder &B) const {
22700b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
22710b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
22720b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
22730b57cec5SDimitry Andric 
22740b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
22750b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
22760b57cec5SDimitry Andric 
22778bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
22788bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
22790b57cec5SDimitry Andric 
22800b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
22818bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
22828bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
22830b57cec5SDimitry Andric 
22848bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
22858bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
22860b57cec5SDimitry Andric 
22878bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
22888bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2289e8d8bef9SDimitry Andric   MI.eraseFromParent();
22900b57cec5SDimitry Andric   return true;
22910b57cec5SDimitry Andric }
22920b57cec5SDimitry Andric 
22930b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
22940b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22950b57cec5SDimitry Andric   MachineIRBuilder &B) const {
22960b57cec5SDimitry Andric 
22970b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
22980b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
22990b57cec5SDimitry Andric 
23000b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
23010b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
23020b57cec5SDimitry Andric 
23030b57cec5SDimitry Andric   // result = trunc(src)
23040b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
23050b57cec5SDimitry Andric   //   result += 1.0
23060b57cec5SDimitry Andric 
23075ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
23080b57cec5SDimitry Andric 
23090b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
23100b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
23110b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
23120b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
23130b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
23140b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
23150b57cec5SDimitry Andric 
23160b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
23170b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
231804eeddc0SDimitry Andric   MI.eraseFromParent();
23190b57cec5SDimitry Andric   return true;
23200b57cec5SDimitry Andric }
23210b57cec5SDimitry Andric 
2322e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
2323e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
2324e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
2325e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
2326e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
2327e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
2328e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
2329e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
2330e8d8bef9SDimitry Andric 
2331e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2332e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2333e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2334e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2335e8d8bef9SDimitry Andric     MI.eraseFromParent();
2336e8d8bef9SDimitry Andric     return true;
2337e8d8bef9SDimitry Andric }
2338e8d8bef9SDimitry Andric 
2339e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
23400b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
23410b57cec5SDimitry Andric   const unsigned FractBits = 52;
23420b57cec5SDimitry Andric   const unsigned ExpBits = 11;
23430b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
23440b57cec5SDimitry Andric 
23450b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
23460b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
23470b57cec5SDimitry Andric 
23480b57cec5SDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
2349e8d8bef9SDimitry Andric     .addUse(Hi)
23500b57cec5SDimitry Andric     .addUse(Const0.getReg(0))
23510b57cec5SDimitry Andric     .addUse(Const1.getReg(0));
23520b57cec5SDimitry Andric 
23530b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
23540b57cec5SDimitry Andric }
23550b57cec5SDimitry Andric 
23560b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
23570b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23580b57cec5SDimitry Andric   MachineIRBuilder &B) const {
23590b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
23600b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
23610b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
23620b57cec5SDimitry Andric 
23630b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
23640b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
23650b57cec5SDimitry Andric 
23660b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
23670b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
23680b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
23690b57cec5SDimitry Andric 
23700b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
23710b57cec5SDimitry Andric   // exponent.
23720b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
23730b57cec5SDimitry Andric 
23740b57cec5SDimitry Andric   const unsigned FractBits = 52;
23750b57cec5SDimitry Andric 
23760b57cec5SDimitry Andric   // Extract the sign bit.
23770b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
23780b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
23790b57cec5SDimitry Andric 
23800b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
23810b57cec5SDimitry Andric 
23820b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
23830b57cec5SDimitry Andric 
23840b57cec5SDimitry Andric   // Extend back to 64-bits.
2385bdd1243dSDimitry Andric   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
23860b57cec5SDimitry Andric 
23870b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
23880b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
23890b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
23900b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
23910b57cec5SDimitry Andric 
23920b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
23930b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
23940b57cec5SDimitry Andric 
23950b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
23960b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2397e8d8bef9SDimitry Andric   MI.eraseFromParent();
23980b57cec5SDimitry Andric   return true;
23990b57cec5SDimitry Andric }
24000b57cec5SDimitry Andric 
24010b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
24020b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24030b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
24040b57cec5SDimitry Andric 
24050b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
24060b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
24070b57cec5SDimitry Andric 
24080b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
24090b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
24100b57cec5SDimitry Andric 
2411349cc55cSDimitry Andric   assert(MRI.getType(Src) == S64);
24120b57cec5SDimitry Andric 
24130b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2414349cc55cSDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
24150b57cec5SDimitry Andric 
2416349cc55cSDimitry Andric   if (MRI.getType(Dst) == S64) {
2417349cc55cSDimitry Andric     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2418349cc55cSDimitry Andric                         : B.buildUITOFP(S64, Unmerge.getReg(1));
24190b57cec5SDimitry Andric 
24200b57cec5SDimitry Andric     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2421*06c3fb27SDimitry Andric     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
24220b57cec5SDimitry Andric 
24230b57cec5SDimitry Andric     // TODO: Should this propagate fast-math-flags?
24240b57cec5SDimitry Andric     B.buildFAdd(Dst, LdExp, CvtLo);
24250b57cec5SDimitry Andric     MI.eraseFromParent();
24260b57cec5SDimitry Andric     return true;
24270b57cec5SDimitry Andric   }
24280b57cec5SDimitry Andric 
2429349cc55cSDimitry Andric   assert(MRI.getType(Dst) == S32);
2430349cc55cSDimitry Andric 
2431349cc55cSDimitry Andric   auto One = B.buildConstant(S32, 1);
2432349cc55cSDimitry Andric 
2433349cc55cSDimitry Andric   MachineInstrBuilder ShAmt;
2434349cc55cSDimitry Andric   if (Signed) {
2435349cc55cSDimitry Andric     auto ThirtyOne = B.buildConstant(S32, 31);
2436349cc55cSDimitry Andric     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2437349cc55cSDimitry Andric     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2438349cc55cSDimitry Andric     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2439349cc55cSDimitry Andric     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
2440349cc55cSDimitry Andric                                /*HasSideEffects=*/false)
2441349cc55cSDimitry Andric                   .addUse(Unmerge.getReg(1));
2442349cc55cSDimitry Andric     auto LS2 = B.buildSub(S32, LS, One);
2443349cc55cSDimitry Andric     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2444349cc55cSDimitry Andric   } else
2445349cc55cSDimitry Andric     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2446349cc55cSDimitry Andric   auto Norm = B.buildShl(S64, Src, ShAmt);
2447349cc55cSDimitry Andric   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2448349cc55cSDimitry Andric   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2449349cc55cSDimitry Andric   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2450349cc55cSDimitry Andric   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2451349cc55cSDimitry Andric   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2452*06c3fb27SDimitry Andric   B.buildFLdexp(Dst, FVal, Scale);
2453349cc55cSDimitry Andric   MI.eraseFromParent();
2454349cc55cSDimitry Andric   return true;
2455349cc55cSDimitry Andric }
2456349cc55cSDimitry Andric 
24575ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
24585ffd83dbSDimitry Andric // actually works.
2459fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2460fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2461fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2462fe6060f1SDimitry Andric                                         bool Signed) const {
24635ffd83dbSDimitry Andric 
24645ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
24655ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
24665ffd83dbSDimitry Andric 
24675ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
24685ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
24695ffd83dbSDimitry Andric 
2470fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2471fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
24725ffd83dbSDimitry Andric 
24735ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
24745ffd83dbSDimitry Andric 
2475fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2476fe6060f1SDimitry Andric   // integers is illustrated as follows:
2477fe6060f1SDimitry Andric   //
2478fe6060f1SDimitry Andric   //     tf := trunc(val);
2479fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2480fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2481fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2482fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2483fe6060f1SDimitry Andric   //
2484fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2485fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2486fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2487fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2488fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2489fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2490fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2491fe6060f1SDimitry Andric     // signedness.
2492fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2493fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2494fe6060f1SDimitry Andric   }
2495fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2496fe6060f1SDimitry Andric   if (SrcLT == S64) {
2497*06c3fb27SDimitry Andric     K0 = B.buildFConstant(
2498*06c3fb27SDimitry Andric         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2499*06c3fb27SDimitry Andric     K1 = B.buildFConstant(
2500*06c3fb27SDimitry Andric         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2501fe6060f1SDimitry Andric   } else {
2502*06c3fb27SDimitry Andric     K0 = B.buildFConstant(
2503*06c3fb27SDimitry Andric         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2504*06c3fb27SDimitry Andric     K1 = B.buildFConstant(
2505*06c3fb27SDimitry Andric         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2506fe6060f1SDimitry Andric   }
25075ffd83dbSDimitry Andric 
2508fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2509fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2510fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
25115ffd83dbSDimitry Andric 
2512fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2513fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
25145ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
25155ffd83dbSDimitry Andric 
2516fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2517fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2518bdd1243dSDimitry Andric     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2519fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2520bdd1243dSDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2521bdd1243dSDimitry Andric                Sign);
2522fe6060f1SDimitry Andric   } else
2523bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {Lo, Hi});
25245ffd83dbSDimitry Andric   MI.eraseFromParent();
25255ffd83dbSDimitry Andric 
25265ffd83dbSDimitry Andric   return true;
25275ffd83dbSDimitry Andric }
25285ffd83dbSDimitry Andric 
25295ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
25305ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
25315ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
25320b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
25330b57cec5SDimitry Andric 
25340b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
25350b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
25360b57cec5SDimitry Andric 
25370b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
25380b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
25390b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
25400b57cec5SDimitry Andric     return !IsIEEEOp;
25410b57cec5SDimitry Andric 
25420b57cec5SDimitry Andric   if (IsIEEEOp)
25430b57cec5SDimitry Andric     return true;
25440b57cec5SDimitry Andric 
25450b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
25460b57cec5SDimitry Andric }
25470b57cec5SDimitry Andric 
25480b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
25490b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
25500b57cec5SDimitry Andric   MachineIRBuilder &B) const {
25510b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
25520b57cec5SDimitry Andric 
25530b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
25545ffd83dbSDimitry Andric 
2555*06c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2556*06c3fb27SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
2557*06c3fb27SDimitry Andric 
2558*06c3fb27SDimitry Andric   LLT VecTy = MRI.getType(Vec);
2559*06c3fb27SDimitry Andric   LLT EltTy = VecTy.getElementType();
2560*06c3fb27SDimitry Andric   assert(EltTy == MRI.getType(Dst));
2561*06c3fb27SDimitry Andric 
2562*06c3fb27SDimitry Andric   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2563*06c3fb27SDimitry Andric   // but we can't go directly to that logic becasue you can't bitcast a vector
2564*06c3fb27SDimitry Andric   // of pointers to a vector of integers. Therefore, introduce an intermediate
2565*06c3fb27SDimitry Andric   // vector of integers using ptrtoint (and inttoptr on the output) in order to
2566*06c3fb27SDimitry Andric   // drive the legalization forward.
2567*06c3fb27SDimitry Andric   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2568*06c3fb27SDimitry Andric     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2569*06c3fb27SDimitry Andric     LLT IntVecTy = VecTy.changeElementType(IntTy);
2570*06c3fb27SDimitry Andric 
2571*06c3fb27SDimitry Andric     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
2572*06c3fb27SDimitry Andric     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
2573*06c3fb27SDimitry Andric     B.buildIntToPtr(Dst, IntElt);
2574*06c3fb27SDimitry Andric 
2575*06c3fb27SDimitry Andric     MI.eraseFromParent();
2576*06c3fb27SDimitry Andric     return true;
2577*06c3fb27SDimitry Andric   }
2578*06c3fb27SDimitry Andric 
25795ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
25805ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2581349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2582bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2583349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2584e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
25850b57cec5SDimitry Andric     return true;
2586bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
25870b57cec5SDimitry Andric 
258804eeddc0SDimitry Andric   if (IdxVal < VecTy.getNumElements()) {
258904eeddc0SDimitry Andric     auto Unmerge = B.buildUnmerge(EltTy, Vec);
259004eeddc0SDimitry Andric     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
259104eeddc0SDimitry Andric   } else {
25920b57cec5SDimitry Andric     B.buildUndef(Dst);
259304eeddc0SDimitry Andric   }
25940b57cec5SDimitry Andric 
25950b57cec5SDimitry Andric   MI.eraseFromParent();
25960b57cec5SDimitry Andric   return true;
25970b57cec5SDimitry Andric }
25980b57cec5SDimitry Andric 
25990b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
26000b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
26010b57cec5SDimitry Andric   MachineIRBuilder &B) const {
26020b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
26030b57cec5SDimitry Andric 
26040b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
26055ffd83dbSDimitry Andric 
2606*06c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2607*06c3fb27SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
2608*06c3fb27SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
2609*06c3fb27SDimitry Andric 
2610*06c3fb27SDimitry Andric   LLT VecTy = MRI.getType(Vec);
2611*06c3fb27SDimitry Andric   LLT EltTy = VecTy.getElementType();
2612*06c3fb27SDimitry Andric   assert(EltTy == MRI.getType(Ins));
2613*06c3fb27SDimitry Andric 
2614*06c3fb27SDimitry Andric   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
2615*06c3fb27SDimitry Andric   // but we can't go directly to that logic becasue you can't bitcast a vector
2616*06c3fb27SDimitry Andric   // of pointers to a vector of integers. Therefore, make the pointer vector
2617*06c3fb27SDimitry Andric   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
2618*06c3fb27SDimitry Andric   // new value, and then inttoptr the result vector back. This will then allow
2619*06c3fb27SDimitry Andric   // the rest of legalization to take over.
2620*06c3fb27SDimitry Andric   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
2621*06c3fb27SDimitry Andric     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
2622*06c3fb27SDimitry Andric     LLT IntVecTy = VecTy.changeElementType(IntTy);
2623*06c3fb27SDimitry Andric 
2624*06c3fb27SDimitry Andric     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
2625*06c3fb27SDimitry Andric     auto IntIns = B.buildPtrToInt(IntTy, Ins);
2626*06c3fb27SDimitry Andric     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2627*06c3fb27SDimitry Andric                                                  MI.getOperand(3));
2628*06c3fb27SDimitry Andric     B.buildIntToPtr(Dst, IntVecDest);
2629*06c3fb27SDimitry Andric     MI.eraseFromParent();
2630*06c3fb27SDimitry Andric     return true;
2631*06c3fb27SDimitry Andric   }
2632*06c3fb27SDimitry Andric 
26335ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
26345ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2635349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2636bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2637349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2638e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
26390b57cec5SDimitry Andric     return true;
26400b57cec5SDimitry Andric 
2641bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
26420b57cec5SDimitry Andric 
264304eeddc0SDimitry Andric   unsigned NumElts = VecTy.getNumElements();
264404eeddc0SDimitry Andric   if (IdxVal < NumElts) {
264504eeddc0SDimitry Andric     SmallVector<Register, 8> SrcRegs;
264604eeddc0SDimitry Andric     for (unsigned i = 0; i < NumElts; ++i)
264704eeddc0SDimitry Andric       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
264804eeddc0SDimitry Andric     B.buildUnmerge(SrcRegs, Vec);
264904eeddc0SDimitry Andric 
265004eeddc0SDimitry Andric     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2651bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, SrcRegs);
265204eeddc0SDimitry Andric   } else {
26530b57cec5SDimitry Andric     B.buildUndef(Dst);
265404eeddc0SDimitry Andric   }
26550b57cec5SDimitry Andric 
26560b57cec5SDimitry Andric   MI.eraseFromParent();
26570b57cec5SDimitry Andric   return true;
26580b57cec5SDimitry Andric }
26590b57cec5SDimitry Andric 
26608bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
26618bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
26628bcb0991SDimitry Andric   MachineIRBuilder &B) const {
26638bcb0991SDimitry Andric 
26648bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
26658bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
26668bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
26678bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
26688bcb0991SDimitry Andric 
26698bcb0991SDimitry Andric   Register TrigVal;
26705ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
26718bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
26728bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
26738bcb0991SDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
26748bcb0991SDimitry Andric       .addUse(MulVal.getReg(0))
26758bcb0991SDimitry Andric       .setMIFlags(Flags).getReg(0);
26768bcb0991SDimitry Andric   } else
26778bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
26788bcb0991SDimitry Andric 
26798bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
26808bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2681bdd1243dSDimitry Andric   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
26828bcb0991SDimitry Andric       .addUse(TrigVal)
26838bcb0991SDimitry Andric       .setMIFlags(Flags);
26848bcb0991SDimitry Andric   MI.eraseFromParent();
26858bcb0991SDimitry Andric   return true;
26868bcb0991SDimitry Andric }
26878bcb0991SDimitry Andric 
26885ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
26895ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
26905ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
26915ffd83dbSDimitry Andric                                                   int64_t Offset,
26925ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
26935ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
26948bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
26958bcb0991SDimitry Andric   // to the following code sequence:
26968bcb0991SDimitry Andric   //
26978bcb0991SDimitry Andric   // For constant address space:
26988bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
26998bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
27008bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
27018bcb0991SDimitry Andric   //
27028bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
27038bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
27048bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
27058bcb0991SDimitry Andric   //   operand to the global variable.
27068bcb0991SDimitry Andric   //
27078bcb0991SDimitry Andric   // For global address space:
27088bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
27098bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
27108bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
27118bcb0991SDimitry Andric   //
27128bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
27138bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
27148bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
27158bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
27168bcb0991SDimitry Andric   //   operand to the global variable.
27178bcb0991SDimitry Andric   //
27188bcb0991SDimitry Andric   // What we want here is an offset from the value returned by s_getpc
27198bcb0991SDimitry Andric   // (which is the address of the s_add_u32 instruction) to the global
27208bcb0991SDimitry Andric   // variable, but since the encoding of $symbol starts 4 bytes after the start
27218bcb0991SDimitry Andric   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
27228bcb0991SDimitry Andric   // small. This requires us to add 4 to the global variable offset in order to
2723e8d8bef9SDimitry Andric   // compute the correct address. Similarly for the s_addc_u32 instruction, the
2724e8d8bef9SDimitry Andric   // encoding of $symbol starts 12 bytes after the start of the s_add_u32
2725e8d8bef9SDimitry Andric   // instruction.
27268bcb0991SDimitry Andric 
27278bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
27288bcb0991SDimitry Andric 
27298bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
27308bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
27318bcb0991SDimitry Andric 
27328bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
27338bcb0991SDimitry Andric     .addDef(PCReg);
27348bcb0991SDimitry Andric 
27358bcb0991SDimitry Andric   MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
27368bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
27378bcb0991SDimitry Andric     MIB.addImm(0);
27388bcb0991SDimitry Andric   else
2739e8d8bef9SDimitry Andric     MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
27408bcb0991SDimitry Andric 
2741*06c3fb27SDimitry Andric   if (!B.getMRI()->getRegClassOrNull(PCReg))
27428bcb0991SDimitry Andric     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
27438bcb0991SDimitry Andric 
27448bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
27458bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
27468bcb0991SDimitry Andric   return true;
27478bcb0991SDimitry Andric  }
27488bcb0991SDimitry Andric 
27498bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
27508bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
27518bcb0991SDimitry Andric   MachineIRBuilder &B) const {
27528bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
27538bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
27548bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
27558bcb0991SDimitry Andric 
27568bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
27578bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
27588bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
27598bcb0991SDimitry Andric 
27608bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2761fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2762fe6060f1SDimitry Andric         !GV->getName().equals("llvm.amdgcn.module.lds")) {
27638bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
27648bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
27655ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
27665ffd83dbSDimitry Andric         DS_Warning);
27678bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
27685ffd83dbSDimitry Andric 
27695ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
27705ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
27715ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
27725ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
27735ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
27745ffd83dbSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
27755ffd83dbSDimitry Andric       B.buildUndef(DstReg);
27765ffd83dbSDimitry Andric       MI.eraseFromParent();
27775ffd83dbSDimitry Andric       return true;
27788bcb0991SDimitry Andric     }
27798bcb0991SDimitry Andric 
27808bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
2781349cc55cSDimitry Andric     // We ignore the initializer for now and legalize it to allow selection.
2782349cc55cSDimitry Andric     // The initializer will anyway get errored out during assembly emission.
27835ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
27845ffd83dbSDimitry Andric     if (!TLI->shouldUseLDSConstAddress(GV)) {
27855ffd83dbSDimitry Andric       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
27865ffd83dbSDimitry Andric       return true; // Leave in place;
27875ffd83dbSDimitry Andric     }
27885ffd83dbSDimitry Andric 
2789e8d8bef9SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2790e8d8bef9SDimitry Andric       Type *Ty = GV->getValueType();
2791e8d8bef9SDimitry Andric       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2792e8d8bef9SDimitry Andric       // zero-sized type in other languages to declare the dynamic shared
2793e8d8bef9SDimitry Andric       // memory which size is not known at the compile time. They will be
2794e8d8bef9SDimitry Andric       // allocated by the runtime and placed directly after the static
2795e8d8bef9SDimitry Andric       // allocated ones. They all share the same offset.
2796e8d8bef9SDimitry Andric       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2797e8d8bef9SDimitry Andric         // Adjust alignment for that dynamic shared memory array.
2798*06c3fb27SDimitry Andric         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2799e8d8bef9SDimitry Andric         LLT S32 = LLT::scalar(32);
2800e8d8bef9SDimitry Andric         auto Sz =
2801e8d8bef9SDimitry Andric             B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
2802e8d8bef9SDimitry Andric         B.buildIntToPtr(DstReg, Sz);
2803e8d8bef9SDimitry Andric         MI.eraseFromParent();
2804e8d8bef9SDimitry Andric         return true;
2805e8d8bef9SDimitry Andric       }
2806e8d8bef9SDimitry Andric     }
2807e8d8bef9SDimitry Andric 
2808349cc55cSDimitry Andric     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2809349cc55cSDimitry Andric                                                    *cast<GlobalVariable>(GV)));
28108bcb0991SDimitry Andric     MI.eraseFromParent();
28118bcb0991SDimitry Andric     return true;
28128bcb0991SDimitry Andric   }
28138bcb0991SDimitry Andric 
28148bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
28158bcb0991SDimitry Andric 
28168bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
28178bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
28188bcb0991SDimitry Andric     MI.eraseFromParent();
28198bcb0991SDimitry Andric     return true;
28208bcb0991SDimitry Andric   }
28218bcb0991SDimitry Andric 
28228bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
28238bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
28248bcb0991SDimitry Andric     MI.eraseFromParent();
28258bcb0991SDimitry Andric     return true;
28268bcb0991SDimitry Andric   }
28278bcb0991SDimitry Andric 
28288bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
28298bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
28308bcb0991SDimitry Andric 
2831fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
28328bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
28338bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
28348bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
28358bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2836fe6060f1SDimitry Andric       LoadTy, Align(8));
28378bcb0991SDimitry Andric 
28388bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
28398bcb0991SDimitry Andric 
28408bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
2841349cc55cSDimitry Andric     // Truncate if this is a 32-bit constant address.
28428bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
28438bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
28448bcb0991SDimitry Andric   } else
28458bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
28468bcb0991SDimitry Andric 
28478bcb0991SDimitry Andric   MI.eraseFromParent();
28488bcb0991SDimitry Andric   return true;
28498bcb0991SDimitry Andric }
28508bcb0991SDimitry Andric 
2851e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2852e8d8bef9SDimitry Andric   if (Ty.isVector())
2853fe6060f1SDimitry Andric     return Ty.changeElementCount(
2854fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2855e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2856e8d8bef9SDimitry Andric }
2857e8d8bef9SDimitry Andric 
2858e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2859e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2860e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2861e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2862e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2863e8d8bef9SDimitry Andric 
2864e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2865e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2866e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2867e8d8bef9SDimitry Andric 
2868e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
28698bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2870e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
28718bcb0991SDimitry Andric     Observer.changingInstr(MI);
28728bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
28738bcb0991SDimitry Andric     Observer.changedInstr(MI);
28748bcb0991SDimitry Andric     return true;
28758bcb0991SDimitry Andric   }
28768bcb0991SDimitry Andric 
2877fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
2878fe6060f1SDimitry Andric     return false;
2879fe6060f1SDimitry Andric 
2880e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2881e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2882e8d8bef9SDimitry Andric 
2883*06c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(ValTy)) {
2884*06c3fb27SDimitry Andric     Observer.changingInstr(MI);
2885*06c3fb27SDimitry Andric     castBufferRsrcFromV4I32(MI, B, MRI, 0);
2886*06c3fb27SDimitry Andric     Observer.changedInstr(MI);
2887*06c3fb27SDimitry Andric     return true;
2888*06c3fb27SDimitry Andric   }
2889*06c3fb27SDimitry Andric 
2890e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
2891e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
2892fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
2893e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
2894fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
289504eeddc0SDimitry Andric   const uint64_t AlignInBits = 8 * MemAlign.value();
2896e8d8bef9SDimitry Andric 
2897e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
2898fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2899e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2900e8d8bef9SDimitry Andric 
2901e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
2902e8d8bef9SDimitry Andric     // the memory type.
2903e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
2904e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
2905e8d8bef9SDimitry Andric 
2906e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
2907e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
2908e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
2909e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
2910e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
2911e8d8bef9SDimitry Andric       return true;
2912e8d8bef9SDimitry Andric     }
2913e8d8bef9SDimitry Andric 
2914e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
2915e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
2916e8d8bef9SDimitry Andric       return false;
2917e8d8bef9SDimitry Andric 
2918e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
2919e8d8bef9SDimitry Andric 
2920e8d8bef9SDimitry Andric     Register WideLoad;
2921e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
2922e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2923e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
2924e8d8bef9SDimitry Andric     } else {
2925e8d8bef9SDimitry Andric       // Extract the subvector.
2926e8d8bef9SDimitry Andric 
2927e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
2928e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
2929e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
2930e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
2931e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
2932e8d8bef9SDimitry Andric       } else {
2933e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
2934e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
29350eae32dcSDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
29360eae32dcSDimitry Andric         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
2937e8d8bef9SDimitry Andric       }
2938e8d8bef9SDimitry Andric     }
2939e8d8bef9SDimitry Andric 
2940e8d8bef9SDimitry Andric     MI.eraseFromParent();
2941e8d8bef9SDimitry Andric     return true;
2942e8d8bef9SDimitry Andric   }
2943e8d8bef9SDimitry Andric 
2944e8d8bef9SDimitry Andric   return false;
2945e8d8bef9SDimitry Andric }
2946e8d8bef9SDimitry Andric 
2947*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
2948*06c3fb27SDimitry Andric                                         MachineInstr &MI) const {
2949*06c3fb27SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2950*06c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2951*06c3fb27SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2952*06c3fb27SDimitry Andric 
2953*06c3fb27SDimitry Andric   Register DataReg = MI.getOperand(0).getReg();
2954*06c3fb27SDimitry Andric   LLT DataTy = MRI.getType(DataReg);
2955*06c3fb27SDimitry Andric 
2956*06c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(DataTy)) {
2957*06c3fb27SDimitry Andric     Observer.changingInstr(MI);
2958*06c3fb27SDimitry Andric     castBufferRsrcArgToV4I32(MI, B, 0);
2959*06c3fb27SDimitry Andric     Observer.changedInstr(MI);
2960*06c3fb27SDimitry Andric     return true;
2961*06c3fb27SDimitry Andric   }
2962*06c3fb27SDimitry Andric   return false;
2963*06c3fb27SDimitry Andric }
2964*06c3fb27SDimitry Andric 
29658bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
29668bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
29678bcb0991SDimitry Andric   MachineIRBuilder &B) const {
29688bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
29698bcb0991SDimitry Andric   assert(Ty.isScalar());
29708bcb0991SDimitry Andric 
2971480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
2972480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2973480093f4SDimitry Andric 
29748bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
29755ffd83dbSDimitry Andric   // FIXME: Do we need just output?
2976*06c3fb27SDimitry Andric   if (Ty == LLT::scalar(32) &&
2977*06c3fb27SDimitry Andric       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
29788bcb0991SDimitry Andric     return true;
2979*06c3fb27SDimitry Andric   if (Ty == LLT::scalar(16) &&
2980*06c3fb27SDimitry Andric       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
29818bcb0991SDimitry Andric     return true;
29828bcb0991SDimitry Andric 
29838bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
29848bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
29858bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
29868bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
29878bcb0991SDimitry Andric }
29888bcb0991SDimitry Andric 
2989480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
2990480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
2991480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2992480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2993480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
2994480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
2995480093f4SDimitry Andric 
2996e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
2997480093f4SDimitry Andric          "this should not have been custom lowered");
2998480093f4SDimitry Andric 
2999480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
3000fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
3001480093f4SDimitry Andric 
3002480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3003480093f4SDimitry Andric 
3004480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3005480093f4SDimitry Andric     .addDef(DstReg)
3006480093f4SDimitry Andric     .addUse(PtrReg)
3007480093f4SDimitry Andric     .addUse(PackedVal)
3008480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
3009480093f4SDimitry Andric 
3010480093f4SDimitry Andric   MI.eraseFromParent();
3011480093f4SDimitry Andric   return true;
3012480093f4SDimitry Andric }
3013480093f4SDimitry Andric 
3014*06c3fb27SDimitry Andric /// Return true if it's known that \p Src can never be an f32 denormal value.
3015*06c3fb27SDimitry Andric static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
3016*06c3fb27SDimitry Andric                                        Register Src) {
3017*06c3fb27SDimitry Andric   Register ExtSrc;
3018*06c3fb27SDimitry Andric   if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc))))
3019*06c3fb27SDimitry Andric     return MRI.getType(ExtSrc) == LLT::scalar(16);
3020*06c3fb27SDimitry Andric   return false;
3021*06c3fb27SDimitry Andric }
3022*06c3fb27SDimitry Andric 
3023*06c3fb27SDimitry Andric static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3024*06c3fb27SDimitry Andric   if (Flags & MachineInstr::FmAfn)
3025*06c3fb27SDimitry Andric     return true;
3026*06c3fb27SDimitry Andric   const auto &Options = MF.getTarget().Options;
3027*06c3fb27SDimitry Andric   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
3028*06c3fb27SDimitry Andric }
3029*06c3fb27SDimitry Andric 
3030*06c3fb27SDimitry Andric static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
3031*06c3fb27SDimitry Andric                                    unsigned Flags) {
3032*06c3fb27SDimitry Andric   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3033*06c3fb27SDimitry Andric          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
3034*06c3fb27SDimitry Andric              DenormalMode::PreserveSign;
3035*06c3fb27SDimitry Andric }
3036*06c3fb27SDimitry Andric 
3037*06c3fb27SDimitry Andric std::pair<Register, Register>
3038*06c3fb27SDimitry Andric AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
3039*06c3fb27SDimitry Andric                                        unsigned Flags) const {
3040*06c3fb27SDimitry Andric   if (allowApproxFunc(B.getMF(), Flags) ||
3041*06c3fb27SDimitry Andric       !needsDenormHandlingF32(B.getMF(), Src, Flags))
3042*06c3fb27SDimitry Andric     return {};
3043*06c3fb27SDimitry Andric 
3044*06c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
3045*06c3fb27SDimitry Andric   auto SmallestNormal = B.buildFConstant(
3046*06c3fb27SDimitry Andric       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
3047*06c3fb27SDimitry Andric   auto IsLtSmallestNormal =
3048*06c3fb27SDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3049*06c3fb27SDimitry Andric 
3050*06c3fb27SDimitry Andric   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3051*06c3fb27SDimitry Andric   auto One = B.buildFConstant(F32, 1.0);
3052*06c3fb27SDimitry Andric   auto ScaleFactor =
3053*06c3fb27SDimitry Andric       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3054*06c3fb27SDimitry Andric   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3055*06c3fb27SDimitry Andric 
3056*06c3fb27SDimitry Andric   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3057*06c3fb27SDimitry Andric }
3058*06c3fb27SDimitry Andric 
3059*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
3060*06c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
3061*06c3fb27SDimitry Andric   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3062*06c3fb27SDimitry Andric   // If we have to handle denormals, scale up the input and adjust the result.
3063*06c3fb27SDimitry Andric 
3064*06c3fb27SDimitry Andric   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3065*06c3fb27SDimitry Andric   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3066*06c3fb27SDimitry Andric 
30675ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
30685ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
30695ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
30705ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
30715ffd83dbSDimitry Andric 
3072*06c3fb27SDimitry Andric   if (Ty == LLT::scalar(16)) {
3073*06c3fb27SDimitry Andric     const LLT F32 = LLT::scalar(32);
3074*06c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
3075*06c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, Src, Flags);
3076*06c3fb27SDimitry Andric     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false)
3077*06c3fb27SDimitry Andric       .addUse(Ext.getReg(0))
3078*06c3fb27SDimitry Andric       .setMIFlags(Flags);
3079*06c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Log2, Flags);
30805ffd83dbSDimitry Andric     MI.eraseFromParent();
30815ffd83dbSDimitry Andric     return true;
30825ffd83dbSDimitry Andric   }
30835ffd83dbSDimitry Andric 
3084*06c3fb27SDimitry Andric   assert(Ty == LLT::scalar(32));
3085*06c3fb27SDimitry Andric 
3086*06c3fb27SDimitry Andric   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3087*06c3fb27SDimitry Andric   if (!ScaledInput) {
3088*06c3fb27SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false)
3089*06c3fb27SDimitry Andric         .addUse(Src)
3090*06c3fb27SDimitry Andric         .setMIFlags(Flags);
3091*06c3fb27SDimitry Andric     MI.eraseFromParent();
3092*06c3fb27SDimitry Andric     return true;
3093*06c3fb27SDimitry Andric   }
3094*06c3fb27SDimitry Andric 
3095*06c3fb27SDimitry Andric   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3096*06c3fb27SDimitry Andric                   .addUse(ScaledInput)
3097*06c3fb27SDimitry Andric                   .setMIFlags(Flags);
3098*06c3fb27SDimitry Andric 
3099*06c3fb27SDimitry Andric   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3100*06c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
3101*06c3fb27SDimitry Andric   auto ResultOffset =
3102*06c3fb27SDimitry Andric       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3103*06c3fb27SDimitry Andric   B.buildFSub(Dst, Log2, ResultOffset, Flags);
3104*06c3fb27SDimitry Andric 
3105*06c3fb27SDimitry Andric   MI.eraseFromParent();
3106*06c3fb27SDimitry Andric   return true;
3107*06c3fb27SDimitry Andric }
3108*06c3fb27SDimitry Andric 
3109*06c3fb27SDimitry Andric static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
3110*06c3fb27SDimitry Andric                        Register Z, unsigned Flags) {
3111*06c3fb27SDimitry Andric   auto FMul = B.buildFMul(Ty, X, Y, Flags);
3112*06c3fb27SDimitry Andric   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3113*06c3fb27SDimitry Andric }
3114*06c3fb27SDimitry Andric 
3115*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
3116*06c3fb27SDimitry Andric                                              MachineIRBuilder &B) const {
3117*06c3fb27SDimitry Andric   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3118*06c3fb27SDimitry Andric   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3119*06c3fb27SDimitry Andric 
3120*06c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
3121*06c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3122*06c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
3123*06c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
3124*06c3fb27SDimitry Andric   const LLT Ty = MRI.getType(X);
3125*06c3fb27SDimitry Andric   MachineFunction &MF = B.getMF();
3126*06c3fb27SDimitry Andric 
3127*06c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
3128*06c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
3129*06c3fb27SDimitry Andric 
3130*06c3fb27SDimitry Andric   const AMDGPUTargetMachine &TM =
3131*06c3fb27SDimitry Andric       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
3132*06c3fb27SDimitry Andric 
3133*06c3fb27SDimitry Andric   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
3134*06c3fb27SDimitry Andric       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
3135*06c3fb27SDimitry Andric     const double Log2BaseInv =
3136*06c3fb27SDimitry Andric         IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
3137*06c3fb27SDimitry Andric 
3138*06c3fb27SDimitry Andric     if (Ty == F16 && !ST.has16BitInsts()) {
3139*06c3fb27SDimitry Andric       Register LogVal = MRI.createGenericVirtualRegister(F32);
3140*06c3fb27SDimitry Andric       auto PromoteSrc = B.buildFPExt(F32, X);
3141*06c3fb27SDimitry Andric       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), Log2BaseInv, Flags);
3142*06c3fb27SDimitry Andric       B.buildFPTrunc(Dst, LogVal);
3143*06c3fb27SDimitry Andric     } else {
3144*06c3fb27SDimitry Andric       legalizeFlogUnsafe(B, Dst, X, Log2BaseInv, Flags);
3145*06c3fb27SDimitry Andric     }
3146*06c3fb27SDimitry Andric 
3147*06c3fb27SDimitry Andric     MI.eraseFromParent();
3148*06c3fb27SDimitry Andric     return true;
3149*06c3fb27SDimitry Andric   }
3150*06c3fb27SDimitry Andric 
3151*06c3fb27SDimitry Andric   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3152*06c3fb27SDimitry Andric   if (ScaledInput)
3153*06c3fb27SDimitry Andric     X = ScaledInput;
3154*06c3fb27SDimitry Andric 
3155*06c3fb27SDimitry Andric   auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3156*06c3fb27SDimitry Andric     .addUse(X)
3157*06c3fb27SDimitry Andric     .setMIFlags(Flags);
3158*06c3fb27SDimitry Andric 
3159*06c3fb27SDimitry Andric   Register R;
3160*06c3fb27SDimitry Andric   if (ST.hasFastFMAF32()) {
3161*06c3fb27SDimitry Andric     // c+cc are ln(2)/ln(10) to more than 49 bits
3162*06c3fb27SDimitry Andric     const float c_log10 = 0x1.344134p-2f;
3163*06c3fb27SDimitry Andric     const float cc_log10 = 0x1.09f79ep-26f;
3164*06c3fb27SDimitry Andric 
3165*06c3fb27SDimitry Andric     // c + cc is ln(2) to more than 49 bits
3166*06c3fb27SDimitry Andric     const float c_log = 0x1.62e42ep-1f;
3167*06c3fb27SDimitry Andric     const float cc_log = 0x1.efa39ep-25f;
3168*06c3fb27SDimitry Andric 
3169*06c3fb27SDimitry Andric     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3170*06c3fb27SDimitry Andric     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3171*06c3fb27SDimitry Andric 
3172*06c3fb27SDimitry Andric     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
3173*06c3fb27SDimitry Andric     auto NegR = B.buildFNeg(Ty, R, Flags);
3174*06c3fb27SDimitry Andric     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
3175*06c3fb27SDimitry Andric     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
3176*06c3fb27SDimitry Andric     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3177*06c3fb27SDimitry Andric   } else {
3178*06c3fb27SDimitry Andric     // ch+ct is ln(2)/ln(10) to more than 36 bits
3179*06c3fb27SDimitry Andric     const float ch_log10 = 0x1.344000p-2f;
3180*06c3fb27SDimitry Andric     const float ct_log10 = 0x1.3509f6p-18f;
3181*06c3fb27SDimitry Andric 
3182*06c3fb27SDimitry Andric     // ch + ct is ln(2) to more than 36 bits
3183*06c3fb27SDimitry Andric     const float ch_log = 0x1.62e000p-1f;
3184*06c3fb27SDimitry Andric     const float ct_log = 0x1.0bfbe8p-15f;
3185*06c3fb27SDimitry Andric 
3186*06c3fb27SDimitry Andric     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3187*06c3fb27SDimitry Andric     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3188*06c3fb27SDimitry Andric 
3189*06c3fb27SDimitry Andric     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3190*06c3fb27SDimitry Andric     auto YH = B.buildAnd(Ty, Y, MaskConst);
3191*06c3fb27SDimitry Andric     auto YT = B.buildFSub(Ty, Y, YH, Flags);
3192*06c3fb27SDimitry Andric     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
3193*06c3fb27SDimitry Andric 
3194*06c3fb27SDimitry Andric     Register Mad0 =
3195*06c3fb27SDimitry Andric         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3196*06c3fb27SDimitry Andric     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
3197*06c3fb27SDimitry Andric     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
3198*06c3fb27SDimitry Andric   }
3199*06c3fb27SDimitry Andric 
3200*06c3fb27SDimitry Andric   const bool IsFiniteOnly =
3201*06c3fb27SDimitry Andric       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
3202*06c3fb27SDimitry Andric       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
3203*06c3fb27SDimitry Andric 
3204*06c3fb27SDimitry Andric   if (!IsFiniteOnly) {
3205*06c3fb27SDimitry Andric     // Expand isfinite(x) => fabs(x) < inf
3206*06c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3207*06c3fb27SDimitry Andric     auto Fabs = B.buildFAbs(Ty, Y);
3208*06c3fb27SDimitry Andric     auto IsFinite =
3209*06c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3210*06c3fb27SDimitry Andric     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3211*06c3fb27SDimitry Andric   }
3212*06c3fb27SDimitry Andric 
3213*06c3fb27SDimitry Andric   if (ScaledInput) {
3214*06c3fb27SDimitry Andric     auto Zero = B.buildFConstant(Ty, 0.0);
3215*06c3fb27SDimitry Andric     auto ShiftK =
3216*06c3fb27SDimitry Andric         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3217*06c3fb27SDimitry Andric     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3218*06c3fb27SDimitry Andric     B.buildFSub(Dst, R, Shift, Flags);
3219*06c3fb27SDimitry Andric   } else {
3220*06c3fb27SDimitry Andric     B.buildCopy(Dst, R);
3221*06c3fb27SDimitry Andric   }
3222*06c3fb27SDimitry Andric 
3223*06c3fb27SDimitry Andric   MI.eraseFromParent();
3224*06c3fb27SDimitry Andric   return true;
3225*06c3fb27SDimitry Andric }
3226*06c3fb27SDimitry Andric 
3227*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
3228*06c3fb27SDimitry Andric                                              Register Src,
3229*06c3fb27SDimitry Andric                                              double Log2BaseInverted,
3230*06c3fb27SDimitry Andric                                              unsigned Flags) const {
3231*06c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
3232*06c3fb27SDimitry Andric   auto Log2Operand = Ty == LLT::scalar(16)
3233*06c3fb27SDimitry Andric                          ? B.buildFLog2(Ty, Src, Flags)
3234*06c3fb27SDimitry Andric                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
3235*06c3fb27SDimitry Andric                                .addUse(Src)
3236*06c3fb27SDimitry Andric                                .setMIFlags(Flags);
3237*06c3fb27SDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3238*06c3fb27SDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3239*06c3fb27SDimitry Andric   return true;
3240*06c3fb27SDimitry Andric }
3241*06c3fb27SDimitry Andric 
3242*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
3243*06c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
3244*06c3fb27SDimitry Andric   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3245*06c3fb27SDimitry Andric   // If we have to handle denormals, scale up the input and adjust the result.
3246*06c3fb27SDimitry Andric 
3247*06c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3248*06c3fb27SDimitry Andric   Register Src = MI.getOperand(1).getReg();
3249*06c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
3250*06c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
3251*06c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
3252*06c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
3253*06c3fb27SDimitry Andric 
3254*06c3fb27SDimitry Andric   if (Ty == F16) {
3255*06c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
3256*06c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, Src, Flags);
3257*06c3fb27SDimitry Andric     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false)
3258*06c3fb27SDimitry Andric       .addUse(Ext.getReg(0))
3259*06c3fb27SDimitry Andric       .setMIFlags(Flags);
3260*06c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Log2, Flags);
3261*06c3fb27SDimitry Andric     MI.eraseFromParent();
3262*06c3fb27SDimitry Andric     return true;
3263*06c3fb27SDimitry Andric   }
3264*06c3fb27SDimitry Andric 
3265*06c3fb27SDimitry Andric   assert(Ty == F32);
3266*06c3fb27SDimitry Andric 
3267*06c3fb27SDimitry Andric   if (allowApproxFunc(B.getMF(), Flags) ||
3268*06c3fb27SDimitry Andric       !needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3269*06c3fb27SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
3270*06c3fb27SDimitry Andric       .addUse(Src)
3271*06c3fb27SDimitry Andric       .setMIFlags(Flags);
3272*06c3fb27SDimitry Andric     MI.eraseFromParent();
3273*06c3fb27SDimitry Andric     return true;
3274*06c3fb27SDimitry Andric   }
3275*06c3fb27SDimitry Andric 
3276*06c3fb27SDimitry Andric   // bool needs_scaling = x < -0x1.f80000p+6f;
3277*06c3fb27SDimitry Andric   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3278*06c3fb27SDimitry Andric 
3279*06c3fb27SDimitry Andric   // -nextafter(128.0, -1)
3280*06c3fb27SDimitry Andric   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3281*06c3fb27SDimitry Andric   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3282*06c3fb27SDimitry Andric                                   RangeCheckConst, Flags);
3283*06c3fb27SDimitry Andric 
3284*06c3fb27SDimitry Andric   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3285*06c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
3286*06c3fb27SDimitry Andric   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3287*06c3fb27SDimitry Andric   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3288*06c3fb27SDimitry Andric 
3289*06c3fb27SDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
3290*06c3fb27SDimitry Andric                   .addUse(AddInput.getReg(0))
3291*06c3fb27SDimitry Andric                   .setMIFlags(Flags);
3292*06c3fb27SDimitry Andric 
3293*06c3fb27SDimitry Andric   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3294*06c3fb27SDimitry Andric   auto One = B.buildFConstant(Ty, 1.0);
3295*06c3fb27SDimitry Andric   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3296*06c3fb27SDimitry Andric   B.buildFMul(Dst, Exp2, ResultScale, Flags);
3297*06c3fb27SDimitry Andric   MI.eraseFromParent();
3298*06c3fb27SDimitry Andric   return true;
3299*06c3fb27SDimitry Andric }
3300*06c3fb27SDimitry Andric 
3301*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3302*06c3fb27SDimitry Andric                                              Register Src,
3303*06c3fb27SDimitry Andric                                              unsigned Flags) const {
3304*06c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
3305*06c3fb27SDimitry Andric   auto K = B.buildFConstant(Ty, numbers::log2e);
3306*06c3fb27SDimitry Andric   auto Mul = B.buildFMul(Ty, Src, K, Flags);
3307*06c3fb27SDimitry Andric 
3308*06c3fb27SDimitry Andric   if (Ty == LLT::scalar(32)) {
3309*06c3fb27SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
3310*06c3fb27SDimitry Andric       .addUse(Mul.getReg(0))
3311*06c3fb27SDimitry Andric       .setMIFlags(Flags);
3312*06c3fb27SDimitry Andric   } else {
3313*06c3fb27SDimitry Andric     B.buildFExp2(Dst, Mul.getReg(0), Flags);
3314*06c3fb27SDimitry Andric   }
3315*06c3fb27SDimitry Andric 
3316*06c3fb27SDimitry Andric   return true;
3317*06c3fb27SDimitry Andric }
3318*06c3fb27SDimitry Andric 
33195ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
33205ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
33215ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3322*06c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
3323*06c3fb27SDimitry Andric   const unsigned Flags = MI.getFlags();
3324*06c3fb27SDimitry Andric   MachineFunction &MF = B.getMF();
3325*06c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
3326*06c3fb27SDimitry Andric   LLT Ty = MRI.getType(Dst);
3327*06c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
3328*06c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
3329*06c3fb27SDimitry Andric   const bool IsExp10 = false; // TODO: For some reason exp10 is missing
33305ffd83dbSDimitry Andric 
3331*06c3fb27SDimitry Andric   if (Ty == F16) {
3332*06c3fb27SDimitry Andric     // v_exp_f16 (fmul x, log2e)
3333*06c3fb27SDimitry Andric     if (allowApproxFunc(MF, Flags)) {
3334*06c3fb27SDimitry Andric       // TODO: Does this really require fast?
3335*06c3fb27SDimitry Andric       legalizeFExpUnsafe(B, Dst, X, Flags);
3336*06c3fb27SDimitry Andric       MI.eraseFromParent();
3337*06c3fb27SDimitry Andric       return true;
3338*06c3fb27SDimitry Andric     }
3339*06c3fb27SDimitry Andric 
3340*06c3fb27SDimitry Andric     // exp(f16 x) ->
3341*06c3fb27SDimitry Andric     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3342*06c3fb27SDimitry Andric 
3343*06c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
3344*06c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, X, Flags);
3345*06c3fb27SDimitry Andric     Register Lowered = MRI.createGenericVirtualRegister(F32);
3346*06c3fb27SDimitry Andric     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
3347*06c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Lowered, Flags);
3348*06c3fb27SDimitry Andric     MI.eraseFromParent();
3349*06c3fb27SDimitry Andric     return true;
3350*06c3fb27SDimitry Andric   }
3351*06c3fb27SDimitry Andric 
3352*06c3fb27SDimitry Andric   assert(Ty == F32);
3353*06c3fb27SDimitry Andric 
3354*06c3fb27SDimitry Andric   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3355*06c3fb27SDimitry Andric   // library behavior. Also, is known-not-daz source sufficient?
3356*06c3fb27SDimitry Andric   if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) {
3357*06c3fb27SDimitry Andric     legalizeFExpUnsafe(B, Dst, X, Flags);
3358*06c3fb27SDimitry Andric     MI.eraseFromParent();
3359*06c3fb27SDimitry Andric     return true;
3360*06c3fb27SDimitry Andric   }
3361*06c3fb27SDimitry Andric 
3362*06c3fb27SDimitry Andric   //    Algorithm:
3363*06c3fb27SDimitry Andric   //
3364*06c3fb27SDimitry Andric   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3365*06c3fb27SDimitry Andric   //
3366*06c3fb27SDimitry Andric   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3367*06c3fb27SDimitry Andric   //    n = 64*m + j,   0 <= j < 64
3368*06c3fb27SDimitry Andric   //
3369*06c3fb27SDimitry Andric   //    e^x = 2^((64*m + j + f)/64)
3370*06c3fb27SDimitry Andric   //        = (2^m) * (2^(j/64)) * 2^(f/64)
3371*06c3fb27SDimitry Andric   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3372*06c3fb27SDimitry Andric   //
3373*06c3fb27SDimitry Andric   //    f = x*(64/ln(2)) - n
3374*06c3fb27SDimitry Andric   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
3375*06c3fb27SDimitry Andric   //
3376*06c3fb27SDimitry Andric   //    e^x = (2^m) * (2^(j/64)) * e^r
3377*06c3fb27SDimitry Andric   //
3378*06c3fb27SDimitry Andric   //    (2^(j/64)) is precomputed
3379*06c3fb27SDimitry Andric   //
3380*06c3fb27SDimitry Andric   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3381*06c3fb27SDimitry Andric   //    e^r = 1 + q
3382*06c3fb27SDimitry Andric   //
3383*06c3fb27SDimitry Andric   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3384*06c3fb27SDimitry Andric   //
3385*06c3fb27SDimitry Andric   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3386*06c3fb27SDimitry Andric   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
3387*06c3fb27SDimitry Andric   Register PH, PL;
3388*06c3fb27SDimitry Andric 
3389*06c3fb27SDimitry Andric   if (ST.hasFastFMAF32()) {
3390*06c3fb27SDimitry Andric     const float c_exp = numbers::log2ef;
3391*06c3fb27SDimitry Andric     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3392*06c3fb27SDimitry Andric     const float c_exp10 = 0x1.a934f0p+1f;
3393*06c3fb27SDimitry Andric     const float cc_exp10 = 0x1.2f346ep-24f;
3394*06c3fb27SDimitry Andric 
3395*06c3fb27SDimitry Andric     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3396*06c3fb27SDimitry Andric     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
3397*06c3fb27SDimitry Andric     auto NegPH = B.buildFNeg(Ty, PH, Flags);
3398*06c3fb27SDimitry Andric     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
3399*06c3fb27SDimitry Andric 
3400*06c3fb27SDimitry Andric     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3401*06c3fb27SDimitry Andric     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
3402*06c3fb27SDimitry Andric   } else {
3403*06c3fb27SDimitry Andric     const float ch_exp = 0x1.714000p+0f;
3404*06c3fb27SDimitry Andric     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3405*06c3fb27SDimitry Andric 
3406*06c3fb27SDimitry Andric     const float ch_exp10 = 0x1.a92000p+1f;
3407*06c3fb27SDimitry Andric     const float cl_exp10 = 0x1.4f0978p-11f;
3408*06c3fb27SDimitry Andric 
3409*06c3fb27SDimitry Andric     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3410*06c3fb27SDimitry Andric     auto XH = B.buildAnd(Ty, X, MaskConst);
3411*06c3fb27SDimitry Andric     auto XL = B.buildFSub(Ty, X, XH, Flags);
3412*06c3fb27SDimitry Andric 
3413*06c3fb27SDimitry Andric     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3414*06c3fb27SDimitry Andric     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
3415*06c3fb27SDimitry Andric 
3416*06c3fb27SDimitry Andric     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3417*06c3fb27SDimitry Andric     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
3418*06c3fb27SDimitry Andric 
3419*06c3fb27SDimitry Andric     Register Mad0 =
3420*06c3fb27SDimitry Andric         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
3421*06c3fb27SDimitry Andric     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3422*06c3fb27SDimitry Andric   }
3423*06c3fb27SDimitry Andric 
3424*06c3fb27SDimitry Andric   auto E = B.buildFRint(Ty, PH, Flags);
3425*06c3fb27SDimitry Andric 
3426*06c3fb27SDimitry Andric   // It is unsafe to contract this fsub into the PH multiply.
3427*06c3fb27SDimitry Andric   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
3428*06c3fb27SDimitry Andric   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
3429*06c3fb27SDimitry Andric   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
3430*06c3fb27SDimitry Andric 
3431*06c3fb27SDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
3432*06c3fb27SDimitry Andric                   .addUse(A.getReg(0))
3433*06c3fb27SDimitry Andric                   .setMIFlags(Flags);
3434*06c3fb27SDimitry Andric   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
3435*06c3fb27SDimitry Andric 
3436*06c3fb27SDimitry Andric   auto UnderflowCheckConst =
3437*06c3fb27SDimitry Andric       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3438*06c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
3439*06c3fb27SDimitry Andric   auto Underflow =
3440*06c3fb27SDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
3441*06c3fb27SDimitry Andric 
3442*06c3fb27SDimitry Andric   R = B.buildSelect(Ty, Underflow, Zero, R);
3443*06c3fb27SDimitry Andric 
3444*06c3fb27SDimitry Andric   const auto &Options = MF.getTarget().Options;
3445*06c3fb27SDimitry Andric 
3446*06c3fb27SDimitry Andric   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
3447*06c3fb27SDimitry Andric     auto OverflowCheckConst =
3448*06c3fb27SDimitry Andric         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3449*06c3fb27SDimitry Andric 
3450*06c3fb27SDimitry Andric     auto Overflow =
3451*06c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
3452*06c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3453*06c3fb27SDimitry Andric     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
3454*06c3fb27SDimitry Andric   }
3455*06c3fb27SDimitry Andric 
3456*06c3fb27SDimitry Andric   B.buildCopy(Dst, R);
34575ffd83dbSDimitry Andric   MI.eraseFromParent();
34585ffd83dbSDimitry Andric   return true;
34595ffd83dbSDimitry Andric }
34605ffd83dbSDimitry Andric 
34615ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
34625ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
34635ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
34645ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
34655ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
34665ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
34675ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
34685ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
34695ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
34705ffd83dbSDimitry Andric 
34715ffd83dbSDimitry Andric   if (Ty == S32) {
34725ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S32, Src0, Flags);
34735ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
34745ffd83dbSDimitry Andric       .addUse(Log.getReg(0))
34755ffd83dbSDimitry Andric       .addUse(Src1)
34765ffd83dbSDimitry Andric       .setMIFlags(Flags);
34775ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
34785ffd83dbSDimitry Andric   } else if (Ty == S16) {
34795ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
34805ffd83dbSDimitry Andric     auto Log = B.buildFLog2(S16, Src0, Flags);
34815ffd83dbSDimitry Andric     auto Ext0 = B.buildFPExt(S32, Log, Flags);
34825ffd83dbSDimitry Andric     auto Ext1 = B.buildFPExt(S32, Src1, Flags);
34835ffd83dbSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
34845ffd83dbSDimitry Andric       .addUse(Ext0.getReg(0))
34855ffd83dbSDimitry Andric       .addUse(Ext1.getReg(0))
34865ffd83dbSDimitry Andric       .setMIFlags(Flags);
34875ffd83dbSDimitry Andric 
34885ffd83dbSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
34895ffd83dbSDimitry Andric   } else
34905ffd83dbSDimitry Andric     return false;
34915ffd83dbSDimitry Andric 
34925ffd83dbSDimitry Andric   MI.eraseFromParent();
34935ffd83dbSDimitry Andric   return true;
34945ffd83dbSDimitry Andric }
34955ffd83dbSDimitry Andric 
34965ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
34975ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
34985ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
34995ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
35005ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
35015ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
35025ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
35035ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
35045ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
35055ffd83dbSDimitry Andric   return ModSrc;
35065ffd83dbSDimitry Andric }
35075ffd83dbSDimitry Andric 
35085ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
35095ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
35105ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
35115ffd83dbSDimitry Andric 
35125ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
35135ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
35145ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
35155ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
35165ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
35175ffd83dbSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
35185ffd83dbSDimitry Andric          "this should not have been custom lowered");
35195ffd83dbSDimitry Andric 
35205ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
35215ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
35225ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
35235ffd83dbSDimitry Andric   // V_FRACT bug is:
35245ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
35255ffd83dbSDimitry Andric   //
35265ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
35275ffd83dbSDimitry Andric 
35285ffd83dbSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
35295ffd83dbSDimitry Andric     .addUse(OrigSrc)
35305ffd83dbSDimitry Andric     .setMIFlags(Flags);
35315ffd83dbSDimitry Andric 
35325ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
35335ffd83dbSDimitry Andric   // pattern.
35345ffd83dbSDimitry Andric 
35355ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
35365ffd83dbSDimitry Andric   // shouldn't matter?
35375ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
35385ffd83dbSDimitry Andric 
3539*06c3fb27SDimitry Andric   auto Const =
3540*06c3fb27SDimitry Andric       B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff));
35415ffd83dbSDimitry Andric 
35425ffd83dbSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(S64);
35435ffd83dbSDimitry Andric 
35445ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
35455ffd83dbSDimitry Andric   // use the one which will directly select.
35465ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
35475ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
35485ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
35495ffd83dbSDimitry Andric   else
35505ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
35515ffd83dbSDimitry Andric 
35525ffd83dbSDimitry Andric   Register CorrectedFract = Min;
35535ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
35545ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
35555ffd83dbSDimitry Andric     CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
35565ffd83dbSDimitry Andric   }
35575ffd83dbSDimitry Andric 
35585ffd83dbSDimitry Andric   auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
35595ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
35605ffd83dbSDimitry Andric 
35615ffd83dbSDimitry Andric   MI.eraseFromParent();
35625ffd83dbSDimitry Andric   return true;
35635ffd83dbSDimitry Andric }
35645ffd83dbSDimitry Andric 
35655ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
35665ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
35675ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
35685ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
35695ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
35705ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3571bdd1243dSDimitry Andric   const LLT S16 = LLT::scalar(16);
3572fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
35735ffd83dbSDimitry Andric 
35745ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
35755ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
35765ffd83dbSDimitry Andric 
3577bdd1243dSDimitry Andric   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3578bdd1243dSDimitry Andric     assert(MRI.getType(Src0) == S32);
3579bdd1243dSDimitry Andric     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3580bdd1243dSDimitry Andric     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3581bdd1243dSDimitry Andric   }
3582bdd1243dSDimitry Andric 
3583bdd1243dSDimitry Andric   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
35845ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
35855ffd83dbSDimitry Andric 
35865ffd83dbSDimitry Andric   MI.eraseFromParent();
35875ffd83dbSDimitry Andric   return true;
35885ffd83dbSDimitry Andric }
35895ffd83dbSDimitry Andric 
359081ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
359181ad6265SDimitry Andric //
359281ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits.
359381ad6265SDimitry Andric //
359481ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence
359581ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of
359681ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go
359781ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction
359881ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions.
3599*06c3fb27SDimitry Andric void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
3600*06c3fb27SDimitry Andric                                         MutableArrayRef<Register> Accum,
3601*06c3fb27SDimitry Andric                                         ArrayRef<Register> Src0,
3602*06c3fb27SDimitry Andric                                         ArrayRef<Register> Src1,
3603*06c3fb27SDimitry Andric                                         bool UsePartialMad64_32,
3604*06c3fb27SDimitry Andric                                         bool SeparateOddAlignedProducts) const {
360581ad6265SDimitry Andric   // Use (possibly empty) vectors of S1 registers to represent the set of
360681ad6265SDimitry Andric   // carries from one pair of positions to the next.
360781ad6265SDimitry Andric   using Carry = SmallVector<Register, 2>;
360881ad6265SDimitry Andric 
360981ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
3610*06c3fb27SDimitry Andric   GISelKnownBits &KB = *Helper.getKnownBits();
361181ad6265SDimitry Andric 
361281ad6265SDimitry Andric   const LLT S1 = LLT::scalar(1);
361381ad6265SDimitry Andric   const LLT S32 = LLT::scalar(32);
361481ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
361581ad6265SDimitry Andric 
361681ad6265SDimitry Andric   Register Zero32;
361781ad6265SDimitry Andric   Register Zero64;
361881ad6265SDimitry Andric 
361981ad6265SDimitry Andric   auto getZero32 = [&]() -> Register {
362081ad6265SDimitry Andric     if (!Zero32)
362181ad6265SDimitry Andric       Zero32 = B.buildConstant(S32, 0).getReg(0);
362281ad6265SDimitry Andric     return Zero32;
362381ad6265SDimitry Andric   };
362481ad6265SDimitry Andric   auto getZero64 = [&]() -> Register {
362581ad6265SDimitry Andric     if (!Zero64)
362681ad6265SDimitry Andric       Zero64 = B.buildConstant(S64, 0).getReg(0);
362781ad6265SDimitry Andric     return Zero64;
362881ad6265SDimitry Andric   };
362981ad6265SDimitry Andric 
3630*06c3fb27SDimitry Andric   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
3631*06c3fb27SDimitry Andric   for (unsigned i = 0; i < Src0.size(); ++i) {
3632*06c3fb27SDimitry Andric     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
3633*06c3fb27SDimitry Andric     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
3634*06c3fb27SDimitry Andric   }
3635*06c3fb27SDimitry Andric 
363681ad6265SDimitry Andric   // Merge the given carries into the 32-bit LocalAccum, which is modified
363781ad6265SDimitry Andric   // in-place.
363881ad6265SDimitry Andric   //
363981ad6265SDimitry Andric   // Returns the carry-out, which is a single S1 register or null.
364081ad6265SDimitry Andric   auto mergeCarry =
364181ad6265SDimitry Andric       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
364281ad6265SDimitry Andric         if (CarryIn.empty())
364381ad6265SDimitry Andric           return Register();
364481ad6265SDimitry Andric 
364581ad6265SDimitry Andric         bool HaveCarryOut = true;
364681ad6265SDimitry Andric         Register CarryAccum;
364781ad6265SDimitry Andric         if (CarryIn.size() == 1) {
364881ad6265SDimitry Andric           if (!LocalAccum) {
364981ad6265SDimitry Andric             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
365081ad6265SDimitry Andric             return Register();
365181ad6265SDimitry Andric           }
365281ad6265SDimitry Andric 
365381ad6265SDimitry Andric           CarryAccum = getZero32();
365481ad6265SDimitry Andric         } else {
365581ad6265SDimitry Andric           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
365681ad6265SDimitry Andric           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
365781ad6265SDimitry Andric             CarryAccum =
365881ad6265SDimitry Andric                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
365981ad6265SDimitry Andric                     .getReg(0);
366081ad6265SDimitry Andric           }
366181ad6265SDimitry Andric 
366281ad6265SDimitry Andric           if (!LocalAccum) {
366381ad6265SDimitry Andric             LocalAccum = getZero32();
366481ad6265SDimitry Andric             HaveCarryOut = false;
366581ad6265SDimitry Andric           }
366681ad6265SDimitry Andric         }
366781ad6265SDimitry Andric 
366881ad6265SDimitry Andric         auto Add =
366981ad6265SDimitry Andric             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
367081ad6265SDimitry Andric         LocalAccum = Add.getReg(0);
367181ad6265SDimitry Andric         return HaveCarryOut ? Add.getReg(1) : Register();
367281ad6265SDimitry Andric       };
367381ad6265SDimitry Andric 
367481ad6265SDimitry Andric   // Build a multiply-add chain to compute
367581ad6265SDimitry Andric   //
367681ad6265SDimitry Andric   //   LocalAccum + (partial products at DstIndex)
367781ad6265SDimitry Andric   //       + (opportunistic subset of CarryIn)
367881ad6265SDimitry Andric   //
367981ad6265SDimitry Andric   // LocalAccum is an array of one or two 32-bit registers that are updated
368081ad6265SDimitry Andric   // in-place. The incoming registers may be null.
368181ad6265SDimitry Andric   //
368281ad6265SDimitry Andric   // In some edge cases, carry-ins can be consumed "for free". In that case,
368381ad6265SDimitry Andric   // the consumed carry bits are removed from CarryIn in-place.
368481ad6265SDimitry Andric   auto buildMadChain =
368581ad6265SDimitry Andric       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
368681ad6265SDimitry Andric           -> Carry {
368781ad6265SDimitry Andric         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
368881ad6265SDimitry Andric                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
368981ad6265SDimitry Andric 
369081ad6265SDimitry Andric         Carry CarryOut;
369181ad6265SDimitry Andric         unsigned j0 = 0;
369281ad6265SDimitry Andric 
369381ad6265SDimitry Andric         // Use plain 32-bit multiplication for the most significant part of the
369481ad6265SDimitry Andric         // result by default.
369581ad6265SDimitry Andric         if (LocalAccum.size() == 1 &&
369681ad6265SDimitry Andric             (!UsePartialMad64_32 || !CarryIn.empty())) {
369781ad6265SDimitry Andric           do {
3698*06c3fb27SDimitry Andric             // Skip multiplication if one of the operands is 0
369981ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
3700*06c3fb27SDimitry Andric             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3701*06c3fb27SDimitry Andric               ++j0;
3702*06c3fb27SDimitry Andric               continue;
3703*06c3fb27SDimitry Andric             }
370481ad6265SDimitry Andric             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
3705*06c3fb27SDimitry Andric             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
370681ad6265SDimitry Andric               LocalAccum[0] = Mul.getReg(0);
370781ad6265SDimitry Andric             } else {
370881ad6265SDimitry Andric               if (CarryIn.empty()) {
370981ad6265SDimitry Andric                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
371081ad6265SDimitry Andric               } else {
371181ad6265SDimitry Andric                 LocalAccum[0] =
371281ad6265SDimitry Andric                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
371381ad6265SDimitry Andric                         .getReg(0);
371481ad6265SDimitry Andric                 CarryIn.pop_back();
371581ad6265SDimitry Andric               }
371681ad6265SDimitry Andric             }
371781ad6265SDimitry Andric             ++j0;
371881ad6265SDimitry Andric           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
371981ad6265SDimitry Andric         }
372081ad6265SDimitry Andric 
372181ad6265SDimitry Andric         // Build full 64-bit multiplies.
372281ad6265SDimitry Andric         if (j0 <= DstIndex) {
372381ad6265SDimitry Andric           bool HaveSmallAccum = false;
372481ad6265SDimitry Andric           Register Tmp;
372581ad6265SDimitry Andric 
372681ad6265SDimitry Andric           if (LocalAccum[0]) {
372781ad6265SDimitry Andric             if (LocalAccum.size() == 1) {
372881ad6265SDimitry Andric               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
372981ad6265SDimitry Andric               HaveSmallAccum = true;
373081ad6265SDimitry Andric             } else if (LocalAccum[1]) {
3731bdd1243dSDimitry Andric               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
373281ad6265SDimitry Andric               HaveSmallAccum = false;
373381ad6265SDimitry Andric             } else {
373481ad6265SDimitry Andric               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
373581ad6265SDimitry Andric               HaveSmallAccum = true;
373681ad6265SDimitry Andric             }
373781ad6265SDimitry Andric           } else {
373881ad6265SDimitry Andric             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
373981ad6265SDimitry Andric             Tmp = getZero64();
374081ad6265SDimitry Andric             HaveSmallAccum = true;
374181ad6265SDimitry Andric           }
374281ad6265SDimitry Andric 
374381ad6265SDimitry Andric           do {
374481ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
3745*06c3fb27SDimitry Andric             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3746*06c3fb27SDimitry Andric               ++j0;
3747*06c3fb27SDimitry Andric               continue;
3748*06c3fb27SDimitry Andric             }
374981ad6265SDimitry Andric             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
375081ad6265SDimitry Andric                                     {Src0[j0], Src1[j1], Tmp});
375181ad6265SDimitry Andric             Tmp = Mad.getReg(0);
375281ad6265SDimitry Andric             if (!HaveSmallAccum)
375381ad6265SDimitry Andric               CarryOut.push_back(Mad.getReg(1));
375481ad6265SDimitry Andric             HaveSmallAccum = false;
3755*06c3fb27SDimitry Andric 
375681ad6265SDimitry Andric             ++j0;
375781ad6265SDimitry Andric           } while (j0 <= DstIndex);
375881ad6265SDimitry Andric 
375981ad6265SDimitry Andric           auto Unmerge = B.buildUnmerge(S32, Tmp);
376081ad6265SDimitry Andric           LocalAccum[0] = Unmerge.getReg(0);
376181ad6265SDimitry Andric           if (LocalAccum.size() > 1)
376281ad6265SDimitry Andric             LocalAccum[1] = Unmerge.getReg(1);
376381ad6265SDimitry Andric         }
376481ad6265SDimitry Andric 
376581ad6265SDimitry Andric         return CarryOut;
376681ad6265SDimitry Andric       };
376781ad6265SDimitry Andric 
376881ad6265SDimitry Andric   // Outer multiply loop, iterating over destination parts from least
376981ad6265SDimitry Andric   // significant to most significant parts.
377081ad6265SDimitry Andric   //
377181ad6265SDimitry Andric   // The columns of the following diagram correspond to the destination parts
377281ad6265SDimitry Andric   // affected by one iteration of the outer loop (ignoring boundary
377381ad6265SDimitry Andric   // conditions).
377481ad6265SDimitry Andric   //
377581ad6265SDimitry Andric   //   Dest index relative to 2 * i:      1 0 -1
377681ad6265SDimitry Andric   //                                      ------
377781ad6265SDimitry Andric   //   Carries from previous iteration:     e o
377881ad6265SDimitry Andric   //   Even-aligned partial product sum:  E E .
377981ad6265SDimitry Andric   //   Odd-aligned partial product sum:     O O
378081ad6265SDimitry Andric   //
378181ad6265SDimitry Andric   // 'o' is OddCarry, 'e' is EvenCarry.
378281ad6265SDimitry Andric   // EE and OO are computed from partial products via buildMadChain and use
378381ad6265SDimitry Andric   // accumulation where possible and appropriate.
378481ad6265SDimitry Andric   //
378581ad6265SDimitry Andric   Register SeparateOddCarry;
378681ad6265SDimitry Andric   Carry EvenCarry;
378781ad6265SDimitry Andric   Carry OddCarry;
378881ad6265SDimitry Andric 
378981ad6265SDimitry Andric   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
379081ad6265SDimitry Andric     Carry OddCarryIn = std::move(OddCarry);
379181ad6265SDimitry Andric     Carry EvenCarryIn = std::move(EvenCarry);
379281ad6265SDimitry Andric     OddCarry.clear();
379381ad6265SDimitry Andric     EvenCarry.clear();
379481ad6265SDimitry Andric 
379581ad6265SDimitry Andric     // Partial products at offset 2 * i.
379681ad6265SDimitry Andric     if (2 * i < Accum.size()) {
379781ad6265SDimitry Andric       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
379881ad6265SDimitry Andric       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
379981ad6265SDimitry Andric     }
380081ad6265SDimitry Andric 
380181ad6265SDimitry Andric     // Partial products at offset 2 * i - 1.
380281ad6265SDimitry Andric     if (i > 0) {
380381ad6265SDimitry Andric       if (!SeparateOddAlignedProducts) {
380481ad6265SDimitry Andric         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
380581ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
380681ad6265SDimitry Andric       } else {
380781ad6265SDimitry Andric         bool IsHighest = 2 * i >= Accum.size();
380881ad6265SDimitry Andric         Register SeparateOddOut[2];
3809bdd1243dSDimitry Andric         auto LocalAccum = MutableArrayRef(SeparateOddOut)
381081ad6265SDimitry Andric                               .take_front(IsHighest ? 1 : 2);
381181ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
381281ad6265SDimitry Andric 
381381ad6265SDimitry Andric         MachineInstr *Lo;
381481ad6265SDimitry Andric 
381581ad6265SDimitry Andric         if (i == 1) {
381681ad6265SDimitry Andric           if (!IsHighest)
381781ad6265SDimitry Andric             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
381881ad6265SDimitry Andric           else
381981ad6265SDimitry Andric             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
382081ad6265SDimitry Andric         } else {
382181ad6265SDimitry Andric           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
382281ad6265SDimitry Andric                             SeparateOddCarry);
382381ad6265SDimitry Andric         }
382481ad6265SDimitry Andric         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
382581ad6265SDimitry Andric 
382681ad6265SDimitry Andric         if (!IsHighest) {
382781ad6265SDimitry Andric           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
382881ad6265SDimitry Andric                                 Lo->getOperand(1).getReg());
382981ad6265SDimitry Andric           Accum[2 * i] = Hi.getReg(0);
383081ad6265SDimitry Andric           SeparateOddCarry = Hi.getReg(1);
383181ad6265SDimitry Andric         }
383281ad6265SDimitry Andric       }
383381ad6265SDimitry Andric     }
383481ad6265SDimitry Andric 
383581ad6265SDimitry Andric     // Add in the carries from the previous iteration
383681ad6265SDimitry Andric     if (i > 0) {
383781ad6265SDimitry Andric       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
383881ad6265SDimitry Andric         EvenCarryIn.push_back(CarryOut);
383981ad6265SDimitry Andric 
384081ad6265SDimitry Andric       if (2 * i < Accum.size()) {
384181ad6265SDimitry Andric         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
384281ad6265SDimitry Andric           OddCarry.push_back(CarryOut);
384381ad6265SDimitry Andric       }
384481ad6265SDimitry Andric     }
384581ad6265SDimitry Andric   }
384681ad6265SDimitry Andric }
384781ad6265SDimitry Andric 
384881ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions.
384981ad6265SDimitry Andric //
385081ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to
385181ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
385281ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
385381ad6265SDimitry Andric                                       MachineInstr &MI) const {
385481ad6265SDimitry Andric   assert(ST.hasMad64_32());
385581ad6265SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_MUL);
385681ad6265SDimitry Andric 
385781ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
385881ad6265SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
385981ad6265SDimitry Andric 
386081ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
386181ad6265SDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
386281ad6265SDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
386381ad6265SDimitry Andric 
386481ad6265SDimitry Andric   LLT Ty = MRI.getType(DstReg);
386581ad6265SDimitry Andric   assert(Ty.isScalar());
386681ad6265SDimitry Andric 
386781ad6265SDimitry Andric   unsigned Size = Ty.getSizeInBits();
386881ad6265SDimitry Andric   unsigned NumParts = Size / 32;
386981ad6265SDimitry Andric   assert((Size % 32) == 0);
387081ad6265SDimitry Andric   assert(NumParts >= 2);
387181ad6265SDimitry Andric 
387281ad6265SDimitry Andric   // Whether to use MAD_64_32 for partial products whose high half is
387381ad6265SDimitry Andric   // discarded. This avoids some ADD instructions but risks false dependency
387481ad6265SDimitry Andric   // stalls on some subtargets in some cases.
387581ad6265SDimitry Andric   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
387681ad6265SDimitry Andric 
387781ad6265SDimitry Andric   // Whether to compute odd-aligned partial products separately. This is
387881ad6265SDimitry Andric   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
387981ad6265SDimitry Andric   // in an even-aligned VGPR.
388081ad6265SDimitry Andric   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
388181ad6265SDimitry Andric 
388281ad6265SDimitry Andric   LLT S32 = LLT::scalar(32);
388381ad6265SDimitry Andric   SmallVector<Register, 2> Src0Parts, Src1Parts;
388481ad6265SDimitry Andric   for (unsigned i = 0; i < NumParts; ++i) {
388581ad6265SDimitry Andric     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
388681ad6265SDimitry Andric     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
388781ad6265SDimitry Andric   }
388881ad6265SDimitry Andric   B.buildUnmerge(Src0Parts, Src0);
388981ad6265SDimitry Andric   B.buildUnmerge(Src1Parts, Src1);
389081ad6265SDimitry Andric 
389181ad6265SDimitry Andric   SmallVector<Register, 2> AccumRegs(NumParts);
389281ad6265SDimitry Andric   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
389381ad6265SDimitry Andric                 SeparateOddAlignedProducts);
389481ad6265SDimitry Andric 
3895bdd1243dSDimitry Andric   B.buildMergeLikeInstr(DstReg, AccumRegs);
389681ad6265SDimitry Andric   MI.eraseFromParent();
389781ad6265SDimitry Andric   return true;
389881ad6265SDimitry Andric }
389981ad6265SDimitry Andric 
3900349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
3901349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
3902349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select.
3903349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
3904349cc55cSDimitry Andric                                             MachineRegisterInfo &MRI,
3905349cc55cSDimitry Andric                                             MachineIRBuilder &B) const {
3906349cc55cSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
3907349cc55cSDimitry Andric   Register Src = MI.getOperand(1).getReg();
3908349cc55cSDimitry Andric   LLT DstTy = MRI.getType(Dst);
3909349cc55cSDimitry Andric   LLT SrcTy = MRI.getType(Src);
3910349cc55cSDimitry Andric 
3911349cc55cSDimitry Andric   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
3912349cc55cSDimitry Andric                         ? AMDGPU::G_AMDGPU_FFBH_U32
3913349cc55cSDimitry Andric                         : AMDGPU::G_AMDGPU_FFBL_B32;
3914349cc55cSDimitry Andric   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
3915349cc55cSDimitry Andric   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
3916349cc55cSDimitry Andric 
3917349cc55cSDimitry Andric   MI.eraseFromParent();
3918349cc55cSDimitry Andric   return true;
3919349cc55cSDimitry Andric }
3920349cc55cSDimitry Andric 
3921e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
3922e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
3923e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
3924e8d8bef9SDimitry Andric     return false;
3925349cc55cSDimitry Andric   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
3926e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
3927e8d8bef9SDimitry Andric }
3928e8d8bef9SDimitry Andric 
39290b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
3930e8d8bef9SDimitry Andric static MachineInstr *
3931e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
3932e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
39330b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
39340b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
39350b57cec5SDimitry Andric     return nullptr;
39360b57cec5SDimitry Andric 
39375ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
3938e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
3939e8d8bef9SDimitry Andric 
3940e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
3941e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
3942e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
3943e8d8bef9SDimitry Andric       return nullptr;
3944e8d8bef9SDimitry Andric 
3945e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
3946349cc55cSDimitry Andric     eraseInstr(*UseMI, MRI);
3947e8d8bef9SDimitry Andric 
3948e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
3949e8d8bef9SDimitry Andric     Negated = true;
3950e8d8bef9SDimitry Andric   }
3951e8d8bef9SDimitry Andric 
3952e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
3953480093f4SDimitry Andric     return nullptr;
3954480093f4SDimitry Andric 
39555ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
3956e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
39575ffd83dbSDimitry Andric   if (Next == Parent->end()) {
39585ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
39595ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
39605ffd83dbSDimitry Andric       return nullptr;
39615ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
39625ffd83dbSDimitry Andric   } else {
3963480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
3964480093f4SDimitry Andric       return nullptr;
3965480093f4SDimitry Andric     Br = &*Next;
39665ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
3967480093f4SDimitry Andric   }
3968480093f4SDimitry Andric 
3969e8d8bef9SDimitry Andric   return UseMI;
39700b57cec5SDimitry Andric }
39710b57cec5SDimitry Andric 
39720b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
3973e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
3974e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
3975e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
3976e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
3977e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
39785ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
39790b57cec5SDimitry Andric 
398004eeddc0SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
398104eeddc0SDimitry Andric                                              *ArgRC, B.getDebugLoc(), ArgTy);
39820b57cec5SDimitry Andric   if (Arg->isMasked()) {
39830b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
39840b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
39850b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
3986*06c3fb27SDimitry Andric     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
39870b57cec5SDimitry Andric 
39888bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
39898bcb0991SDimitry Andric 
399004eeddc0SDimitry Andric     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
399104eeddc0SDimitry Andric     // 0.
39928bcb0991SDimitry Andric     if (Shift != 0) {
39930b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
39948bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
39958bcb0991SDimitry Andric     }
39968bcb0991SDimitry Andric 
39978bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
39985ffd83dbSDimitry Andric   } else {
39990b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
40000b57cec5SDimitry Andric   }
40010b57cec5SDimitry Andric 
40020b57cec5SDimitry Andric   return true;
40030b57cec5SDimitry Andric }
40040b57cec5SDimitry Andric 
4005e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
4006e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
4007e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4008e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4009e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
4010e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
4011e8d8bef9SDimitry Andric   LLT ArgTy;
4012e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4013e8d8bef9SDimitry Andric 
4014349cc55cSDimitry Andric   if (!Arg) {
4015349cc55cSDimitry Andric     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4016349cc55cSDimitry Andric       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4017349cc55cSDimitry Andric       // case the pointer argument may be missing and we use null.
4018349cc55cSDimitry Andric       B.buildConstant(DstReg, 0);
4019349cc55cSDimitry Andric       return true;
4020349cc55cSDimitry Andric     }
4021349cc55cSDimitry Andric 
4022349cc55cSDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
4023349cc55cSDimitry Andric     // attributes uses the corresponding intrinsic.
4024349cc55cSDimitry Andric     B.buildUndef(DstReg);
4025349cc55cSDimitry Andric     return true;
4026349cc55cSDimitry Andric   }
4027349cc55cSDimitry Andric 
4028e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4029e8d8bef9SDimitry Andric     return false; // TODO: Handle these
4030e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4031e8d8bef9SDimitry Andric }
4032e8d8bef9SDimitry Andric 
40330b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
40345ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
40350b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4036e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
40375ffd83dbSDimitry Andric     return false;
40385ffd83dbSDimitry Andric 
40390b57cec5SDimitry Andric   MI.eraseFromParent();
40400b57cec5SDimitry Andric   return true;
40410b57cec5SDimitry Andric }
40420b57cec5SDimitry Andric 
404381ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
404481ad6265SDimitry Andric                                 int64_t C) {
404581ad6265SDimitry Andric   B.buildConstant(MI.getOperand(0).getReg(), C);
404681ad6265SDimitry Andric   MI.eraseFromParent();
404781ad6265SDimitry Andric   return true;
404881ad6265SDimitry Andric }
404981ad6265SDimitry Andric 
405081ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
405181ad6265SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
405281ad6265SDimitry Andric     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
405381ad6265SDimitry Andric   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
405481ad6265SDimitry Andric   if (MaxID == 0)
405581ad6265SDimitry Andric     return replaceWithConstant(B, MI, 0);
405681ad6265SDimitry Andric 
405781ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
405881ad6265SDimitry Andric   const ArgDescriptor *Arg;
405981ad6265SDimitry Andric   const TargetRegisterClass *ArgRC;
406081ad6265SDimitry Andric   LLT ArgTy;
406181ad6265SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
406281ad6265SDimitry Andric 
406381ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
406481ad6265SDimitry Andric   if (!Arg) {
406581ad6265SDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
406681ad6265SDimitry Andric     // attributes uses the corresponding intrinsic.
406781ad6265SDimitry Andric     B.buildUndef(DstReg);
406881ad6265SDimitry Andric     MI.eraseFromParent();
406981ad6265SDimitry Andric     return true;
407081ad6265SDimitry Andric   }
407181ad6265SDimitry Andric 
407281ad6265SDimitry Andric   if (Arg->isMasked()) {
407381ad6265SDimitry Andric     // Don't bother inserting AssertZext for packed IDs since we're emitting the
407481ad6265SDimitry Andric     // masking operations anyway.
407581ad6265SDimitry Andric     //
407681ad6265SDimitry Andric     // TODO: We could assert the top bit is 0 for the source copy.
407781ad6265SDimitry Andric     if (!loadInputValue(DstReg, B, ArgType))
407881ad6265SDimitry Andric       return false;
407981ad6265SDimitry Andric   } else {
408081ad6265SDimitry Andric     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
408181ad6265SDimitry Andric     if (!loadInputValue(TmpReg, B, ArgType))
408281ad6265SDimitry Andric       return false;
4083bdd1243dSDimitry Andric     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
408481ad6265SDimitry Andric   }
408581ad6265SDimitry Andric 
408681ad6265SDimitry Andric   MI.eraseFromParent();
408781ad6265SDimitry Andric   return true;
408881ad6265SDimitry Andric }
408981ad6265SDimitry Andric 
409081ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
409181ad6265SDimitry Andric                                                      int64_t Offset) const {
409281ad6265SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
409381ad6265SDimitry Andric   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
409481ad6265SDimitry Andric 
409581ad6265SDimitry Andric   // TODO: If we passed in the base kernel offset we could have a better
409681ad6265SDimitry Andric   // alignment than 4, but we don't really need it.
409781ad6265SDimitry Andric   if (!loadInputValue(KernArgReg, B,
409881ad6265SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
409981ad6265SDimitry Andric     llvm_unreachable("failed to find kernarg segment ptr");
410081ad6265SDimitry Andric 
410181ad6265SDimitry Andric   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
410281ad6265SDimitry Andric   // TODO: Should get nuw
410381ad6265SDimitry Andric   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
410481ad6265SDimitry Andric }
410581ad6265SDimitry Andric 
410681ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by
410781ad6265SDimitry Andric /// legacy intrinsics.
410881ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
410981ad6265SDimitry Andric                                                       MachineIRBuilder &B,
411081ad6265SDimitry Andric                                                       uint64_t Offset,
411181ad6265SDimitry Andric                                                       Align Alignment) const {
411281ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
411381ad6265SDimitry Andric 
411481ad6265SDimitry Andric   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
411581ad6265SDimitry Andric          "unexpected kernarg parameter type");
411681ad6265SDimitry Andric 
411781ad6265SDimitry Andric   Register Ptr = getKernargParameterPtr(B, Offset);
411881ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
411981ad6265SDimitry Andric   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
412081ad6265SDimitry Andric               MachineMemOperand::MODereferenceable |
412181ad6265SDimitry Andric                   MachineMemOperand::MOInvariant);
412281ad6265SDimitry Andric   MI.eraseFromParent();
412381ad6265SDimitry Andric   return true;
412481ad6265SDimitry Andric }
412581ad6265SDimitry Andric 
41268bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
41278bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
41288bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
4129480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4130480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
4131480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
4132480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4133480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
41348bcb0991SDimitry Andric 
4135480093f4SDimitry Andric   if (DstTy == S16)
4136480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
4137480093f4SDimitry Andric   if (DstTy == S32)
4138480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
4139480093f4SDimitry Andric   if (DstTy == S64)
4140480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
4141480093f4SDimitry Andric 
41428bcb0991SDimitry Andric   return false;
41438bcb0991SDimitry Andric }
41448bcb0991SDimitry Andric 
4145fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4146fe6060f1SDimitry Andric                                                         Register DstDivReg,
4147fe6060f1SDimitry Andric                                                         Register DstRemReg,
41485ffd83dbSDimitry Andric                                                         Register X,
4149fe6060f1SDimitry Andric                                                         Register Y) const {
41505ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
41515ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
41525ffd83dbSDimitry Andric 
41535ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
41545ffd83dbSDimitry Andric   // algorithm used here.
41555ffd83dbSDimitry Andric 
41565ffd83dbSDimitry Andric   // Initial estimate of inv(y).
41575ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
41585ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
4159*06c3fb27SDimitry Andric   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
41605ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
41615ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
41625ffd83dbSDimitry Andric 
41635ffd83dbSDimitry Andric   // One round of UNR.
41645ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
41655ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
41665ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
41675ffd83dbSDimitry Andric 
41685ffd83dbSDimitry Andric   // Quotient/remainder estimate.
41695ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
41705ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
41715ffd83dbSDimitry Andric 
41725ffd83dbSDimitry Andric   // First quotient/remainder refinement.
41735ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
41745ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4175fe6060f1SDimitry Andric   if (DstDivReg)
41765ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
41775ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
41785ffd83dbSDimitry Andric 
41795ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
41805ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4181fe6060f1SDimitry Andric   if (DstDivReg)
4182fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
41835ffd83dbSDimitry Andric 
4184fe6060f1SDimitry Andric   if (DstRemReg)
4185fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
41865ffd83dbSDimitry Andric }
41875ffd83dbSDimitry Andric 
4188349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32
41895ffd83dbSDimitry Andric //
41905ffd83dbSDimitry Andric // Return lo, hi of result
41915ffd83dbSDimitry Andric //
41925ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
41935ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
41945ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
41955ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
41965ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
41975ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
41985ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
41995ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
42005ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
42015ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
42025ffd83dbSDimitry Andric                                                        Register Val) {
42035ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
42045ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
42055ffd83dbSDimitry Andric 
42065ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
42075ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
42085ffd83dbSDimitry Andric 
4209*06c3fb27SDimitry Andric   auto Mad = B.buildFMAD(
4210*06c3fb27SDimitry Andric       S32, CvtHi, // 2**32
4211*06c3fb27SDimitry Andric       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
42125ffd83dbSDimitry Andric 
42135ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
4214*06c3fb27SDimitry Andric   auto Mul1 = B.buildFMul(
4215*06c3fb27SDimitry Andric       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
42165ffd83dbSDimitry Andric 
42175ffd83dbSDimitry Andric   // 2**(-32)
4218*06c3fb27SDimitry Andric   auto Mul2 = B.buildFMul(
4219*06c3fb27SDimitry Andric       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
42205ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
42215ffd83dbSDimitry Andric 
42225ffd83dbSDimitry Andric   // -(2**32)
4223*06c3fb27SDimitry Andric   auto Mad2 = B.buildFMAD(
4224*06c3fb27SDimitry Andric       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
4225*06c3fb27SDimitry Andric       Mul1);
42265ffd83dbSDimitry Andric 
42275ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
42285ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
42295ffd83dbSDimitry Andric 
42305ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
42315ffd83dbSDimitry Andric }
42325ffd83dbSDimitry Andric 
4233fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4234fe6060f1SDimitry Andric                                                         Register DstDivReg,
4235fe6060f1SDimitry Andric                                                         Register DstRemReg,
42365ffd83dbSDimitry Andric                                                         Register Numer,
4237fe6060f1SDimitry Andric                                                         Register Denom) const {
42385ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
42395ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
42405ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
42415ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
42425ffd83dbSDimitry Andric 
42435ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
42445ffd83dbSDimitry Andric 
4245bdd1243dSDimitry Andric   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
42465ffd83dbSDimitry Andric 
42475ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
42485ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
42495ffd83dbSDimitry Andric 
42505ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
42515ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
42525ffd83dbSDimitry Andric 
42535ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
42545ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
42555ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
42565ffd83dbSDimitry Andric 
42575ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
42585ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4259bdd1243dSDimitry Andric   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
42605ffd83dbSDimitry Andric 
42615ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
42625ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
42635ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
42645ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
42655ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
42665ffd83dbSDimitry Andric 
42675ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
42685ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4269349cc55cSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4270bdd1243dSDimitry Andric   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
42715ffd83dbSDimitry Andric 
42725ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
42735ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
42745ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
42755ffd83dbSDimitry Andric 
42765ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
42775ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
42785ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
42795ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
42805ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
42815ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
42825ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
42835ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4284bdd1243dSDimitry Andric   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
42855ffd83dbSDimitry Andric 
42865ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
42875ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
42885ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
42895ffd83dbSDimitry Andric 
42905ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
42915ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
42925ffd83dbSDimitry Andric 
42935ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
42945ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
42955ffd83dbSDimitry Andric 
42965ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
42975ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
42985ffd83dbSDimitry Andric 
42995ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
43005ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
43015ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
43025ffd83dbSDimitry Andric 
43035ffd83dbSDimitry Andric   // if C3 != 0 ...
43045ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
43055ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
43065ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4307bdd1243dSDimitry Andric   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
43085ffd83dbSDimitry Andric 
43095ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
43105ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
43115ffd83dbSDimitry Andric 
43125ffd83dbSDimitry Andric   auto C4 =
43135ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
43145ffd83dbSDimitry Andric   auto C5 =
43155ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
43165ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
43175ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
43185ffd83dbSDimitry Andric 
43195ffd83dbSDimitry Andric   // if (C6 != 0)
43205ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
43215ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
43225ffd83dbSDimitry Andric 
43235ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
43245ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4325bdd1243dSDimitry Andric   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
43265ffd83dbSDimitry Andric 
43275ffd83dbSDimitry Andric   // endif C6
43285ffd83dbSDimitry Andric   // endif C3
43295ffd83dbSDimitry Andric 
4330fe6060f1SDimitry Andric   if (DstDivReg) {
43315ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
43325ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4333fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4334fe6060f1SDimitry Andric                   Sel1, MulHi3);
4335fe6060f1SDimitry Andric   }
4336fe6060f1SDimitry Andric 
4337fe6060f1SDimitry Andric   if (DstRemReg) {
43385ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
43395ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4340fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4341fe6060f1SDimitry Andric                   Sel2, Sub1);
43425ffd83dbSDimitry Andric   }
43435ffd83dbSDimitry Andric }
43445ffd83dbSDimitry Andric 
4345fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
43465ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
43475ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
4348fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
4349fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
4350fe6060f1SDimitry Andric   default:
4351fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
4352fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
4353fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4354fe6060f1SDimitry Andric     break;
4355fe6060f1SDimitry Andric   }
4356fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
4357fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
4358fe6060f1SDimitry Andric     break;
4359fe6060f1SDimitry Andric   }
4360fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
4361fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4362fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
4363fe6060f1SDimitry Andric     break;
4364fe6060f1SDimitry Andric   }
4365fe6060f1SDimitry Andric   }
4366fe6060f1SDimitry Andric 
43675ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
43685ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
4369fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4370fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4371fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4372fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
43735ffd83dbSDimitry Andric 
43745ffd83dbSDimitry Andric   if (Ty == S32)
4375fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
43765ffd83dbSDimitry Andric   else if (Ty == S64)
4377fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
43785ffd83dbSDimitry Andric   else
43795ffd83dbSDimitry Andric     return false;
43805ffd83dbSDimitry Andric 
43815ffd83dbSDimitry Andric   MI.eraseFromParent();
43825ffd83dbSDimitry Andric   return true;
43835ffd83dbSDimitry Andric }
43845ffd83dbSDimitry Andric 
4385fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
43865ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
43875ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
43885ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
43895ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43905ffd83dbSDimitry Andric 
4391fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
43925ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
43935ffd83dbSDimitry Andric     return false;
43945ffd83dbSDimitry Andric 
4395fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4396fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4397fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
43985ffd83dbSDimitry Andric 
43995ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
44005ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
44015ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
44025ffd83dbSDimitry Andric 
44035ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
44045ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
44055ffd83dbSDimitry Andric 
44065ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
44075ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
44085ffd83dbSDimitry Andric 
4409fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4410fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
4411fe6060f1SDimitry Andric   default:
4412fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
4413fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
4414fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4415fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4416fe6060f1SDimitry Andric     break;
4417fe6060f1SDimitry Andric   }
4418fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
4419fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
4420fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4421fe6060f1SDimitry Andric     break;
4422fe6060f1SDimitry Andric   }
4423fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
4424fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4425fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
4426fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4427fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4428fe6060f1SDimitry Andric     break;
4429fe6060f1SDimitry Andric   }
4430fe6060f1SDimitry Andric   }
4431fe6060f1SDimitry Andric 
44325ffd83dbSDimitry Andric   if (Ty == S32)
4433fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
44345ffd83dbSDimitry Andric   else
4435fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
44365ffd83dbSDimitry Andric 
4437fe6060f1SDimitry Andric   if (DstDivReg) {
4438fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4439fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4440fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
4441fe6060f1SDimitry Andric   }
44425ffd83dbSDimitry Andric 
4443fe6060f1SDimitry Andric   if (DstRemReg) {
4444fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4445fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4446fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
4447fe6060f1SDimitry Andric   }
44485ffd83dbSDimitry Andric 
44495ffd83dbSDimitry Andric   MI.eraseFromParent();
44505ffd83dbSDimitry Andric   return true;
44515ffd83dbSDimitry Andric }
44525ffd83dbSDimitry Andric 
44538bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
44548bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
44558bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
44568bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
44578bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
44588bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
44598bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
44608bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
44618bcb0991SDimitry Andric 
44628bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
4463*06c3fb27SDimitry Andric   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
4464*06c3fb27SDimitry Andric                             MF.getTarget().Options.UnsafeFPMath;
44658bcb0991SDimitry Andric 
44668bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
4467*06c3fb27SDimitry Andric     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
4468*06c3fb27SDimitry Andric       return false;
4469*06c3fb27SDimitry Andric 
4470*06c3fb27SDimitry Andric     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4471*06c3fb27SDimitry Andric     // the CI documentation has a worst case error of 1 ulp.
4472*06c3fb27SDimitry Andric     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4473*06c3fb27SDimitry Andric     // use it as long as we aren't trying to use denormals.
4474*06c3fb27SDimitry Andric     //
4475*06c3fb27SDimitry Andric     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
4476*06c3fb27SDimitry Andric 
44778bcb0991SDimitry Andric     // 1 / x -> RCP(x)
44788bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
44798bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
44808bcb0991SDimitry Andric         .addUse(RHS)
44818bcb0991SDimitry Andric         .setMIFlags(Flags);
44828bcb0991SDimitry Andric 
44838bcb0991SDimitry Andric       MI.eraseFromParent();
44848bcb0991SDimitry Andric       return true;
44858bcb0991SDimitry Andric     }
44868bcb0991SDimitry Andric 
4487*06c3fb27SDimitry Andric     // TODO: Match rsq
4488*06c3fb27SDimitry Andric 
44898bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
44908bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
44918bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
44928bcb0991SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
44938bcb0991SDimitry Andric         .addUse(FNeg.getReg(0))
44948bcb0991SDimitry Andric         .setMIFlags(Flags);
44958bcb0991SDimitry Andric 
44968bcb0991SDimitry Andric       MI.eraseFromParent();
44978bcb0991SDimitry Andric       return true;
44988bcb0991SDimitry Andric     }
44998bcb0991SDimitry Andric   }
45008bcb0991SDimitry Andric 
4501*06c3fb27SDimitry Andric   // For f16 require arcp only.
4502*06c3fb27SDimitry Andric   // For f32 require afn+arcp.
4503*06c3fb27SDimitry Andric   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
4504*06c3fb27SDimitry Andric                               !MI.getFlag(MachineInstr::FmArcp)))
4505*06c3fb27SDimitry Andric     return false;
4506*06c3fb27SDimitry Andric 
45078bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
45088bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
45098bcb0991SDimitry Andric     .addUse(RHS)
45108bcb0991SDimitry Andric     .setMIFlags(Flags);
45118bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
45128bcb0991SDimitry Andric 
45138bcb0991SDimitry Andric   MI.eraseFromParent();
45148bcb0991SDimitry Andric   return true;
45158bcb0991SDimitry Andric }
45168bcb0991SDimitry Andric 
4517e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4518e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
4519e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
4520e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4521e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
4522e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
4523e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
4524e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
4525e8d8bef9SDimitry Andric 
4526e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
4527e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4528e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
4529e8d8bef9SDimitry Andric 
4530e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
45318bcb0991SDimitry Andric     return false;
4532e8d8bef9SDimitry Andric 
4533e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
4534e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
4535e8d8bef9SDimitry Andric 
4536e8d8bef9SDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
4537e8d8bef9SDimitry Andric     .addUse(Y)
4538e8d8bef9SDimitry Andric     .setMIFlags(Flags);
4539e8d8bef9SDimitry Andric 
4540e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4541e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
4542e8d8bef9SDimitry Andric 
4543e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4544e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
4545e8d8bef9SDimitry Andric 
4546e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
4547e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4548e8d8bef9SDimitry Andric 
4549e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
4550e8d8bef9SDimitry Andric   MI.eraseFromParent();
4551e8d8bef9SDimitry Andric   return true;
45528bcb0991SDimitry Andric }
45538bcb0991SDimitry Andric 
4554480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4555480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4556480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4557e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4558e8d8bef9SDimitry Andric     return true;
4559e8d8bef9SDimitry Andric 
4560480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4561480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4562480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4563480093f4SDimitry Andric 
4564480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4565480093f4SDimitry Andric 
4566480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
4567480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4568480093f4SDimitry Andric 
4569480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4570480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4571480093f4SDimitry Andric 
4572480093f4SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4573480093f4SDimitry Andric     .addUse(RHSExt.getReg(0))
4574480093f4SDimitry Andric     .setMIFlags(Flags);
4575480093f4SDimitry Andric 
4576480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4577480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4578480093f4SDimitry Andric 
4579480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
4580480093f4SDimitry Andric     .addUse(RDst.getReg(0))
4581480093f4SDimitry Andric     .addUse(RHS)
4582480093f4SDimitry Andric     .addUse(LHS)
4583480093f4SDimitry Andric     .setMIFlags(Flags);
4584480093f4SDimitry Andric 
4585480093f4SDimitry Andric   MI.eraseFromParent();
4586480093f4SDimitry Andric   return true;
4587480093f4SDimitry Andric }
4588480093f4SDimitry Andric 
4589480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4590480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
4591*06c3fb27SDimitry Andric static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4592480093f4SDimitry Andric                                const GCNSubtarget &ST,
4593*06c3fb27SDimitry Andric                                SIModeRegisterDefaults Mode) {
4594480093f4SDimitry Andric   // Set SP denorm mode to this value.
4595480093f4SDimitry Andric   unsigned SPDenormMode =
45965ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4597480093f4SDimitry Andric 
4598480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
4599480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
46005ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4601480093f4SDimitry Andric 
46025ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4603480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
4604480093f4SDimitry Andric       .addImm(NewDenormModeValue);
4605480093f4SDimitry Andric 
4606480093f4SDimitry Andric   } else {
4607480093f4SDimitry Andric     // Select FP32 bit field in mode register.
4608480093f4SDimitry Andric     unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
4609480093f4SDimitry Andric                                     (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4610480093f4SDimitry Andric                                     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4611480093f4SDimitry Andric 
4612480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4613480093f4SDimitry Andric       .addImm(SPDenormMode)
4614480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
4615480093f4SDimitry Andric   }
4616480093f4SDimitry Andric }
4617480093f4SDimitry Andric 
4618480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4619480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4620480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4621e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4622e8d8bef9SDimitry Andric     return true;
4623e8d8bef9SDimitry Andric 
4624480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4625480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4626480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4627480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4628*06c3fb27SDimitry Andric   SIModeRegisterDefaults Mode = MFI->getMode();
4629480093f4SDimitry Andric 
4630480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4631480093f4SDimitry Andric 
4632480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4633480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
4634480093f4SDimitry Andric 
4635480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
4636480093f4SDimitry Andric 
4637480093f4SDimitry Andric   auto DenominatorScaled =
4638480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
4639480093f4SDimitry Andric       .addUse(LHS)
46405ffd83dbSDimitry Andric       .addUse(RHS)
46415ffd83dbSDimitry Andric       .addImm(0)
4642480093f4SDimitry Andric       .setMIFlags(Flags);
4643480093f4SDimitry Andric   auto NumeratorScaled =
4644480093f4SDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
4645480093f4SDimitry Andric       .addUse(LHS)
4646480093f4SDimitry Andric       .addUse(RHS)
46475ffd83dbSDimitry Andric       .addImm(1)
4648480093f4SDimitry Andric       .setMIFlags(Flags);
4649480093f4SDimitry Andric 
4650480093f4SDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
4651480093f4SDimitry Andric     .addUse(DenominatorScaled.getReg(0))
4652480093f4SDimitry Andric     .setMIFlags(Flags);
4653480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4654480093f4SDimitry Andric 
4655480093f4SDimitry Andric   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
4656480093f4SDimitry Andric   // aren't modeled as reading it.
4657*06c3fb27SDimitry Andric   if (Mode.FP32Denormals != DenormalMode::getIEEE())
4658480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
4659480093f4SDimitry Andric 
4660480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4661480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4662480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4663480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4664480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4665480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4666480093f4SDimitry Andric 
4667*06c3fb27SDimitry Andric   // FIXME: This mishandles dynamic denormal mode. We need to query the
4668*06c3fb27SDimitry Andric   // current mode and restore the original.
4669*06c3fb27SDimitry Andric   if (Mode.FP32Denormals != DenormalMode::getIEEE())
4670480093f4SDimitry Andric     toggleSPDenormMode(false, B, ST, Mode);
4671480093f4SDimitry Andric 
4672480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
4673480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
4674480093f4SDimitry Andric     .addUse(Fma1.getReg(0))
4675480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
4676480093f4SDimitry Andric     .addUse(NumeratorScaled.getReg(1))
4677480093f4SDimitry Andric     .setMIFlags(Flags);
4678480093f4SDimitry Andric 
4679480093f4SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
4680480093f4SDimitry Andric     .addUse(Fmas.getReg(0))
4681480093f4SDimitry Andric     .addUse(RHS)
4682480093f4SDimitry Andric     .addUse(LHS)
4683480093f4SDimitry Andric     .setMIFlags(Flags);
4684480093f4SDimitry Andric 
4685480093f4SDimitry Andric   MI.eraseFromParent();
4686480093f4SDimitry Andric   return true;
4687480093f4SDimitry Andric }
4688480093f4SDimitry Andric 
4689480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4690480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4691480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4692e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4693e8d8bef9SDimitry Andric     return true;
4694e8d8bef9SDimitry Andric 
4695480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4696480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4697480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4698480093f4SDimitry Andric 
4699480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4700480093f4SDimitry Andric 
4701480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
4702480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
4703480093f4SDimitry Andric 
4704480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
4705480093f4SDimitry Andric 
4706480093f4SDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4707480093f4SDimitry Andric     .addUse(LHS)
4708480093f4SDimitry Andric     .addUse(RHS)
47095ffd83dbSDimitry Andric     .addImm(0)
4710480093f4SDimitry Andric     .setMIFlags(Flags);
4711480093f4SDimitry Andric 
4712480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4713480093f4SDimitry Andric 
4714480093f4SDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
4715480093f4SDimitry Andric     .addUse(DivScale0.getReg(0))
4716480093f4SDimitry Andric     .setMIFlags(Flags);
4717480093f4SDimitry Andric 
4718480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4719480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4720480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4721480093f4SDimitry Andric 
4722480093f4SDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
4723480093f4SDimitry Andric     .addUse(LHS)
4724480093f4SDimitry Andric     .addUse(RHS)
47255ffd83dbSDimitry Andric     .addImm(1)
4726480093f4SDimitry Andric     .setMIFlags(Flags);
4727480093f4SDimitry Andric 
4728480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
47295ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4730480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4731480093f4SDimitry Andric 
4732480093f4SDimitry Andric   Register Scale;
4733480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
4734480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
4735480093f4SDimitry Andric     // is not usable.
4736480093f4SDimitry Andric 
4737480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
4738480093f4SDimitry Andric 
4739480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4740480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4741480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4742480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4743480093f4SDimitry Andric 
4744480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4745480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
4746480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4747480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
47485ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4749480093f4SDimitry Andric   } else {
4750480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
4751480093f4SDimitry Andric   }
4752480093f4SDimitry Andric 
4753480093f4SDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
4754480093f4SDimitry Andric     .addUse(Fma4.getReg(0))
4755480093f4SDimitry Andric     .addUse(Fma3.getReg(0))
4756480093f4SDimitry Andric     .addUse(Mul.getReg(0))
4757480093f4SDimitry Andric     .addUse(Scale)
4758480093f4SDimitry Andric     .setMIFlags(Flags);
4759480093f4SDimitry Andric 
4760bdd1243dSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
4761480093f4SDimitry Andric       .addUse(Fmas.getReg(0))
4762480093f4SDimitry Andric       .addUse(RHS)
4763480093f4SDimitry Andric       .addUse(LHS)
4764480093f4SDimitry Andric       .setMIFlags(Flags);
4765480093f4SDimitry Andric 
4766480093f4SDimitry Andric   MI.eraseFromParent();
4767480093f4SDimitry Andric   return true;
4768480093f4SDimitry Andric }
4769480093f4SDimitry Andric 
4770*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
4771*06c3fb27SDimitry Andric                                          MachineRegisterInfo &MRI,
4772*06c3fb27SDimitry Andric                                          MachineIRBuilder &B) const {
4773*06c3fb27SDimitry Andric   Register Res0 = MI.getOperand(0).getReg();
4774*06c3fb27SDimitry Andric   Register Res1 = MI.getOperand(1).getReg();
4775*06c3fb27SDimitry Andric   Register Val = MI.getOperand(2).getReg();
4776*06c3fb27SDimitry Andric   uint16_t Flags = MI.getFlags();
4777*06c3fb27SDimitry Andric 
4778*06c3fb27SDimitry Andric   LLT Ty = MRI.getType(Res0);
4779*06c3fb27SDimitry Andric   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
4780*06c3fb27SDimitry Andric 
4781*06c3fb27SDimitry Andric   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false)
4782*06c3fb27SDimitry Andric                   .addUse(Val)
4783*06c3fb27SDimitry Andric                   .setMIFlags(Flags);
4784*06c3fb27SDimitry Andric   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false)
4785*06c3fb27SDimitry Andric                  .addUse(Val)
4786*06c3fb27SDimitry Andric                  .setMIFlags(Flags);
4787*06c3fb27SDimitry Andric 
4788*06c3fb27SDimitry Andric   if (ST.hasFractBug()) {
4789*06c3fb27SDimitry Andric     auto Fabs = B.buildFAbs(Ty, Val);
4790*06c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
4791*06c3fb27SDimitry Andric     auto IsFinite =
4792*06c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
4793*06c3fb27SDimitry Andric     auto Zero = B.buildConstant(InstrExpTy, 0);
4794*06c3fb27SDimitry Andric     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
4795*06c3fb27SDimitry Andric     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
4796*06c3fb27SDimitry Andric   }
4797*06c3fb27SDimitry Andric 
4798*06c3fb27SDimitry Andric   B.buildCopy(Res0, Mant);
4799*06c3fb27SDimitry Andric   B.buildSExtOrTrunc(Res1, Exp);
4800*06c3fb27SDimitry Andric 
4801*06c3fb27SDimitry Andric   MI.eraseFromParent();
4802*06c3fb27SDimitry Andric   return true;
4803*06c3fb27SDimitry Andric }
4804*06c3fb27SDimitry Andric 
48058bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
48068bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
48078bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
48088bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
48098bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
48108bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
48118bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
48128bcb0991SDimitry Andric 
48138bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
48148bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
48158bcb0991SDimitry Andric 
48168bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
48178bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
48188bcb0991SDimitry Andric 
4819*06c3fb27SDimitry Andric   auto C0 = B.buildFConstant(S32, 0x1p+96f);
4820*06c3fb27SDimitry Andric   auto C1 = B.buildFConstant(S32, 0x1p-32f);
4821*06c3fb27SDimitry Andric   auto C2 = B.buildFConstant(S32, 1.0f);
48228bcb0991SDimitry Andric 
48238bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
48248bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
48258bcb0991SDimitry Andric 
48268bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
48278bcb0991SDimitry Andric 
48288bcb0991SDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
48298bcb0991SDimitry Andric     .addUse(Mul0.getReg(0))
48308bcb0991SDimitry Andric     .setMIFlags(Flags);
48318bcb0991SDimitry Andric 
48328bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
48338bcb0991SDimitry Andric 
48348bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
48358bcb0991SDimitry Andric 
48368bcb0991SDimitry Andric   MI.eraseFromParent();
48378bcb0991SDimitry Andric   return true;
48388bcb0991SDimitry Andric }
48398bcb0991SDimitry Andric 
4840*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
4841*06c3fb27SDimitry Andric                                         MachineRegisterInfo &MRI,
4842*06c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
4843*06c3fb27SDimitry Andric   // For double type, the SQRT and RSQ instructions don't have required
4844*06c3fb27SDimitry Andric   // precision, we apply Goldschmidt's algorithm to improve the result:
4845*06c3fb27SDimitry Andric   //
4846*06c3fb27SDimitry Andric   //   y0 = rsq(x)
4847*06c3fb27SDimitry Andric   //   g0 = x * y0
4848*06c3fb27SDimitry Andric   //   h0 = 0.5 * y0
4849*06c3fb27SDimitry Andric   //
4850*06c3fb27SDimitry Andric   //   r0 = 0.5 - h0 * g0
4851*06c3fb27SDimitry Andric   //   g1 = g0 * r0 + g0
4852*06c3fb27SDimitry Andric   //   h1 = h0 * r0 + h0
4853*06c3fb27SDimitry Andric   //
4854*06c3fb27SDimitry Andric   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
4855*06c3fb27SDimitry Andric   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
4856*06c3fb27SDimitry Andric   //   h2 = h1 * r1 + h1
4857*06c3fb27SDimitry Andric   //
4858*06c3fb27SDimitry Andric   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
4859*06c3fb27SDimitry Andric   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
4860*06c3fb27SDimitry Andric   //
4861*06c3fb27SDimitry Andric   //   sqrt(x) = g3
4862*06c3fb27SDimitry Andric 
4863*06c3fb27SDimitry Andric   const LLT S1 = LLT::scalar(1);
4864*06c3fb27SDimitry Andric   const LLT S32 = LLT::scalar(32);
4865*06c3fb27SDimitry Andric   const LLT F64 = LLT::scalar(64);
4866*06c3fb27SDimitry Andric 
4867*06c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4868*06c3fb27SDimitry Andric   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
4869*06c3fb27SDimitry Andric 
4870*06c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
4871*06c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
4872*06c3fb27SDimitry Andric 
4873*06c3fb27SDimitry Andric   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
4874*06c3fb27SDimitry Andric 
4875*06c3fb27SDimitry Andric   auto ZeroInt = B.buildConstant(S32, 0);
4876*06c3fb27SDimitry Andric   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
4877*06c3fb27SDimitry Andric 
4878*06c3fb27SDimitry Andric   // Scale up input if it is too small.
4879*06c3fb27SDimitry Andric   auto ScaleUpFactor = B.buildConstant(S32, 256);
4880*06c3fb27SDimitry Andric   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
4881*06c3fb27SDimitry Andric   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
4882*06c3fb27SDimitry Andric 
4883*06c3fb27SDimitry Andric   auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false)
4884*06c3fb27SDimitry Andric                    .addReg(SqrtX.getReg(0));
4885*06c3fb27SDimitry Andric 
4886*06c3fb27SDimitry Andric   auto Half = B.buildFConstant(F64, 0.5);
4887*06c3fb27SDimitry Andric   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
4888*06c3fb27SDimitry Andric   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
4889*06c3fb27SDimitry Andric 
4890*06c3fb27SDimitry Andric   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
4891*06c3fb27SDimitry Andric   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
4892*06c3fb27SDimitry Andric 
4893*06c3fb27SDimitry Andric   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
4894*06c3fb27SDimitry Andric   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
4895*06c3fb27SDimitry Andric 
4896*06c3fb27SDimitry Andric   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
4897*06c3fb27SDimitry Andric   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
4898*06c3fb27SDimitry Andric 
4899*06c3fb27SDimitry Andric   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
4900*06c3fb27SDimitry Andric 
4901*06c3fb27SDimitry Andric   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
4902*06c3fb27SDimitry Andric   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
4903*06c3fb27SDimitry Andric 
4904*06c3fb27SDimitry Andric   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
4905*06c3fb27SDimitry Andric 
4906*06c3fb27SDimitry Andric   // Scale down the result.
4907*06c3fb27SDimitry Andric   auto ScaleDownFactor = B.buildConstant(S32, -128);
4908*06c3fb27SDimitry Andric   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
4909*06c3fb27SDimitry Andric   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
4910*06c3fb27SDimitry Andric 
4911*06c3fb27SDimitry Andric   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
4912*06c3fb27SDimitry Andric   // with finite only or nsz because rsq(+/-0) = +/-inf
4913*06c3fb27SDimitry Andric 
4914*06c3fb27SDimitry Andric   // TODO: Check for DAZ and expand to subnormals
4915*06c3fb27SDimitry Andric   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
4916*06c3fb27SDimitry Andric 
4917*06c3fb27SDimitry Andric   // If x is +INF, +0, or -0, use its original value
4918*06c3fb27SDimitry Andric   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
4919*06c3fb27SDimitry Andric 
4920*06c3fb27SDimitry Andric   MI.eraseFromParent();
4921*06c3fb27SDimitry Andric   return true;
4922*06c3fb27SDimitry Andric }
4923*06c3fb27SDimitry Andric 
4924e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
4925e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
4926e8d8bef9SDimitry Andric //
4927e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
4928e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
4929e8d8bef9SDimitry Andric // +-max_float.
4930e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
4931e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
4932e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
4933e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
4934e8d8bef9SDimitry Andric     return true;
4935e8d8bef9SDimitry Andric 
4936e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4937e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
4938e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
4939e8d8bef9SDimitry Andric 
4940e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
4941e8d8bef9SDimitry Andric 
4942e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
4943e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
4944e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
4945e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
4946e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
4947e8d8bef9SDimitry Andric   else
4948e8d8bef9SDimitry Andric     return false;
4949e8d8bef9SDimitry Andric 
4950e8d8bef9SDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
4951e8d8bef9SDimitry Andric     .addUse(Src)
4952e8d8bef9SDimitry Andric     .setMIFlags(Flags);
4953e8d8bef9SDimitry Andric 
4954e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
4955e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
4956e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4957e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
4958e8d8bef9SDimitry Andric 
4959e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
4960e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
4961e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
4962e8d8bef9SDimitry Andric 
4963e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
4964e8d8bef9SDimitry Andric 
4965e8d8bef9SDimitry Andric   if (UseIEEE)
4966e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
4967e8d8bef9SDimitry Andric   else
4968e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
4969e8d8bef9SDimitry Andric   MI.eraseFromParent();
4970e8d8bef9SDimitry Andric   return true;
4971e8d8bef9SDimitry Andric }
4972e8d8bef9SDimitry Andric 
4973e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
4974e8d8bef9SDimitry Andric   switch (IID) {
4975e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
4976e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
4977e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
4978e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
4979e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
4980e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
4981e8d8bef9SDimitry Andric   default:
4982e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
4983e8d8bef9SDimitry Andric   }
4984e8d8bef9SDimitry Andric }
4985e8d8bef9SDimitry Andric 
4986e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
4987e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
4988e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
4989e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
4990e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
4991e8d8bef9SDimitry Andric 
4992e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
4993e8d8bef9SDimitry Andric 
4994e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
4995e8d8bef9SDimitry Andric   // construction.
4996e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
499781ad6265SDimitry Andric     MI.removeOperand(I);
4998e8d8bef9SDimitry Andric 
499981ad6265SDimitry Andric   MI.removeOperand(1); // Remove the intrinsic ID.
5000e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
5001e8d8bef9SDimitry Andric   return true;
5002e8d8bef9SDimitry Andric }
5003e8d8bef9SDimitry Andric 
5004e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5005e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
5006e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
5007e8d8bef9SDimitry Andric   uint64_t Offset =
5008e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
5009e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5010e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
5011e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5012e8d8bef9SDimitry Andric 
5013e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5014e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
5015e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5016e8d8bef9SDimitry Andric     return false;
5017e8d8bef9SDimitry Andric 
5018e8d8bef9SDimitry Andric   // FIXME: This should be nuw
5019e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5020e8d8bef9SDimitry Andric   return true;
5021e8d8bef9SDimitry Andric }
5022e8d8bef9SDimitry Andric 
5023*06c3fb27SDimitry Andric /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
5024*06c3fb27SDimitry Andric /// bits of the pointer and replace them with the stride argument, then
5025*06c3fb27SDimitry Andric /// merge_values everything together. In the common case of a raw buffer (the
5026*06c3fb27SDimitry Andric /// stride component is 0), we can just AND off the upper half.
5027*06c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
5028*06c3fb27SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
5029*06c3fb27SDimitry Andric   Register Result = MI.getOperand(0).getReg();
5030*06c3fb27SDimitry Andric   Register Pointer = MI.getOperand(2).getReg();
5031*06c3fb27SDimitry Andric   Register Stride = MI.getOperand(3).getReg();
5032*06c3fb27SDimitry Andric   Register NumRecords = MI.getOperand(4).getReg();
5033*06c3fb27SDimitry Andric   Register Flags = MI.getOperand(5).getReg();
5034*06c3fb27SDimitry Andric 
5035*06c3fb27SDimitry Andric   LLT S32 = LLT::scalar(32);
5036*06c3fb27SDimitry Andric 
5037*06c3fb27SDimitry Andric   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
5038*06c3fb27SDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Pointer);
5039*06c3fb27SDimitry Andric   Register LowHalf = Unmerge.getReg(0);
5040*06c3fb27SDimitry Andric   Register HighHalf = Unmerge.getReg(1);
5041*06c3fb27SDimitry Andric 
5042*06c3fb27SDimitry Andric   auto AndMask = B.buildConstant(S32, 0x0000ffff);
5043*06c3fb27SDimitry Andric   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
5044*06c3fb27SDimitry Andric 
5045*06c3fb27SDimitry Andric   MachineInstrBuilder NewHighHalf = Masked;
5046*06c3fb27SDimitry Andric   std::optional<ValueAndVReg> StrideConst =
5047*06c3fb27SDimitry Andric       getIConstantVRegValWithLookThrough(Stride, MRI);
5048*06c3fb27SDimitry Andric   if (!StrideConst || !StrideConst->Value.isZero()) {
5049*06c3fb27SDimitry Andric     MachineInstrBuilder ShiftedStride;
5050*06c3fb27SDimitry Andric     if (StrideConst) {
5051*06c3fb27SDimitry Andric       uint32_t StrideVal = StrideConst->Value.getZExtValue();
5052*06c3fb27SDimitry Andric       uint32_t ShiftedStrideVal = StrideVal << 16;
5053*06c3fb27SDimitry Andric       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
5054*06c3fb27SDimitry Andric     } else {
5055*06c3fb27SDimitry Andric       auto ExtStride = B.buildAnyExt(S32, Stride);
5056*06c3fb27SDimitry Andric       auto ShiftConst = B.buildConstant(S32, 16);
5057*06c3fb27SDimitry Andric       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
5058*06c3fb27SDimitry Andric     }
5059*06c3fb27SDimitry Andric     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
5060*06c3fb27SDimitry Andric   }
5061*06c3fb27SDimitry Andric   Register NewHighHalfReg = NewHighHalf.getReg(0);
5062*06c3fb27SDimitry Andric   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5063*06c3fb27SDimitry Andric   MI.eraseFromParent();
5064*06c3fb27SDimitry Andric   return true;
5065*06c3fb27SDimitry Andric }
5066*06c3fb27SDimitry Andric 
50670b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
50680b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
50690b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
50700b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
50710b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
50720b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
50730b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
50740b57cec5SDimitry Andric   }
50750b57cec5SDimitry Andric 
50760b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5077e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
50780b57cec5SDimitry Andric     return false;
50790b57cec5SDimitry Andric 
50800b57cec5SDimitry Andric   MI.eraseFromParent();
50810b57cec5SDimitry Andric   return true;
50820b57cec5SDimitry Andric }
50830b57cec5SDimitry Andric 
5084fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5085fcaf7f86SDimitry Andric                                          MachineRegisterInfo &MRI,
5086fcaf7f86SDimitry Andric                                          MachineIRBuilder &B) const {
5087fcaf7f86SDimitry Andric   Function &F = B.getMF().getFunction();
5088bdd1243dSDimitry Andric   std::optional<uint32_t> KnownSize =
5089fcaf7f86SDimitry Andric       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5090fcaf7f86SDimitry Andric   if (KnownSize.has_value())
5091bdd1243dSDimitry Andric     B.buildConstant(DstReg, *KnownSize);
5092fcaf7f86SDimitry Andric   return false;
5093fcaf7f86SDimitry Andric }
5094fcaf7f86SDimitry Andric 
5095fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5096fcaf7f86SDimitry Andric                                               MachineRegisterInfo &MRI,
5097fcaf7f86SDimitry Andric                                               MachineIRBuilder &B) const {
5098fcaf7f86SDimitry Andric 
5099fcaf7f86SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5100fcaf7f86SDimitry Andric   if (!MFI->isEntryFunction()) {
5101fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
5102fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5103fcaf7f86SDimitry Andric   }
5104fcaf7f86SDimitry Andric 
5105fcaf7f86SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5106fcaf7f86SDimitry Andric   if (!getLDSKernelId(DstReg, MRI, B))
5107fcaf7f86SDimitry Andric     return false;
5108fcaf7f86SDimitry Andric 
5109fcaf7f86SDimitry Andric   MI.eraseFromParent();
5110fcaf7f86SDimitry Andric   return true;
5111fcaf7f86SDimitry Andric }
5112fcaf7f86SDimitry Andric 
51138bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
51148bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
51158bcb0991SDimitry Andric                                               MachineIRBuilder &B,
51168bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
51178bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5118e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5119e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
5120e8d8bef9SDimitry Andric 
51218bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
51228bcb0991SDimitry Andric   MI.eraseFromParent();
51238bcb0991SDimitry Andric   return true;
51248bcb0991SDimitry Andric }
51258bcb0991SDimitry Andric 
51265ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
51275ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
51285ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
51295ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
51305ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
51315ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
5132fe6060f1SDimitry Andric std::pair<Register, unsigned>
51335ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
51345ffd83dbSDimitry Andric                                         Register OrigOffset) const {
5135*06c3fb27SDimitry Andric   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
51365ffd83dbSDimitry Andric   Register BaseReg;
5137fe6060f1SDimitry Andric   unsigned ImmOffset;
51385ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
5139fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
51405ffd83dbSDimitry Andric 
5141fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
5142fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
51435ffd83dbSDimitry Andric 
5144fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
5145fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
5146fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
51475ffd83dbSDimitry Andric 
5148*06c3fb27SDimitry Andric   // If the immediate value is too big for the immoffset field, put only bits
5149*06c3fb27SDimitry Andric   // that would normally fit in the immoffset field. The remaining value that
5150*06c3fb27SDimitry Andric   // is copied/added for the voffset field is a large power of 2, and it
5151*06c3fb27SDimitry Andric   // stands more chance of being CSEd with the copy/add for another similar
5152*06c3fb27SDimitry Andric   // load/store.
5153*06c3fb27SDimitry Andric   // However, do not do that rounding down if that is a negative
5154*06c3fb27SDimitry Andric   // number, as it appears to be illegal to have a negative offset in the
5155*06c3fb27SDimitry Andric   // vgpr, even if adding the immediate offset makes it positive.
51565ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
51575ffd83dbSDimitry Andric   ImmOffset -= Overflow;
51585ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
51595ffd83dbSDimitry Andric     Overflow += ImmOffset;
51605ffd83dbSDimitry Andric     ImmOffset = 0;
51615ffd83dbSDimitry Andric   }
51625ffd83dbSDimitry Andric 
51635ffd83dbSDimitry Andric   if (Overflow != 0) {
51645ffd83dbSDimitry Andric     if (!BaseReg) {
51655ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
51665ffd83dbSDimitry Andric     } else {
51675ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
51685ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
51695ffd83dbSDimitry Andric     }
51705ffd83dbSDimitry Andric   }
51715ffd83dbSDimitry Andric 
51725ffd83dbSDimitry Andric   if (!BaseReg)
51735ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
51745ffd83dbSDimitry Andric 
5175bdd1243dSDimitry Andric   return std::pair(BaseReg, ImmOffset);
5176fe6060f1SDimitry Andric }
5177fe6060f1SDimitry Andric 
51788bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
51798bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
51808bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
5181e8d8bef9SDimitry Andric                                              Register Reg,
5182e8d8bef9SDimitry Andric                                              bool ImageStore) const {
51838bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
51848bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
51858bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
51868bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
51878bcb0991SDimitry Andric 
5188e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
51898bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
51908bcb0991SDimitry Andric 
51918bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
51928bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
51938bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
51948bcb0991SDimitry Andric 
51958bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
51968bcb0991SDimitry Andric 
5197fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5198fe6060f1SDimitry Andric         .getReg(0);
51998bcb0991SDimitry Andric   }
52008bcb0991SDimitry Andric 
5201e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
5202e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
5203e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5204e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
5205e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
5206e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5207fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5208fe6060f1SDimitry Andric           .getReg(0);
5209e8d8bef9SDimitry Andric     }
5210e8d8bef9SDimitry Andric 
5211e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
5212e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5213e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
5214e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5215e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
5216e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5217fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5218fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5219e8d8bef9SDimitry Andric     }
5220e8d8bef9SDimitry Andric 
5221e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
5222e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5223fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5224e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
5225e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5226e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
5227e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5228fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5229fe6060f1SDimitry Andric           .getReg(0);
5230e8d8bef9SDimitry Andric     }
5231e8d8bef9SDimitry Andric 
5232e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
5233e8d8bef9SDimitry Andric   }
5234e8d8bef9SDimitry Andric 
52350eae32dcSDimitry Andric   if (StoreVT == LLT::fixed_vector(3, S16)) {
52360eae32dcSDimitry Andric     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
52370eae32dcSDimitry Andric               .getReg(0);
52380eae32dcSDimitry Andric   }
5239e8d8bef9SDimitry Andric   return Reg;
5240e8d8bef9SDimitry Andric }
5241e8d8bef9SDimitry Andric 
52425ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
52435ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
52445ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
52455ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
52468bcb0991SDimitry Andric 
52478bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
52488bcb0991SDimitry Andric 
5249*06c3fb27SDimitry Andric   // Fixup buffer resources themselves needing to be v4i128.
5250*06c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty))
5251*06c3fb27SDimitry Andric     return castBufferRsrcToV4I32(VData, B);
5252*06c3fb27SDimitry Andric 
52538bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
52548bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
52558bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
52565ffd83dbSDimitry Andric     return AnyExt;
52578bcb0991SDimitry Andric   }
52588bcb0991SDimitry Andric 
52598bcb0991SDimitry Andric   if (Ty.isVector()) {
52608bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
52618bcb0991SDimitry Andric       if (IsFormat)
52625ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
52635ffd83dbSDimitry Andric     }
52645ffd83dbSDimitry Andric   }
52655ffd83dbSDimitry Andric 
52665ffd83dbSDimitry Andric   return VData;
52675ffd83dbSDimitry Andric }
52685ffd83dbSDimitry Andric 
52695ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
52705ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
52715ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
52725ffd83dbSDimitry Andric                                               bool IsTyped,
52735ffd83dbSDimitry Andric                                               bool IsFormat) const {
52745ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
52755ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
52765ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
52775ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
52785ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
52795ffd83dbSDimitry Andric 
52805ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
5281*06c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 2);
52825ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
52835ffd83dbSDimitry Andric 
52845ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
52855ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
52865ffd83dbSDimitry Andric 
52875ffd83dbSDimitry Andric   unsigned ImmOffset;
52885ffd83dbSDimitry Andric 
52895ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
52905ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
52915ffd83dbSDimitry Andric 
52925ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
52935ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
52945ffd83dbSDimitry Andric   Register VIndex;
52955ffd83dbSDimitry Andric   int OpOffset = 0;
52965ffd83dbSDimitry Andric   if (HasVIndex) {
52975ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
52985ffd83dbSDimitry Andric     OpOffset = 1;
5299fe6060f1SDimitry Andric   } else {
5300fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
53015ffd83dbSDimitry Andric   }
53025ffd83dbSDimitry Andric 
53035ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
53045ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
53055ffd83dbSDimitry Andric 
53065ffd83dbSDimitry Andric   unsigned Format = 0;
53075ffd83dbSDimitry Andric   if (IsTyped) {
53085ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
53095ffd83dbSDimitry Andric     ++OpOffset;
53105ffd83dbSDimitry Andric   }
53115ffd83dbSDimitry Andric 
53125ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
53135ffd83dbSDimitry Andric 
5314fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
53155ffd83dbSDimitry Andric 
53165ffd83dbSDimitry Andric   unsigned Opc;
53175ffd83dbSDimitry Andric   if (IsTyped) {
53185ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
53195ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
53205ffd83dbSDimitry Andric   } else if (IsFormat) {
53215ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
53225ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
53235ffd83dbSDimitry Andric   } else {
53245ffd83dbSDimitry Andric     switch (MemSize) {
53255ffd83dbSDimitry Andric     case 1:
53265ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
53275ffd83dbSDimitry Andric       break;
53285ffd83dbSDimitry Andric     case 2:
53295ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
53305ffd83dbSDimitry Andric       break;
53315ffd83dbSDimitry Andric     default:
53325ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
53335ffd83dbSDimitry Andric       break;
53345ffd83dbSDimitry Andric     }
53355ffd83dbSDimitry Andric   }
53365ffd83dbSDimitry Andric 
53375ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
53385ffd83dbSDimitry Andric     .addUse(VData)              // vdata
53395ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
53405ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
53415ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
53425ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
53435ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
53445ffd83dbSDimitry Andric 
53455ffd83dbSDimitry Andric   if (IsTyped)
53465ffd83dbSDimitry Andric     MIB.addImm(Format);
53475ffd83dbSDimitry Andric 
53485ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
53495ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
53505ffd83dbSDimitry Andric      .addMemOperand(MMO);
53515ffd83dbSDimitry Andric 
53525ffd83dbSDimitry Andric   MI.eraseFromParent();
53538bcb0991SDimitry Andric   return true;
53548bcb0991SDimitry Andric }
53558bcb0991SDimitry Andric 
5356bdd1243dSDimitry Andric static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5357bdd1243dSDimitry Andric                             Register VIndex, Register VOffset, Register SOffset,
5358bdd1243dSDimitry Andric                             unsigned ImmOffset, unsigned Format,
5359bdd1243dSDimitry Andric                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5360bdd1243dSDimitry Andric                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5361bdd1243dSDimitry Andric   auto MIB = B.buildInstr(Opc)
5362bdd1243dSDimitry Andric                  .addDef(LoadDstReg) // vdata
5363bdd1243dSDimitry Andric                  .addUse(RSrc)       // rsrc
5364bdd1243dSDimitry Andric                  .addUse(VIndex)     // vindex
5365bdd1243dSDimitry Andric                  .addUse(VOffset)    // voffset
5366bdd1243dSDimitry Andric                  .addUse(SOffset)    // soffset
5367bdd1243dSDimitry Andric                  .addImm(ImmOffset); // offset(imm)
5368bdd1243dSDimitry Andric 
5369bdd1243dSDimitry Andric   if (IsTyped)
5370bdd1243dSDimitry Andric     MIB.addImm(Format);
5371bdd1243dSDimitry Andric 
5372bdd1243dSDimitry Andric   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5373bdd1243dSDimitry Andric       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5374bdd1243dSDimitry Andric       .addMemOperand(MMO);
5375bdd1243dSDimitry Andric }
5376bdd1243dSDimitry Andric 
53775ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
53785ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
53795ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
53805ffd83dbSDimitry Andric                                              bool IsFormat,
53815ffd83dbSDimitry Andric                                              bool IsTyped) const {
53825ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
53835ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
5384fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
53855ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
53865ffd83dbSDimitry Andric 
53875ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
5388bdd1243dSDimitry Andric 
5389bdd1243dSDimitry Andric   Register StatusDst;
5390bdd1243dSDimitry Andric   int OpOffset = 0;
5391bdd1243dSDimitry Andric   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5392bdd1243dSDimitry Andric   bool IsTFE = MI.getNumExplicitDefs() == 2;
5393bdd1243dSDimitry Andric   if (IsTFE) {
5394bdd1243dSDimitry Andric     StatusDst = MI.getOperand(1).getReg();
5395bdd1243dSDimitry Andric     ++OpOffset;
5396bdd1243dSDimitry Andric   }
5397bdd1243dSDimitry Andric 
5398*06c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5399bdd1243dSDimitry Andric   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
54005ffd83dbSDimitry Andric 
54015ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
54025ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
54035ffd83dbSDimitry Andric 
54045ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
5405bdd1243dSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
54065ffd83dbSDimitry Andric   Register VIndex;
54075ffd83dbSDimitry Andric   if (HasVIndex) {
5408bdd1243dSDimitry Andric     VIndex = MI.getOperand(3 + OpOffset).getReg();
5409bdd1243dSDimitry Andric     ++OpOffset;
5410fe6060f1SDimitry Andric   } else {
5411fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
54128bcb0991SDimitry Andric   }
54138bcb0991SDimitry Andric 
54145ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
54155ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
54165ffd83dbSDimitry Andric 
54175ffd83dbSDimitry Andric   unsigned Format = 0;
54185ffd83dbSDimitry Andric   if (IsTyped) {
54195ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
54205ffd83dbSDimitry Andric     ++OpOffset;
54218bcb0991SDimitry Andric   }
54228bcb0991SDimitry Andric 
54235ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
54245ffd83dbSDimitry Andric   unsigned ImmOffset;
54255ffd83dbSDimitry Andric 
54265ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
5427*06c3fb27SDimitry Andric   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
5428*06c3fb27SDimitry Andric   // logic doesn't have to handle that case.
5429*06c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty)) {
5430*06c3fb27SDimitry Andric     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
5431*06c3fb27SDimitry Andric     Dst = MI.getOperand(0).getReg();
5432*06c3fb27SDimitry Andric   }
54335ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
54345ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
54355ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
54365ffd83dbSDimitry Andric 
5437fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
54385ffd83dbSDimitry Andric 
54395ffd83dbSDimitry Andric   unsigned Opc;
54405ffd83dbSDimitry Andric 
5441bdd1243dSDimitry Andric   // TODO: Support TFE for typed and narrow loads.
54425ffd83dbSDimitry Andric   if (IsTyped) {
5443bdd1243dSDimitry Andric     if (IsTFE)
5444bdd1243dSDimitry Andric       return false;
54455ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
54465ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
54475ffd83dbSDimitry Andric   } else if (IsFormat) {
5448bdd1243dSDimitry Andric     if (IsD16) {
5449bdd1243dSDimitry Andric       if (IsTFE)
5450bdd1243dSDimitry Andric         return false;
5451bdd1243dSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
54525ffd83dbSDimitry Andric     } else {
5453bdd1243dSDimitry Andric       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5454bdd1243dSDimitry Andric                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5455bdd1243dSDimitry Andric     }
5456bdd1243dSDimitry Andric   } else {
5457bdd1243dSDimitry Andric     if (IsTFE)
5458bdd1243dSDimitry Andric       return false;
5459fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
5460fe6060f1SDimitry Andric     case 8:
54615ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
54625ffd83dbSDimitry Andric       break;
5463fe6060f1SDimitry Andric     case 16:
54645ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
54655ffd83dbSDimitry Andric       break;
54665ffd83dbSDimitry Andric     default:
54675ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
54685ffd83dbSDimitry Andric       break;
54695ffd83dbSDimitry Andric     }
54705ffd83dbSDimitry Andric   }
54715ffd83dbSDimitry Andric 
5472bdd1243dSDimitry Andric   if (IsTFE) {
5473bdd1243dSDimitry Andric     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5474bdd1243dSDimitry Andric     unsigned NumLoadDWords = NumValueDWords + 1;
5475bdd1243dSDimitry Andric     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5476bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5477bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5478bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5479bdd1243dSDimitry Andric     if (NumValueDWords == 1) {
5480bdd1243dSDimitry Andric       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5481bdd1243dSDimitry Andric     } else {
5482bdd1243dSDimitry Andric       SmallVector<Register, 5> LoadElts;
5483bdd1243dSDimitry Andric       for (unsigned I = 0; I != NumValueDWords; ++I)
5484bdd1243dSDimitry Andric         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5485bdd1243dSDimitry Andric       LoadElts.push_back(StatusDst);
5486bdd1243dSDimitry Andric       B.buildUnmerge(LoadElts, LoadDstReg);
5487bdd1243dSDimitry Andric       LoadElts.truncate(NumValueDWords);
5488bdd1243dSDimitry Andric       B.buildMergeLikeInstr(Dst, LoadElts);
5489bdd1243dSDimitry Andric     }
5490bdd1243dSDimitry Andric   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5491bdd1243dSDimitry Andric              (IsD16 && !Ty.isVector())) {
5492bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5493bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5494bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
54955ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
54965ffd83dbSDimitry Andric     B.buildTrunc(Dst, LoadDstReg);
5497bdd1243dSDimitry Andric   } else if (Unpacked && IsD16 && Ty.isVector()) {
5498bdd1243dSDimitry Andric     LLT UnpackedTy = Ty.changeElementSize(32);
5499bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5500bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5501bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5502bdd1243dSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
55035ffd83dbSDimitry Andric     // FIXME: G_TRUNC should work, but legalization currently fails
55045ffd83dbSDimitry Andric     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
55055ffd83dbSDimitry Andric     SmallVector<Register, 4> Repack;
55065ffd83dbSDimitry Andric     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
55075ffd83dbSDimitry Andric       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5508bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, Repack);
5509bdd1243dSDimitry Andric   } else {
5510bdd1243dSDimitry Andric     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5511bdd1243dSDimitry Andric                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
55125ffd83dbSDimitry Andric   }
55135ffd83dbSDimitry Andric 
55145ffd83dbSDimitry Andric   MI.eraseFromParent();
55155ffd83dbSDimitry Andric   return true;
55165ffd83dbSDimitry Andric }
55175ffd83dbSDimitry Andric 
55185ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
55195ffd83dbSDimitry Andric   switch (IntrID) {
55205ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5521*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
55225ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5523*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
55245ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
55255ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5526*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
55275ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5528*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
55295ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
55305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5531*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
55325ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5533*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
55345ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
55355ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5536*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
55375ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5538*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
55395ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
55405ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5541*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
55425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5543*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
55445ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
55455ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5546*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
55475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5548*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
55495ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
55505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5551*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
55525ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5553*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
55545ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
55555ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5556*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
55575ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5558*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
55595ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
55605ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5561*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
55625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5563*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
55645ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
55655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5566*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
55675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5568*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
55695ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
55705ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5571*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
55725ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5573*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
55745ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
55755ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5576*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
55775ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5578*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
55795ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
55805ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5581*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
55825ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5583*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
55845ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5585e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5586*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5587e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5588*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5589e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5590fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5591*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5592fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5593*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5594fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5595fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5596*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5597fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5598*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5599fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
56005ffd83dbSDimitry Andric   default:
56015ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
56025ffd83dbSDimitry Andric   }
56035ffd83dbSDimitry Andric }
56045ffd83dbSDimitry Andric 
56055ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
56065ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
56075ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
5608*06c3fb27SDimitry Andric   const bool IsCmpSwap =
5609*06c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
5610*06c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
5611*06c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
5612*06c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
5613e8d8bef9SDimitry Andric   const bool HasReturn = MI.getNumExplicitDefs() != 0;
56145ffd83dbSDimitry Andric 
5615e8d8bef9SDimitry Andric   Register Dst;
56165ffd83dbSDimitry Andric 
56175ffd83dbSDimitry Andric   int OpOffset = 0;
5618e8d8bef9SDimitry Andric   if (HasReturn) {
5619e8d8bef9SDimitry Andric     // A few FP atomics do not support return values.
5620e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
5621e8d8bef9SDimitry Andric   } else {
5622e8d8bef9SDimitry Andric     OpOffset = -1;
5623e8d8bef9SDimitry Andric   }
5624e8d8bef9SDimitry Andric 
5625*06c3fb27SDimitry Andric   // Since we don't have 128-bit atomics, we don't need to handle the case of
5626*06c3fb27SDimitry Andric   // p8 argmunents to the atomic itself
5627e8d8bef9SDimitry Andric   Register VData = MI.getOperand(2 + OpOffset).getReg();
5628e8d8bef9SDimitry Andric   Register CmpVal;
56295ffd83dbSDimitry Andric 
56305ffd83dbSDimitry Andric   if (IsCmpSwap) {
56315ffd83dbSDimitry Andric     CmpVal = MI.getOperand(3 + OpOffset).getReg();
56325ffd83dbSDimitry Andric     ++OpOffset;
56335ffd83dbSDimitry Andric   }
56345ffd83dbSDimitry Andric 
5635*06c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
56365ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
5637e8d8bef9SDimitry Andric   const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
56385ffd83dbSDimitry Andric 
56395ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
56405ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
56415ffd83dbSDimitry Andric   Register VIndex;
56425ffd83dbSDimitry Andric   if (HasVIndex) {
56435ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
56445ffd83dbSDimitry Andric     ++OpOffset;
5645fe6060f1SDimitry Andric   } else {
5646fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
56475ffd83dbSDimitry Andric   }
56485ffd83dbSDimitry Andric 
56495ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
56505ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
56515ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
56525ffd83dbSDimitry Andric 
56535ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
56545ffd83dbSDimitry Andric 
56555ffd83dbSDimitry Andric   unsigned ImmOffset;
5656fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
56575ffd83dbSDimitry Andric 
5658e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
5659e8d8bef9SDimitry Andric 
5660e8d8bef9SDimitry Andric   if (HasReturn)
5661e8d8bef9SDimitry Andric     MIB.addDef(Dst);
5662e8d8bef9SDimitry Andric 
5663e8d8bef9SDimitry Andric   MIB.addUse(VData); // vdata
56645ffd83dbSDimitry Andric 
56655ffd83dbSDimitry Andric   if (IsCmpSwap)
56665ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
56675ffd83dbSDimitry Andric 
56685ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
56695ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
56705ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
56715ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
56725ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
56735ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
56745ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
56755ffd83dbSDimitry Andric      .addMemOperand(MMO);
56765ffd83dbSDimitry Andric 
56775ffd83dbSDimitry Andric   MI.eraseFromParent();
56785ffd83dbSDimitry Andric   return true;
56795ffd83dbSDimitry Andric }
56805ffd83dbSDimitry Andric 
5681fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
56825ffd83dbSDimitry Andric /// vector with s16 typed elements.
5683fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
5684fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
5685fe6060f1SDimitry Andric                                       unsigned ArgOffset,
5686fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
5687fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
56885ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
5689fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
5690fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
56915ffd83dbSDimitry Andric 
5692e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
5693e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
56945ffd83dbSDimitry Andric     if (!SrcOp.isReg())
56955ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
56965ffd83dbSDimitry Andric 
56975ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
56985ffd83dbSDimitry Andric 
5699fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
5700fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
5701fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
57020eae32dcSDimitry Andric       if ((I < Intr->GradientStart) && IsA16 &&
57030eae32dcSDimitry Andric           (B.getMRI()->getType(AddrReg) == S16)) {
570404eeddc0SDimitry Andric         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
57050eae32dcSDimitry Andric         // Special handling of bias when A16 is on. Bias is of type half but
57060eae32dcSDimitry Andric         // occupies full 32-bit.
57070eae32dcSDimitry Andric         PackedAddrs.push_back(
57080eae32dcSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
57090eae32dcSDimitry Andric                 .getReg(0));
57100eae32dcSDimitry Andric       } else {
571104eeddc0SDimitry Andric         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
571204eeddc0SDimitry Andric                "Bias needs to be converted to 16 bit in A16 mode");
571304eeddc0SDimitry Andric         // Handle any gradient or coordinate operands that should not be packed
57145ffd83dbSDimitry Andric         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
57155ffd83dbSDimitry Andric         PackedAddrs.push_back(AddrReg);
57160eae32dcSDimitry Andric       }
57175ffd83dbSDimitry Andric     } else {
57185ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
57195ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
57205ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
5721e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
5722e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
5723e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
5724e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
5725e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
57265ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
5727e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
57285ffd83dbSDimitry Andric         PackedAddrs.push_back(
57295ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
57305ffd83dbSDimitry Andric                 .getReg(0));
57315ffd83dbSDimitry Andric       } else {
57325ffd83dbSDimitry Andric         PackedAddrs.push_back(
5733e8d8bef9SDimitry Andric             B.buildBuildVector(
5734e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
57355ffd83dbSDimitry Andric                 .getReg(0));
57365ffd83dbSDimitry Andric         ++I;
57375ffd83dbSDimitry Andric       }
57385ffd83dbSDimitry Andric     }
57395ffd83dbSDimitry Andric   }
57405ffd83dbSDimitry Andric }
57415ffd83dbSDimitry Andric 
57425ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
57435ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
57445ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
57455ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
57465ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
5747bdd1243dSDimitry Andric   (void)S32;
57485ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
57495ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
57505ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
57515ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
57525ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
57535ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
57545ffd83dbSDimitry Andric     }
57555ffd83dbSDimitry Andric   }
57565ffd83dbSDimitry Andric 
57575ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
57585ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
5759fe6060f1SDimitry Andric     auto VAddr =
5760fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
57615ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
57625ffd83dbSDimitry Andric   }
57635ffd83dbSDimitry Andric 
57645ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
57655ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
57665ffd83dbSDimitry Andric     if (SrcOp.isReg())
57675ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
57685ffd83dbSDimitry Andric   }
57695ffd83dbSDimitry Andric }
57705ffd83dbSDimitry Andric 
57715ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
57725ffd83dbSDimitry Andric ///
57735ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
57745ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
57755ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
57765ffd83dbSDimitry Andric /// registers.
57775ffd83dbSDimitry Andric ///
57785ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
57795ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
578081ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid
57815ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
5782349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires
5783349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg.
57845ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
5785e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
5786e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
57875ffd83dbSDimitry Andric 
5788bdd1243dSDimitry Andric   const MachineFunction &MF = *MI.getMF();
5789e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
5790e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
57915ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
57925ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
57935ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
57945ffd83dbSDimitry Andric 
57955ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
57965ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5797e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
57985ffd83dbSDimitry Andric 
57995ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
58005ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
58015ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
5802fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
58035ffd83dbSDimitry Andric 
58045ffd83dbSDimitry Andric   unsigned DMask = 0;
580504eeddc0SDimitry Andric   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
580604eeddc0SDimitry Andric   LLT Ty = MRI->getType(VData);
58075ffd83dbSDimitry Andric 
58085ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
5809e8d8bef9SDimitry Andric   LLT GradTy =
5810e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
5811e8d8bef9SDimitry Andric   LLT AddrTy =
5812e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
5813*06c3fb27SDimitry Andric   const bool IsG16 =
5814*06c3fb27SDimitry Andric       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
58155ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
581604eeddc0SDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
58175ffd83dbSDimitry Andric 
58185ffd83dbSDimitry Andric   int DMaskLanes = 0;
58195ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
5820e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
58215ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
58225ffd83dbSDimitry Andric       DMaskLanes = 4;
58235ffd83dbSDimitry Andric     } else if (DMask != 0) {
5824bdd1243dSDimitry Andric       DMaskLanes = llvm::popcount(DMask);
58255ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
58265ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
58275ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
58285ffd83dbSDimitry Andric       MI.eraseFromParent();
58295ffd83dbSDimitry Andric       return true;
58305ffd83dbSDimitry Andric     }
58315ffd83dbSDimitry Andric   }
58325ffd83dbSDimitry Andric 
58335ffd83dbSDimitry Andric   Observer.changingInstr(MI);
58345ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
58355ffd83dbSDimitry Andric 
583604eeddc0SDimitry Andric   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
583704eeddc0SDimitry Andric                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
583804eeddc0SDimitry Andric   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
583904eeddc0SDimitry Andric                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
584004eeddc0SDimitry Andric   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
58415ffd83dbSDimitry Andric 
58425ffd83dbSDimitry Andric   // Track that we legalized this
58435ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
58445ffd83dbSDimitry Andric 
58455ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
58465ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
58475ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
58485ffd83dbSDimitry Andric     DMask = 0x1;
58495ffd83dbSDimitry Andric     DMaskLanes = 1;
5850e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
58515ffd83dbSDimitry Andric   }
58525ffd83dbSDimitry Andric 
58535ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
58545ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
58555ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
58565ffd83dbSDimitry Andric 
58575ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
58585ffd83dbSDimitry Andric     if (Ty.isVector())
58595ffd83dbSDimitry Andric       return false;
58605ffd83dbSDimitry Andric 
58615ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
58625ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
58635ffd83dbSDimitry Andric       // The two values are packed in one register.
5864fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
58655ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
58665ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
58675ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
58685ffd83dbSDimitry Andric     }
58695ffd83dbSDimitry Andric   }
58705ffd83dbSDimitry Andric 
5871e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
58725ffd83dbSDimitry Andric 
58735ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
5874fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
5875fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
5876fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
58775ffd83dbSDimitry Andric     return false;
5878fe6060f1SDimitry Andric   }
58795ffd83dbSDimitry Andric 
5880fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
5881fe6060f1SDimitry Andric     // A16 not supported
5882fe6060f1SDimitry Andric     return false;
5883fe6060f1SDimitry Andric   }
5884fe6060f1SDimitry Andric 
5885*06c3fb27SDimitry Andric   const unsigned NSAMaxSize = ST.getNSAMaxSize();
5886*06c3fb27SDimitry Andric   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
5887*06c3fb27SDimitry Andric 
5888fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
5889e8d8bef9SDimitry Andric     if (Intr->NumVAddrs > 1) {
58905ffd83dbSDimitry Andric       SmallVector<Register, 4> PackedRegs;
58915ffd83dbSDimitry Andric 
5892fe6060f1SDimitry Andric       packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
5893fe6060f1SDimitry Andric                                 IsG16);
58945ffd83dbSDimitry Andric 
58955ffd83dbSDimitry Andric       // See also below in the non-a16 branch
5896bdd1243dSDimitry Andric       const bool UseNSA = ST.hasNSAEncoding() &&
5897bdd1243dSDimitry Andric                           PackedRegs.size() >= ST.getNSAThreshold(MF) &&
5898*06c3fb27SDimitry Andric                           (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
5899*06c3fb27SDimitry Andric       const bool UsePartialNSA =
5900*06c3fb27SDimitry Andric           UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
59015ffd83dbSDimitry Andric 
5902*06c3fb27SDimitry Andric       if (UsePartialNSA) {
5903*06c3fb27SDimitry Andric         // Pack registers that would go over NSAMaxSize into last VAddr register
5904*06c3fb27SDimitry Andric         LLT PackedAddrTy =
5905*06c3fb27SDimitry Andric             LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
5906*06c3fb27SDimitry Andric         auto Concat = B.buildConcatVectors(
5907*06c3fb27SDimitry Andric             PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
5908*06c3fb27SDimitry Andric         PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
5909*06c3fb27SDimitry Andric         PackedRegs.resize(NSAMaxSize);
5910*06c3fb27SDimitry Andric       } else if (!UseNSA && PackedRegs.size() > 1) {
5911fe6060f1SDimitry Andric         LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
59125ffd83dbSDimitry Andric         auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
59135ffd83dbSDimitry Andric         PackedRegs[0] = Concat.getReg(0);
59145ffd83dbSDimitry Andric         PackedRegs.resize(1);
59155ffd83dbSDimitry Andric       }
59165ffd83dbSDimitry Andric 
5917e8d8bef9SDimitry Andric       const unsigned NumPacked = PackedRegs.size();
5918e8d8bef9SDimitry Andric       for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
5919e8d8bef9SDimitry Andric         MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
59205ffd83dbSDimitry Andric         if (!SrcOp.isReg()) {
59215ffd83dbSDimitry Andric           assert(SrcOp.isImm() && SrcOp.getImm() == 0);
59225ffd83dbSDimitry Andric           continue;
59235ffd83dbSDimitry Andric         }
59245ffd83dbSDimitry Andric 
59255ffd83dbSDimitry Andric         assert(SrcOp.getReg() != AMDGPU::NoRegister);
59265ffd83dbSDimitry Andric 
5927e8d8bef9SDimitry Andric         if (I - Intr->VAddrStart < NumPacked)
5928e8d8bef9SDimitry Andric           SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
59295ffd83dbSDimitry Andric         else
59305ffd83dbSDimitry Andric           SrcOp.setReg(AMDGPU::NoRegister);
59315ffd83dbSDimitry Andric       }
59325ffd83dbSDimitry Andric     }
59335ffd83dbSDimitry Andric   } else {
59345ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
59355ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
59365ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
59375ffd83dbSDimitry Andric     // wash in terms of code size or even better.
59385ffd83dbSDimitry Andric     //
59395ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
59405ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
59415ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
59425ffd83dbSDimitry Andric     //
59435ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
59445ffd83dbSDimitry Andric     // allocation when possible.
594581ad6265SDimitry Andric     //
5946*06c3fb27SDimitry Andric     // Partial NSA is allowed on GFX11 where the final register is a contiguous
5947*06c3fb27SDimitry Andric     // set of the remaining addresses.
5948bdd1243dSDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() &&
5949bdd1243dSDimitry Andric                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
5950*06c3fb27SDimitry Andric                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
5951*06c3fb27SDimitry Andric     const bool UsePartialNSA =
5952*06c3fb27SDimitry Andric         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
59535ffd83dbSDimitry Andric 
5954*06c3fb27SDimitry Andric     if (UsePartialNSA) {
5955*06c3fb27SDimitry Andric       convertImageAddrToPacked(B, MI,
5956*06c3fb27SDimitry Andric                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
5957*06c3fb27SDimitry Andric                                Intr->NumVAddrs - NSAMaxSize + 1);
5958*06c3fb27SDimitry Andric     } else if (!UseNSA && Intr->NumVAddrs > 1) {
5959e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
5960e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
59615ffd83dbSDimitry Andric     }
5962*06c3fb27SDimitry Andric   }
59635ffd83dbSDimitry Andric 
59645ffd83dbSDimitry Andric   int Flags = 0;
59655ffd83dbSDimitry Andric   if (IsA16)
59665ffd83dbSDimitry Andric     Flags |= 1;
59675ffd83dbSDimitry Andric   if (IsG16)
59685ffd83dbSDimitry Andric     Flags |= 2;
59695ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
59705ffd83dbSDimitry Andric 
59715ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
59725ffd83dbSDimitry Andric     // TODO: Handle dmask trim
597304eeddc0SDimitry Andric     if (!Ty.isVector() || !IsD16)
59745ffd83dbSDimitry Andric       return true;
59755ffd83dbSDimitry Andric 
5976e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
59775ffd83dbSDimitry Andric     if (RepackedReg != VData) {
59785ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
59795ffd83dbSDimitry Andric     }
59805ffd83dbSDimitry Andric 
59815ffd83dbSDimitry Andric     return true;
59825ffd83dbSDimitry Andric   }
59835ffd83dbSDimitry Andric 
59845ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
59855ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
59865ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
59875ffd83dbSDimitry Andric 
59885ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
59895ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
59905ffd83dbSDimitry Andric     return false;
59915ffd83dbSDimitry Andric 
59925ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
59935ffd83dbSDimitry Andric     return false;
59945ffd83dbSDimitry Andric 
59955ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
5996fe6060f1SDimitry Andric   const LLT AdjustedTy =
5997fe6060f1SDimitry Andric       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
59985ffd83dbSDimitry Andric 
59995ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
60005ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
60015ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
60025ffd83dbSDimitry Andric   LLT RoundedTy;
60035ffd83dbSDimitry Andric 
6004bdd1243dSDimitry Andric   // S32 vector to cover all data, plus TFE result element.
60055ffd83dbSDimitry Andric   LLT TFETy;
60065ffd83dbSDimitry Andric 
60075ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
60085ffd83dbSDimitry Andric   LLT RegTy;
60095ffd83dbSDimitry Andric 
60105ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
6011fe6060f1SDimitry Andric     RoundedTy =
6012fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6013fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
60145ffd83dbSDimitry Andric     RegTy = S32;
60155ffd83dbSDimitry Andric   } else {
60165ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
60175ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
60185ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
6019fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
6020fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6021fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
60225ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
60235ffd83dbSDimitry Andric   }
60245ffd83dbSDimitry Andric 
60255ffd83dbSDimitry Andric   // The return type does not need adjustment.
60265ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
60275ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
60285ffd83dbSDimitry Andric     return true;
60295ffd83dbSDimitry Andric 
60305ffd83dbSDimitry Andric   Register Dst1Reg;
60315ffd83dbSDimitry Andric 
60325ffd83dbSDimitry Andric   // Insert after the instruction.
60335ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
60345ffd83dbSDimitry Andric 
60355ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
60365ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
60375ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
60385ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
60395ffd83dbSDimitry Andric 
60405ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
60415ffd83dbSDimitry Andric 
60425ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
60435ffd83dbSDimitry Andric 
60445ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
6045349cc55cSDimitry Andric   // type. The instruction really returns these two values in one contiguous
60465ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
60475ffd83dbSDimitry Andric   // return type to use a single register result.
60485ffd83dbSDimitry Andric 
60495ffd83dbSDimitry Andric   if (IsTFE) {
60505ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
60515ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
60525ffd83dbSDimitry Andric       return false;
60535ffd83dbSDimitry Andric 
60545ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
605581ad6265SDimitry Andric     MI.removeOperand(1);
60565ffd83dbSDimitry Andric 
60575ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
60585ffd83dbSDimitry Andric     if (Ty == S32) {
60595ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
60605ffd83dbSDimitry Andric       return true;
60615ffd83dbSDimitry Andric     }
60625ffd83dbSDimitry Andric   }
60635ffd83dbSDimitry Andric 
60645ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
60655ffd83dbSDimitry Andric   // result.
60665ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
60675ffd83dbSDimitry Andric 
60685ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
60695ffd83dbSDimitry Andric 
60705ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
60715ffd83dbSDimitry Andric     assert(!IsTFE);
60725ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
60735ffd83dbSDimitry Andric   } else {
60745ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
60755ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
60765ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
60775ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
60785ffd83dbSDimitry Andric 
60795ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
60805ffd83dbSDimitry Andric     // directly written to the right place already.
60815ffd83dbSDimitry Andric     if (IsTFE)
60825ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
60835ffd83dbSDimitry Andric   }
60845ffd83dbSDimitry Andric 
60855ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
60865ffd83dbSDimitry Andric   // of packed vs. unpacked.
60875ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
60885ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
60895ffd83dbSDimitry Andric     return true;
60905ffd83dbSDimitry Andric   }
60915ffd83dbSDimitry Andric 
60925ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
60935ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
60945ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
60955ffd83dbSDimitry Andric     return true;
60965ffd83dbSDimitry Andric   }
60975ffd83dbSDimitry Andric 
60985ffd83dbSDimitry Andric   assert(Ty.isVector());
60995ffd83dbSDimitry Andric 
61005ffd83dbSDimitry Andric   if (IsD16) {
61015ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
61025ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
61035ffd83dbSDimitry Andric     //
61045ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
61055ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
61065ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
61075ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
61085ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
61095ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
61105ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
61115ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
61125ffd83dbSDimitry Andric     }
61135ffd83dbSDimitry Andric   }
61145ffd83dbSDimitry Andric 
61155ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
61165ffd83dbSDimitry Andric     if (NumElts == 0)
61175ffd83dbSDimitry Andric       return;
61185ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
61195ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
61205ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
61215ffd83dbSDimitry Andric   };
61225ffd83dbSDimitry Andric 
61235ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
61245ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
61255ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
61265ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
61275ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
61285ffd83dbSDimitry Andric     return true;
61295ffd83dbSDimitry Andric   }
61305ffd83dbSDimitry Andric 
61315ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
61325ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
61335ffd83dbSDimitry Andric 
61345ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
6135fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
61365ffd83dbSDimitry Andric   if (Ty == V3S16) {
61370eae32dcSDimitry Andric     if (IsTFE) {
61380eae32dcSDimitry Andric       if (ResultRegs.size() == 1) {
61390eae32dcSDimitry Andric         NewResultReg = ResultRegs[0];
61400eae32dcSDimitry Andric       } else if (ResultRegs.size() == 2) {
61410eae32dcSDimitry Andric         LLT V4S16 = LLT::fixed_vector(4, 16);
61420eae32dcSDimitry Andric         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
61430eae32dcSDimitry Andric       } else {
61440eae32dcSDimitry Andric         return false;
61450eae32dcSDimitry Andric       }
61460eae32dcSDimitry Andric     }
61470eae32dcSDimitry Andric 
61480eae32dcSDimitry Andric     if (MRI->getType(DstReg).getNumElements() <
61490eae32dcSDimitry Andric         MRI->getType(NewResultReg).getNumElements()) {
61500eae32dcSDimitry Andric       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
61510eae32dcSDimitry Andric     } else {
61520eae32dcSDimitry Andric       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
61530eae32dcSDimitry Andric     }
61545ffd83dbSDimitry Andric     return true;
61555ffd83dbSDimitry Andric   }
61565ffd83dbSDimitry Andric 
61575ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
61585ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
61595ffd83dbSDimitry Andric   return true;
61605ffd83dbSDimitry Andric }
61615ffd83dbSDimitry Andric 
61625ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
6163e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
6164e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
6165e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
6166e8d8bef9SDimitry Andric 
61675ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
61685ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
61695ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
61705ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
61715ffd83dbSDimitry Andric 
61725ffd83dbSDimitry Andric   Observer.changingInstr(MI);
61735ffd83dbSDimitry Andric 
6174*06c3fb27SDimitry Andric   // Handle needing to s.buffer.load() a p8 value.
6175*06c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty)) {
6176*06c3fb27SDimitry Andric     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
6177*06c3fb27SDimitry Andric     Dst = MI.getOperand(0).getReg();
6178*06c3fb27SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
6179*06c3fb27SDimitry Andric   }
6180fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6181e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
6182e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
6183e8d8bef9SDimitry Andric     Dst = MI.getOperand(0).getReg();
6184e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
6185e8d8bef9SDimitry Andric   }
6186e8d8bef9SDimitry Andric 
61875ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
61885ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
61895ffd83dbSDimitry Andric   // allowed to add one.
61905ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
619181ad6265SDimitry Andric   MI.removeOperand(1); // Remove intrinsic ID
61925ffd83dbSDimitry Andric 
61935ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
61945ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
61955ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
61965ffd83dbSDimitry Andric   const Align MemAlign(4);
61975ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
61985ffd83dbSDimitry Andric       MachinePointerInfo(),
61995ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
62005ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
62015ffd83dbSDimitry Andric       MemSize, MemAlign);
62025ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
62035ffd83dbSDimitry Andric 
62045ffd83dbSDimitry Andric   // There are no 96-bit result scalar loads, but widening to 128-bit should
62055ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
62065ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
62075ffd83dbSDimitry Andric   if (!isPowerOf2_32(Size)) {
62085ffd83dbSDimitry Andric     if (Ty.isVector())
62095ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
62105ffd83dbSDimitry Andric     else
62115ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
62125ffd83dbSDimitry Andric   }
62135ffd83dbSDimitry Andric 
62145ffd83dbSDimitry Andric   Observer.changedInstr(MI);
62155ffd83dbSDimitry Andric   return true;
62165ffd83dbSDimitry Andric }
62175ffd83dbSDimitry Andric 
6218e8d8bef9SDimitry Andric // TODO: Move to selection
62195ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
62200b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
62210b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
6222fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
6223fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6224fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
6225fe6060f1SDimitry Andric 
6226*06c3fb27SDimitry Andric   const Module *M = B.getMF().getFunction().getParent();
6227*06c3fb27SDimitry Andric   unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
6228*06c3fb27SDimitry Andric   if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
6229fe6060f1SDimitry Andric     return legalizeTrapHsaQueuePtr(MI, MRI, B);
6230fe6060f1SDimitry Andric 
6231*06c3fb27SDimitry Andric   return ST.supportsGetDoorbellID() ?
6232*06c3fb27SDimitry Andric          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6233fe6060f1SDimitry Andric }
6234fe6060f1SDimitry Andric 
6235fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6236fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6237*06c3fb27SDimitry Andric   const DebugLoc &DL = MI.getDebugLoc();
6238*06c3fb27SDimitry Andric   MachineBasicBlock &BB = B.getMBB();
6239*06c3fb27SDimitry Andric   MachineFunction *MF = BB.getParent();
6240*06c3fb27SDimitry Andric 
6241*06c3fb27SDimitry Andric   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
6242*06c3fb27SDimitry Andric     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6243*06c3fb27SDimitry Andric       .addImm(0);
6244*06c3fb27SDimitry Andric     MI.eraseFromParent();
6245*06c3fb27SDimitry Andric     return true;
6246*06c3fb27SDimitry Andric   }
6247*06c3fb27SDimitry Andric 
6248*06c3fb27SDimitry Andric   // We need a block split to make the real endpgm a terminator. We also don't
6249*06c3fb27SDimitry Andric   // want to break phis in successor blocks, so we can't just delete to the
6250*06c3fb27SDimitry Andric   // end of the block.
6251*06c3fb27SDimitry Andric   BB.splitAt(MI, false /*UpdateLiveIns*/);
6252*06c3fb27SDimitry Andric   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
6253*06c3fb27SDimitry Andric   MF->push_back(TrapBB);
6254*06c3fb27SDimitry Andric   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
6255*06c3fb27SDimitry Andric     .addImm(0);
6256*06c3fb27SDimitry Andric   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6257*06c3fb27SDimitry Andric     .addMBB(TrapBB);
6258*06c3fb27SDimitry Andric 
6259*06c3fb27SDimitry Andric   BB.addSuccessor(TrapBB);
6260fe6060f1SDimitry Andric   MI.eraseFromParent();
6261fe6060f1SDimitry Andric   return true;
6262fe6060f1SDimitry Andric }
6263fe6060f1SDimitry Andric 
6264fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6265fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
626681ad6265SDimitry Andric   MachineFunction &MF = B.getMF();
626781ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
626881ad6265SDimitry Andric 
626981ad6265SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
627081ad6265SDimitry Andric   // For code object version 5, queue_ptr is passed through implicit kernarg.
6271*06c3fb27SDimitry Andric   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
6272*06c3fb27SDimitry Andric       AMDGPU::AMDHSA_COV5) {
627381ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
627481ad6265SDimitry Andric         AMDGPUTargetLowering::QUEUE_PTR;
627581ad6265SDimitry Andric     uint64_t Offset =
627681ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
627781ad6265SDimitry Andric 
627881ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
627981ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
628081ad6265SDimitry Andric 
628181ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
628281ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
628381ad6265SDimitry Andric       return false;
628481ad6265SDimitry Andric 
628581ad6265SDimitry Andric     // TODO: can we be smarter about machine pointer info?
628681ad6265SDimitry Andric     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
628781ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
628881ad6265SDimitry Andric         PtrInfo,
628981ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
629081ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
629181ad6265SDimitry Andric         LLT::scalar(64), commonAlignment(Align(64), Offset));
629281ad6265SDimitry Andric 
629381ad6265SDimitry Andric     // Pointer address
629481ad6265SDimitry Andric     Register LoadAddr = MRI.createGenericVirtualRegister(
629581ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
629681ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
629781ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
629881ad6265SDimitry Andric     // Load address
629981ad6265SDimitry Andric     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
630081ad6265SDimitry Andric     B.buildCopy(SGPR01, Temp);
630181ad6265SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
630281ad6265SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
630381ad6265SDimitry Andric         .addReg(SGPR01, RegState::Implicit);
630481ad6265SDimitry Andric     MI.eraseFromParent();
630581ad6265SDimitry Andric     return true;
630681ad6265SDimitry Andric   }
630781ad6265SDimitry Andric 
63085ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
63095ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6310e8d8bef9SDimitry Andric   Register LiveIn =
6311e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6312e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
63135ffd83dbSDimitry Andric     return false;
6314e8d8bef9SDimitry Andric 
63155ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
63165ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
6317fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
63185ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
6319fe6060f1SDimitry Andric 
6320fe6060f1SDimitry Andric   MI.eraseFromParent();
6321fe6060f1SDimitry Andric   return true;
63225ffd83dbSDimitry Andric }
63235ffd83dbSDimitry Andric 
6324fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(
6325fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6326fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
6327fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
63285ffd83dbSDimitry Andric   MI.eraseFromParent();
63295ffd83dbSDimitry Andric   return true;
63305ffd83dbSDimitry Andric }
63315ffd83dbSDimitry Andric 
63325ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
63335ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6334349cc55cSDimitry Andric   // Is non-HSA path or trap-handler disabled? Then, report a warning
63355ffd83dbSDimitry Andric   // accordingly
6336fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
6337fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
63385ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
63395ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
63405ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
63415ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
63425ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
63435ffd83dbSDimitry Andric   } else {
63445ffd83dbSDimitry Andric     // Insert debug-trap instruction
6345fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
6346fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
63475ffd83dbSDimitry Andric   }
63485ffd83dbSDimitry Andric 
63495ffd83dbSDimitry Andric   MI.eraseFromParent();
63505ffd83dbSDimitry Andric   return true;
63515ffd83dbSDimitry Andric }
63525ffd83dbSDimitry Andric 
6353e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6354e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
6355e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
6356e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
6357e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
635881ad6265SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
635981ad6265SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
6360e8d8bef9SDimitry Andric 
6361e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
6362e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
6363e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
6364e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
6365e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
6366e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
6367e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
6368e8d8bef9SDimitry Andric 
6369fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
6370fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6371fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
6372fe6060f1SDimitry Andric                                         MI.getDebugLoc());
6373fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6374fe6060f1SDimitry Andric     return false;
6375fe6060f1SDimitry Andric   }
6376fe6060f1SDimitry Andric 
637781ad6265SDimitry Andric   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6378349cc55cSDimitry Andric   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6379349cc55cSDimitry Andric   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6380349cc55cSDimitry Andric   const unsigned NumVDataDwords = 4;
6381349cc55cSDimitry Andric   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
638281ad6265SDimitry Andric   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
638381ad6265SDimitry Andric   const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
6384349cc55cSDimitry Andric   const unsigned BaseOpcodes[2][2] = {
6385349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6386349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6387349cc55cSDimitry Andric        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6388349cc55cSDimitry Andric   int Opcode;
6389349cc55cSDimitry Andric   if (UseNSA) {
639081ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
639181ad6265SDimitry Andric                                    IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
639281ad6265SDimitry Andric                                                : AMDGPU::MIMGEncGfx10NSA,
6393349cc55cSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
6394349cc55cSDimitry Andric   } else {
639581ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(
639681ad6265SDimitry Andric         BaseOpcodes[Is64][IsA16],
639781ad6265SDimitry Andric         IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
6398bdd1243dSDimitry Andric         NumVDataDwords, NumVAddrDwords);
6399349cc55cSDimitry Andric   }
6400349cc55cSDimitry Andric   assert(Opcode != -1);
6401e8d8bef9SDimitry Andric 
6402e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
640381ad6265SDimitry Andric   if (UseNSA && IsGFX11Plus) {
640481ad6265SDimitry Andric     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
640581ad6265SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6406bdd1243dSDimitry Andric       auto Merged = B.buildMergeLikeInstr(
640781ad6265SDimitry Andric           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
640881ad6265SDimitry Andric       Ops.push_back(Merged.getReg(0));
640981ad6265SDimitry Andric     };
641081ad6265SDimitry Andric 
641181ad6265SDimitry Andric     Ops.push_back(NodePtr);
641281ad6265SDimitry Andric     Ops.push_back(RayExtent);
641381ad6265SDimitry Andric     packLanes(RayOrigin);
641481ad6265SDimitry Andric 
641581ad6265SDimitry Andric     if (IsA16) {
641681ad6265SDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
641781ad6265SDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6418bdd1243dSDimitry Andric       auto MergedDir = B.buildMergeLikeInstr(
641981ad6265SDimitry Andric           V3S32,
6420bdd1243dSDimitry Andric           {B.buildBitcast(
6421bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
642281ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(0)}))
642381ad6265SDimitry Andric                .getReg(0),
6424bdd1243dSDimitry Andric            B.buildBitcast(
6425bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
642681ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(1)}))
642781ad6265SDimitry Andric                .getReg(0),
6428bdd1243dSDimitry Andric            B.buildBitcast(
6429bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
643081ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(2)}))
643181ad6265SDimitry Andric                .getReg(0)});
643281ad6265SDimitry Andric       Ops.push_back(MergedDir.getReg(0));
643381ad6265SDimitry Andric     } else {
643481ad6265SDimitry Andric       packLanes(RayDir);
643581ad6265SDimitry Andric       packLanes(RayInvDir);
643681ad6265SDimitry Andric     }
643781ad6265SDimitry Andric   } else {
6438e8d8bef9SDimitry Andric     if (Is64) {
6439e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6440e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
6441e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
6442e8d8bef9SDimitry Andric     } else {
6443e8d8bef9SDimitry Andric       Ops.push_back(NodePtr);
6444e8d8bef9SDimitry Andric     }
6445e8d8bef9SDimitry Andric     Ops.push_back(RayExtent);
6446e8d8bef9SDimitry Andric 
6447e8d8bef9SDimitry Andric     auto packLanes = [&Ops, &S32, &B](Register Src) {
64480eae32dcSDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6449e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
6450e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
6451e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(2));
6452e8d8bef9SDimitry Andric     };
6453e8d8bef9SDimitry Andric 
6454e8d8bef9SDimitry Andric     packLanes(RayOrigin);
6455e8d8bef9SDimitry Andric     if (IsA16) {
64560eae32dcSDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
64570eae32dcSDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6458e8d8bef9SDimitry Andric       Register R1 = MRI.createGenericVirtualRegister(S32);
6459e8d8bef9SDimitry Andric       Register R2 = MRI.createGenericVirtualRegister(S32);
6460e8d8bef9SDimitry Andric       Register R3 = MRI.createGenericVirtualRegister(S32);
6461bdd1243dSDimitry Andric       B.buildMergeLikeInstr(R1,
6462bdd1243dSDimitry Andric                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6463bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
6464bdd1243dSDimitry Andric           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6465bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
6466bdd1243dSDimitry Andric           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6467e8d8bef9SDimitry Andric       Ops.push_back(R1);
6468e8d8bef9SDimitry Andric       Ops.push_back(R2);
6469e8d8bef9SDimitry Andric       Ops.push_back(R3);
6470e8d8bef9SDimitry Andric     } else {
6471e8d8bef9SDimitry Andric       packLanes(RayDir);
6472e8d8bef9SDimitry Andric       packLanes(RayInvDir);
6473e8d8bef9SDimitry Andric     }
647481ad6265SDimitry Andric   }
6475e8d8bef9SDimitry Andric 
6476349cc55cSDimitry Andric   if (!UseNSA) {
6477349cc55cSDimitry Andric     // Build a single vector containing all the operands so far prepared.
6478349cc55cSDimitry Andric     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6479bdd1243dSDimitry Andric     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6480349cc55cSDimitry Andric     Ops.clear();
6481349cc55cSDimitry Andric     Ops.push_back(MergedOps);
6482349cc55cSDimitry Andric   }
6483349cc55cSDimitry Andric 
6484e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6485e8d8bef9SDimitry Andric     .addDef(DstReg)
6486e8d8bef9SDimitry Andric     .addImm(Opcode);
6487e8d8bef9SDimitry Andric 
6488e8d8bef9SDimitry Andric   for (Register R : Ops) {
6489e8d8bef9SDimitry Andric     MIB.addUse(R);
6490e8d8bef9SDimitry Andric   }
6491e8d8bef9SDimitry Andric 
6492e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
6493e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
6494e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
6495e8d8bef9SDimitry Andric 
6496e8d8bef9SDimitry Andric   MI.eraseFromParent();
6497e8d8bef9SDimitry Andric   return true;
6498e8d8bef9SDimitry Andric }
6499e8d8bef9SDimitry Andric 
650081ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
650181ad6265SDimitry Andric                                                MachineIRBuilder &B) const {
650281ad6265SDimitry Andric   unsigned Opc;
650381ad6265SDimitry Andric   int RoundMode = MI.getOperand(2).getImm();
650481ad6265SDimitry Andric 
650581ad6265SDimitry Andric   if (RoundMode == (int)RoundingMode::TowardPositive)
650681ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
650781ad6265SDimitry Andric   else if (RoundMode == (int)RoundingMode::TowardNegative)
650881ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
650981ad6265SDimitry Andric   else
651081ad6265SDimitry Andric     return false;
651181ad6265SDimitry Andric 
651281ad6265SDimitry Andric   B.buildInstr(Opc)
651381ad6265SDimitry Andric       .addDef(MI.getOperand(0).getReg())
651481ad6265SDimitry Andric       .addUse(MI.getOperand(1).getReg());
651581ad6265SDimitry Andric 
651604eeddc0SDimitry Andric   MI.eraseFromParent();
651781ad6265SDimitry Andric 
651804eeddc0SDimitry Andric   return true;
651904eeddc0SDimitry Andric }
652004eeddc0SDimitry Andric 
65215ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
65225ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
65235ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
65245ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
65255ffd83dbSDimitry Andric 
65260b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
6527480093f4SDimitry Andric   auto IntrID = MI.getIntrinsicID();
6528480093f4SDimitry Andric   switch (IntrID) {
6529480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
6530480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
6531480093f4SDimitry Andric     MachineInstr *Br = nullptr;
65325ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
6533e8d8bef9SDimitry Andric     bool Negated = false;
6534e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
6535e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
65360b57cec5SDimitry Andric       const SIRegisterInfo *TRI
65370b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
65380b57cec5SDimitry Andric 
65390b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
65400b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
6541480093f4SDimitry Andric 
65425ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6543e8d8bef9SDimitry Andric 
6544e8d8bef9SDimitry Andric       if (Negated)
6545e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
6546e8d8bef9SDimitry Andric 
65475ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6548480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
65490b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
65500b57cec5SDimitry Andric           .addDef(Def)
65510b57cec5SDimitry Andric           .addUse(Use)
65525ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
6553480093f4SDimitry Andric       } else {
6554480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
6555480093f4SDimitry Andric             .addDef(Def)
6556480093f4SDimitry Andric             .addUse(Use)
6557e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
6558480093f4SDimitry Andric       }
6559480093f4SDimitry Andric 
65605ffd83dbSDimitry Andric       if (Br) {
65615ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
65625ffd83dbSDimitry Andric       } else {
65635ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
65645ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
65655ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
65665ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
65675ffd83dbSDimitry Andric       }
65680b57cec5SDimitry Andric 
65690b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
65700b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
65710b57cec5SDimitry Andric       MI.eraseFromParent();
65720b57cec5SDimitry Andric       BrCond->eraseFromParent();
65730b57cec5SDimitry Andric       return true;
65740b57cec5SDimitry Andric     }
65750b57cec5SDimitry Andric 
65760b57cec5SDimitry Andric     return false;
65770b57cec5SDimitry Andric   }
65780b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
6579480093f4SDimitry Andric     MachineInstr *Br = nullptr;
65805ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
6581e8d8bef9SDimitry Andric     bool Negated = false;
6582e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
6583e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
65840b57cec5SDimitry Andric       const SIRegisterInfo *TRI
65850b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
65860b57cec5SDimitry Andric 
65875ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
65880b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
65895ffd83dbSDimitry Andric 
6590e8d8bef9SDimitry Andric       if (Negated)
6591e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
6592e8d8bef9SDimitry Andric 
65935ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
65940b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
65950b57cec5SDimitry Andric         .addUse(Reg)
65965ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
65975ffd83dbSDimitry Andric 
65985ffd83dbSDimitry Andric       if (Br)
65995ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
66005ffd83dbSDimitry Andric       else
66015ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
66025ffd83dbSDimitry Andric 
66030b57cec5SDimitry Andric       MI.eraseFromParent();
66040b57cec5SDimitry Andric       BrCond->eraseFromParent();
66050b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
66060b57cec5SDimitry Andric       return true;
66070b57cec5SDimitry Andric     }
66080b57cec5SDimitry Andric 
66090b57cec5SDimitry Andric     return false;
66100b57cec5SDimitry Andric   }
6611*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_make_buffer_rsrc:
6612*06c3fb27SDimitry Andric     return legalizePointerAsRsrcIntrin(MI, MRI, B);
66130b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
66145ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
66155ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
66165ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
66175ffd83dbSDimitry Andric       MI.eraseFromParent();
66185ffd83dbSDimitry Andric       return true;
66195ffd83dbSDimitry Andric     }
66205ffd83dbSDimitry Andric 
66210b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
66220b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
66230b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
66240b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
66250b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
662681ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
66270b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
66280b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
662981ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
66300b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
66310b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
663281ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
66330b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
66340b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
66350b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
66360b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
66370b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
66380b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
66390b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
66400b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
66410b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
66420b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6643fcaf7f86SDimitry Andric   case Intrinsic::amdgcn_lds_kernel_id:
6644fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
6645fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
66460b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
66470b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
66480b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
66490b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
66500b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
66510b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
66520b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
66530b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
66540b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
66550b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
66560b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
66570b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
665881ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_x:
665981ad6265SDimitry Andric     // TODO: Emit error for hsa
666081ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
666181ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_X);
666281ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_y:
666381ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
666481ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Y);
666581ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_z:
666681ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
666781ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Z);
666881ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_x:
666981ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
667081ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
667181ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_y:
667281ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
667381ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
667481ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
667581ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_z:
667681ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
667781ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_x:
667881ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
667981ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_y:
668081ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
668181ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_z:
668281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
66838bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
66848bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
66858bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
66868bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
66878bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
66888bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
66898bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
66908bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
66918bcb0991SDimitry Andric     MI.eraseFromParent();
66928bcb0991SDimitry Andric     return true;
66938bcb0991SDimitry Andric   }
66945ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
6695e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
66968bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
6697*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_store:
66985ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
6699*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_store:
67005ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
67018bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
6702*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
67035ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
6704*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
67055ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
67065ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
6707*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
67085ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
6709*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
67105ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
67115ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
6712*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_load:
67135ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
6714*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_load:
67155ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
67165ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
6717*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
67185ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
6719*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
67205ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
67215ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
6722*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
67235ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
6724*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
67255ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
67265ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6727*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
67285ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6729*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
67305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
6731*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
67325ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
6733*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
67345ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6735*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
67365ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6737*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
67385ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6739*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
67405ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6741*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
67425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6743*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
67445ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6745*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
67465ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6747*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
67485ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6749*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
67505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6751*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
67525ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6753*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
67545ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
6755*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
67565ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
6757*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
67585ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
6759*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
67605ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
6761*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
67625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6763*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
67645ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6765*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
67665ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6767*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
67685ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6769*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
67705ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6771*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
67725ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6773*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
67745ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6775*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
67765ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6777*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6778fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6779*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6780fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6781*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6782fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6783*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6784fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6785*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
678604eeddc0SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6787*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6788bdd1243dSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6789*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
679004eeddc0SDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
67915ffd83dbSDimitry Andric   case Intrinsic::trap:
67925ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
67935ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
67945ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
6795e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
6796e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
6797e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
6798e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
6799e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
6800e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
6801e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
6802e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
6803*06c3fb27SDimitry Andric   case Intrinsic::amdgcn_fmed3: {
6804*06c3fb27SDimitry Andric     GISelChangeObserver &Observer = Helper.Observer;
6805*06c3fb27SDimitry Andric 
6806*06c3fb27SDimitry Andric     // FIXME: This is to workaround the inability of tablegen match combiners to
6807*06c3fb27SDimitry Andric     // match intrinsics in patterns.
6808*06c3fb27SDimitry Andric     Observer.changingInstr(MI);
6809*06c3fb27SDimitry Andric     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
6810*06c3fb27SDimitry Andric     MI.removeOperand(1);
6811*06c3fb27SDimitry Andric     Observer.changedInstr(MI);
6812*06c3fb27SDimitry Andric     return true;
6813*06c3fb27SDimitry Andric   }
68145ffd83dbSDimitry Andric   default: {
68155ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
68165ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
68175ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
68180b57cec5SDimitry Andric     return true;
68190b57cec5SDimitry Andric   }
68205ffd83dbSDimitry Andric   }
68210b57cec5SDimitry Andric 
68220b57cec5SDimitry Andric   return true;
68230b57cec5SDimitry Andric }
6824