xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 1db9f3b21e39176dd5b67cf8ac378633b172463e)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
205f757f3fSDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
215f757f3fSDimitry Andric #include "SIInstrInfo.h"
220b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
235f757f3fSDimitry Andric #include "SIRegisterInfo.h"
24fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
255ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
26fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
275f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
280b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
295ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
3106c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h"
325f757f3fSDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h"
338bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
34e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
3581ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
360b57cec5SDimitry Andric 
370b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
380b57cec5SDimitry Andric 
390b57cec5SDimitry Andric using namespace llvm;
400b57cec5SDimitry Andric using namespace LegalizeActions;
410b57cec5SDimitry Andric using namespace LegalizeMutations;
420b57cec5SDimitry Andric using namespace LegalityPredicates;
435ffd83dbSDimitry Andric using namespace MIPatternMatch;
440b57cec5SDimitry Andric 
455ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
465ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
475ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
485ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
495ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
505ffd83dbSDimitry Andric   cl::init(false),
515ffd83dbSDimitry Andric   cl::ReallyHidden);
520b57cec5SDimitry Andric 
535ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
545ffd83dbSDimitry Andric 
555ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
565ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
575ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
585ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
59fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
600b57cec5SDimitry Andric }
610b57cec5SDimitry Andric 
625ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
635ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
645ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
655ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
665ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
678bcb0991SDimitry Andric }
688bcb0991SDimitry Andric 
69349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an
70e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
720b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
730b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
740b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
75e8d8bef9SDimitry Andric     if (!Ty.isVector())
76e8d8bef9SDimitry Andric       return false;
77e8d8bef9SDimitry Andric 
78e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
79e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
80e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
81e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
828bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
838bcb0991SDimitry Andric   };
848bcb0991SDimitry Andric }
858bcb0991SDimitry Andric 
86e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
88e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
89e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
90e8d8bef9SDimitry Andric   };
91e8d8bef9SDimitry Andric }
92e8d8bef9SDimitry Andric 
938bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
948bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
958bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
968bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
978bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
980b57cec5SDimitry Andric   };
990b57cec5SDimitry Andric }
1000b57cec5SDimitry Andric 
1010b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
1020b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1030b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1040b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
105bdd1243dSDimitry Andric     return std::pair(TypeIdx,
106fe6060f1SDimitry Andric                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1070b57cec5SDimitry Andric   };
1080b57cec5SDimitry Andric }
1090b57cec5SDimitry Andric 
1100b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1110b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1120b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1130b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1140b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1150b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1160b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::scalarOrVector(
118bdd1243dSDimitry Andric                                   ElementCount::getFixed(NewNumElts), EltTy));
1190b57cec5SDimitry Andric   };
1200b57cec5SDimitry Andric }
1210b57cec5SDimitry Andric 
1228bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1238bcb0991SDimitry Andric // type.
1248bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1258bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1268bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1278bcb0991SDimitry Andric 
1288bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1298bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1308bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1318bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1328bcb0991SDimitry Andric 
1338bcb0991SDimitry Andric     assert(EltSize < 32);
1348bcb0991SDimitry Andric 
1358bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1378bcb0991SDimitry Andric   };
1388bcb0991SDimitry Andric }
1398bcb0991SDimitry Andric 
14006c3fb27SDimitry Andric // Increase the number of vector elements to reach the next legal RegClass.
14106c3fb27SDimitry Andric static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
14206c3fb27SDimitry Andric   return [=](const LegalityQuery &Query) {
14306c3fb27SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
14406c3fb27SDimitry Andric     const unsigned NumElts = Ty.getNumElements();
14506c3fb27SDimitry Andric     const unsigned EltSize = Ty.getElementType().getSizeInBits();
14606c3fb27SDimitry Andric     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
14706c3fb27SDimitry Andric 
14806c3fb27SDimitry Andric     assert(EltSize == 32 || EltSize == 64);
14906c3fb27SDimitry Andric     assert(Ty.getSizeInBits() < MaxRegisterSize);
15006c3fb27SDimitry Andric 
15106c3fb27SDimitry Andric     unsigned NewNumElts;
15206c3fb27SDimitry Andric     // Find the nearest legal RegClass that is larger than the current type.
15306c3fb27SDimitry Andric     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
15406c3fb27SDimitry Andric       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
15506c3fb27SDimitry Andric         break;
15606c3fb27SDimitry Andric     }
15706c3fb27SDimitry Andric 
15806c3fb27SDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
15906c3fb27SDimitry Andric   };
16006c3fb27SDimitry Andric }
16106c3fb27SDimitry Andric 
16206c3fb27SDimitry Andric static LLT getBufferRsrcScalarType(const LLT Ty) {
16306c3fb27SDimitry Andric   if (!Ty.isVector())
16406c3fb27SDimitry Andric     return LLT::scalar(128);
16506c3fb27SDimitry Andric   const ElementCount NumElems = Ty.getElementCount();
16606c3fb27SDimitry Andric   return LLT::vector(NumElems, LLT::scalar(128));
16706c3fb27SDimitry Andric }
16806c3fb27SDimitry Andric 
16906c3fb27SDimitry Andric static LLT getBufferRsrcRegisterType(const LLT Ty) {
17006c3fb27SDimitry Andric   if (!Ty.isVector())
17106c3fb27SDimitry Andric     return LLT::fixed_vector(4, LLT::scalar(32));
17206c3fb27SDimitry Andric   const unsigned NumElems = Ty.getElementCount().getFixedValue();
17306c3fb27SDimitry Andric   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
17406c3fb27SDimitry Andric }
17506c3fb27SDimitry Andric 
176e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
177e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1785ffd83dbSDimitry Andric 
1795ffd83dbSDimitry Andric   if (Size <= 32) {
1805ffd83dbSDimitry Andric     // <2 x s8> -> s16
1815ffd83dbSDimitry Andric     // <4 x s8> -> s32
182e8d8bef9SDimitry Andric     return LLT::scalar(Size);
183e8d8bef9SDimitry Andric   }
1845ffd83dbSDimitry Andric 
185fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186e8d8bef9SDimitry Andric }
187e8d8bef9SDimitry Andric 
188e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
190e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
191bdd1243dSDimitry Andric     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192e8d8bef9SDimitry Andric   };
193e8d8bef9SDimitry Andric }
194e8d8bef9SDimitry Andric 
195e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
197e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
198e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
199e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
200bdd1243dSDimitry Andric     return std::pair(
201fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
2025ffd83dbSDimitry Andric   };
2035ffd83dbSDimitry Andric }
2045ffd83dbSDimitry Andric 
2058bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
2068bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2078bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2088bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
2098bcb0991SDimitry Andric   };
2108bcb0991SDimitry Andric }
2118bcb0991SDimitry Andric 
2120b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
2130b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
2140b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2150b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
2160b57cec5SDimitry Andric   };
2170b57cec5SDimitry Andric }
2180b57cec5SDimitry Andric 
2190b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
2200b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
2210b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2220b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
2230b57cec5SDimitry Andric   };
2240b57cec5SDimitry Andric }
2250b57cec5SDimitry Andric 
2265ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
2275ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
2285ffd83dbSDimitry Andric }
2295ffd83dbSDimitry Andric 
2305ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
2315ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
2325ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
2335ffd83dbSDimitry Andric }
2345ffd83dbSDimitry Andric 
2355ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
2360b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
2370b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
2380b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
2390b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
2400b57cec5SDimitry Andric }
2410b57cec5SDimitry Andric 
2425ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2435ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2445ffd83dbSDimitry Andric     return false;
2455ffd83dbSDimitry Andric 
2465ffd83dbSDimitry Andric   if (Ty.isVector())
2475ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2485ffd83dbSDimitry Andric 
2495ffd83dbSDimitry Andric   return true;
2505ffd83dbSDimitry Andric }
2515ffd83dbSDimitry Andric 
2525ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2535ffd83dbSDimitry Andric // multiples of v2s16.
2545ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2555ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2565ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2578bcb0991SDimitry Andric   };
2588bcb0991SDimitry Andric }
2598bcb0991SDimitry Andric 
26006c3fb27SDimitry Andric // RegisterType that doesn't have a corresponding RegClass.
26106c3fb27SDimitry Andric static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
26206c3fb27SDimitry Andric   return [=](const LegalityQuery &Query) {
26306c3fb27SDimitry Andric     LLT Ty = Query.Types[TypeIdx];
26406c3fb27SDimitry Andric     return isRegisterType(Ty) &&
26506c3fb27SDimitry Andric            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
26606c3fb27SDimitry Andric   };
26706c3fb27SDimitry Andric }
26806c3fb27SDimitry Andric 
2695ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2708bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2715ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2725ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2735ffd83dbSDimitry Andric       return false;
2745ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2755ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2768bcb0991SDimitry Andric   };
2778bcb0991SDimitry Andric }
2788bcb0991SDimitry Andric 
279fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
280fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
281fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
2828bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2838bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2848bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
285fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
2860b57cec5SDimitry Andric   };
2870b57cec5SDimitry Andric }
2880b57cec5SDimitry Andric 
2895ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2905ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2915ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2925ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
29306c3fb27SDimitry Andric                                     bool IsLoad, bool IsAtomic) {
2945ffd83dbSDimitry Andric   switch (AS) {
2955ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2965ffd83dbSDimitry Andric     // FIXME: Private element size.
297e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2985ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2995ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
3005ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
3015ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
3025ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
30306c3fb27SDimitry Andric   case AMDGPUAS::BUFFER_RESOURCE:
3045ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
3055ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
3065ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
3075ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
3085ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
3095ffd83dbSDimitry Andric     // kernel.
3105ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
3115ffd83dbSDimitry Andric   default:
31206c3fb27SDimitry Andric     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
31306c3fb27SDimitry Andric     // if they may alias scratch depending on the subtarget.  This needs to be
31406c3fb27SDimitry Andric     // moved to custom handling to use addressMayBeAccessedAsPrivate
31506c3fb27SDimitry Andric     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
3165ffd83dbSDimitry Andric   }
3175ffd83dbSDimitry Andric }
3185ffd83dbSDimitry Andric 
3195ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
320fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
3215ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
3225ffd83dbSDimitry Andric 
3235ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
324fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
3255ffd83dbSDimitry Andric 
3265ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
32704eeddc0SDimitry Andric   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
32804eeddc0SDimitry Andric   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
3295ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
3305ffd83dbSDimitry Andric 
3315ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
3325ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
3335ffd83dbSDimitry Andric     return false;
3345ffd83dbSDimitry Andric 
335fe6060f1SDimitry Andric   // Do not handle extending vector loads.
336fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
337fe6060f1SDimitry Andric     return false;
338fe6060f1SDimitry Andric 
3395ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
3405ffd83dbSDimitry Andric   // we also need to modify the memory access size.
3415ffd83dbSDimitry Andric #if 0
3425ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
3435ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
3445ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
3455ffd83dbSDimitry Andric #endif
3465ffd83dbSDimitry Andric 
3475ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
3485ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
3495ffd83dbSDimitry Andric     return false;
3505ffd83dbSDimitry Andric 
35106c3fb27SDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
35206c3fb27SDimitry Andric                                     Query.MMODescrs[0].Ordering !=
35306c3fb27SDimitry Andric                                         AtomicOrdering::NotAtomic))
3545ffd83dbSDimitry Andric     return false;
3555ffd83dbSDimitry Andric 
3565ffd83dbSDimitry Andric   switch (MemSize) {
3575ffd83dbSDimitry Andric   case 8:
3585ffd83dbSDimitry Andric   case 16:
3595ffd83dbSDimitry Andric   case 32:
3605ffd83dbSDimitry Andric   case 64:
3615ffd83dbSDimitry Andric   case 128:
3625ffd83dbSDimitry Andric     break;
3635ffd83dbSDimitry Andric   case 96:
3645ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3655ffd83dbSDimitry Andric       return false;
3665ffd83dbSDimitry Andric     break;
3675ffd83dbSDimitry Andric   case 256:
3685ffd83dbSDimitry Andric   case 512:
3695ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3705ffd83dbSDimitry Andric     break;
3715ffd83dbSDimitry Andric   default:
3725ffd83dbSDimitry Andric     return false;
3735ffd83dbSDimitry Andric   }
3745ffd83dbSDimitry Andric 
3755ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3765ffd83dbSDimitry Andric 
377e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3785ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
379e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
380e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3815ffd83dbSDimitry Andric       return false;
3825ffd83dbSDimitry Andric   }
3835ffd83dbSDimitry Andric 
3845ffd83dbSDimitry Andric   return true;
3855ffd83dbSDimitry Andric }
3865ffd83dbSDimitry Andric 
38706c3fb27SDimitry Andric // The newer buffer intrinsic forms take their resource arguments as
38806c3fb27SDimitry Andric // pointers in address space 8, aka s128 values. However, in order to not break
38906c3fb27SDimitry Andric // SelectionDAG, the underlying operations have to continue to take v4i32
39006c3fb27SDimitry Andric // arguments. Therefore, we convert resource pointers - or vectors of them
39106c3fb27SDimitry Andric // to integer values here.
39206c3fb27SDimitry Andric static bool hasBufferRsrcWorkaround(const LLT Ty) {
39306c3fb27SDimitry Andric   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
39406c3fb27SDimitry Andric     return true;
39506c3fb27SDimitry Andric   if (Ty.isVector()) {
39606c3fb27SDimitry Andric     const LLT ElemTy = Ty.getElementType();
39706c3fb27SDimitry Andric     return hasBufferRsrcWorkaround(ElemTy);
39806c3fb27SDimitry Andric   }
39906c3fb27SDimitry Andric   return false;
40006c3fb27SDimitry Andric }
40106c3fb27SDimitry Andric 
4025ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
4035ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
4045ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
4055ffd83dbSDimitry Andric // bitcasting.
4065ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
4075ffd83dbSDimitry Andric   if (EnableNewLegality)
4085ffd83dbSDimitry Andric     return false;
4095ffd83dbSDimitry Andric 
4105ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
4115ffd83dbSDimitry Andric   if (Size <= 64)
4125ffd83dbSDimitry Andric     return false;
41306c3fb27SDimitry Andric   // Address space 8 pointers get their own workaround.
41406c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty))
41506c3fb27SDimitry Andric     return false;
4165ffd83dbSDimitry Andric   if (!Ty.isVector())
4175ffd83dbSDimitry Andric     return true;
418e8d8bef9SDimitry Andric 
419e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
420e8d8bef9SDimitry Andric   if (EltTy.isPointer())
421e8d8bef9SDimitry Andric     return true;
422e8d8bef9SDimitry Andric 
423e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
4245ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
4255ffd83dbSDimitry Andric }
4265ffd83dbSDimitry Andric 
427fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
4285ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
429fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
43006c3fb27SDimitry Andric          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
4315ffd83dbSDimitry Andric }
4325ffd83dbSDimitry Andric 
433e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
434e8d8bef9SDimitry Andric /// to a different type.
435e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
436fe6060f1SDimitry Andric                                        const LLT MemTy) {
437fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
438e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
439e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
440e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
441e8d8bef9SDimitry Andric 
442e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
443e8d8bef9SDimitry Andric     return true;
444fe6060f1SDimitry Andric 
445fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
446fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
447fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
448e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
449e8d8bef9SDimitry Andric }
450e8d8bef9SDimitry Andric 
451e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
452e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
453e8d8bef9SDimitry Andric /// changes, not the size of the result register.
454fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
45504eeddc0SDimitry Andric                             uint64_t AlignInBits, unsigned AddrSpace,
456e8d8bef9SDimitry Andric                             unsigned Opcode) {
457fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
458e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
459e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
460e8d8bef9SDimitry Andric     return false;
461e8d8bef9SDimitry Andric 
462e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
4635f757f3fSDimitry Andric   // end up widening these for a scalar load during RegBankSelect, if we don't
4645f757f3fSDimitry Andric   // have 96-bit scalar loads.
465e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
466e8d8bef9SDimitry Andric     return false;
467e8d8bef9SDimitry Andric 
46806c3fb27SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
469e8d8bef9SDimitry Andric     return false;
470e8d8bef9SDimitry Andric 
471e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
472e8d8bef9SDimitry Andric   // to it.
473e8d8bef9SDimitry Andric   //
474e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
475e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
476e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
477e8d8bef9SDimitry Andric     return false;
478e8d8bef9SDimitry Andric 
479e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
480e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
481bdd1243dSDimitry Andric   unsigned Fast = 0;
482e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
483e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
484e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
485e8d8bef9SDimitry Andric          Fast;
486e8d8bef9SDimitry Andric }
487e8d8bef9SDimitry Andric 
488e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
489e8d8bef9SDimitry Andric                             unsigned Opcode) {
490e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
491e8d8bef9SDimitry Andric     return false;
492e8d8bef9SDimitry Andric 
493fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
494e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
495e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
496e8d8bef9SDimitry Andric }
497e8d8bef9SDimitry Andric 
49806c3fb27SDimitry Andric /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
49906c3fb27SDimitry Andric /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
50006c3fb27SDimitry Andric /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
50106c3fb27SDimitry Andric static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
50206c3fb27SDimitry Andric                                    MachineRegisterInfo &MRI, unsigned Idx) {
50306c3fb27SDimitry Andric   MachineOperand &MO = MI.getOperand(Idx);
50406c3fb27SDimitry Andric 
50506c3fb27SDimitry Andric   const LLT PointerTy = MRI.getType(MO.getReg());
50606c3fb27SDimitry Andric 
50706c3fb27SDimitry Andric   // Paranoidly prevent us from doing this multiple times.
50806c3fb27SDimitry Andric   if (!hasBufferRsrcWorkaround(PointerTy))
50906c3fb27SDimitry Andric     return PointerTy;
51006c3fb27SDimitry Andric 
51106c3fb27SDimitry Andric   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
51206c3fb27SDimitry Andric   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
51306c3fb27SDimitry Andric   if (!PointerTy.isVector()) {
51406c3fb27SDimitry Andric     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
51506c3fb27SDimitry Andric     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
51606c3fb27SDimitry Andric     const LLT S32 = LLT::scalar(32);
51706c3fb27SDimitry Andric 
51806c3fb27SDimitry Andric     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
51906c3fb27SDimitry Andric     std::array<Register, 4> VectorElems;
52006c3fb27SDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
52106c3fb27SDimitry Andric     for (unsigned I = 0; I < NumParts; ++I)
52206c3fb27SDimitry Andric       VectorElems[I] =
52306c3fb27SDimitry Andric           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
52406c3fb27SDimitry Andric     B.buildMergeValues(MO, VectorElems);
52506c3fb27SDimitry Andric     MO.setReg(VectorReg);
52606c3fb27SDimitry Andric     return VectorTy;
52706c3fb27SDimitry Andric   }
52806c3fb27SDimitry Andric   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
52906c3fb27SDimitry Andric   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
53006c3fb27SDimitry Andric   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
53106c3fb27SDimitry Andric   B.buildIntToPtr(MO, Scalar);
53206c3fb27SDimitry Andric   MO.setReg(BitcastReg);
53306c3fb27SDimitry Andric 
53406c3fb27SDimitry Andric   return VectorTy;
53506c3fb27SDimitry Andric }
53606c3fb27SDimitry Andric 
53706c3fb27SDimitry Andric /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
53806c3fb27SDimitry Andric /// the form in which the value must be in order to be passed to the low-level
53906c3fb27SDimitry Andric /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
54006c3fb27SDimitry Andric /// needed in order to account for the fact that we can't define a register
54106c3fb27SDimitry Andric /// class for s128 without breaking SelectionDAG.
54206c3fb27SDimitry Andric static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
54306c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
54406c3fb27SDimitry Andric   const LLT PointerTy = MRI.getType(Pointer);
54506c3fb27SDimitry Andric   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
54606c3fb27SDimitry Andric   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
54706c3fb27SDimitry Andric 
54806c3fb27SDimitry Andric   if (!PointerTy.isVector()) {
54906c3fb27SDimitry Andric     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
55006c3fb27SDimitry Andric     SmallVector<Register, 4> PointerParts;
55106c3fb27SDimitry Andric     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
55206c3fb27SDimitry Andric     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
55306c3fb27SDimitry Andric     for (unsigned I = 0; I < NumParts; ++I)
55406c3fb27SDimitry Andric       PointerParts.push_back(Unmerged.getReg(I));
55506c3fb27SDimitry Andric     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
55606c3fb27SDimitry Andric   }
55706c3fb27SDimitry Andric   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
55806c3fb27SDimitry Andric   return B.buildBitcast(VectorTy, Scalar).getReg(0);
55906c3fb27SDimitry Andric }
56006c3fb27SDimitry Andric 
56106c3fb27SDimitry Andric static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
56206c3fb27SDimitry Andric                                      unsigned Idx) {
56306c3fb27SDimitry Andric   MachineOperand &MO = MI.getOperand(Idx);
56406c3fb27SDimitry Andric 
56506c3fb27SDimitry Andric   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
56606c3fb27SDimitry Andric   // Paranoidly prevent us from doing this multiple times.
56706c3fb27SDimitry Andric   if (!hasBufferRsrcWorkaround(PointerTy))
56806c3fb27SDimitry Andric     return;
56906c3fb27SDimitry Andric   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
57006c3fb27SDimitry Andric }
57106c3fb27SDimitry Andric 
5720b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
5730b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
5740b57cec5SDimitry Andric   :  ST(ST_) {
5750b57cec5SDimitry Andric   using namespace TargetOpcode;
5760b57cec5SDimitry Andric 
5770b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
5780b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
5790b57cec5SDimitry Andric   };
5800b57cec5SDimitry Andric 
5810b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
582e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
5830b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
5840b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
5850b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
5860b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
5870b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
5885ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
5895ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
5900b57cec5SDimitry Andric 
591fe6060f1SDimitry Andric   const LLT V2S8 = LLT::fixed_vector(2, 8);
592fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
593fe6060f1SDimitry Andric   const LLT V4S16 = LLT::fixed_vector(4, 16);
5940b57cec5SDimitry Andric 
595fe6060f1SDimitry Andric   const LLT V2S32 = LLT::fixed_vector(2, 32);
596fe6060f1SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
597fe6060f1SDimitry Andric   const LLT V4S32 = LLT::fixed_vector(4, 32);
598fe6060f1SDimitry Andric   const LLT V5S32 = LLT::fixed_vector(5, 32);
599fe6060f1SDimitry Andric   const LLT V6S32 = LLT::fixed_vector(6, 32);
600fe6060f1SDimitry Andric   const LLT V7S32 = LLT::fixed_vector(7, 32);
601fe6060f1SDimitry Andric   const LLT V8S32 = LLT::fixed_vector(8, 32);
602fe6060f1SDimitry Andric   const LLT V9S32 = LLT::fixed_vector(9, 32);
603fe6060f1SDimitry Andric   const LLT V10S32 = LLT::fixed_vector(10, 32);
604fe6060f1SDimitry Andric   const LLT V11S32 = LLT::fixed_vector(11, 32);
605fe6060f1SDimitry Andric   const LLT V12S32 = LLT::fixed_vector(12, 32);
606fe6060f1SDimitry Andric   const LLT V13S32 = LLT::fixed_vector(13, 32);
607fe6060f1SDimitry Andric   const LLT V14S32 = LLT::fixed_vector(14, 32);
608fe6060f1SDimitry Andric   const LLT V15S32 = LLT::fixed_vector(15, 32);
609fe6060f1SDimitry Andric   const LLT V16S32 = LLT::fixed_vector(16, 32);
610fe6060f1SDimitry Andric   const LLT V32S32 = LLT::fixed_vector(32, 32);
6110b57cec5SDimitry Andric 
612fe6060f1SDimitry Andric   const LLT V2S64 = LLT::fixed_vector(2, 64);
613fe6060f1SDimitry Andric   const LLT V3S64 = LLT::fixed_vector(3, 64);
614fe6060f1SDimitry Andric   const LLT V4S64 = LLT::fixed_vector(4, 64);
615fe6060f1SDimitry Andric   const LLT V5S64 = LLT::fixed_vector(5, 64);
616fe6060f1SDimitry Andric   const LLT V6S64 = LLT::fixed_vector(6, 64);
617fe6060f1SDimitry Andric   const LLT V7S64 = LLT::fixed_vector(7, 64);
618fe6060f1SDimitry Andric   const LLT V8S64 = LLT::fixed_vector(8, 64);
619fe6060f1SDimitry Andric   const LLT V16S64 = LLT::fixed_vector(16, 64);
6200b57cec5SDimitry Andric 
6210b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
6220b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
6238bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
6240b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
6258bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
6260b57cec5SDimitry Andric 
6270b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
6280b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
6298bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
6300b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
6318bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
6320b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
6330b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
63406c3fb27SDimitry Andric   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
63506c3fb27SDimitry Andric   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
6365f757f3fSDimitry Andric   const LLT BufferStridedPtr =
6375f757f3fSDimitry Andric       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
6380b57cec5SDimitry Andric 
6390b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
6420b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
6430b57cec5SDimitry Andric   };
6440b57cec5SDimitry Andric 
6450b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
6468bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
6470b57cec5SDimitry Andric   };
6480b57cec5SDimitry Andric 
64906c3fb27SDimitry Andric   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
65006c3fb27SDimitry Andric 
6510b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
6520b57cec5SDimitry Andric     S32, S64
6530b57cec5SDimitry Andric   };
6540b57cec5SDimitry Andric 
6550b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
6560b57cec5SDimitry Andric     S32, S64, S16
6570b57cec5SDimitry Andric   };
6580b57cec5SDimitry Andric 
6590b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
6600b57cec5SDimitry Andric     S32, S64, S16, V2S16
6610b57cec5SDimitry Andric   };
6620b57cec5SDimitry Andric 
6635ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
6645ffd83dbSDimitry Andric 
665fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
666fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
6670b57cec5SDimitry Andric 
6680b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
6690b57cec5SDimitry Andric   // elements for v3s16
6700b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
671e8d8bef9SDimitry Andric       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
6720b57cec5SDimitry Andric       .legalFor(AllS32Vectors)
6730b57cec5SDimitry Andric       .legalFor(AllS64Vectors)
6740b57cec5SDimitry Andric       .legalFor(AddrSpaces64)
6750b57cec5SDimitry Andric       .legalFor(AddrSpaces32)
67606c3fb27SDimitry Andric       .legalFor(AddrSpaces128)
677e8d8bef9SDimitry Andric       .legalIf(isPointer(0))
678e8d8bef9SDimitry Andric       .clampScalar(0, S16, S256)
6790b57cec5SDimitry Andric       .widenScalarToNextPow2(0, 32)
6800b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 16)
6810b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682e8d8bef9SDimitry Andric       .scalarize(0);
6830b57cec5SDimitry Andric 
684e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
685e8d8bef9SDimitry Andric     // Full set of gfx9 features.
6865f757f3fSDimitry Andric     if (ST.hasScalarAddSub64()) {
6875f757f3fSDimitry Andric       getActionDefinitionsBuilder({G_ADD, G_SUB})
6885f757f3fSDimitry Andric           .legalFor({S64, S32, S16, V2S16})
6895f757f3fSDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
6905f757f3fSDimitry Andric           .scalarize(0)
6915f757f3fSDimitry Andric           .minScalar(0, S16)
6925f757f3fSDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
6935f757f3fSDimitry Andric           .maxScalar(0, S32);
6945f757f3fSDimitry Andric     } else {
69581ad6265SDimitry Andric       getActionDefinitionsBuilder({G_ADD, G_SUB})
6965ffd83dbSDimitry Andric           .legalFor({S32, S16, V2S16})
6970eae32dcSDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
69881ad6265SDimitry Andric           .scalarize(0)
69981ad6265SDimitry Andric           .minScalar(0, S16)
700349cc55cSDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
70181ad6265SDimitry Andric           .maxScalar(0, S32);
7025f757f3fSDimitry Andric     }
70381ad6265SDimitry Andric 
704*1db9f3b2SDimitry Andric     if (ST.hasScalarSMulU64()) {
705*1db9f3b2SDimitry Andric       getActionDefinitionsBuilder(G_MUL)
706*1db9f3b2SDimitry Andric           .legalFor({S64, S32, S16, V2S16})
707*1db9f3b2SDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
708*1db9f3b2SDimitry Andric           .scalarize(0)
709*1db9f3b2SDimitry Andric           .minScalar(0, S16)
710*1db9f3b2SDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
711*1db9f3b2SDimitry Andric           .custom();
712*1db9f3b2SDimitry Andric     } else {
71381ad6265SDimitry Andric       getActionDefinitionsBuilder(G_MUL)
71481ad6265SDimitry Andric           .legalFor({S32, S16, V2S16})
71581ad6265SDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
71681ad6265SDimitry Andric           .scalarize(0)
71781ad6265SDimitry Andric           .minScalar(0, S16)
71881ad6265SDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
71981ad6265SDimitry Andric           .custom();
720*1db9f3b2SDimitry Andric     }
72181ad6265SDimitry Andric     assert(ST.hasMad64_32());
722e8d8bef9SDimitry Andric 
723e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
724e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
725e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
7260eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
727e8d8bef9SDimitry Andric       .scalarize(0)
728e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
729e8d8bef9SDimitry Andric       .lower();
7305ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
73181ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
7320b57cec5SDimitry Andric       .legalFor({S32, S16})
733349cc55cSDimitry Andric       .minScalar(0, S16)
734349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
735349cc55cSDimitry Andric       .maxScalar(0, S32)
736349cc55cSDimitry Andric       .scalarize(0);
737e8d8bef9SDimitry Andric 
73881ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
73981ad6265SDimitry Andric       .legalFor({S32, S16})
74081ad6265SDimitry Andric       .scalarize(0)
74181ad6265SDimitry Andric       .minScalar(0, S16)
74281ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
74381ad6265SDimitry Andric       .custom();
74481ad6265SDimitry Andric     assert(ST.hasMad64_32());
74581ad6265SDimitry Andric 
746e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
747e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
748e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
749e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
750e8d8bef9SDimitry Andric       .minScalar(0, S16)
751e8d8bef9SDimitry Andric       .scalarize(0)
752e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
753e8d8bef9SDimitry Andric       .lower();
754e8d8bef9SDimitry Andric 
755e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
756e8d8bef9SDimitry Andric     // coerce to the desired type first.
757e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
758e8d8bef9SDimitry Andric       .minScalar(0, S16)
759e8d8bef9SDimitry Andric       .scalarize(0)
760e8d8bef9SDimitry Andric       .lower();
7610b57cec5SDimitry Andric   } else {
76281ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
7630b57cec5SDimitry Andric       .legalFor({S32})
764349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
7650b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
7660b57cec5SDimitry Andric       .scalarize(0);
767e8d8bef9SDimitry Andric 
76881ad6265SDimitry Andric     auto &Mul = getActionDefinitionsBuilder(G_MUL)
76981ad6265SDimitry Andric       .legalFor({S32})
77081ad6265SDimitry Andric       .scalarize(0)
77181ad6265SDimitry Andric       .minScalar(0, S32)
77281ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32);
77381ad6265SDimitry Andric 
77481ad6265SDimitry Andric     if (ST.hasMad64_32())
77581ad6265SDimitry Andric       Mul.custom();
77681ad6265SDimitry Andric     else
77781ad6265SDimitry Andric       Mul.maxScalar(0, S32);
77881ad6265SDimitry Andric 
779e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
780e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
781e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
782e8d8bef9SDimitry Andric         .scalarize(0)
783e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
784e8d8bef9SDimitry Andric         .lower();
785e8d8bef9SDimitry Andric     } else {
786e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
787e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
788e8d8bef9SDimitry Andric         .minScalar(0, S32)
789e8d8bef9SDimitry Andric         .scalarize(0)
790e8d8bef9SDimitry Andric         .lower();
7910b57cec5SDimitry Andric     }
7920b57cec5SDimitry Andric 
793e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
794e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
795e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
796e8d8bef9SDimitry Andric       .minScalar(0, S32)
797e8d8bef9SDimitry Andric       .scalarize(0)
798e8d8bef9SDimitry Andric       .lower();
799e8d8bef9SDimitry Andric   }
800e8d8bef9SDimitry Andric 
801fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
802fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
8035ffd83dbSDimitry Andric       .customFor({S32, S64})
804480093f4SDimitry Andric       .clampScalar(0, S32, S64)
805480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
806480093f4SDimitry Andric       .scalarize(0);
807480093f4SDimitry Andric 
808e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
8090b57cec5SDimitry Andric                    .legalFor({S32})
810349cc55cSDimitry Andric                    .maxScalar(0, S32);
811e8d8bef9SDimitry Andric 
812e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
813e8d8bef9SDimitry Andric     Mulh
814e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
815e8d8bef9SDimitry Andric       .lowerFor({V2S8});
816e8d8bef9SDimitry Andric   }
817e8d8bef9SDimitry Andric 
818e8d8bef9SDimitry Andric   Mulh
819e8d8bef9SDimitry Andric     .scalarize(0)
820e8d8bef9SDimitry Andric     .lower();
8210b57cec5SDimitry Andric 
8220b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
8230b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
8240b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
8250b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
8260b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
8270b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8288bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
8290b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
8300b57cec5SDimitry Andric     .scalarize(0);
8310b57cec5SDimitry Andric 
832bdd1243dSDimitry Andric   getActionDefinitionsBuilder(
833bdd1243dSDimitry Andric       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
834480093f4SDimitry Andric       .legalFor({{S32, S1}, {S32, S32}})
835bdd1243dSDimitry Andric       .clampScalar(0, S32, S32)
836bdd1243dSDimitry Andric       .scalarize(0);
8370b57cec5SDimitry Andric 
8380b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
8390b57cec5SDimitry Andric     // Don't worry about the size constraint.
8408bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
8415ffd83dbSDimitry Andric     .lower();
8420b57cec5SDimitry Andric 
8430b57cec5SDimitry Andric 
8440b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
8458bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
8460b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
847e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
8480b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
849e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
8500b57cec5SDimitry Andric 
8515ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
8525ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
8535ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
8548bcb0991SDimitry Andric 
8555ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
8565ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
8575ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
8585ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
8595ffd83dbSDimitry Andric       .legalFor({S1, S16})
8605ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8615ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
8625ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
8635ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
8645ffd83dbSDimitry Andric 
865fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
8665ffd83dbSDimitry Andric 
8675ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
8685ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
8695ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
8705ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
8715ffd83dbSDimitry Andric 
8725f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_STACKSAVE)
8735f757f3fSDimitry Andric     .customFor({PrivatePtr});
8745f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_STACKRESTORE)
8755f757f3fSDimitry Andric     .legalFor({PrivatePtr});
8765f757f3fSDimitry Andric 
8775ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
878e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
879e8d8bef9SDimitry Andric 
880fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
8810b57cec5SDimitry Andric 
8820b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
883bdd1243dSDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
884bdd1243dSDimitry Andric       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
8850b57cec5SDimitry Andric     .legalFor({S32, S64});
8868bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
8878bcb0991SDimitry Andric     .customFor({S32, S64});
8888bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
8898bcb0991SDimitry Andric     .customFor({S32, S64});
8900b57cec5SDimitry Andric 
8910b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
8920b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
8930b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
8940b57cec5SDimitry Andric     else
8950b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
8968bcb0991SDimitry Andric 
8978bcb0991SDimitry Andric     TrigActions.customFor({S16});
8988bcb0991SDimitry Andric     FDIVActions.customFor({S16});
8990b57cec5SDimitry Andric   }
9000b57cec5SDimitry Andric 
9015f757f3fSDimitry Andric   if (ST.hasPackedFP32Ops()) {
9025f757f3fSDimitry Andric     FPOpActions.legalFor({V2S32});
9035f757f3fSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
9045f757f3fSDimitry Andric   }
9055f757f3fSDimitry Andric 
9060b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
9070b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
9080b57cec5SDimitry Andric 
9090b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
9100b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
911480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
9120b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
9130b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
9140b57cec5SDimitry Andric       .scalarize(0);
9150b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
9160b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
9170b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
9180b57cec5SDimitry Andric       .scalarize(0);
9190b57cec5SDimitry Andric   } else {
9200b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
9210b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
9220b57cec5SDimitry Andric       .scalarize(0);
9230b57cec5SDimitry Andric   }
9240b57cec5SDimitry Andric 
9250b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
9260eae32dcSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
9278bcb0991SDimitry Andric 
9280b57cec5SDimitry Andric   FPOpActions
9290b57cec5SDimitry Andric     .scalarize(0)
9300b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9310b57cec5SDimitry Andric 
9328bcb0991SDimitry Andric   TrigActions
9338bcb0991SDimitry Andric     .scalarize(0)
9348bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9358bcb0991SDimitry Andric 
9368bcb0991SDimitry Andric   FDIVActions
9378bcb0991SDimitry Andric     .scalarize(0)
9388bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9398bcb0991SDimitry Andric 
9408bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
9418bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
9420eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
9438bcb0991SDimitry Andric     .scalarize(0)
9448bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
9458bcb0991SDimitry Andric 
9460b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
94706c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
9485f757f3fSDimitry Andric       .legalFor({S16})
9495f757f3fSDimitry Andric       .customFor({S32, S64})
95006c3fb27SDimitry Andric       .scalarize(0)
9515f757f3fSDimitry Andric       .unsupported();
95206c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFLOOR)
9530b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
9540b57cec5SDimitry Andric       .scalarize(0)
9550b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
95606c3fb27SDimitry Andric 
95706c3fb27SDimitry Andric     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
95806c3fb27SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
95906c3fb27SDimitry Andric       .scalarize(0)
96006c3fb27SDimitry Andric       .maxScalarIf(typeIs(0, S16), 1, S16)
96106c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
96206c3fb27SDimitry Andric       .lower();
96306c3fb27SDimitry Andric 
96406c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFREXP)
96506c3fb27SDimitry Andric       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
96606c3fb27SDimitry Andric       .scalarize(0)
96706c3fb27SDimitry Andric       .lower();
9680b57cec5SDimitry Andric   } else {
9695ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
9705f757f3fSDimitry Andric       .customFor({S32, S64, S16})
9715ffd83dbSDimitry Andric       .scalarize(0)
9725f757f3fSDimitry Andric       .unsupported();
9735f757f3fSDimitry Andric 
9745ffd83dbSDimitry Andric 
9755ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
9765ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
9775ffd83dbSDimitry Andric         .customFor({S64})
9785ffd83dbSDimitry Andric         .legalFor({S32, S64})
9795ffd83dbSDimitry Andric         .scalarize(0)
9805ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
9815ffd83dbSDimitry Andric     } else {
9825ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
9830b57cec5SDimitry Andric         .legalFor({S32, S64})
9840b57cec5SDimitry Andric         .scalarize(0)
9850b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
9860b57cec5SDimitry Andric     }
98706c3fb27SDimitry Andric 
98806c3fb27SDimitry Andric     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
98906c3fb27SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
99006c3fb27SDimitry Andric       .scalarize(0)
99106c3fb27SDimitry Andric       .clampScalar(0, S32, S64)
99206c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
99306c3fb27SDimitry Andric       .lower();
99406c3fb27SDimitry Andric 
99506c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFREXP)
99606c3fb27SDimitry Andric       .customFor({{S32, S32}, {S64, S32}})
99706c3fb27SDimitry Andric       .scalarize(0)
99806c3fb27SDimitry Andric       .minScalar(0, S32)
99906c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
100006c3fb27SDimitry Andric       .lower();
10015ffd83dbSDimitry Andric   }
10020b57cec5SDimitry Andric 
10030b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
10040b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
10055ffd83dbSDimitry Andric     .scalarize(0)
10065ffd83dbSDimitry Andric     .lower();
10070b57cec5SDimitry Andric 
10080b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
10090b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
1010e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
10110b57cec5SDimitry Andric     .scalarize(0);
10120b57cec5SDimitry Andric 
1013bdd1243dSDimitry Andric   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
101481ad6265SDimitry Andric   if (ST.has16BitInsts()) {
101581ad6265SDimitry Andric     FSubActions
101681ad6265SDimitry Andric       // Use actual fsub instruction
101781ad6265SDimitry Andric       .legalFor({S32, S16})
101881ad6265SDimitry Andric       // Must use fadd + fneg
101981ad6265SDimitry Andric       .lowerFor({S64, V2S16});
102081ad6265SDimitry Andric   } else {
102181ad6265SDimitry Andric     FSubActions
10220b57cec5SDimitry Andric       // Use actual fsub instruction
10230b57cec5SDimitry Andric       .legalFor({S32})
10240b57cec5SDimitry Andric       // Must use fadd + fneg
102581ad6265SDimitry Andric       .lowerFor({S64, S16, V2S16});
102681ad6265SDimitry Andric   }
102781ad6265SDimitry Andric 
102881ad6265SDimitry Andric   FSubActions
10290b57cec5SDimitry Andric     .scalarize(0)
10300b57cec5SDimitry Andric     .clampScalar(0, S32, S64);
10310b57cec5SDimitry Andric 
10328bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
10338bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
10345ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
10358bcb0991SDimitry Andric     FMad.customFor({S32, S16});
10365ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
10378bcb0991SDimitry Andric     FMad.customFor({S32});
10385ffd83dbSDimitry Andric   else if (ST.hasMadF16())
10395ffd83dbSDimitry Andric     FMad.customFor({S16});
10408bcb0991SDimitry Andric   FMad.scalarize(0)
10418bcb0991SDimitry Andric       .lower();
10428bcb0991SDimitry Andric 
1043e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1044e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
1045e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
1046e8d8bef9SDimitry Andric   } else {
1047e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
1048e8d8bef9SDimitry Andric         .customFor({S32, S64});
1049e8d8bef9SDimitry Andric   }
1050e8d8bef9SDimitry Andric   FRem.scalarize(0);
1051e8d8bef9SDimitry Andric 
10525ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
10535ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
10545ffd83dbSDimitry Andric     .legalIf(isScalar(0))
10555ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
10565ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
10575ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
10585ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
10595ffd83dbSDimitry Andric     // in the legalizer.
10605ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
10615ffd83dbSDimitry Andric     .alwaysLegal();
10625ffd83dbSDimitry Andric 
10630b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
10640b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
10655ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
1066480093f4SDimitry Andric     .scalarize(0)
10675ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
10685ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
10690b57cec5SDimitry Andric 
10708bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
10718bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1072480093f4SDimitry Andric                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1073480093f4SDimitry Andric                     .lowerIf(typeIs(1, S1))
1074349cc55cSDimitry Andric                     .customFor({{S32, S64}, {S64, S64}});
10758bcb0991SDimitry Andric   if (ST.has16BitInsts())
10768bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
10778bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
1078e8d8bef9SDimitry Andric        .minScalar(0, S32)
10795ffd83dbSDimitry Andric        .scalarize(0)
10805ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
10810b57cec5SDimitry Andric 
10828bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
10835ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1084fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
1085e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
10868bcb0991SDimitry Andric   if (ST.has16BitInsts())
10878bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
10888bcb0991SDimitry Andric   else
10898bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
10908bcb0991SDimitry Andric 
10918bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
1092fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
10935ffd83dbSDimitry Andric        .scalarize(0)
10945ffd83dbSDimitry Andric        .lower();
10950b57cec5SDimitry Andric 
109681ad6265SDimitry Andric   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
109781ad6265SDimitry Andric       .customFor({S16, S32})
109881ad6265SDimitry Andric       .scalarize(0)
109981ad6265SDimitry Andric       .lower();
110081ad6265SDimitry Andric 
11015f757f3fSDimitry Andric   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
11025f757f3fSDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1103480093f4SDimitry Andric       .scalarize(0)
1104480093f4SDimitry Andric       .lower();
11050b57cec5SDimitry Andric 
1106480093f4SDimitry Andric   if (ST.has16BitInsts()) {
11075f757f3fSDimitry Andric     getActionDefinitionsBuilder(
11085f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1109480093f4SDimitry Andric         .legalFor({S16, S32, S64})
1110480093f4SDimitry Andric         .clampScalar(0, S16, S64)
1111480093f4SDimitry Andric         .scalarize(0);
1112480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
11135f757f3fSDimitry Andric     getActionDefinitionsBuilder(
11145f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
11150b57cec5SDimitry Andric         .legalFor({S32, S64})
11160b57cec5SDimitry Andric         .clampScalar(0, S32, S64)
11170b57cec5SDimitry Andric         .scalarize(0);
11180b57cec5SDimitry Andric   } else {
11195f757f3fSDimitry Andric     getActionDefinitionsBuilder(
11205f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
11210b57cec5SDimitry Andric         .legalFor({S32})
11220b57cec5SDimitry Andric         .customFor({S64})
11230b57cec5SDimitry Andric         .clampScalar(0, S32, S64)
11240b57cec5SDimitry Andric         .scalarize(0);
11250b57cec5SDimitry Andric   }
11260b57cec5SDimitry Andric 
1127480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
11285f757f3fSDimitry Andric       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1129e8d8bef9SDimitry Andric       .legalIf(all(isPointer(0), sameSize(0, 1)))
1130e8d8bef9SDimitry Andric       .scalarize(0)
1131e8d8bef9SDimitry Andric       .scalarSameSizeAs(1, 0);
11320b57cec5SDimitry Andric 
11335ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
1134e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1135e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
11365ffd83dbSDimitry Andric     .scalarize(0);
11370b57cec5SDimitry Andric 
11380b57cec5SDimitry Andric   auto &CmpBuilder =
11390b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
1140480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
1141480093f4SDimitry Andric     // so make both s1 and s32 legal.
1142480093f4SDimitry Andric     //
1143480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
1144480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
1145480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
1146480093f4SDimitry Andric     // before that won't try to use s32 result types.
1147480093f4SDimitry Andric     //
1148480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1149480093f4SDimitry Andric     // bank.
11500b57cec5SDimitry Andric     .legalForCartesianProduct(
11510b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1152480093f4SDimitry Andric     .legalForCartesianProduct(
1153480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
11540b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
11550b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
11560b57cec5SDimitry Andric   }
11570b57cec5SDimitry Andric 
11580b57cec5SDimitry Andric   CmpBuilder
11590b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
11600b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
11610b57cec5SDimitry Andric     .scalarize(0)
1162480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
11630b57cec5SDimitry Andric 
11645f757f3fSDimitry Andric   auto &FCmpBuilder =
11655f757f3fSDimitry Andric       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
11665f757f3fSDimitry Andric           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
11675f757f3fSDimitry Andric 
11685f757f3fSDimitry Andric   if (ST.hasSALUFloatInsts())
11695f757f3fSDimitry Andric     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
11705f757f3fSDimitry Andric 
11715f757f3fSDimitry Andric   FCmpBuilder
11720b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
11730b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
11740b57cec5SDimitry Andric     .scalarize(0);
11750b57cec5SDimitry Andric 
11765ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
117706c3fb27SDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
11785ffd83dbSDimitry Andric   if (ST.has16BitInsts())
11795ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
11805ffd83dbSDimitry Andric   else
11815ffd83dbSDimitry Andric     ExpOps.customFor({S32});
11825ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
11830b57cec5SDimitry Andric         .scalarize(0);
11840b57cec5SDimitry Andric 
1185e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
1186e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
1187e8d8bef9SDimitry Andric     .lower();
1188e8d8bef9SDimitry Andric 
118906c3fb27SDimitry Andric   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
119006c3fb27SDimitry Andric   Log2Ops.customFor({S32});
119106c3fb27SDimitry Andric   if (ST.has16BitInsts())
119206c3fb27SDimitry Andric     Log2Ops.legalFor({S16});
119306c3fb27SDimitry Andric   else
119406c3fb27SDimitry Andric     Log2Ops.customFor({S16});
119506c3fb27SDimitry Andric   Log2Ops.scalarize(0)
119606c3fb27SDimitry Andric     .lower();
119706c3fb27SDimitry Andric 
11985f757f3fSDimitry Andric   auto &LogOps =
11995f757f3fSDimitry Andric       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
120006c3fb27SDimitry Andric   LogOps.customFor({S32, S16});
120106c3fb27SDimitry Andric   LogOps.clampScalar(0, MinScalarFPTy, S32)
120206c3fb27SDimitry Andric         .scalarize(0);
120306c3fb27SDimitry Andric 
12040b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
12055ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
12060b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
12070b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
120804eeddc0SDimitry Andric     .widenScalarToNextPow2(1, 32)
12090b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
12100b57cec5SDimitry Andric     .scalarize(0)
121104eeddc0SDimitry Andric     .widenScalarToNextPow2(0, 32);
121204eeddc0SDimitry Andric 
1213bdd1243dSDimitry Andric   // If no 16 bit instr is available, lower into different instructions.
1214bdd1243dSDimitry Andric   if (ST.has16BitInsts())
1215bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
1216bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypes16)
1217bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
1218bdd1243dSDimitry Andric         .scalarize(0)
1219bdd1243dSDimitry Andric         .lower();
1220bdd1243dSDimitry Andric   else
1221bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
1222bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypesBase)
1223bdd1243dSDimitry Andric         .lowerFor({S1, S16})
1224bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
1225bdd1243dSDimitry Andric         .scalarize(0)
1226bdd1243dSDimitry Andric         .lower();
12270b57cec5SDimitry Andric 
12285ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
12295ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
12305ffd83dbSDimitry Andric   // bitwidth.
12315ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
12325ffd83dbSDimitry Andric     .scalarize(0)
12335ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
12345ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
12355ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
12365ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
1237349cc55cSDimitry Andric     .custom();
12385ffd83dbSDimitry Andric 
12395ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
12405ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
12415ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
12425ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
12435ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
12445ffd83dbSDimitry Andric     .scalarize(0)
12455ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
12465ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
12475ffd83dbSDimitry Andric 
1248fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1249fe6060f1SDimitry Andric   // RegBankSelect.
12505ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
1251fe6060f1SDimitry Andric     .legalFor({S32, S64})
1252fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
1253fe6060f1SDimitry Andric     .scalarize(0)
1254fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
12550b57cec5SDimitry Andric 
12560b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
12575ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
12585ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
12590eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
12605ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
12615ffd83dbSDimitry Andric       // narrowScalar limitation.
12625ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
12635ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
12645ffd83dbSDimitry Andric       .scalarize(0);
12655ffd83dbSDimitry Andric 
12660b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
1267fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12680b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
12690b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
12705ffd83dbSDimitry Andric         .minScalar(0, S16)
12710b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
12725ffd83dbSDimitry Andric         .scalarize(0)
12735ffd83dbSDimitry Andric         .lower();
12740b57cec5SDimitry Andric     } else {
1275fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12760b57cec5SDimitry Andric         .legalFor({S32, S16})
12770b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
12785ffd83dbSDimitry Andric         .minScalar(0, S16)
12795ffd83dbSDimitry Andric         .scalarize(0)
12805ffd83dbSDimitry Andric         .lower();
12810b57cec5SDimitry Andric     }
12820b57cec5SDimitry Andric   } else {
12835ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
12845ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
12855ffd83dbSDimitry Andric       .legalFor({S32})
12865ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
12875ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
12885ffd83dbSDimitry Andric       // narrowScalar limitation.
12895ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
12905ffd83dbSDimitry Andric       .maxScalar(0, S32)
12915ffd83dbSDimitry Andric       .scalarize(0)
12925ffd83dbSDimitry Andric       .lower();
12935ffd83dbSDimitry Andric 
1294fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12950b57cec5SDimitry Andric       .legalFor({S32})
12965ffd83dbSDimitry Andric       .minScalar(0, S32)
12970b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
12985ffd83dbSDimitry Andric       .scalarize(0)
12995ffd83dbSDimitry Andric       .lower();
13000b57cec5SDimitry Andric   }
13010b57cec5SDimitry Andric 
13020b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
13030b57cec5SDimitry Andric       // List the common cases
13040b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
13050b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
13060b57cec5SDimitry Andric       .scalarize(0)
13070b57cec5SDimitry Andric       // Accept any address space as long as the size matches
13080b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
13090b57cec5SDimitry Andric       .widenScalarIf(smallerThan(1, 0),
13100b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1311bdd1243dSDimitry Andric                        return std::pair(
1312bdd1243dSDimitry Andric                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
13130b57cec5SDimitry Andric                      })
1314bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1315bdd1243dSDimitry Andric         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
13160b57cec5SDimitry Andric       });
13170b57cec5SDimitry Andric 
13180b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
13190b57cec5SDimitry Andric       // List the common cases
13200b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
13210b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
13220b57cec5SDimitry Andric       .scalarize(0)
13230b57cec5SDimitry Andric       // Accept any address space as long as the size matches
13240b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
13250b57cec5SDimitry Andric       .widenScalarIf(smallerThan(0, 1),
13260b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1327bdd1243dSDimitry Andric                        return std::pair(
1328bdd1243dSDimitry Andric                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
13290b57cec5SDimitry Andric                      })
1330bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1331bdd1243dSDimitry Andric         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
13320b57cec5SDimitry Andric       });
13330b57cec5SDimitry Andric 
13340b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
13350b57cec5SDimitry Andric     .scalarize(0)
13360b57cec5SDimitry Andric     .custom();
13370b57cec5SDimitry Andric 
13385ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
13395ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
13408bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
13418bcb0991SDimitry Andric 
13428bcb0991SDimitry Andric     // Split vector extloads.
1343fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1344480093f4SDimitry Andric 
13458bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
13468bcb0991SDimitry Andric       return true;
13478bcb0991SDimitry Andric 
13488bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
13498bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
135006c3fb27SDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
135106c3fb27SDimitry Andric                                       Query.MMODescrs[0].Ordering !=
135206c3fb27SDimitry Andric                                           AtomicOrdering::NotAtomic))
13538bcb0991SDimitry Andric       return true;
13548bcb0991SDimitry Andric 
13558bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
13568bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
13575ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
13585ffd83dbSDimitry Andric     if (NumRegs == 3) {
13595ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
13608bcb0991SDimitry Andric         return true;
13615ffd83dbSDimitry Andric     } else {
13625ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
13635ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
13645ffd83dbSDimitry Andric         return true;
13655ffd83dbSDimitry Andric     }
13668bcb0991SDimitry Andric 
13678bcb0991SDimitry Andric     return false;
13688bcb0991SDimitry Andric   };
13698bcb0991SDimitry Andric 
1370e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1371e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1372e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
13738bcb0991SDimitry Andric 
13748bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
13758bcb0991SDimitry Andric   // LDS
13768bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
13778bcb0991SDimitry Andric 
13788bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
13798bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
13808bcb0991SDimitry Andric 
13818bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
13825ffd83dbSDimitry Andric     // Explicitly list some common cases.
13835ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1384fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1385fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1386fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1387fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1388fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1389fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1390fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1391fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
13928bcb0991SDimitry Andric 
1393fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1394fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1395fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1396fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1397fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1398fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
13998bcb0991SDimitry Andric 
1400fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1401fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1402fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1403fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
14048bcb0991SDimitry Andric 
1405fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1406fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1407fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1408fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1409fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
14105ffd83dbSDimitry Andric     Actions.legalIf(
14115ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1412fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
14135ffd83dbSDimitry Andric       });
14145ffd83dbSDimitry Andric 
141506c3fb27SDimitry Andric     // The custom pointers (fat pointers, buffer resources) don't work with load
141606c3fb27SDimitry Andric     // and store at this level. Fat pointers should have been lowered to
141706c3fb27SDimitry Andric     // intrinsics before the translation to MIR.
14185f757f3fSDimitry Andric     Actions.unsupportedIf(
14195f757f3fSDimitry Andric         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
142006c3fb27SDimitry Andric 
142106c3fb27SDimitry Andric     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
142206c3fb27SDimitry Andric     // ptrtoint. This is needed to account for the fact that we can't have i128
142306c3fb27SDimitry Andric     // as a register class for SelectionDAG reasons.
142406c3fb27SDimitry Andric     Actions.customIf([=](const LegalityQuery &Query) -> bool {
142506c3fb27SDimitry Andric       return hasBufferRsrcWorkaround(Query.Types[0]);
142606c3fb27SDimitry Andric     });
142706c3fb27SDimitry Andric 
14285ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
14295ffd83dbSDimitry Andric     // 64-bits.
14305ffd83dbSDimitry Andric     //
14315ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
14325ffd83dbSDimitry Andric     // inserting addrspacecasts.
14335ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
14345ffd83dbSDimitry Andric 
14355ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
14365ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
14375ffd83dbSDimitry Andric     // parts anyway.
14385ffd83dbSDimitry Andric     //
14395ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
14405ffd83dbSDimitry Andric     // 16-bit vector parts.
14415ffd83dbSDimitry Andric     Actions.bitcastIf(
14425ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1443e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1444fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
14455ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
14465ffd83dbSDimitry Andric 
1447e8d8bef9SDimitry Andric     if (!IsStore) {
1448e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1449e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1450e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1451e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1452e8d8bef9SDimitry Andric       });
1453e8d8bef9SDimitry Andric     }
1454e8d8bef9SDimitry Andric 
1455e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
14568bcb0991SDimitry Andric     Actions
14578bcb0991SDimitry Andric         .narrowScalarIf(
14588bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
14595ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
14605ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
14618bcb0991SDimitry Andric             },
14628bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
14638bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
14648bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
14658bcb0991SDimitry Andric 
14668bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1467fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
14688bcb0991SDimitry Andric 
14698bcb0991SDimitry Andric               // Split extloads.
14708bcb0991SDimitry Andric               if (DstSize > MemSize)
1471bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MemSize));
14728bcb0991SDimitry Andric 
147306c3fb27SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(
147406c3fb27SDimitry Andric                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
147506c3fb27SDimitry Andric                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
14768bcb0991SDimitry Andric               if (MemSize > MaxSize)
1477bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MaxSize));
14788bcb0991SDimitry Andric 
147904eeddc0SDimitry Andric               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1480bdd1243dSDimitry Andric               return std::pair(0, LLT::scalar(Align));
14818bcb0991SDimitry Andric             })
14828bcb0991SDimitry Andric         .fewerElementsIf(
14838bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
14845ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
14855ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
14868bcb0991SDimitry Andric             },
14878bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
14888bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
14898bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
14908bcb0991SDimitry Andric 
14918bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
149206c3fb27SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(
149306c3fb27SDimitry Andric                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
149406c3fb27SDimitry Andric                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
14955ffd83dbSDimitry Andric 
14965ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
14975ffd83dbSDimitry Andric               // up scalarizing.
14985ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
14998bcb0991SDimitry Andric 
15008bcb0991SDimitry Andric               // Split if it's too large for the address space.
1501fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1502fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
15038bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
15045ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
15055ffd83dbSDimitry Andric 
15065ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
1507bdd1243dSDimitry Andric                   return std::pair(
1508fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1509fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
15105ffd83dbSDimitry Andric                 }
15115ffd83dbSDimitry Andric 
1512fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
15138bcb0991SDimitry Andric 
15148bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
15158bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
15168bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
15178bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
1518bdd1243dSDimitry Andric                   return std::pair(0, EltTy);
15198bcb0991SDimitry Andric 
1520bdd1243dSDimitry Andric                 return std::pair(0,
1521bdd1243dSDimitry Andric                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
15228bcb0991SDimitry Andric               }
15238bcb0991SDimitry Andric 
15245ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
15255ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
1526bdd1243dSDimitry Andric                 return std::pair(0, EltTy);
15275ffd83dbSDimitry Andric 
15285ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
15295ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
15305ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
15315ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
15325ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
15335ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
153406c3fb27SDimitry Andric                 unsigned FloorSize = llvm::bit_floor(DstSize);
1535bdd1243dSDimitry Andric                 return std::pair(
1536fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1537fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
15385ffd83dbSDimitry Andric               }
15395ffd83dbSDimitry Andric 
15408bcb0991SDimitry Andric               // May need relegalization for the scalars.
1541bdd1243dSDimitry Andric               return std::pair(0, EltTy);
15428bcb0991SDimitry Andric             })
1543fe6060f1SDimitry Andric     .minScalar(0, S32)
1544fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
15458bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1546e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1547e8d8bef9SDimitry Andric     .lower();
15488bcb0991SDimitry Andric   }
15490b57cec5SDimitry Andric 
1550fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
15510b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1552fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1553fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1554fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1555fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1556fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1557fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1558fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1559fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1560fe6060f1SDimitry Andric                        .legalIf(
1561fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1562fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1563fe6060f1SDimitry Andric                          });
1564fe6060f1SDimitry Andric 
15650b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
15668bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1567fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
15680b57cec5SDimitry Andric   }
15690b57cec5SDimitry Andric 
1570fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1571fe6060f1SDimitry Andric   // 64-bits.
1572fe6060f1SDimitry Andric   //
1573fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1574fe6060f1SDimitry Andric   // inserting addrspacecasts.
1575fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1576fe6060f1SDimitry Andric 
15770b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
15780b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
15790b57cec5SDimitry Andric           .lower();
15800b57cec5SDimitry Andric 
15810b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
15820b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
15830b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
15840b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
158506c3fb27SDimitry Andric      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
15860b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1587e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1588e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
15890b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
15900b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
15910b57cec5SDimitry Andric   }
15920b57cec5SDimitry Andric 
1593fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1594349cc55cSDimitry Andric   if (ST.hasLDSFPAtomicAdd()) {
1595fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1596fe6060f1SDimitry Andric     if (ST.hasGFX90AInsts())
1597fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
159806c3fb27SDimitry Andric     if (ST.hasAtomicDsPkAdd16Insts())
159981ad6265SDimitry Andric       Atomic.legalFor({{V2S16, LocalPtr}});
16005ffd83dbSDimitry Andric   }
1601fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1602fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
1603bdd1243dSDimitry Andric   if (ST.hasFlatAtomicFaddF32Inst())
1604bdd1243dSDimitry Andric     Atomic.legalFor({{S32, FlatPtr}});
16058bcb0991SDimitry Andric 
160604eeddc0SDimitry Andric   if (ST.hasGFX90AInsts()) {
160704eeddc0SDimitry Andric     // These are legal with some caveats, and should have undergone expansion in
160804eeddc0SDimitry Andric     // the IR in most situations
160904eeddc0SDimitry Andric     // TODO: Move atomic expansion into legalizer
161004eeddc0SDimitry Andric     Atomic.legalFor({
161104eeddc0SDimitry Andric         {S32, GlobalPtr},
161204eeddc0SDimitry Andric         {S64, GlobalPtr},
161304eeddc0SDimitry Andric         {S64, FlatPtr}
161404eeddc0SDimitry Andric       });
161504eeddc0SDimitry Andric   }
161604eeddc0SDimitry Andric 
1617480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1618480093f4SDimitry Andric   // demarshalling
1619480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1620480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1621480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1622480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1623480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
16240b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1625480093f4SDimitry Andric 
1626480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
16270b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1628fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1629fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1630fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1631fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1632fe6060f1SDimitry Andric                                 {S1, S32})
16330b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
16345ffd83dbSDimitry Andric       .scalarize(1)
16350b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
16360b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
16370b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
16380b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
16390b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
16400b57cec5SDimitry Andric       .scalarize(0)
16410b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1642480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
16430b57cec5SDimitry Andric 
16440b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
16450b57cec5SDimitry Andric   // be more flexible with the shift amount type.
16460b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
16470b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
16480b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
16490b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
16505ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
16510b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
16520b57cec5SDimitry Andric     } else
16535ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
16540b57cec5SDimitry Andric 
16555ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
16565ffd83dbSDimitry Andric     Shifts.widenScalarIf(
16575ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
16585ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
16595ffd83dbSDimitry Andric         // 32-bit amount.
16605ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
16615ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
16625ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
16635ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
16645ffd83dbSDimitry Andric       }, changeTo(1, S16));
16655ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1666480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
16670b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
166804eeddc0SDimitry Andric     Shifts.clampScalar(0, S16, S64);
1669e8d8bef9SDimitry Andric 
1670e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1671e8d8bef9SDimitry Andric       .minScalar(0, S16)
1672e8d8bef9SDimitry Andric       .scalarize(0)
1673e8d8bef9SDimitry Andric       .lower();
16740b57cec5SDimitry Andric   } else {
16750b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
16760b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
16770b57cec5SDimitry Andric     // been truncated already.
16780b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
16790b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
168004eeddc0SDimitry Andric     Shifts.clampScalar(0, S32, S64);
1681e8d8bef9SDimitry Andric 
1682e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1683e8d8bef9SDimitry Andric       .minScalar(0, S32)
1684e8d8bef9SDimitry Andric       .scalarize(0)
1685e8d8bef9SDimitry Andric       .lower();
16860b57cec5SDimitry Andric   }
16870b57cec5SDimitry Andric   Shifts.scalarize(0);
16880b57cec5SDimitry Andric 
16890b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
16900b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
16910b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
16920b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
16930b57cec5SDimitry Andric 
16940b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
16950b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
16960b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
16970b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
16980b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1699e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
170006c3fb27SDimitry Andric           const bool isLegalVecType =
170106c3fb27SDimitry Andric               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
170206c3fb27SDimitry Andric           // Address space 8 pointers are 128-bit wide values, but the logic
170306c3fb27SDimitry Andric           // below will try to bitcast them to 2N x s64, which will fail.
170406c3fb27SDimitry Andric           // Therefore, as an intermediate step, wrap extracts/insertions from a
170506c3fb27SDimitry Andric           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
170606c3fb27SDimitry Andric           // extraction result) in order to produce a vector operation that can
170706c3fb27SDimitry Andric           // be handled by the logic below.
170806c3fb27SDimitry Andric           if (EltTy.isPointer() && EltSize > 64)
170906c3fb27SDimitry Andric             return true;
1710e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
17110b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
17125ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
171306c3fb27SDimitry Andric                   IdxTy.getSizeInBits() == 32 &&
171406c3fb27SDimitry Andric                   isLegalVecType;
17150b57cec5SDimitry Andric         })
1716e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1717e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1718e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1719e8d8bef9SDimitry Andric       .bitcastIf(
1720e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1721e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1722e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1723e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1724e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1725e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1726e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1727e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1728e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1729e8d8bef9SDimitry Andric 
1730e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1731bdd1243dSDimitry Andric           return std::pair(
1732fe6060f1SDimitry Andric               VecTypeIdx,
1733fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1734e8d8bef9SDimitry Andric         })
17350b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
17360b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1737e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1738e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1739e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
174006c3fb27SDimitry Andric       .moreElementsIf(
174106c3fb27SDimitry Andric         isIllegalRegisterType(VecTypeIdx),
174206c3fb27SDimitry Andric         moreElementsToNextExistingRegClass(VecTypeIdx))
1743e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1744e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1745e8d8bef9SDimitry Andric       .lower();
17460b57cec5SDimitry Andric   }
17470b57cec5SDimitry Andric 
17480b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
17490b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
17500b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
17510b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
17520b57cec5SDimitry Andric       });
17530b57cec5SDimitry Andric 
17540b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
17550b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
17560b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
17570b57cec5SDimitry Andric 
17580b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
17590b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
17608bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
17610eae32dcSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
17620eae32dcSDimitry Andric           // Sub-vector(or single element) insert and extract.
17630eae32dcSDimitry Andric           // TODO: verify immediate offset here since lower only works with
17640eae32dcSDimitry Andric           // whole elements.
17650eae32dcSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17660eae32dcSDimitry Andric           return BigTy.isVector();
17670eae32dcSDimitry Andric         })
17688bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
17690b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
17700b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17710b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
17720b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
17730b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
17740b57cec5SDimitry Andric         })
17750b57cec5SDimitry Andric       .widenScalarIf(
17760b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
17770b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17780b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
17790b57cec5SDimitry Andric         },
17800b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
17810b57cec5SDimitry Andric       .widenScalarIf(
17820b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
17830b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
17840b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
17850b57cec5SDimitry Andric         },
17860b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
17870b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
17880b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
17890b57cec5SDimitry Andric 
17900b57cec5SDimitry Andric   }
17910b57cec5SDimitry Andric 
17928bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
17930b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
17940b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
17958bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
17968bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
179706c3fb27SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
179806c3fb27SDimitry Andric     .moreElementsIf(
179906c3fb27SDimitry Andric       isIllegalRegisterType(0),
180006c3fb27SDimitry Andric       moreElementsToNextExistingRegClass(0));
18018bcb0991SDimitry Andric 
18028bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
18035ffd83dbSDimitry Andric     BuildVector
18045ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
18055ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
1806bdd1243dSDimitry Andric       .minScalar(1, S16);
18075ffd83dbSDimitry Andric 
18088bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
18098bcb0991SDimitry Andric       .legalFor({V2S16, S32})
18108bcb0991SDimitry Andric       .lower();
18118bcb0991SDimitry Andric   } else {
18125ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
18135ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
18145ffd83dbSDimitry Andric 
18158bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
18165ffd83dbSDimitry Andric       .customFor({V2S16, S32})
18178bcb0991SDimitry Andric       .lower();
18188bcb0991SDimitry Andric   }
18198bcb0991SDimitry Andric 
18205ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
18215ffd83dbSDimitry Andric 
18225ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
18230b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1824e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1825e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1826e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1827e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
18280b57cec5SDimitry Andric 
18298bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
18308bcb0991SDimitry Andric 
18310b57cec5SDimitry Andric   // Merge/Unmerge
18320b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
18330b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
18340b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
18350b57cec5SDimitry Andric 
18360b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
18375ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
18380b57cec5SDimitry Andric       if (Ty.isVector()) {
18390b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
18405ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
18410b57cec5SDimitry Andric           return true;
184206c3fb27SDimitry Andric         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
18430b57cec5SDimitry Andric           return true;
18440b57cec5SDimitry Andric       }
18450b57cec5SDimitry Andric       return false;
18460b57cec5SDimitry Andric     };
18470b57cec5SDimitry Andric 
18488bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1849e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
18505ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
18515ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
18525ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
18535ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
18545ffd83dbSDimitry Andric         })
18555ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
18565ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
18575ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
18580b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
18598bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
18608bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
18618bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
18628bcb0991SDimitry Andric                        changeTo(1, V2S16))
18635ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
18645ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
18655ffd83dbSDimitry Andric       // valid.
18665ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
18675ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
18680b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
18690b57cec5SDimitry Andric       .fewerElementsIf(
18705ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
18710b57cec5SDimitry Andric         scalarize(0))
18720b57cec5SDimitry Andric       .fewerElementsIf(
18735ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
18740b57cec5SDimitry Andric         scalarize(1))
18755ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
18768bcb0991SDimitry Andric 
18778bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
18788bcb0991SDimitry Andric       Builder.widenScalarIf(
18798bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
18800b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
18818bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
18828bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
18838bcb0991SDimitry Andric         },
18848bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
18858bcb0991SDimitry Andric     }
18868bcb0991SDimitry Andric 
18878bcb0991SDimitry Andric     Builder.widenScalarIf(
18888bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
18898bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
189006c3fb27SDimitry Andric         return Ty.getSizeInBits() % 16 != 0;
18910b57cec5SDimitry Andric       },
18920b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
18930b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
18940b57cec5SDimitry Andric         // Whichever is smaller.
18950b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
18960b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
18970b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
18980b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
18990b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
19000b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
19010b57cec5SDimitry Andric         }
1902bdd1243dSDimitry Andric         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
19030b57cec5SDimitry Andric       })
19040b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
19050b57cec5SDimitry Andric       .scalarize(0)
19060b57cec5SDimitry Andric       .scalarize(1);
19070b57cec5SDimitry Andric   }
19080b57cec5SDimitry Andric 
19095ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
19105ffd83dbSDimitry Andric   // RegBankSelect.
19115ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
19125ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
19138bcb0991SDimitry Andric 
19145ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
19155ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
19165ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
19175ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
19185ffd83dbSDimitry Andric       // expanded.
19190eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2);
19205ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
19215ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
19225ffd83dbSDimitry Andric   } else {
19235ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
19245ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
19255ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
19265ffd83dbSDimitry Andric   }
19275ffd83dbSDimitry Andric 
19285ffd83dbSDimitry Andric   SextInReg
19295ffd83dbSDimitry Andric     .scalarize(0)
19305ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
19315ffd83dbSDimitry Andric     .lower();
19325ffd83dbSDimitry Andric 
1933349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1934349cc55cSDimitry Andric     .scalarize(0)
1935349cc55cSDimitry Andric     .lower();
1936349cc55cSDimitry Andric 
1937fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
19385ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
19395ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1940fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
19410eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
19425ffd83dbSDimitry Andric     .scalarize(0)
19435ffd83dbSDimitry Andric     .lower();
1944480093f4SDimitry Andric 
1945fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
1946fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1947fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
19480eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
1949fe6060f1SDimitry Andric       .scalarize(0)
1950fe6060f1SDimitry Andric       .lower();
1951fe6060f1SDimitry Andric   } else {
1952fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1953fe6060f1SDimitry Andric       .scalarize(0)
1954fe6060f1SDimitry Andric       .lower();
1955fe6060f1SDimitry Andric   }
1956fe6060f1SDimitry Andric 
1957480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1958480093f4SDimitry Andric     .legalFor({S64});
1959480093f4SDimitry Andric 
1960e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1961e8d8bef9SDimitry Andric     .alwaysLegal();
1962e8d8bef9SDimitry Andric 
1963fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1964fe6060f1SDimitry Andric       .scalarize(0)
1965fe6060f1SDimitry Andric       .minScalar(0, S32)
1966fe6060f1SDimitry Andric       .lower();
1967fe6060f1SDimitry Andric 
1968fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1969fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
1970fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
1971fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
1972fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
1973fe6060f1SDimitry Andric       .scalarize(0);
1974fe6060f1SDimitry Andric 
19755f757f3fSDimitry Andric   getActionDefinitionsBuilder(
19765f757f3fSDimitry Andric       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
19775ffd83dbSDimitry Andric        G_FCOPYSIGN,
19785ffd83dbSDimitry Andric 
19795f757f3fSDimitry Andric        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
19805f757f3fSDimitry Andric        G_READ_REGISTER, G_WRITE_REGISTER,
19815ffd83dbSDimitry Andric 
19825f757f3fSDimitry Andric        G_SADDO, G_SSUBO})
19835f757f3fSDimitry Andric       .lower();
19845ffd83dbSDimitry Andric 
19855f757f3fSDimitry Andric   if (ST.hasIEEEMinMax()) {
19865f757f3fSDimitry Andric     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
19875f757f3fSDimitry Andric         .legalFor(FPTypesPK16)
19885f757f3fSDimitry Andric         .clampMaxNumElements(0, S16, 2)
19895f757f3fSDimitry Andric         .scalarize(0);
19905f757f3fSDimitry Andric   } else {
19915ffd83dbSDimitry Andric     // TODO: Implement
19925f757f3fSDimitry Andric     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
19935f757f3fSDimitry Andric   }
19945ffd83dbSDimitry Andric 
1995349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1996349cc55cSDimitry Andric       .lower();
1997349cc55cSDimitry Andric 
1998480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
19995ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2000480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2001480093f4SDimitry Andric     .unsupported();
2002480093f4SDimitry Andric 
20035f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
20045f757f3fSDimitry Andric 
2005fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
20060b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
20070b57cec5SDimitry Andric }
20080b57cec5SDimitry Andric 
2009*1db9f3b2SDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(
2010*1db9f3b2SDimitry Andric     LegalizerHelper &Helper, MachineInstr &MI,
2011*1db9f3b2SDimitry Andric     LostDebugLocObserver &LocObserver) const {
20125ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
20135ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
20145ffd83dbSDimitry Andric 
20150b57cec5SDimitry Andric   switch (MI.getOpcode()) {
20160b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
20178bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
20185f757f3fSDimitry Andric   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
20195f757f3fSDimitry Andric     return legalizeFroundeven(MI, MRI, B);
20200b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
20218bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
2022e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
2023e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
20240b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
20258bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
20260b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
20278bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
20280b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
20298bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
20305ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
20315ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
20325ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
20335ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
20340b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
20350b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
20360b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
20370b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
20385ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
20390b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
20408bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
20410b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
20428bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
20438bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
20448bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
20458bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
20468bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
20478bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
20488bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
2049fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
2050fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
2051e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
205206c3fb27SDimitry Andric   case TargetOpcode::G_STORE:
205306c3fb27SDimitry Andric     return legalizeStore(Helper, MI);
20548bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
20558bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
20568bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
20578bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
205806c3fb27SDimitry Andric   case TargetOpcode::G_FFREXP:
205906c3fb27SDimitry Andric     return legalizeFFREXP(MI, MRI, B);
206006c3fb27SDimitry Andric   case TargetOpcode::G_FSQRT:
206106c3fb27SDimitry Andric     return legalizeFSQRT(MI, MRI, B);
20625ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
20635ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
2064fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
2065fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
20665ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
20675ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
2068fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
2069fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
2070480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
2071480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
207206c3fb27SDimitry Andric   case TargetOpcode::G_FLOG2:
207306c3fb27SDimitry Andric     return legalizeFlog2(MI, B);
20745ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
20755ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
207606c3fb27SDimitry Andric     return legalizeFlogCommon(MI, B);
207706c3fb27SDimitry Andric   case TargetOpcode::G_FEXP2:
207806c3fb27SDimitry Andric     return legalizeFExp2(MI, B);
20795ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
20805f757f3fSDimitry Andric   case TargetOpcode::G_FEXP10:
20815ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
20825ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
20835ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
20845ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
20855ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
20865ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
2087bdd1243dSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
20885ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
208981ad6265SDimitry Andric   case TargetOpcode::G_MUL:
209081ad6265SDimitry Andric     return legalizeMul(Helper, MI);
2091349cc55cSDimitry Andric   case TargetOpcode::G_CTLZ:
2092349cc55cSDimitry Andric   case TargetOpcode::G_CTTZ:
2093349cc55cSDimitry Andric     return legalizeCTLZ_CTTZ(MI, MRI, B);
209481ad6265SDimitry Andric   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
209581ad6265SDimitry Andric     return legalizeFPTruncRound(MI, B);
20965f757f3fSDimitry Andric   case TargetOpcode::G_STACKSAVE:
20975f757f3fSDimitry Andric     return legalizeStackSave(MI, B);
20980b57cec5SDimitry Andric   default:
20990b57cec5SDimitry Andric     return false;
21000b57cec5SDimitry Andric   }
21010b57cec5SDimitry Andric 
21020b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
21030b57cec5SDimitry Andric }
21040b57cec5SDimitry Andric 
21050b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
21060b57cec5SDimitry Andric   unsigned AS,
21070b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
21088bcb0991SDimitry Andric   MachineIRBuilder &B) const {
21098bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
21100b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
21110b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
2112bdd1243dSDimitry Andric   const LLT S64 = LLT::scalar(64);
21130b57cec5SDimitry Andric 
21148bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
21158bcb0991SDimitry Andric 
21160b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
2117bdd1243dSDimitry Andric     // Note: this register is somewhat broken. When used as a 32-bit operand,
2118bdd1243dSDimitry Andric     // it only returns zeroes. The real value is in the upper 32 bits.
2119bdd1243dSDimitry Andric     // Thus, we must emit extract the high 32 bits.
2120bdd1243dSDimitry Andric     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2121bdd1243dSDimitry Andric                                        ? AMDGPU::SRC_SHARED_BASE
2122bdd1243dSDimitry Andric                                        : AMDGPU::SRC_PRIVATE_BASE;
2123bdd1243dSDimitry Andric     // FIXME: It would be more natural to emit a COPY here, but then copy
2124bdd1243dSDimitry Andric     // coalescing would kick in and it would think it's okay to use the "HI"
2125bdd1243dSDimitry Andric     // subregister (instead of extracting the HI 32 bits) which is an artificial
2126bdd1243dSDimitry Andric     // (unusable) register.
2127bdd1243dSDimitry Andric     //  Register TableGen definitions would need an overhaul to get rid of the
2128bdd1243dSDimitry Andric     //  artificial "HI" aperture registers and prevent this kind of issue from
2129bdd1243dSDimitry Andric     //  happening.
2130bdd1243dSDimitry Andric     Register Dst = MRI.createGenericVirtualRegister(S64);
2131bdd1243dSDimitry Andric     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2132bdd1243dSDimitry Andric     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2133bdd1243dSDimitry Andric     return B.buildUnmerge(S32, Dst).getReg(1);
21340b57cec5SDimitry Andric   }
21350b57cec5SDimitry Andric 
213681ad6265SDimitry Andric   // TODO: can we be smarter about machine pointer info?
213781ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
213881ad6265SDimitry Andric   Register LoadAddr = MRI.createGenericVirtualRegister(
213981ad6265SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
214081ad6265SDimitry Andric   // For code object version 5, private_base and shared_base are passed through
214181ad6265SDimitry Andric   // implicit kernargs.
214206c3fb27SDimitry Andric   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
214306c3fb27SDimitry Andric       AMDGPU::AMDHSA_COV5) {
214481ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
214581ad6265SDimitry Andric         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
214681ad6265SDimitry Andric                                       : AMDGPUTargetLowering::PRIVATE_BASE;
214781ad6265SDimitry Andric     uint64_t Offset =
214881ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
214981ad6265SDimitry Andric 
215081ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
215181ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
215281ad6265SDimitry Andric 
215381ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
215481ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
215581ad6265SDimitry Andric       return Register();
215681ad6265SDimitry Andric 
215781ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
215881ad6265SDimitry Andric         PtrInfo,
215981ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
216081ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
216181ad6265SDimitry Andric         LLT::scalar(32), commonAlignment(Align(64), Offset));
216281ad6265SDimitry Andric 
216381ad6265SDimitry Andric     // Pointer address
216481ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
216581ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
216681ad6265SDimitry Andric     // Load address
216781ad6265SDimitry Andric     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
216881ad6265SDimitry Andric   }
216981ad6265SDimitry Andric 
21700b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
21710b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
21720b57cec5SDimitry Andric 
2173e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
21748bcb0991SDimitry Andric     return Register();
21750b57cec5SDimitry Andric 
21760b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
21770b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
21780b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
21790b57cec5SDimitry Andric 
21800b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
21810b57cec5SDimitry Andric       PtrInfo,
21825ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
21830b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
2184fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
21850b57cec5SDimitry Andric 
218681ad6265SDimitry Andric   B.buildPtrAdd(LoadAddr, QueuePtr,
218781ad6265SDimitry Andric                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
21885ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
21890b57cec5SDimitry Andric }
21900b57cec5SDimitry Andric 
219104eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is
219204eeddc0SDimitry Andric /// not necessary.
219304eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
219404eeddc0SDimitry Andric                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
219504eeddc0SDimitry Andric   MachineInstr *Def = MRI.getVRegDef(Val);
219604eeddc0SDimitry Andric   switch (Def->getOpcode()) {
219704eeddc0SDimitry Andric   case AMDGPU::G_FRAME_INDEX:
219804eeddc0SDimitry Andric   case AMDGPU::G_GLOBAL_VALUE:
219904eeddc0SDimitry Andric   case AMDGPU::G_BLOCK_ADDR:
220004eeddc0SDimitry Andric     return true;
220104eeddc0SDimitry Andric   case AMDGPU::G_CONSTANT: {
220204eeddc0SDimitry Andric     const ConstantInt *CI = Def->getOperand(1).getCImm();
220304eeddc0SDimitry Andric     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
220404eeddc0SDimitry Andric   }
220504eeddc0SDimitry Andric   default:
220604eeddc0SDimitry Andric     return false;
220704eeddc0SDimitry Andric   }
220804eeddc0SDimitry Andric 
220904eeddc0SDimitry Andric   return false;
221004eeddc0SDimitry Andric }
221104eeddc0SDimitry Andric 
22120b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
22130b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22148bcb0991SDimitry Andric   MachineIRBuilder &B) const {
22158bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
22160b57cec5SDimitry Andric 
22178bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
22180b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22190b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
22200b57cec5SDimitry Andric 
22210b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
22220b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
22230b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
22240b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
22250b57cec5SDimitry Andric 
22260b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
22270b57cec5SDimitry Andric   // vector element.
22280b57cec5SDimitry Andric   assert(!DstTy.isVector());
22290b57cec5SDimitry Andric 
22300b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
22310b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
22320b57cec5SDimitry Andric 
2233e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
22348bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
22358bcb0991SDimitry Andric     return true;
22368bcb0991SDimitry Andric   }
22378bcb0991SDimitry Andric 
223881ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
223981ad6265SDimitry Andric       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
224081ad6265SDimitry Andric        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
224104eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
224204eeddc0SDimitry Andric       // Extract low 32-bits of the pointer.
224304eeddc0SDimitry Andric       B.buildExtract(Dst, Src, 0);
224404eeddc0SDimitry Andric       MI.eraseFromParent();
224504eeddc0SDimitry Andric       return true;
224604eeddc0SDimitry Andric     }
224704eeddc0SDimitry Andric 
22480b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
22490b57cec5SDimitry Andric 
22508bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
22518bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
22520b57cec5SDimitry Andric 
22530b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
22545ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
22550b57cec5SDimitry Andric 
22565ffd83dbSDimitry Andric     auto CmpRes =
22575ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
22588bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
22590b57cec5SDimitry Andric 
22600b57cec5SDimitry Andric     MI.eraseFromParent();
22610b57cec5SDimitry Andric     return true;
22620b57cec5SDimitry Andric   }
22630b57cec5SDimitry Andric 
226481ad6265SDimitry Andric   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
226581ad6265SDimitry Andric       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
226681ad6265SDimitry Andric        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
22678bcb0991SDimitry Andric     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
22688bcb0991SDimitry Andric     if (!ApertureReg.isValid())
22698bcb0991SDimitry Andric       return false;
22700b57cec5SDimitry Andric 
22710b57cec5SDimitry Andric     // Coerce the type of the low half of the result so we can use merge_values.
22725ffd83dbSDimitry Andric     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
22730b57cec5SDimitry Andric 
22740b57cec5SDimitry Andric     // TODO: Should we allow mismatched types but matching sizes in merges to
22750b57cec5SDimitry Andric     // avoid the ptrtoint?
2276bdd1243dSDimitry Andric     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
227704eeddc0SDimitry Andric 
227804eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
227904eeddc0SDimitry Andric       B.buildCopy(Dst, BuildPtr);
228004eeddc0SDimitry Andric       MI.eraseFromParent();
228104eeddc0SDimitry Andric       return true;
228204eeddc0SDimitry Andric     }
228304eeddc0SDimitry Andric 
228404eeddc0SDimitry Andric     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
228504eeddc0SDimitry Andric     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
228604eeddc0SDimitry Andric 
228781ad6265SDimitry Andric     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
228881ad6265SDimitry Andric                               SegmentNull.getReg(0));
228904eeddc0SDimitry Andric 
22905ffd83dbSDimitry Andric     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
22910b57cec5SDimitry Andric 
22920b57cec5SDimitry Andric     MI.eraseFromParent();
22930b57cec5SDimitry Andric     return true;
22940b57cec5SDimitry Andric   }
22950b57cec5SDimitry Andric 
229681ad6265SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
229781ad6265SDimitry Andric       SrcTy.getSizeInBits() == 64) {
229881ad6265SDimitry Andric     // Truncate.
229981ad6265SDimitry Andric     B.buildExtract(Dst, Src, 0);
230081ad6265SDimitry Andric     MI.eraseFromParent();
230181ad6265SDimitry Andric     return true;
230281ad6265SDimitry Andric   }
230381ad6265SDimitry Andric 
230481ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
230581ad6265SDimitry Andric       DstTy.getSizeInBits() == 64) {
230681ad6265SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
230781ad6265SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2308bdd1243dSDimitry Andric     auto PtrLo = B.buildPtrToInt(S32, Src);
2309bdd1243dSDimitry Andric     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2310bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
231181ad6265SDimitry Andric     MI.eraseFromParent();
231281ad6265SDimitry Andric     return true;
231381ad6265SDimitry Andric   }
231481ad6265SDimitry Andric 
231581ad6265SDimitry Andric   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
231681ad6265SDimitry Andric       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
231781ad6265SDimitry Andric 
231881ad6265SDimitry Andric   LLVMContext &Ctx = MF.getFunction().getContext();
231981ad6265SDimitry Andric   Ctx.diagnose(InvalidAddrSpaceCast);
232081ad6265SDimitry Andric   B.buildUndef(Dst);
232181ad6265SDimitry Andric   MI.eraseFromParent();
232281ad6265SDimitry Andric   return true;
232381ad6265SDimitry Andric }
232481ad6265SDimitry Andric 
23255f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
23265f757f3fSDimitry Andric                                              MachineRegisterInfo &MRI,
23278bcb0991SDimitry Andric                                              MachineIRBuilder &B) const {
23280b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
23290b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
23300b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
23310b57cec5SDimitry Andric 
23320b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
23330b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
23340b57cec5SDimitry Andric 
23358bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
23368bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
23370b57cec5SDimitry Andric 
23380b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
23398bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
23408bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
23410b57cec5SDimitry Andric 
23428bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
23438bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
23440b57cec5SDimitry Andric 
23458bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
23468bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2347e8d8bef9SDimitry Andric   MI.eraseFromParent();
23480b57cec5SDimitry Andric   return true;
23490b57cec5SDimitry Andric }
23500b57cec5SDimitry Andric 
23510b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
23520b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23530b57cec5SDimitry Andric   MachineIRBuilder &B) const {
23540b57cec5SDimitry Andric 
23550b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
23560b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
23570b57cec5SDimitry Andric 
23580b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
23590b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
23600b57cec5SDimitry Andric 
23610b57cec5SDimitry Andric   // result = trunc(src)
23620b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
23630b57cec5SDimitry Andric   //   result += 1.0
23640b57cec5SDimitry Andric 
23655ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
23660b57cec5SDimitry Andric 
23670b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
23680b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
23690b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
23700b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
23710b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
23720b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
23730b57cec5SDimitry Andric 
23740b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
23750b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
237604eeddc0SDimitry Andric   MI.eraseFromParent();
23770b57cec5SDimitry Andric   return true;
23780b57cec5SDimitry Andric }
23790b57cec5SDimitry Andric 
2380e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
2381e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
2382e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
2383e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
2384e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
2385e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
2386e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
2387e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
2388e8d8bef9SDimitry Andric 
2389e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2390e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2391e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2392e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2393e8d8bef9SDimitry Andric     MI.eraseFromParent();
2394e8d8bef9SDimitry Andric     return true;
2395e8d8bef9SDimitry Andric }
2396e8d8bef9SDimitry Andric 
2397e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
23980b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
23990b57cec5SDimitry Andric   const unsigned FractBits = 52;
24000b57cec5SDimitry Andric   const unsigned ExpBits = 11;
24010b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
24020b57cec5SDimitry Andric 
24030b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
24040b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
24050b57cec5SDimitry Andric 
24065f757f3fSDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2407e8d8bef9SDimitry Andric                      .addUse(Hi)
24080b57cec5SDimitry Andric                      .addUse(Const0.getReg(0))
24090b57cec5SDimitry Andric                      .addUse(Const1.getReg(0));
24100b57cec5SDimitry Andric 
24110b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
24120b57cec5SDimitry Andric }
24130b57cec5SDimitry Andric 
24140b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
24150b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24160b57cec5SDimitry Andric   MachineIRBuilder &B) const {
24170b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
24180b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
24190b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
24200b57cec5SDimitry Andric 
24210b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
24220b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
24230b57cec5SDimitry Andric 
24240b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
24250b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
24260b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
24270b57cec5SDimitry Andric 
24280b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
24290b57cec5SDimitry Andric   // exponent.
24300b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
24310b57cec5SDimitry Andric 
24320b57cec5SDimitry Andric   const unsigned FractBits = 52;
24330b57cec5SDimitry Andric 
24340b57cec5SDimitry Andric   // Extract the sign bit.
24350b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
24360b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
24370b57cec5SDimitry Andric 
24380b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
24390b57cec5SDimitry Andric 
24400b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
24410b57cec5SDimitry Andric 
24420b57cec5SDimitry Andric   // Extend back to 64-bits.
2443bdd1243dSDimitry Andric   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
24440b57cec5SDimitry Andric 
24450b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
24460b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
24470b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
24480b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
24490b57cec5SDimitry Andric 
24500b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
24510b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
24520b57cec5SDimitry Andric 
24530b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
24540b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2455e8d8bef9SDimitry Andric   MI.eraseFromParent();
24560b57cec5SDimitry Andric   return true;
24570b57cec5SDimitry Andric }
24580b57cec5SDimitry Andric 
24590b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
24600b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24610b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
24620b57cec5SDimitry Andric 
24630b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
24640b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
24650b57cec5SDimitry Andric 
24660b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
24670b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
24680b57cec5SDimitry Andric 
2469349cc55cSDimitry Andric   assert(MRI.getType(Src) == S64);
24700b57cec5SDimitry Andric 
24710b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2472349cc55cSDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
24730b57cec5SDimitry Andric 
2474349cc55cSDimitry Andric   if (MRI.getType(Dst) == S64) {
2475349cc55cSDimitry Andric     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2476349cc55cSDimitry Andric                         : B.buildUITOFP(S64, Unmerge.getReg(1));
24770b57cec5SDimitry Andric 
24780b57cec5SDimitry Andric     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
247906c3fb27SDimitry Andric     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
24800b57cec5SDimitry Andric 
24810b57cec5SDimitry Andric     // TODO: Should this propagate fast-math-flags?
24820b57cec5SDimitry Andric     B.buildFAdd(Dst, LdExp, CvtLo);
24830b57cec5SDimitry Andric     MI.eraseFromParent();
24840b57cec5SDimitry Andric     return true;
24850b57cec5SDimitry Andric   }
24860b57cec5SDimitry Andric 
2487349cc55cSDimitry Andric   assert(MRI.getType(Dst) == S32);
2488349cc55cSDimitry Andric 
2489349cc55cSDimitry Andric   auto One = B.buildConstant(S32, 1);
2490349cc55cSDimitry Andric 
2491349cc55cSDimitry Andric   MachineInstrBuilder ShAmt;
2492349cc55cSDimitry Andric   if (Signed) {
2493349cc55cSDimitry Andric     auto ThirtyOne = B.buildConstant(S32, 31);
2494349cc55cSDimitry Andric     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2495349cc55cSDimitry Andric     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2496349cc55cSDimitry Andric     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
24975f757f3fSDimitry Andric     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2498349cc55cSDimitry Andric                   .addUse(Unmerge.getReg(1));
2499349cc55cSDimitry Andric     auto LS2 = B.buildSub(S32, LS, One);
2500349cc55cSDimitry Andric     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2501349cc55cSDimitry Andric   } else
2502349cc55cSDimitry Andric     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2503349cc55cSDimitry Andric   auto Norm = B.buildShl(S64, Src, ShAmt);
2504349cc55cSDimitry Andric   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2505349cc55cSDimitry Andric   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2506349cc55cSDimitry Andric   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2507349cc55cSDimitry Andric   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2508349cc55cSDimitry Andric   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
250906c3fb27SDimitry Andric   B.buildFLdexp(Dst, FVal, Scale);
2510349cc55cSDimitry Andric   MI.eraseFromParent();
2511349cc55cSDimitry Andric   return true;
2512349cc55cSDimitry Andric }
2513349cc55cSDimitry Andric 
25145ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
25155ffd83dbSDimitry Andric // actually works.
2516fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2517fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2518fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2519fe6060f1SDimitry Andric                                         bool Signed) const {
25205ffd83dbSDimitry Andric 
25215ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
25225ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
25235ffd83dbSDimitry Andric 
25245ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
25255ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
25265ffd83dbSDimitry Andric 
2527fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2528fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
25295ffd83dbSDimitry Andric 
25305ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
25315ffd83dbSDimitry Andric 
2532fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2533fe6060f1SDimitry Andric   // integers is illustrated as follows:
2534fe6060f1SDimitry Andric   //
2535fe6060f1SDimitry Andric   //     tf := trunc(val);
2536fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2537fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2538fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2539fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2540fe6060f1SDimitry Andric   //
2541fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2542fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2543fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2544fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2545fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2546fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2547fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2548fe6060f1SDimitry Andric     // signedness.
2549fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2550fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2551fe6060f1SDimitry Andric   }
2552fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2553fe6060f1SDimitry Andric   if (SrcLT == S64) {
255406c3fb27SDimitry Andric     K0 = B.buildFConstant(
255506c3fb27SDimitry Andric         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
255606c3fb27SDimitry Andric     K1 = B.buildFConstant(
255706c3fb27SDimitry Andric         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2558fe6060f1SDimitry Andric   } else {
255906c3fb27SDimitry Andric     K0 = B.buildFConstant(
256006c3fb27SDimitry Andric         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
256106c3fb27SDimitry Andric     K1 = B.buildFConstant(
256206c3fb27SDimitry Andric         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2563fe6060f1SDimitry Andric   }
25645ffd83dbSDimitry Andric 
2565fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2566fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2567fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
25685ffd83dbSDimitry Andric 
2569fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2570fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
25715ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
25725ffd83dbSDimitry Andric 
2573fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2574fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2575bdd1243dSDimitry Andric     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2576fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2577bdd1243dSDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2578bdd1243dSDimitry Andric                Sign);
2579fe6060f1SDimitry Andric   } else
2580bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {Lo, Hi});
25815ffd83dbSDimitry Andric   MI.eraseFromParent();
25825ffd83dbSDimitry Andric 
25835ffd83dbSDimitry Andric   return true;
25845ffd83dbSDimitry Andric }
25855ffd83dbSDimitry Andric 
25865ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
25875ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
25885ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
25890b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
25900b57cec5SDimitry Andric 
25910b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
25920b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
25930b57cec5SDimitry Andric 
25940b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
25950b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
25960b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
25970b57cec5SDimitry Andric     return !IsIEEEOp;
25980b57cec5SDimitry Andric 
25990b57cec5SDimitry Andric   if (IsIEEEOp)
26000b57cec5SDimitry Andric     return true;
26010b57cec5SDimitry Andric 
26020b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
26030b57cec5SDimitry Andric }
26040b57cec5SDimitry Andric 
26050b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
26060b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
26070b57cec5SDimitry Andric   MachineIRBuilder &B) const {
26080b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
26090b57cec5SDimitry Andric 
26100b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
26115ffd83dbSDimitry Andric 
261206c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
261306c3fb27SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
261406c3fb27SDimitry Andric 
261506c3fb27SDimitry Andric   LLT VecTy = MRI.getType(Vec);
261606c3fb27SDimitry Andric   LLT EltTy = VecTy.getElementType();
261706c3fb27SDimitry Andric   assert(EltTy == MRI.getType(Dst));
261806c3fb27SDimitry Andric 
261906c3fb27SDimitry Andric   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
262006c3fb27SDimitry Andric   // but we can't go directly to that logic becasue you can't bitcast a vector
262106c3fb27SDimitry Andric   // of pointers to a vector of integers. Therefore, introduce an intermediate
262206c3fb27SDimitry Andric   // vector of integers using ptrtoint (and inttoptr on the output) in order to
262306c3fb27SDimitry Andric   // drive the legalization forward.
262406c3fb27SDimitry Andric   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
262506c3fb27SDimitry Andric     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
262606c3fb27SDimitry Andric     LLT IntVecTy = VecTy.changeElementType(IntTy);
262706c3fb27SDimitry Andric 
262806c3fb27SDimitry Andric     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
262906c3fb27SDimitry Andric     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
263006c3fb27SDimitry Andric     B.buildIntToPtr(Dst, IntElt);
263106c3fb27SDimitry Andric 
263206c3fb27SDimitry Andric     MI.eraseFromParent();
263306c3fb27SDimitry Andric     return true;
263406c3fb27SDimitry Andric   }
263506c3fb27SDimitry Andric 
26365ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
26375ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2638349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2639bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2640349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2641e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
26420b57cec5SDimitry Andric     return true;
2643bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
26440b57cec5SDimitry Andric 
264504eeddc0SDimitry Andric   if (IdxVal < VecTy.getNumElements()) {
264604eeddc0SDimitry Andric     auto Unmerge = B.buildUnmerge(EltTy, Vec);
264704eeddc0SDimitry Andric     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
264804eeddc0SDimitry Andric   } else {
26490b57cec5SDimitry Andric     B.buildUndef(Dst);
265004eeddc0SDimitry Andric   }
26510b57cec5SDimitry Andric 
26520b57cec5SDimitry Andric   MI.eraseFromParent();
26530b57cec5SDimitry Andric   return true;
26540b57cec5SDimitry Andric }
26550b57cec5SDimitry Andric 
26560b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
26570b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
26580b57cec5SDimitry Andric   MachineIRBuilder &B) const {
26590b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
26600b57cec5SDimitry Andric 
26610b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
26625ffd83dbSDimitry Andric 
266306c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
266406c3fb27SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
266506c3fb27SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
266606c3fb27SDimitry Andric 
266706c3fb27SDimitry Andric   LLT VecTy = MRI.getType(Vec);
266806c3fb27SDimitry Andric   LLT EltTy = VecTy.getElementType();
266906c3fb27SDimitry Andric   assert(EltTy == MRI.getType(Ins));
267006c3fb27SDimitry Andric 
267106c3fb27SDimitry Andric   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
267206c3fb27SDimitry Andric   // but we can't go directly to that logic becasue you can't bitcast a vector
267306c3fb27SDimitry Andric   // of pointers to a vector of integers. Therefore, make the pointer vector
267406c3fb27SDimitry Andric   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
267506c3fb27SDimitry Andric   // new value, and then inttoptr the result vector back. This will then allow
267606c3fb27SDimitry Andric   // the rest of legalization to take over.
267706c3fb27SDimitry Andric   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
267806c3fb27SDimitry Andric     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
267906c3fb27SDimitry Andric     LLT IntVecTy = VecTy.changeElementType(IntTy);
268006c3fb27SDimitry Andric 
268106c3fb27SDimitry Andric     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
268206c3fb27SDimitry Andric     auto IntIns = B.buildPtrToInt(IntTy, Ins);
268306c3fb27SDimitry Andric     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
268406c3fb27SDimitry Andric                                                  MI.getOperand(3));
268506c3fb27SDimitry Andric     B.buildIntToPtr(Dst, IntVecDest);
268606c3fb27SDimitry Andric     MI.eraseFromParent();
268706c3fb27SDimitry Andric     return true;
268806c3fb27SDimitry Andric   }
268906c3fb27SDimitry Andric 
26905ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
26915ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2692349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2693bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2694349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2695e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
26960b57cec5SDimitry Andric     return true;
26970b57cec5SDimitry Andric 
2698bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
26990b57cec5SDimitry Andric 
270004eeddc0SDimitry Andric   unsigned NumElts = VecTy.getNumElements();
270104eeddc0SDimitry Andric   if (IdxVal < NumElts) {
270204eeddc0SDimitry Andric     SmallVector<Register, 8> SrcRegs;
270304eeddc0SDimitry Andric     for (unsigned i = 0; i < NumElts; ++i)
270404eeddc0SDimitry Andric       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
270504eeddc0SDimitry Andric     B.buildUnmerge(SrcRegs, Vec);
270604eeddc0SDimitry Andric 
270704eeddc0SDimitry Andric     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2708bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, SrcRegs);
270904eeddc0SDimitry Andric   } else {
27100b57cec5SDimitry Andric     B.buildUndef(Dst);
271104eeddc0SDimitry Andric   }
27120b57cec5SDimitry Andric 
27130b57cec5SDimitry Andric   MI.eraseFromParent();
27140b57cec5SDimitry Andric   return true;
27150b57cec5SDimitry Andric }
27160b57cec5SDimitry Andric 
27178bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
27188bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
27198bcb0991SDimitry Andric   MachineIRBuilder &B) const {
27208bcb0991SDimitry Andric 
27218bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
27228bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
27238bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
27248bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
27258bcb0991SDimitry Andric 
27268bcb0991SDimitry Andric   Register TrigVal;
27275ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
27288bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
27298bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
27305f757f3fSDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
27318bcb0991SDimitry Andric                   .addUse(MulVal.getReg(0))
27325f757f3fSDimitry Andric                   .setMIFlags(Flags)
27335f757f3fSDimitry Andric                   .getReg(0);
27348bcb0991SDimitry Andric   } else
27358bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
27368bcb0991SDimitry Andric 
27378bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
27388bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
27395f757f3fSDimitry Andric   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
27408bcb0991SDimitry Andric       .addUse(TrigVal)
27418bcb0991SDimitry Andric       .setMIFlags(Flags);
27428bcb0991SDimitry Andric   MI.eraseFromParent();
27438bcb0991SDimitry Andric   return true;
27448bcb0991SDimitry Andric }
27458bcb0991SDimitry Andric 
27465ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
27475ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
27485ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
27495ffd83dbSDimitry Andric                                                   int64_t Offset,
27505ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
27515ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
27528bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
27538bcb0991SDimitry Andric   // to the following code sequence:
27548bcb0991SDimitry Andric   //
27558bcb0991SDimitry Andric   // For constant address space:
27568bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
27578bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
27588bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
27598bcb0991SDimitry Andric   //
27608bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
27618bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
27628bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
27638bcb0991SDimitry Andric   //   operand to the global variable.
27648bcb0991SDimitry Andric   //
27658bcb0991SDimitry Andric   // For global address space:
27668bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
27678bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
27688bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
27698bcb0991SDimitry Andric   //
27708bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
27718bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
27728bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
27738bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
27748bcb0991SDimitry Andric   //   operand to the global variable.
27758bcb0991SDimitry Andric 
27768bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
27778bcb0991SDimitry Andric 
27788bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
27798bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
27808bcb0991SDimitry Andric 
27818bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
27828bcb0991SDimitry Andric     .addDef(PCReg);
27838bcb0991SDimitry Andric 
27845f757f3fSDimitry Andric   MIB.addGlobalAddress(GV, Offset, GAFlags);
27858bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
27868bcb0991SDimitry Andric     MIB.addImm(0);
27878bcb0991SDimitry Andric   else
27885f757f3fSDimitry Andric     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
27898bcb0991SDimitry Andric 
279006c3fb27SDimitry Andric   if (!B.getMRI()->getRegClassOrNull(PCReg))
27918bcb0991SDimitry Andric     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
27928bcb0991SDimitry Andric 
27938bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
27948bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
27958bcb0991SDimitry Andric   return true;
27968bcb0991SDimitry Andric }
27978bcb0991SDimitry Andric 
27985f757f3fSDimitry Andric // Emit a ABS32_LO / ABS32_HI relocation stub.
27995f757f3fSDimitry Andric void AMDGPULegalizerInfo::buildAbsGlobalAddress(
28005f757f3fSDimitry Andric     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
28015f757f3fSDimitry Andric     MachineRegisterInfo &MRI) const {
28025f757f3fSDimitry Andric   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
28035f757f3fSDimitry Andric 
28045f757f3fSDimitry Andric   LLT S32 = LLT::scalar(32);
28055f757f3fSDimitry Andric 
28065f757f3fSDimitry Andric   // Use the destination directly, if and only if we store the lower address
28075f757f3fSDimitry Andric   // part only and we don't have a register class being set.
28085f757f3fSDimitry Andric   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
28095f757f3fSDimitry Andric                         ? DstReg
28105f757f3fSDimitry Andric                         : MRI.createGenericVirtualRegister(S32);
28115f757f3fSDimitry Andric 
28125f757f3fSDimitry Andric   if (!MRI.getRegClassOrNull(AddrLo))
28135f757f3fSDimitry Andric     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
28145f757f3fSDimitry Andric 
28155f757f3fSDimitry Andric   // Write the lower half.
28165f757f3fSDimitry Andric   B.buildInstr(AMDGPU::S_MOV_B32)
28175f757f3fSDimitry Andric       .addDef(AddrLo)
28185f757f3fSDimitry Andric       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
28195f757f3fSDimitry Andric 
28205f757f3fSDimitry Andric   // If required, write the upper half as well.
28215f757f3fSDimitry Andric   if (RequiresHighHalf) {
28225f757f3fSDimitry Andric     assert(PtrTy.getSizeInBits() == 64 &&
28235f757f3fSDimitry Andric            "Must provide a 64-bit pointer type!");
28245f757f3fSDimitry Andric 
28255f757f3fSDimitry Andric     Register AddrHi = MRI.createGenericVirtualRegister(S32);
28265f757f3fSDimitry Andric     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
28275f757f3fSDimitry Andric 
28285f757f3fSDimitry Andric     B.buildInstr(AMDGPU::S_MOV_B32)
28295f757f3fSDimitry Andric         .addDef(AddrHi)
28305f757f3fSDimitry Andric         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
28315f757f3fSDimitry Andric 
28325f757f3fSDimitry Andric     // Use the destination directly, if and only if we don't have a register
28335f757f3fSDimitry Andric     // class being set.
28345f757f3fSDimitry Andric     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
28355f757f3fSDimitry Andric                            ? DstReg
28365f757f3fSDimitry Andric                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
28375f757f3fSDimitry Andric 
28385f757f3fSDimitry Andric     if (!MRI.getRegClassOrNull(AddrDst))
28395f757f3fSDimitry Andric       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
28405f757f3fSDimitry Andric 
28415f757f3fSDimitry Andric     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
28425f757f3fSDimitry Andric 
28435f757f3fSDimitry Andric     // If we created a new register for the destination, cast the result into
28445f757f3fSDimitry Andric     // the final output.
28455f757f3fSDimitry Andric     if (AddrDst != DstReg)
28465f757f3fSDimitry Andric       B.buildCast(DstReg, AddrDst);
28475f757f3fSDimitry Andric   } else if (AddrLo != DstReg) {
28485f757f3fSDimitry Andric     // If we created a new register for the destination, cast the result into
28495f757f3fSDimitry Andric     // the final output.
28505f757f3fSDimitry Andric     B.buildCast(DstReg, AddrLo);
28515f757f3fSDimitry Andric   }
28525f757f3fSDimitry Andric }
28535f757f3fSDimitry Andric 
28548bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
28558bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
28568bcb0991SDimitry Andric   MachineIRBuilder &B) const {
28578bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
28588bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
28598bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
28608bcb0991SDimitry Andric 
28618bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
28628bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
28638bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
28648bcb0991SDimitry Andric 
28658bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2866fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2867fe6060f1SDimitry Andric         !GV->getName().equals("llvm.amdgcn.module.lds")) {
28688bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
28698bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
28705ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
28715ffd83dbSDimitry Andric         DS_Warning);
28728bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
28735ffd83dbSDimitry Andric 
28745ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
28755ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
28765ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
28775ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
28785ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
28795f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>());
28805ffd83dbSDimitry Andric       B.buildUndef(DstReg);
28815ffd83dbSDimitry Andric       MI.eraseFromParent();
28825ffd83dbSDimitry Andric       return true;
28838bcb0991SDimitry Andric     }
28848bcb0991SDimitry Andric 
28858bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
2886349cc55cSDimitry Andric     // We ignore the initializer for now and legalize it to allow selection.
2887349cc55cSDimitry Andric     // The initializer will anyway get errored out during assembly emission.
28885ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
28895ffd83dbSDimitry Andric     if (!TLI->shouldUseLDSConstAddress(GV)) {
28905ffd83dbSDimitry Andric       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
28915ffd83dbSDimitry Andric       return true; // Leave in place;
28925ffd83dbSDimitry Andric     }
28935ffd83dbSDimitry Andric 
2894e8d8bef9SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2895e8d8bef9SDimitry Andric       Type *Ty = GV->getValueType();
2896e8d8bef9SDimitry Andric       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2897e8d8bef9SDimitry Andric       // zero-sized type in other languages to declare the dynamic shared
2898e8d8bef9SDimitry Andric       // memory which size is not known at the compile time. They will be
2899e8d8bef9SDimitry Andric       // allocated by the runtime and placed directly after the static
2900e8d8bef9SDimitry Andric       // allocated ones. They all share the same offset.
2901e8d8bef9SDimitry Andric       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2902e8d8bef9SDimitry Andric         // Adjust alignment for that dynamic shared memory array.
290306c3fb27SDimitry Andric         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2904e8d8bef9SDimitry Andric         LLT S32 = LLT::scalar(32);
29055f757f3fSDimitry Andric         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2906e8d8bef9SDimitry Andric         B.buildIntToPtr(DstReg, Sz);
2907e8d8bef9SDimitry Andric         MI.eraseFromParent();
2908e8d8bef9SDimitry Andric         return true;
2909e8d8bef9SDimitry Andric       }
2910e8d8bef9SDimitry Andric     }
2911e8d8bef9SDimitry Andric 
2912349cc55cSDimitry Andric     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2913349cc55cSDimitry Andric                                                    *cast<GlobalVariable>(GV)));
29148bcb0991SDimitry Andric     MI.eraseFromParent();
29158bcb0991SDimitry Andric     return true;
29168bcb0991SDimitry Andric   }
29178bcb0991SDimitry Andric 
29185f757f3fSDimitry Andric   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
29195f757f3fSDimitry Andric     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
29205f757f3fSDimitry Andric     MI.eraseFromParent();
29215f757f3fSDimitry Andric     return true;
29225f757f3fSDimitry Andric   }
29235f757f3fSDimitry Andric 
29248bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
29258bcb0991SDimitry Andric 
29268bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
29278bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
29288bcb0991SDimitry Andric     MI.eraseFromParent();
29298bcb0991SDimitry Andric     return true;
29308bcb0991SDimitry Andric   }
29318bcb0991SDimitry Andric 
29328bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
29338bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
29348bcb0991SDimitry Andric     MI.eraseFromParent();
29358bcb0991SDimitry Andric     return true;
29368bcb0991SDimitry Andric   }
29378bcb0991SDimitry Andric 
29388bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
29398bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
29408bcb0991SDimitry Andric 
2941fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
29428bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
29438bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
29448bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
29458bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2946fe6060f1SDimitry Andric       LoadTy, Align(8));
29478bcb0991SDimitry Andric 
29488bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
29498bcb0991SDimitry Andric 
29508bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
2951349cc55cSDimitry Andric     // Truncate if this is a 32-bit constant address.
29528bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
29538bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
29548bcb0991SDimitry Andric   } else
29558bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
29568bcb0991SDimitry Andric 
29578bcb0991SDimitry Andric   MI.eraseFromParent();
29588bcb0991SDimitry Andric   return true;
29598bcb0991SDimitry Andric }
29608bcb0991SDimitry Andric 
2961e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2962e8d8bef9SDimitry Andric   if (Ty.isVector())
2963fe6060f1SDimitry Andric     return Ty.changeElementCount(
2964fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2965e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2966e8d8bef9SDimitry Andric }
2967e8d8bef9SDimitry Andric 
2968e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2969e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2970e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2971e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2972e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2973e8d8bef9SDimitry Andric 
2974e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2975e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2976e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2977e8d8bef9SDimitry Andric 
2978e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
29798bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2980e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
29818bcb0991SDimitry Andric     Observer.changingInstr(MI);
29828bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
29838bcb0991SDimitry Andric     Observer.changedInstr(MI);
29848bcb0991SDimitry Andric     return true;
29858bcb0991SDimitry Andric   }
29868bcb0991SDimitry Andric 
2987fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
2988fe6060f1SDimitry Andric     return false;
2989fe6060f1SDimitry Andric 
2990e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2991e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2992e8d8bef9SDimitry Andric 
299306c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(ValTy)) {
299406c3fb27SDimitry Andric     Observer.changingInstr(MI);
299506c3fb27SDimitry Andric     castBufferRsrcFromV4I32(MI, B, MRI, 0);
299606c3fb27SDimitry Andric     Observer.changedInstr(MI);
299706c3fb27SDimitry Andric     return true;
299806c3fb27SDimitry Andric   }
299906c3fb27SDimitry Andric 
3000e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
3001e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
3002fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
3003e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
3004fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
300504eeddc0SDimitry Andric   const uint64_t AlignInBits = 8 * MemAlign.value();
3006e8d8bef9SDimitry Andric 
3007e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
3008fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3009e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3010e8d8bef9SDimitry Andric 
3011e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
3012e8d8bef9SDimitry Andric     // the memory type.
3013e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
3014e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
3015e8d8bef9SDimitry Andric 
3016e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
3017e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3018e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
3019e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
3020e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
3021e8d8bef9SDimitry Andric       return true;
3022e8d8bef9SDimitry Andric     }
3023e8d8bef9SDimitry Andric 
3024e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
3025e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
3026e8d8bef9SDimitry Andric       return false;
3027e8d8bef9SDimitry Andric 
3028e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
3029e8d8bef9SDimitry Andric 
3030e8d8bef9SDimitry Andric     Register WideLoad;
3031e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
3032e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3033e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
3034e8d8bef9SDimitry Andric     } else {
3035e8d8bef9SDimitry Andric       // Extract the subvector.
3036e8d8bef9SDimitry Andric 
3037e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
3038e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
3039e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
3040e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3041e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
3042e8d8bef9SDimitry Andric       } else {
3043e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
3044e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
30450eae32dcSDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
30460eae32dcSDimitry Andric         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3047e8d8bef9SDimitry Andric       }
3048e8d8bef9SDimitry Andric     }
3049e8d8bef9SDimitry Andric 
3050e8d8bef9SDimitry Andric     MI.eraseFromParent();
3051e8d8bef9SDimitry Andric     return true;
3052e8d8bef9SDimitry Andric   }
3053e8d8bef9SDimitry Andric 
3054e8d8bef9SDimitry Andric   return false;
3055e8d8bef9SDimitry Andric }
3056e8d8bef9SDimitry Andric 
305706c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
305806c3fb27SDimitry Andric                                         MachineInstr &MI) const {
305906c3fb27SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
306006c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
306106c3fb27SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
306206c3fb27SDimitry Andric 
306306c3fb27SDimitry Andric   Register DataReg = MI.getOperand(0).getReg();
306406c3fb27SDimitry Andric   LLT DataTy = MRI.getType(DataReg);
306506c3fb27SDimitry Andric 
306606c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(DataTy)) {
306706c3fb27SDimitry Andric     Observer.changingInstr(MI);
306806c3fb27SDimitry Andric     castBufferRsrcArgToV4I32(MI, B, 0);
306906c3fb27SDimitry Andric     Observer.changedInstr(MI);
307006c3fb27SDimitry Andric     return true;
307106c3fb27SDimitry Andric   }
307206c3fb27SDimitry Andric   return false;
307306c3fb27SDimitry Andric }
307406c3fb27SDimitry Andric 
30758bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
30768bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
30778bcb0991SDimitry Andric   MachineIRBuilder &B) const {
30788bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
30798bcb0991SDimitry Andric   assert(Ty.isScalar());
30808bcb0991SDimitry Andric 
3081480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
3082480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3083480093f4SDimitry Andric 
30848bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
30855ffd83dbSDimitry Andric   // FIXME: Do we need just output?
30865f757f3fSDimitry Andric   if (Ty == LLT::float32() &&
308706c3fb27SDimitry Andric       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
30888bcb0991SDimitry Andric     return true;
30895f757f3fSDimitry Andric   if (Ty == LLT::float16() &&
309006c3fb27SDimitry Andric       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
30918bcb0991SDimitry Andric     return true;
30928bcb0991SDimitry Andric 
30938bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
30948bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
30958bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
30968bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
30978bcb0991SDimitry Andric }
30988bcb0991SDimitry Andric 
3099480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3100480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3101480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
3102480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
3103480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
3104480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
3105480093f4SDimitry Andric 
3106e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3107480093f4SDimitry Andric          "this should not have been custom lowered");
3108480093f4SDimitry Andric 
3109480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
3110fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
3111480093f4SDimitry Andric 
3112480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3113480093f4SDimitry Andric 
3114480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3115480093f4SDimitry Andric     .addDef(DstReg)
3116480093f4SDimitry Andric     .addUse(PtrReg)
3117480093f4SDimitry Andric     .addUse(PackedVal)
3118480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
3119480093f4SDimitry Andric 
3120480093f4SDimitry Andric   MI.eraseFromParent();
3121480093f4SDimitry Andric   return true;
3122480093f4SDimitry Andric }
3123480093f4SDimitry Andric 
312406c3fb27SDimitry Andric /// Return true if it's known that \p Src can never be an f32 denormal value.
312506c3fb27SDimitry Andric static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
312606c3fb27SDimitry Andric                                        Register Src) {
31275f757f3fSDimitry Andric   const MachineInstr *DefMI = MRI.getVRegDef(Src);
31285f757f3fSDimitry Andric   switch (DefMI->getOpcode()) {
31295f757f3fSDimitry Andric   case TargetOpcode::G_INTRINSIC: {
31305f757f3fSDimitry Andric     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
31315f757f3fSDimitry Andric     case Intrinsic::amdgcn_frexp_mant:
31325f757f3fSDimitry Andric       return true;
31335f757f3fSDimitry Andric     default:
31345f757f3fSDimitry Andric       break;
31355f757f3fSDimitry Andric     }
31365f757f3fSDimitry Andric 
31375f757f3fSDimitry Andric     break;
31385f757f3fSDimitry Andric   }
31395f757f3fSDimitry Andric   case TargetOpcode::G_FFREXP: {
31405f757f3fSDimitry Andric     if (DefMI->getOperand(0).getReg() == Src)
31415f757f3fSDimitry Andric       return true;
31425f757f3fSDimitry Andric     break;
31435f757f3fSDimitry Andric   }
31445f757f3fSDimitry Andric   case TargetOpcode::G_FPEXT: {
31455f757f3fSDimitry Andric     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
31465f757f3fSDimitry Andric   }
31475f757f3fSDimitry Andric   default:
31485f757f3fSDimitry Andric     return false;
31495f757f3fSDimitry Andric   }
31505f757f3fSDimitry Andric 
315106c3fb27SDimitry Andric   return false;
315206c3fb27SDimitry Andric }
315306c3fb27SDimitry Andric 
315406c3fb27SDimitry Andric static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
315506c3fb27SDimitry Andric   if (Flags & MachineInstr::FmAfn)
315606c3fb27SDimitry Andric     return true;
315706c3fb27SDimitry Andric   const auto &Options = MF.getTarget().Options;
315806c3fb27SDimitry Andric   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
315906c3fb27SDimitry Andric }
316006c3fb27SDimitry Andric 
316106c3fb27SDimitry Andric static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
316206c3fb27SDimitry Andric                                    unsigned Flags) {
316306c3fb27SDimitry Andric   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
316406c3fb27SDimitry Andric          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
316506c3fb27SDimitry Andric              DenormalMode::PreserveSign;
316606c3fb27SDimitry Andric }
316706c3fb27SDimitry Andric 
316806c3fb27SDimitry Andric std::pair<Register, Register>
316906c3fb27SDimitry Andric AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
317006c3fb27SDimitry Andric                                        unsigned Flags) const {
31718a4dda33SDimitry Andric   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
317206c3fb27SDimitry Andric     return {};
317306c3fb27SDimitry Andric 
317406c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
317506c3fb27SDimitry Andric   auto SmallestNormal = B.buildFConstant(
317606c3fb27SDimitry Andric       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
317706c3fb27SDimitry Andric   auto IsLtSmallestNormal =
317806c3fb27SDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
317906c3fb27SDimitry Andric 
318006c3fb27SDimitry Andric   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
318106c3fb27SDimitry Andric   auto One = B.buildFConstant(F32, 1.0);
318206c3fb27SDimitry Andric   auto ScaleFactor =
318306c3fb27SDimitry Andric       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
318406c3fb27SDimitry Andric   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
318506c3fb27SDimitry Andric 
318606c3fb27SDimitry Andric   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
318706c3fb27SDimitry Andric }
318806c3fb27SDimitry Andric 
318906c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
319006c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
319106c3fb27SDimitry Andric   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
319206c3fb27SDimitry Andric   // If we have to handle denormals, scale up the input and adjust the result.
319306c3fb27SDimitry Andric 
319406c3fb27SDimitry Andric   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
319506c3fb27SDimitry Andric   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
319606c3fb27SDimitry Andric 
31975ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
31985ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
31995ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
32005ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
32015ffd83dbSDimitry Andric 
320206c3fb27SDimitry Andric   if (Ty == LLT::scalar(16)) {
320306c3fb27SDimitry Andric     const LLT F32 = LLT::scalar(32);
320406c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
320506c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, Src, Flags);
32065f757f3fSDimitry Andric     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
320706c3fb27SDimitry Andric                     .addUse(Ext.getReg(0))
320806c3fb27SDimitry Andric                     .setMIFlags(Flags);
320906c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Log2, Flags);
32105ffd83dbSDimitry Andric     MI.eraseFromParent();
32115ffd83dbSDimitry Andric     return true;
32125ffd83dbSDimitry Andric   }
32135ffd83dbSDimitry Andric 
321406c3fb27SDimitry Andric   assert(Ty == LLT::scalar(32));
321506c3fb27SDimitry Andric 
321606c3fb27SDimitry Andric   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
321706c3fb27SDimitry Andric   if (!ScaledInput) {
32185f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
321906c3fb27SDimitry Andric         .addUse(Src)
322006c3fb27SDimitry Andric         .setMIFlags(Flags);
322106c3fb27SDimitry Andric     MI.eraseFromParent();
322206c3fb27SDimitry Andric     return true;
322306c3fb27SDimitry Andric   }
322406c3fb27SDimitry Andric 
32255f757f3fSDimitry Andric   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
322606c3fb27SDimitry Andric                   .addUse(ScaledInput)
322706c3fb27SDimitry Andric                   .setMIFlags(Flags);
322806c3fb27SDimitry Andric 
322906c3fb27SDimitry Andric   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
323006c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
323106c3fb27SDimitry Andric   auto ResultOffset =
323206c3fb27SDimitry Andric       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
323306c3fb27SDimitry Andric   B.buildFSub(Dst, Log2, ResultOffset, Flags);
323406c3fb27SDimitry Andric 
323506c3fb27SDimitry Andric   MI.eraseFromParent();
323606c3fb27SDimitry Andric   return true;
323706c3fb27SDimitry Andric }
323806c3fb27SDimitry Andric 
323906c3fb27SDimitry Andric static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
324006c3fb27SDimitry Andric                        Register Z, unsigned Flags) {
324106c3fb27SDimitry Andric   auto FMul = B.buildFMul(Ty, X, Y, Flags);
324206c3fb27SDimitry Andric   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
324306c3fb27SDimitry Andric }
324406c3fb27SDimitry Andric 
324506c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
324606c3fb27SDimitry Andric                                              MachineIRBuilder &B) const {
324706c3fb27SDimitry Andric   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
324806c3fb27SDimitry Andric   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
324906c3fb27SDimitry Andric 
325006c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
325106c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
325206c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
325306c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
325406c3fb27SDimitry Andric   const LLT Ty = MRI.getType(X);
325506c3fb27SDimitry Andric   MachineFunction &MF = B.getMF();
325606c3fb27SDimitry Andric 
325706c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
325806c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
325906c3fb27SDimitry Andric 
326006c3fb27SDimitry Andric   const AMDGPUTargetMachine &TM =
326106c3fb27SDimitry Andric       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
326206c3fb27SDimitry Andric 
326306c3fb27SDimitry Andric   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
326406c3fb27SDimitry Andric       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
326506c3fb27SDimitry Andric     if (Ty == F16 && !ST.has16BitInsts()) {
326606c3fb27SDimitry Andric       Register LogVal = MRI.createGenericVirtualRegister(F32);
326706c3fb27SDimitry Andric       auto PromoteSrc = B.buildFPExt(F32, X);
32688a4dda33SDimitry Andric       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
326906c3fb27SDimitry Andric       B.buildFPTrunc(Dst, LogVal);
327006c3fb27SDimitry Andric     } else {
32718a4dda33SDimitry Andric       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
327206c3fb27SDimitry Andric     }
327306c3fb27SDimitry Andric 
327406c3fb27SDimitry Andric     MI.eraseFromParent();
327506c3fb27SDimitry Andric     return true;
327606c3fb27SDimitry Andric   }
327706c3fb27SDimitry Andric 
327806c3fb27SDimitry Andric   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
327906c3fb27SDimitry Andric   if (ScaledInput)
328006c3fb27SDimitry Andric     X = ScaledInput;
328106c3fb27SDimitry Andric 
32825f757f3fSDimitry Andric   auto Y =
32835f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
328406c3fb27SDimitry Andric 
328506c3fb27SDimitry Andric   Register R;
328606c3fb27SDimitry Andric   if (ST.hasFastFMAF32()) {
328706c3fb27SDimitry Andric     // c+cc are ln(2)/ln(10) to more than 49 bits
328806c3fb27SDimitry Andric     const float c_log10 = 0x1.344134p-2f;
328906c3fb27SDimitry Andric     const float cc_log10 = 0x1.09f79ep-26f;
329006c3fb27SDimitry Andric 
329106c3fb27SDimitry Andric     // c + cc is ln(2) to more than 49 bits
329206c3fb27SDimitry Andric     const float c_log = 0x1.62e42ep-1f;
329306c3fb27SDimitry Andric     const float cc_log = 0x1.efa39ep-25f;
329406c3fb27SDimitry Andric 
329506c3fb27SDimitry Andric     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
329606c3fb27SDimitry Andric     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
329706c3fb27SDimitry Andric 
329806c3fb27SDimitry Andric     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
329906c3fb27SDimitry Andric     auto NegR = B.buildFNeg(Ty, R, Flags);
330006c3fb27SDimitry Andric     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
330106c3fb27SDimitry Andric     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
330206c3fb27SDimitry Andric     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
330306c3fb27SDimitry Andric   } else {
330406c3fb27SDimitry Andric     // ch+ct is ln(2)/ln(10) to more than 36 bits
330506c3fb27SDimitry Andric     const float ch_log10 = 0x1.344000p-2f;
330606c3fb27SDimitry Andric     const float ct_log10 = 0x1.3509f6p-18f;
330706c3fb27SDimitry Andric 
330806c3fb27SDimitry Andric     // ch + ct is ln(2) to more than 36 bits
330906c3fb27SDimitry Andric     const float ch_log = 0x1.62e000p-1f;
331006c3fb27SDimitry Andric     const float ct_log = 0x1.0bfbe8p-15f;
331106c3fb27SDimitry Andric 
331206c3fb27SDimitry Andric     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
331306c3fb27SDimitry Andric     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
331406c3fb27SDimitry Andric 
331506c3fb27SDimitry Andric     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
331606c3fb27SDimitry Andric     auto YH = B.buildAnd(Ty, Y, MaskConst);
331706c3fb27SDimitry Andric     auto YT = B.buildFSub(Ty, Y, YH, Flags);
331806c3fb27SDimitry Andric     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
331906c3fb27SDimitry Andric 
332006c3fb27SDimitry Andric     Register Mad0 =
332106c3fb27SDimitry Andric         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
332206c3fb27SDimitry Andric     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
332306c3fb27SDimitry Andric     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
332406c3fb27SDimitry Andric   }
332506c3fb27SDimitry Andric 
332606c3fb27SDimitry Andric   const bool IsFiniteOnly =
332706c3fb27SDimitry Andric       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
332806c3fb27SDimitry Andric       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
332906c3fb27SDimitry Andric 
333006c3fb27SDimitry Andric   if (!IsFiniteOnly) {
333106c3fb27SDimitry Andric     // Expand isfinite(x) => fabs(x) < inf
333206c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
333306c3fb27SDimitry Andric     auto Fabs = B.buildFAbs(Ty, Y);
333406c3fb27SDimitry Andric     auto IsFinite =
333506c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
333606c3fb27SDimitry Andric     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
333706c3fb27SDimitry Andric   }
333806c3fb27SDimitry Andric 
333906c3fb27SDimitry Andric   if (ScaledInput) {
334006c3fb27SDimitry Andric     auto Zero = B.buildFConstant(Ty, 0.0);
334106c3fb27SDimitry Andric     auto ShiftK =
334206c3fb27SDimitry Andric         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
334306c3fb27SDimitry Andric     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
334406c3fb27SDimitry Andric     B.buildFSub(Dst, R, Shift, Flags);
334506c3fb27SDimitry Andric   } else {
334606c3fb27SDimitry Andric     B.buildCopy(Dst, R);
334706c3fb27SDimitry Andric   }
334806c3fb27SDimitry Andric 
334906c3fb27SDimitry Andric   MI.eraseFromParent();
335006c3fb27SDimitry Andric   return true;
335106c3fb27SDimitry Andric }
335206c3fb27SDimitry Andric 
335306c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
33548a4dda33SDimitry Andric                                              Register Src, bool IsLog10,
335506c3fb27SDimitry Andric                                              unsigned Flags) const {
33568a4dda33SDimitry Andric   const double Log2BaseInverted =
33578a4dda33SDimitry Andric       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
33588a4dda33SDimitry Andric 
335906c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
33608a4dda33SDimitry Andric 
33618a4dda33SDimitry Andric   if (Ty == LLT::scalar(32)) {
33628a4dda33SDimitry Andric     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
33638a4dda33SDimitry Andric     if (ScaledInput) {
33645f757f3fSDimitry Andric       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
33658a4dda33SDimitry Andric                         .addUse(Src)
33668a4dda33SDimitry Andric                         .setMIFlags(Flags);
33678a4dda33SDimitry Andric       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
33688a4dda33SDimitry Andric       auto Zero = B.buildFConstant(Ty, 0.0);
33698a4dda33SDimitry Andric       auto ResultOffset =
33708a4dda33SDimitry Andric           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
33718a4dda33SDimitry Andric       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
33728a4dda33SDimitry Andric 
33738a4dda33SDimitry Andric       if (ST.hasFastFMAF32())
33748a4dda33SDimitry Andric         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
33758a4dda33SDimitry Andric       else {
33768a4dda33SDimitry Andric         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
33778a4dda33SDimitry Andric         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
33788a4dda33SDimitry Andric       }
33798a4dda33SDimitry Andric 
33808a4dda33SDimitry Andric       return true;
33818a4dda33SDimitry Andric     }
33828a4dda33SDimitry Andric   }
33838a4dda33SDimitry Andric 
338406c3fb27SDimitry Andric   auto Log2Operand = Ty == LLT::scalar(16)
338506c3fb27SDimitry Andric                          ? B.buildFLog2(Ty, Src, Flags)
33865f757f3fSDimitry Andric                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
338706c3fb27SDimitry Andric                                .addUse(Src)
338806c3fb27SDimitry Andric                                .setMIFlags(Flags);
338906c3fb27SDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
339006c3fb27SDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
339106c3fb27SDimitry Andric   return true;
339206c3fb27SDimitry Andric }
339306c3fb27SDimitry Andric 
339406c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
339506c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
339606c3fb27SDimitry Andric   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
339706c3fb27SDimitry Andric   // If we have to handle denormals, scale up the input and adjust the result.
339806c3fb27SDimitry Andric 
339906c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
340006c3fb27SDimitry Andric   Register Src = MI.getOperand(1).getReg();
340106c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
340206c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
340306c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
340406c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
340506c3fb27SDimitry Andric 
340606c3fb27SDimitry Andric   if (Ty == F16) {
340706c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
340806c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, Src, Flags);
34095f757f3fSDimitry Andric     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
341006c3fb27SDimitry Andric                     .addUse(Ext.getReg(0))
341106c3fb27SDimitry Andric                     .setMIFlags(Flags);
341206c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Log2, Flags);
341306c3fb27SDimitry Andric     MI.eraseFromParent();
341406c3fb27SDimitry Andric     return true;
341506c3fb27SDimitry Andric   }
341606c3fb27SDimitry Andric 
341706c3fb27SDimitry Andric   assert(Ty == F32);
341806c3fb27SDimitry Andric 
34198a4dda33SDimitry Andric   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
34205f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
342106c3fb27SDimitry Andric         .addUse(Src)
342206c3fb27SDimitry Andric         .setMIFlags(Flags);
342306c3fb27SDimitry Andric     MI.eraseFromParent();
342406c3fb27SDimitry Andric     return true;
342506c3fb27SDimitry Andric   }
342606c3fb27SDimitry Andric 
342706c3fb27SDimitry Andric   // bool needs_scaling = x < -0x1.f80000p+6f;
342806c3fb27SDimitry Andric   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
342906c3fb27SDimitry Andric 
343006c3fb27SDimitry Andric   // -nextafter(128.0, -1)
343106c3fb27SDimitry Andric   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
343206c3fb27SDimitry Andric   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
343306c3fb27SDimitry Andric                                   RangeCheckConst, Flags);
343406c3fb27SDimitry Andric 
343506c3fb27SDimitry Andric   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
343606c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
343706c3fb27SDimitry Andric   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
343806c3fb27SDimitry Andric   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
343906c3fb27SDimitry Andric 
34405f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
344106c3fb27SDimitry Andric                   .addUse(AddInput.getReg(0))
344206c3fb27SDimitry Andric                   .setMIFlags(Flags);
344306c3fb27SDimitry Andric 
344406c3fb27SDimitry Andric   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
344506c3fb27SDimitry Andric   auto One = B.buildFConstant(Ty, 1.0);
344606c3fb27SDimitry Andric   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
344706c3fb27SDimitry Andric   B.buildFMul(Dst, Exp2, ResultScale, Flags);
344806c3fb27SDimitry Andric   MI.eraseFromParent();
344906c3fb27SDimitry Andric   return true;
345006c3fb27SDimitry Andric }
345106c3fb27SDimitry Andric 
345206c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
34535f757f3fSDimitry Andric                                              Register X, unsigned Flags) const {
345406c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
34555f757f3fSDimitry Andric   LLT F32 = LLT::scalar(32);
345606c3fb27SDimitry Andric 
34575f757f3fSDimitry Andric   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
34585f757f3fSDimitry Andric     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
34595f757f3fSDimitry Andric     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
34605f757f3fSDimitry Andric 
34615f757f3fSDimitry Andric     if (Ty == F32) {
34625f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
346306c3fb27SDimitry Andric         .addUse(Mul.getReg(0))
346406c3fb27SDimitry Andric         .setMIFlags(Flags);
346506c3fb27SDimitry Andric     } else {
346606c3fb27SDimitry Andric       B.buildFExp2(Dst, Mul.getReg(0), Flags);
346706c3fb27SDimitry Andric     }
346806c3fb27SDimitry Andric 
346906c3fb27SDimitry Andric     return true;
347006c3fb27SDimitry Andric   }
347106c3fb27SDimitry Andric 
34725f757f3fSDimitry Andric   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
34735f757f3fSDimitry Andric   auto NeedsScaling =
34745f757f3fSDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
34755f757f3fSDimitry Andric   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
34765f757f3fSDimitry Andric   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
34775f757f3fSDimitry Andric   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
34785f757f3fSDimitry Andric 
34795f757f3fSDimitry Andric   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
34805f757f3fSDimitry Andric   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
34815f757f3fSDimitry Andric 
34825f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
34835f757f3fSDimitry Andric     .addUse(ExpInput.getReg(0))
34845f757f3fSDimitry Andric     .setMIFlags(Flags);
34855f757f3fSDimitry Andric 
34865f757f3fSDimitry Andric   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
34875f757f3fSDimitry Andric   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
34885f757f3fSDimitry Andric   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
34895f757f3fSDimitry Andric   return true;
34905f757f3fSDimitry Andric }
34915f757f3fSDimitry Andric 
34925ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
34935ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
34945ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
349506c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
349606c3fb27SDimitry Andric   const unsigned Flags = MI.getFlags();
349706c3fb27SDimitry Andric   MachineFunction &MF = B.getMF();
349806c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
349906c3fb27SDimitry Andric   LLT Ty = MRI.getType(Dst);
350006c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
350106c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
35025f757f3fSDimitry Andric   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
35035ffd83dbSDimitry Andric 
350406c3fb27SDimitry Andric   if (Ty == F16) {
350506c3fb27SDimitry Andric     // v_exp_f16 (fmul x, log2e)
350606c3fb27SDimitry Andric     if (allowApproxFunc(MF, Flags)) {
350706c3fb27SDimitry Andric       // TODO: Does this really require fast?
350806c3fb27SDimitry Andric       legalizeFExpUnsafe(B, Dst, X, Flags);
350906c3fb27SDimitry Andric       MI.eraseFromParent();
351006c3fb27SDimitry Andric       return true;
351106c3fb27SDimitry Andric     }
351206c3fb27SDimitry Andric 
351306c3fb27SDimitry Andric     // exp(f16 x) ->
351406c3fb27SDimitry Andric     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
351506c3fb27SDimitry Andric 
351606c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
351706c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, X, Flags);
351806c3fb27SDimitry Andric     Register Lowered = MRI.createGenericVirtualRegister(F32);
351906c3fb27SDimitry Andric     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
352006c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Lowered, Flags);
352106c3fb27SDimitry Andric     MI.eraseFromParent();
352206c3fb27SDimitry Andric     return true;
352306c3fb27SDimitry Andric   }
352406c3fb27SDimitry Andric 
352506c3fb27SDimitry Andric   assert(Ty == F32);
352606c3fb27SDimitry Andric 
352706c3fb27SDimitry Andric   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
352806c3fb27SDimitry Andric   // library behavior. Also, is known-not-daz source sufficient?
35295f757f3fSDimitry Andric   if (allowApproxFunc(MF, Flags)) {
353006c3fb27SDimitry Andric     legalizeFExpUnsafe(B, Dst, X, Flags);
353106c3fb27SDimitry Andric     MI.eraseFromParent();
353206c3fb27SDimitry Andric     return true;
353306c3fb27SDimitry Andric   }
353406c3fb27SDimitry Andric 
353506c3fb27SDimitry Andric   //    Algorithm:
353606c3fb27SDimitry Andric   //
353706c3fb27SDimitry Andric   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
353806c3fb27SDimitry Andric   //
353906c3fb27SDimitry Andric   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
354006c3fb27SDimitry Andric   //    n = 64*m + j,   0 <= j < 64
354106c3fb27SDimitry Andric   //
354206c3fb27SDimitry Andric   //    e^x = 2^((64*m + j + f)/64)
354306c3fb27SDimitry Andric   //        = (2^m) * (2^(j/64)) * 2^(f/64)
354406c3fb27SDimitry Andric   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
354506c3fb27SDimitry Andric   //
354606c3fb27SDimitry Andric   //    f = x*(64/ln(2)) - n
354706c3fb27SDimitry Andric   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
354806c3fb27SDimitry Andric   //
354906c3fb27SDimitry Andric   //    e^x = (2^m) * (2^(j/64)) * e^r
355006c3fb27SDimitry Andric   //
355106c3fb27SDimitry Andric   //    (2^(j/64)) is precomputed
355206c3fb27SDimitry Andric   //
355306c3fb27SDimitry Andric   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
355406c3fb27SDimitry Andric   //    e^r = 1 + q
355506c3fb27SDimitry Andric   //
355606c3fb27SDimitry Andric   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
355706c3fb27SDimitry Andric   //
355806c3fb27SDimitry Andric   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
355906c3fb27SDimitry Andric   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
356006c3fb27SDimitry Andric   Register PH, PL;
356106c3fb27SDimitry Andric 
356206c3fb27SDimitry Andric   if (ST.hasFastFMAF32()) {
356306c3fb27SDimitry Andric     const float c_exp = numbers::log2ef;
356406c3fb27SDimitry Andric     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
356506c3fb27SDimitry Andric     const float c_exp10 = 0x1.a934f0p+1f;
356606c3fb27SDimitry Andric     const float cc_exp10 = 0x1.2f346ep-24f;
356706c3fb27SDimitry Andric 
356806c3fb27SDimitry Andric     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
356906c3fb27SDimitry Andric     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
357006c3fb27SDimitry Andric     auto NegPH = B.buildFNeg(Ty, PH, Flags);
357106c3fb27SDimitry Andric     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
357206c3fb27SDimitry Andric 
357306c3fb27SDimitry Andric     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
357406c3fb27SDimitry Andric     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
357506c3fb27SDimitry Andric   } else {
357606c3fb27SDimitry Andric     const float ch_exp = 0x1.714000p+0f;
357706c3fb27SDimitry Andric     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
357806c3fb27SDimitry Andric 
357906c3fb27SDimitry Andric     const float ch_exp10 = 0x1.a92000p+1f;
358006c3fb27SDimitry Andric     const float cl_exp10 = 0x1.4f0978p-11f;
358106c3fb27SDimitry Andric 
358206c3fb27SDimitry Andric     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
358306c3fb27SDimitry Andric     auto XH = B.buildAnd(Ty, X, MaskConst);
358406c3fb27SDimitry Andric     auto XL = B.buildFSub(Ty, X, XH, Flags);
358506c3fb27SDimitry Andric 
358606c3fb27SDimitry Andric     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
358706c3fb27SDimitry Andric     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
358806c3fb27SDimitry Andric 
358906c3fb27SDimitry Andric     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
359006c3fb27SDimitry Andric     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
359106c3fb27SDimitry Andric 
359206c3fb27SDimitry Andric     Register Mad0 =
359306c3fb27SDimitry Andric         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
359406c3fb27SDimitry Andric     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
359506c3fb27SDimitry Andric   }
359606c3fb27SDimitry Andric 
35975f757f3fSDimitry Andric   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
359806c3fb27SDimitry Andric 
359906c3fb27SDimitry Andric   // It is unsafe to contract this fsub into the PH multiply.
360006c3fb27SDimitry Andric   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
360106c3fb27SDimitry Andric   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
360206c3fb27SDimitry Andric   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
360306c3fb27SDimitry Andric 
36045f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
360506c3fb27SDimitry Andric                   .addUse(A.getReg(0))
360606c3fb27SDimitry Andric                   .setMIFlags(Flags);
360706c3fb27SDimitry Andric   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
360806c3fb27SDimitry Andric 
360906c3fb27SDimitry Andric   auto UnderflowCheckConst =
361006c3fb27SDimitry Andric       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
361106c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
361206c3fb27SDimitry Andric   auto Underflow =
361306c3fb27SDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
361406c3fb27SDimitry Andric 
361506c3fb27SDimitry Andric   R = B.buildSelect(Ty, Underflow, Zero, R);
361606c3fb27SDimitry Andric 
361706c3fb27SDimitry Andric   const auto &Options = MF.getTarget().Options;
361806c3fb27SDimitry Andric 
361906c3fb27SDimitry Andric   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
362006c3fb27SDimitry Andric     auto OverflowCheckConst =
362106c3fb27SDimitry Andric         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
362206c3fb27SDimitry Andric 
362306c3fb27SDimitry Andric     auto Overflow =
362406c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
362506c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
362606c3fb27SDimitry Andric     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
362706c3fb27SDimitry Andric   }
362806c3fb27SDimitry Andric 
362906c3fb27SDimitry Andric   B.buildCopy(Dst, R);
36305ffd83dbSDimitry Andric   MI.eraseFromParent();
36315ffd83dbSDimitry Andric   return true;
36325ffd83dbSDimitry Andric }
36335ffd83dbSDimitry Andric 
36345ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
36355ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
36365ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
36375ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
36385ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
36395ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
36405ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
36415f757f3fSDimitry Andric   const LLT F16 = LLT::float16();
36425f757f3fSDimitry Andric   const LLT F32 = LLT::float32();
36435ffd83dbSDimitry Andric 
36445f757f3fSDimitry Andric   if (Ty == F32) {
36455f757f3fSDimitry Andric     auto Log = B.buildFLog2(F32, Src0, Flags);
36465f757f3fSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
36475ffd83dbSDimitry Andric                    .addUse(Log.getReg(0))
36485ffd83dbSDimitry Andric                    .addUse(Src1)
36495ffd83dbSDimitry Andric                    .setMIFlags(Flags);
36505ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
36515f757f3fSDimitry Andric   } else if (Ty == F16) {
36525ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
36535f757f3fSDimitry Andric     auto Log = B.buildFLog2(F16, Src0, Flags);
36545f757f3fSDimitry Andric     auto Ext0 = B.buildFPExt(F32, Log, Flags);
36555f757f3fSDimitry Andric     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
36565f757f3fSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
36575ffd83dbSDimitry Andric                    .addUse(Ext0.getReg(0))
36585ffd83dbSDimitry Andric                    .addUse(Ext1.getReg(0))
36595ffd83dbSDimitry Andric                    .setMIFlags(Flags);
36605f757f3fSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
36615ffd83dbSDimitry Andric   } else
36625ffd83dbSDimitry Andric     return false;
36635ffd83dbSDimitry Andric 
36645ffd83dbSDimitry Andric   MI.eraseFromParent();
36655ffd83dbSDimitry Andric   return true;
36665ffd83dbSDimitry Andric }
36675ffd83dbSDimitry Andric 
36685ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
36695ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
36705ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
36715ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
36725ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
36735ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
36745ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
36755ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
36765ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
36775ffd83dbSDimitry Andric   return ModSrc;
36785ffd83dbSDimitry Andric }
36795ffd83dbSDimitry Andric 
36805ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
36815ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
36825ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
36835ffd83dbSDimitry Andric 
36845ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
36855f757f3fSDimitry Andric   const LLT F64 = LLT::float64();
36865ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
36875ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
36885ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
36895f757f3fSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
36905ffd83dbSDimitry Andric          "this should not have been custom lowered");
36915ffd83dbSDimitry Andric 
36925ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
36935ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
36945ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
36955ffd83dbSDimitry Andric   // V_FRACT bug is:
36965ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
36975ffd83dbSDimitry Andric   //
36985ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
36995ffd83dbSDimitry Andric 
37005f757f3fSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
37015ffd83dbSDimitry Andric                    .addUse(OrigSrc)
37025ffd83dbSDimitry Andric                    .setMIFlags(Flags);
37035ffd83dbSDimitry Andric 
37045ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
37055ffd83dbSDimitry Andric   // pattern.
37065ffd83dbSDimitry Andric 
37075ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
37085ffd83dbSDimitry Andric   // shouldn't matter?
37095ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
37105ffd83dbSDimitry Andric 
371106c3fb27SDimitry Andric   auto Const =
37125f757f3fSDimitry Andric       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
37135ffd83dbSDimitry Andric 
37145f757f3fSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(F64);
37155ffd83dbSDimitry Andric 
37165ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
37175ffd83dbSDimitry Andric   // use the one which will directly select.
37185ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
37195ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
37205ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
37215ffd83dbSDimitry Andric   else
37225ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
37235ffd83dbSDimitry Andric 
37245ffd83dbSDimitry Andric   Register CorrectedFract = Min;
37255ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
37265ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
37275f757f3fSDimitry Andric     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
37285ffd83dbSDimitry Andric   }
37295ffd83dbSDimitry Andric 
37305f757f3fSDimitry Andric   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
37315ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
37325ffd83dbSDimitry Andric 
37335ffd83dbSDimitry Andric   MI.eraseFromParent();
37345ffd83dbSDimitry Andric   return true;
37355ffd83dbSDimitry Andric }
37365ffd83dbSDimitry Andric 
37375ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
37385ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
37395ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
37405ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
37415ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
37425ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3743bdd1243dSDimitry Andric   const LLT S16 = LLT::scalar(16);
3744fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
37455ffd83dbSDimitry Andric 
37465ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
37475ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
37485ffd83dbSDimitry Andric 
3749bdd1243dSDimitry Andric   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3750bdd1243dSDimitry Andric     assert(MRI.getType(Src0) == S32);
3751bdd1243dSDimitry Andric     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3752bdd1243dSDimitry Andric     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3753bdd1243dSDimitry Andric   }
3754bdd1243dSDimitry Andric 
3755bdd1243dSDimitry Andric   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
37565ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
37575ffd83dbSDimitry Andric 
37585ffd83dbSDimitry Andric   MI.eraseFromParent();
37595ffd83dbSDimitry Andric   return true;
37605ffd83dbSDimitry Andric }
37615ffd83dbSDimitry Andric 
376281ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
376381ad6265SDimitry Andric //
376481ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits.
376581ad6265SDimitry Andric //
376681ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence
376781ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of
376881ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go
376981ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction
377081ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions.
377106c3fb27SDimitry Andric void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
377206c3fb27SDimitry Andric                                         MutableArrayRef<Register> Accum,
377306c3fb27SDimitry Andric                                         ArrayRef<Register> Src0,
377406c3fb27SDimitry Andric                                         ArrayRef<Register> Src1,
377506c3fb27SDimitry Andric                                         bool UsePartialMad64_32,
377606c3fb27SDimitry Andric                                         bool SeparateOddAlignedProducts) const {
377781ad6265SDimitry Andric   // Use (possibly empty) vectors of S1 registers to represent the set of
377881ad6265SDimitry Andric   // carries from one pair of positions to the next.
377981ad6265SDimitry Andric   using Carry = SmallVector<Register, 2>;
378081ad6265SDimitry Andric 
378181ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
378206c3fb27SDimitry Andric   GISelKnownBits &KB = *Helper.getKnownBits();
378381ad6265SDimitry Andric 
378481ad6265SDimitry Andric   const LLT S1 = LLT::scalar(1);
378581ad6265SDimitry Andric   const LLT S32 = LLT::scalar(32);
378681ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
378781ad6265SDimitry Andric 
378881ad6265SDimitry Andric   Register Zero32;
378981ad6265SDimitry Andric   Register Zero64;
379081ad6265SDimitry Andric 
379181ad6265SDimitry Andric   auto getZero32 = [&]() -> Register {
379281ad6265SDimitry Andric     if (!Zero32)
379381ad6265SDimitry Andric       Zero32 = B.buildConstant(S32, 0).getReg(0);
379481ad6265SDimitry Andric     return Zero32;
379581ad6265SDimitry Andric   };
379681ad6265SDimitry Andric   auto getZero64 = [&]() -> Register {
379781ad6265SDimitry Andric     if (!Zero64)
379881ad6265SDimitry Andric       Zero64 = B.buildConstant(S64, 0).getReg(0);
379981ad6265SDimitry Andric     return Zero64;
380081ad6265SDimitry Andric   };
380181ad6265SDimitry Andric 
380206c3fb27SDimitry Andric   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
380306c3fb27SDimitry Andric   for (unsigned i = 0; i < Src0.size(); ++i) {
380406c3fb27SDimitry Andric     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
380506c3fb27SDimitry Andric     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
380606c3fb27SDimitry Andric   }
380706c3fb27SDimitry Andric 
380881ad6265SDimitry Andric   // Merge the given carries into the 32-bit LocalAccum, which is modified
380981ad6265SDimitry Andric   // in-place.
381081ad6265SDimitry Andric   //
381181ad6265SDimitry Andric   // Returns the carry-out, which is a single S1 register or null.
381281ad6265SDimitry Andric   auto mergeCarry =
381381ad6265SDimitry Andric       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
381481ad6265SDimitry Andric         if (CarryIn.empty())
381581ad6265SDimitry Andric           return Register();
381681ad6265SDimitry Andric 
381781ad6265SDimitry Andric         bool HaveCarryOut = true;
381881ad6265SDimitry Andric         Register CarryAccum;
381981ad6265SDimitry Andric         if (CarryIn.size() == 1) {
382081ad6265SDimitry Andric           if (!LocalAccum) {
382181ad6265SDimitry Andric             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
382281ad6265SDimitry Andric             return Register();
382381ad6265SDimitry Andric           }
382481ad6265SDimitry Andric 
382581ad6265SDimitry Andric           CarryAccum = getZero32();
382681ad6265SDimitry Andric         } else {
382781ad6265SDimitry Andric           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
382881ad6265SDimitry Andric           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
382981ad6265SDimitry Andric             CarryAccum =
383081ad6265SDimitry Andric                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
383181ad6265SDimitry Andric                     .getReg(0);
383281ad6265SDimitry Andric           }
383381ad6265SDimitry Andric 
383481ad6265SDimitry Andric           if (!LocalAccum) {
383581ad6265SDimitry Andric             LocalAccum = getZero32();
383681ad6265SDimitry Andric             HaveCarryOut = false;
383781ad6265SDimitry Andric           }
383881ad6265SDimitry Andric         }
383981ad6265SDimitry Andric 
384081ad6265SDimitry Andric         auto Add =
384181ad6265SDimitry Andric             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
384281ad6265SDimitry Andric         LocalAccum = Add.getReg(0);
384381ad6265SDimitry Andric         return HaveCarryOut ? Add.getReg(1) : Register();
384481ad6265SDimitry Andric       };
384581ad6265SDimitry Andric 
384681ad6265SDimitry Andric   // Build a multiply-add chain to compute
384781ad6265SDimitry Andric   //
384881ad6265SDimitry Andric   //   LocalAccum + (partial products at DstIndex)
384981ad6265SDimitry Andric   //       + (opportunistic subset of CarryIn)
385081ad6265SDimitry Andric   //
385181ad6265SDimitry Andric   // LocalAccum is an array of one or two 32-bit registers that are updated
385281ad6265SDimitry Andric   // in-place. The incoming registers may be null.
385381ad6265SDimitry Andric   //
385481ad6265SDimitry Andric   // In some edge cases, carry-ins can be consumed "for free". In that case,
385581ad6265SDimitry Andric   // the consumed carry bits are removed from CarryIn in-place.
385681ad6265SDimitry Andric   auto buildMadChain =
385781ad6265SDimitry Andric       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
385881ad6265SDimitry Andric           -> Carry {
385981ad6265SDimitry Andric         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
386081ad6265SDimitry Andric                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
386181ad6265SDimitry Andric 
386281ad6265SDimitry Andric         Carry CarryOut;
386381ad6265SDimitry Andric         unsigned j0 = 0;
386481ad6265SDimitry Andric 
386581ad6265SDimitry Andric         // Use plain 32-bit multiplication for the most significant part of the
386681ad6265SDimitry Andric         // result by default.
386781ad6265SDimitry Andric         if (LocalAccum.size() == 1 &&
386881ad6265SDimitry Andric             (!UsePartialMad64_32 || !CarryIn.empty())) {
386981ad6265SDimitry Andric           do {
387006c3fb27SDimitry Andric             // Skip multiplication if one of the operands is 0
387181ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
387206c3fb27SDimitry Andric             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
387306c3fb27SDimitry Andric               ++j0;
387406c3fb27SDimitry Andric               continue;
387506c3fb27SDimitry Andric             }
387681ad6265SDimitry Andric             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
387706c3fb27SDimitry Andric             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
387881ad6265SDimitry Andric               LocalAccum[0] = Mul.getReg(0);
387981ad6265SDimitry Andric             } else {
388081ad6265SDimitry Andric               if (CarryIn.empty()) {
388181ad6265SDimitry Andric                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
388281ad6265SDimitry Andric               } else {
388381ad6265SDimitry Andric                 LocalAccum[0] =
388481ad6265SDimitry Andric                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
388581ad6265SDimitry Andric                         .getReg(0);
388681ad6265SDimitry Andric                 CarryIn.pop_back();
388781ad6265SDimitry Andric               }
388881ad6265SDimitry Andric             }
388981ad6265SDimitry Andric             ++j0;
389081ad6265SDimitry Andric           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
389181ad6265SDimitry Andric         }
389281ad6265SDimitry Andric 
389381ad6265SDimitry Andric         // Build full 64-bit multiplies.
389481ad6265SDimitry Andric         if (j0 <= DstIndex) {
389581ad6265SDimitry Andric           bool HaveSmallAccum = false;
389681ad6265SDimitry Andric           Register Tmp;
389781ad6265SDimitry Andric 
389881ad6265SDimitry Andric           if (LocalAccum[0]) {
389981ad6265SDimitry Andric             if (LocalAccum.size() == 1) {
390081ad6265SDimitry Andric               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
390181ad6265SDimitry Andric               HaveSmallAccum = true;
390281ad6265SDimitry Andric             } else if (LocalAccum[1]) {
3903bdd1243dSDimitry Andric               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
390481ad6265SDimitry Andric               HaveSmallAccum = false;
390581ad6265SDimitry Andric             } else {
390681ad6265SDimitry Andric               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
390781ad6265SDimitry Andric               HaveSmallAccum = true;
390881ad6265SDimitry Andric             }
390981ad6265SDimitry Andric           } else {
391081ad6265SDimitry Andric             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
391181ad6265SDimitry Andric             Tmp = getZero64();
391281ad6265SDimitry Andric             HaveSmallAccum = true;
391381ad6265SDimitry Andric           }
391481ad6265SDimitry Andric 
391581ad6265SDimitry Andric           do {
391681ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
391706c3fb27SDimitry Andric             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
391806c3fb27SDimitry Andric               ++j0;
391906c3fb27SDimitry Andric               continue;
392006c3fb27SDimitry Andric             }
392181ad6265SDimitry Andric             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
392281ad6265SDimitry Andric                                     {Src0[j0], Src1[j1], Tmp});
392381ad6265SDimitry Andric             Tmp = Mad.getReg(0);
392481ad6265SDimitry Andric             if (!HaveSmallAccum)
392581ad6265SDimitry Andric               CarryOut.push_back(Mad.getReg(1));
392681ad6265SDimitry Andric             HaveSmallAccum = false;
392706c3fb27SDimitry Andric 
392881ad6265SDimitry Andric             ++j0;
392981ad6265SDimitry Andric           } while (j0 <= DstIndex);
393081ad6265SDimitry Andric 
393181ad6265SDimitry Andric           auto Unmerge = B.buildUnmerge(S32, Tmp);
393281ad6265SDimitry Andric           LocalAccum[0] = Unmerge.getReg(0);
393381ad6265SDimitry Andric           if (LocalAccum.size() > 1)
393481ad6265SDimitry Andric             LocalAccum[1] = Unmerge.getReg(1);
393581ad6265SDimitry Andric         }
393681ad6265SDimitry Andric 
393781ad6265SDimitry Andric         return CarryOut;
393881ad6265SDimitry Andric       };
393981ad6265SDimitry Andric 
394081ad6265SDimitry Andric   // Outer multiply loop, iterating over destination parts from least
394181ad6265SDimitry Andric   // significant to most significant parts.
394281ad6265SDimitry Andric   //
394381ad6265SDimitry Andric   // The columns of the following diagram correspond to the destination parts
394481ad6265SDimitry Andric   // affected by one iteration of the outer loop (ignoring boundary
394581ad6265SDimitry Andric   // conditions).
394681ad6265SDimitry Andric   //
394781ad6265SDimitry Andric   //   Dest index relative to 2 * i:      1 0 -1
394881ad6265SDimitry Andric   //                                      ------
394981ad6265SDimitry Andric   //   Carries from previous iteration:     e o
395081ad6265SDimitry Andric   //   Even-aligned partial product sum:  E E .
395181ad6265SDimitry Andric   //   Odd-aligned partial product sum:     O O
395281ad6265SDimitry Andric   //
395381ad6265SDimitry Andric   // 'o' is OddCarry, 'e' is EvenCarry.
395481ad6265SDimitry Andric   // EE and OO are computed from partial products via buildMadChain and use
395581ad6265SDimitry Andric   // accumulation where possible and appropriate.
395681ad6265SDimitry Andric   //
395781ad6265SDimitry Andric   Register SeparateOddCarry;
395881ad6265SDimitry Andric   Carry EvenCarry;
395981ad6265SDimitry Andric   Carry OddCarry;
396081ad6265SDimitry Andric 
396181ad6265SDimitry Andric   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
396281ad6265SDimitry Andric     Carry OddCarryIn = std::move(OddCarry);
396381ad6265SDimitry Andric     Carry EvenCarryIn = std::move(EvenCarry);
396481ad6265SDimitry Andric     OddCarry.clear();
396581ad6265SDimitry Andric     EvenCarry.clear();
396681ad6265SDimitry Andric 
396781ad6265SDimitry Andric     // Partial products at offset 2 * i.
396881ad6265SDimitry Andric     if (2 * i < Accum.size()) {
396981ad6265SDimitry Andric       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
397081ad6265SDimitry Andric       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
397181ad6265SDimitry Andric     }
397281ad6265SDimitry Andric 
397381ad6265SDimitry Andric     // Partial products at offset 2 * i - 1.
397481ad6265SDimitry Andric     if (i > 0) {
397581ad6265SDimitry Andric       if (!SeparateOddAlignedProducts) {
397681ad6265SDimitry Andric         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
397781ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
397881ad6265SDimitry Andric       } else {
397981ad6265SDimitry Andric         bool IsHighest = 2 * i >= Accum.size();
398081ad6265SDimitry Andric         Register SeparateOddOut[2];
3981bdd1243dSDimitry Andric         auto LocalAccum = MutableArrayRef(SeparateOddOut)
398281ad6265SDimitry Andric                               .take_front(IsHighest ? 1 : 2);
398381ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
398481ad6265SDimitry Andric 
398581ad6265SDimitry Andric         MachineInstr *Lo;
398681ad6265SDimitry Andric 
398781ad6265SDimitry Andric         if (i == 1) {
398881ad6265SDimitry Andric           if (!IsHighest)
398981ad6265SDimitry Andric             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
399081ad6265SDimitry Andric           else
399181ad6265SDimitry Andric             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
399281ad6265SDimitry Andric         } else {
399381ad6265SDimitry Andric           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
399481ad6265SDimitry Andric                             SeparateOddCarry);
399581ad6265SDimitry Andric         }
399681ad6265SDimitry Andric         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
399781ad6265SDimitry Andric 
399881ad6265SDimitry Andric         if (!IsHighest) {
399981ad6265SDimitry Andric           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
400081ad6265SDimitry Andric                                 Lo->getOperand(1).getReg());
400181ad6265SDimitry Andric           Accum[2 * i] = Hi.getReg(0);
400281ad6265SDimitry Andric           SeparateOddCarry = Hi.getReg(1);
400381ad6265SDimitry Andric         }
400481ad6265SDimitry Andric       }
400581ad6265SDimitry Andric     }
400681ad6265SDimitry Andric 
400781ad6265SDimitry Andric     // Add in the carries from the previous iteration
400881ad6265SDimitry Andric     if (i > 0) {
400981ad6265SDimitry Andric       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
401081ad6265SDimitry Andric         EvenCarryIn.push_back(CarryOut);
401181ad6265SDimitry Andric 
401281ad6265SDimitry Andric       if (2 * i < Accum.size()) {
401381ad6265SDimitry Andric         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
401481ad6265SDimitry Andric           OddCarry.push_back(CarryOut);
401581ad6265SDimitry Andric       }
401681ad6265SDimitry Andric     }
401781ad6265SDimitry Andric   }
401881ad6265SDimitry Andric }
401981ad6265SDimitry Andric 
402081ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions.
402181ad6265SDimitry Andric //
402281ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to
402381ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
402481ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
402581ad6265SDimitry Andric                                       MachineInstr &MI) const {
402681ad6265SDimitry Andric   assert(ST.hasMad64_32());
402781ad6265SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_MUL);
402881ad6265SDimitry Andric 
402981ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
403081ad6265SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
403181ad6265SDimitry Andric 
403281ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
403381ad6265SDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
403481ad6265SDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
403581ad6265SDimitry Andric 
403681ad6265SDimitry Andric   LLT Ty = MRI.getType(DstReg);
403781ad6265SDimitry Andric   assert(Ty.isScalar());
403881ad6265SDimitry Andric 
403981ad6265SDimitry Andric   unsigned Size = Ty.getSizeInBits();
404081ad6265SDimitry Andric   unsigned NumParts = Size / 32;
404181ad6265SDimitry Andric   assert((Size % 32) == 0);
404281ad6265SDimitry Andric   assert(NumParts >= 2);
404381ad6265SDimitry Andric 
404481ad6265SDimitry Andric   // Whether to use MAD_64_32 for partial products whose high half is
404581ad6265SDimitry Andric   // discarded. This avoids some ADD instructions but risks false dependency
404681ad6265SDimitry Andric   // stalls on some subtargets in some cases.
404781ad6265SDimitry Andric   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
404881ad6265SDimitry Andric 
404981ad6265SDimitry Andric   // Whether to compute odd-aligned partial products separately. This is
405081ad6265SDimitry Andric   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
405181ad6265SDimitry Andric   // in an even-aligned VGPR.
405281ad6265SDimitry Andric   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
405381ad6265SDimitry Andric 
405481ad6265SDimitry Andric   LLT S32 = LLT::scalar(32);
405581ad6265SDimitry Andric   SmallVector<Register, 2> Src0Parts, Src1Parts;
405681ad6265SDimitry Andric   for (unsigned i = 0; i < NumParts; ++i) {
405781ad6265SDimitry Andric     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
405881ad6265SDimitry Andric     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
405981ad6265SDimitry Andric   }
406081ad6265SDimitry Andric   B.buildUnmerge(Src0Parts, Src0);
406181ad6265SDimitry Andric   B.buildUnmerge(Src1Parts, Src1);
406281ad6265SDimitry Andric 
406381ad6265SDimitry Andric   SmallVector<Register, 2> AccumRegs(NumParts);
406481ad6265SDimitry Andric   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
406581ad6265SDimitry Andric                 SeparateOddAlignedProducts);
406681ad6265SDimitry Andric 
4067bdd1243dSDimitry Andric   B.buildMergeLikeInstr(DstReg, AccumRegs);
406881ad6265SDimitry Andric   MI.eraseFromParent();
406981ad6265SDimitry Andric   return true;
407081ad6265SDimitry Andric }
407181ad6265SDimitry Andric 
4072349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4073349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4074349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select.
4075349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4076349cc55cSDimitry Andric                                             MachineRegisterInfo &MRI,
4077349cc55cSDimitry Andric                                             MachineIRBuilder &B) const {
4078349cc55cSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4079349cc55cSDimitry Andric   Register Src = MI.getOperand(1).getReg();
4080349cc55cSDimitry Andric   LLT DstTy = MRI.getType(Dst);
4081349cc55cSDimitry Andric   LLT SrcTy = MRI.getType(Src);
4082349cc55cSDimitry Andric 
4083349cc55cSDimitry Andric   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4084349cc55cSDimitry Andric                         ? AMDGPU::G_AMDGPU_FFBH_U32
4085349cc55cSDimitry Andric                         : AMDGPU::G_AMDGPU_FFBL_B32;
4086349cc55cSDimitry Andric   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4087349cc55cSDimitry Andric   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4088349cc55cSDimitry Andric 
4089349cc55cSDimitry Andric   MI.eraseFromParent();
4090349cc55cSDimitry Andric   return true;
4091349cc55cSDimitry Andric }
4092349cc55cSDimitry Andric 
4093e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
4094e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4095e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
4096e8d8bef9SDimitry Andric     return false;
4097349cc55cSDimitry Andric   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4098e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
4099e8d8bef9SDimitry Andric }
4100e8d8bef9SDimitry Andric 
41010b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
4102e8d8bef9SDimitry Andric static MachineInstr *
4103e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4104e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
41050b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
41060b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
41070b57cec5SDimitry Andric     return nullptr;
41080b57cec5SDimitry Andric 
41095ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
4110e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4111e8d8bef9SDimitry Andric 
4112e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
4113e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
4114e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
4115e8d8bef9SDimitry Andric       return nullptr;
4116e8d8bef9SDimitry Andric 
4117e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
4118349cc55cSDimitry Andric     eraseInstr(*UseMI, MRI);
4119e8d8bef9SDimitry Andric 
4120e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4121e8d8bef9SDimitry Andric     Negated = true;
4122e8d8bef9SDimitry Andric   }
4123e8d8bef9SDimitry Andric 
4124e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4125480093f4SDimitry Andric     return nullptr;
4126480093f4SDimitry Andric 
41275ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4128e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
41295ffd83dbSDimitry Andric   if (Next == Parent->end()) {
41305ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
41315ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
41325ffd83dbSDimitry Andric       return nullptr;
41335ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
41345ffd83dbSDimitry Andric   } else {
4135480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
4136480093f4SDimitry Andric       return nullptr;
4137480093f4SDimitry Andric     Br = &*Next;
41385ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
4139480093f4SDimitry Andric   }
4140480093f4SDimitry Andric 
4141e8d8bef9SDimitry Andric   return UseMI;
41420b57cec5SDimitry Andric }
41430b57cec5SDimitry Andric 
41440b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4145e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
4146e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
4147e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
4148e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
4149e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
41505ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
41510b57cec5SDimitry Andric 
415204eeddc0SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
415304eeddc0SDimitry Andric                                              *ArgRC, B.getDebugLoc(), ArgTy);
41540b57cec5SDimitry Andric   if (Arg->isMasked()) {
41550b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
41560b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
41570b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
415806c3fb27SDimitry Andric     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
41590b57cec5SDimitry Andric 
41608bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
41618bcb0991SDimitry Andric 
416204eeddc0SDimitry Andric     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
416304eeddc0SDimitry Andric     // 0.
41648bcb0991SDimitry Andric     if (Shift != 0) {
41650b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
41668bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
41678bcb0991SDimitry Andric     }
41688bcb0991SDimitry Andric 
41698bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
41705ffd83dbSDimitry Andric   } else {
41710b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
41720b57cec5SDimitry Andric   }
41730b57cec5SDimitry Andric 
41740b57cec5SDimitry Andric   return true;
41750b57cec5SDimitry Andric }
41760b57cec5SDimitry Andric 
4177e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
4178e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
4179e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4180e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4181e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
4182e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
4183e8d8bef9SDimitry Andric   LLT ArgTy;
4184e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4185e8d8bef9SDimitry Andric 
4186349cc55cSDimitry Andric   if (!Arg) {
4187349cc55cSDimitry Andric     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4188349cc55cSDimitry Andric       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4189349cc55cSDimitry Andric       // case the pointer argument may be missing and we use null.
4190349cc55cSDimitry Andric       B.buildConstant(DstReg, 0);
4191349cc55cSDimitry Andric       return true;
4192349cc55cSDimitry Andric     }
4193349cc55cSDimitry Andric 
4194349cc55cSDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
4195349cc55cSDimitry Andric     // attributes uses the corresponding intrinsic.
4196349cc55cSDimitry Andric     B.buildUndef(DstReg);
4197349cc55cSDimitry Andric     return true;
4198349cc55cSDimitry Andric   }
4199349cc55cSDimitry Andric 
4200e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4201e8d8bef9SDimitry Andric     return false; // TODO: Handle these
4202e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4203e8d8bef9SDimitry Andric }
4204e8d8bef9SDimitry Andric 
42050b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
42065ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
42070b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4208e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
42095ffd83dbSDimitry Andric     return false;
42105ffd83dbSDimitry Andric 
42110b57cec5SDimitry Andric   MI.eraseFromParent();
42120b57cec5SDimitry Andric   return true;
42130b57cec5SDimitry Andric }
42140b57cec5SDimitry Andric 
421581ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
421681ad6265SDimitry Andric                                 int64_t C) {
421781ad6265SDimitry Andric   B.buildConstant(MI.getOperand(0).getReg(), C);
421881ad6265SDimitry Andric   MI.eraseFromParent();
421981ad6265SDimitry Andric   return true;
422081ad6265SDimitry Andric }
422181ad6265SDimitry Andric 
422281ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
422381ad6265SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
422481ad6265SDimitry Andric     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
422581ad6265SDimitry Andric   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
422681ad6265SDimitry Andric   if (MaxID == 0)
422781ad6265SDimitry Andric     return replaceWithConstant(B, MI, 0);
422881ad6265SDimitry Andric 
422981ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
423081ad6265SDimitry Andric   const ArgDescriptor *Arg;
423181ad6265SDimitry Andric   const TargetRegisterClass *ArgRC;
423281ad6265SDimitry Andric   LLT ArgTy;
423381ad6265SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
423481ad6265SDimitry Andric 
423581ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
423681ad6265SDimitry Andric   if (!Arg) {
423781ad6265SDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
423881ad6265SDimitry Andric     // attributes uses the corresponding intrinsic.
423981ad6265SDimitry Andric     B.buildUndef(DstReg);
424081ad6265SDimitry Andric     MI.eraseFromParent();
424181ad6265SDimitry Andric     return true;
424281ad6265SDimitry Andric   }
424381ad6265SDimitry Andric 
424481ad6265SDimitry Andric   if (Arg->isMasked()) {
424581ad6265SDimitry Andric     // Don't bother inserting AssertZext for packed IDs since we're emitting the
424681ad6265SDimitry Andric     // masking operations anyway.
424781ad6265SDimitry Andric     //
424881ad6265SDimitry Andric     // TODO: We could assert the top bit is 0 for the source copy.
424981ad6265SDimitry Andric     if (!loadInputValue(DstReg, B, ArgType))
425081ad6265SDimitry Andric       return false;
425181ad6265SDimitry Andric   } else {
425281ad6265SDimitry Andric     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
425381ad6265SDimitry Andric     if (!loadInputValue(TmpReg, B, ArgType))
425481ad6265SDimitry Andric       return false;
4255bdd1243dSDimitry Andric     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
425681ad6265SDimitry Andric   }
425781ad6265SDimitry Andric 
425881ad6265SDimitry Andric   MI.eraseFromParent();
425981ad6265SDimitry Andric   return true;
426081ad6265SDimitry Andric }
426181ad6265SDimitry Andric 
426281ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
426381ad6265SDimitry Andric                                                      int64_t Offset) const {
426481ad6265SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
426581ad6265SDimitry Andric   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
426681ad6265SDimitry Andric 
426781ad6265SDimitry Andric   // TODO: If we passed in the base kernel offset we could have a better
426881ad6265SDimitry Andric   // alignment than 4, but we don't really need it.
426981ad6265SDimitry Andric   if (!loadInputValue(KernArgReg, B,
427081ad6265SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
427181ad6265SDimitry Andric     llvm_unreachable("failed to find kernarg segment ptr");
427281ad6265SDimitry Andric 
427381ad6265SDimitry Andric   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
427481ad6265SDimitry Andric   // TODO: Should get nuw
427581ad6265SDimitry Andric   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
427681ad6265SDimitry Andric }
427781ad6265SDimitry Andric 
427881ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by
427981ad6265SDimitry Andric /// legacy intrinsics.
428081ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
428181ad6265SDimitry Andric                                                       MachineIRBuilder &B,
428281ad6265SDimitry Andric                                                       uint64_t Offset,
428381ad6265SDimitry Andric                                                       Align Alignment) const {
428481ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
428581ad6265SDimitry Andric 
428681ad6265SDimitry Andric   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
428781ad6265SDimitry Andric          "unexpected kernarg parameter type");
428881ad6265SDimitry Andric 
428981ad6265SDimitry Andric   Register Ptr = getKernargParameterPtr(B, Offset);
429081ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
429181ad6265SDimitry Andric   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
429281ad6265SDimitry Andric               MachineMemOperand::MODereferenceable |
429381ad6265SDimitry Andric                   MachineMemOperand::MOInvariant);
429481ad6265SDimitry Andric   MI.eraseFromParent();
429581ad6265SDimitry Andric   return true;
429681ad6265SDimitry Andric }
429781ad6265SDimitry Andric 
42988bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
42998bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
43008bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
4301480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4302480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
4303480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
4304480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4305480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
43068bcb0991SDimitry Andric 
4307480093f4SDimitry Andric   if (DstTy == S16)
4308480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
4309480093f4SDimitry Andric   if (DstTy == S32)
4310480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
4311480093f4SDimitry Andric   if (DstTy == S64)
4312480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
4313480093f4SDimitry Andric 
43148bcb0991SDimitry Andric   return false;
43158bcb0991SDimitry Andric }
43168bcb0991SDimitry Andric 
4317fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4318fe6060f1SDimitry Andric                                                         Register DstDivReg,
4319fe6060f1SDimitry Andric                                                         Register DstRemReg,
43205ffd83dbSDimitry Andric                                                         Register X,
4321fe6060f1SDimitry Andric                                                         Register Y) const {
43225ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
43235ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43245ffd83dbSDimitry Andric 
43255ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
43265ffd83dbSDimitry Andric   // algorithm used here.
43275ffd83dbSDimitry Andric 
43285ffd83dbSDimitry Andric   // Initial estimate of inv(y).
43295ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
43305ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
433106c3fb27SDimitry Andric   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
43325ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
43335ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
43345ffd83dbSDimitry Andric 
43355ffd83dbSDimitry Andric   // One round of UNR.
43365ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
43375ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
43385ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
43395ffd83dbSDimitry Andric 
43405ffd83dbSDimitry Andric   // Quotient/remainder estimate.
43415ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
43425ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
43435ffd83dbSDimitry Andric 
43445ffd83dbSDimitry Andric   // First quotient/remainder refinement.
43455ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
43465ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4347fe6060f1SDimitry Andric   if (DstDivReg)
43485ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
43495ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
43505ffd83dbSDimitry Andric 
43515ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
43525ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4353fe6060f1SDimitry Andric   if (DstDivReg)
4354fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
43555ffd83dbSDimitry Andric 
4356fe6060f1SDimitry Andric   if (DstRemReg)
4357fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
43585ffd83dbSDimitry Andric }
43595ffd83dbSDimitry Andric 
4360349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32
43615ffd83dbSDimitry Andric //
43625ffd83dbSDimitry Andric // Return lo, hi of result
43635ffd83dbSDimitry Andric //
43645ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
43655ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
43665ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
43675ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
43685ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
43695ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
43705ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
43715ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
43725ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
43735ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
43745ffd83dbSDimitry Andric                                                        Register Val) {
43755ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43765ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
43775ffd83dbSDimitry Andric 
43785ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
43795ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
43805ffd83dbSDimitry Andric 
438106c3fb27SDimitry Andric   auto Mad = B.buildFMAD(
438206c3fb27SDimitry Andric       S32, CvtHi, // 2**32
438306c3fb27SDimitry Andric       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
43845ffd83dbSDimitry Andric 
43855ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
438606c3fb27SDimitry Andric   auto Mul1 = B.buildFMul(
438706c3fb27SDimitry Andric       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
43885ffd83dbSDimitry Andric 
43895ffd83dbSDimitry Andric   // 2**(-32)
439006c3fb27SDimitry Andric   auto Mul2 = B.buildFMul(
439106c3fb27SDimitry Andric       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
43925ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
43935ffd83dbSDimitry Andric 
43945ffd83dbSDimitry Andric   // -(2**32)
439506c3fb27SDimitry Andric   auto Mad2 = B.buildFMAD(
439606c3fb27SDimitry Andric       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
439706c3fb27SDimitry Andric       Mul1);
43985ffd83dbSDimitry Andric 
43995ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
44005ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
44015ffd83dbSDimitry Andric 
44025ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
44035ffd83dbSDimitry Andric }
44045ffd83dbSDimitry Andric 
4405fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4406fe6060f1SDimitry Andric                                                         Register DstDivReg,
4407fe6060f1SDimitry Andric                                                         Register DstRemReg,
44085ffd83dbSDimitry Andric                                                         Register Numer,
4409fe6060f1SDimitry Andric                                                         Register Denom) const {
44105ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
44115ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
44125ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
44135ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
44145ffd83dbSDimitry Andric 
44155ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
44165ffd83dbSDimitry Andric 
4417bdd1243dSDimitry Andric   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
44185ffd83dbSDimitry Andric 
44195ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
44205ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
44215ffd83dbSDimitry Andric 
44225ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
44235ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
44245ffd83dbSDimitry Andric 
44255ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
44265ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
44275ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
44285ffd83dbSDimitry Andric 
44295ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
44305ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4431bdd1243dSDimitry Andric   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
44325ffd83dbSDimitry Andric 
44335ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
44345ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
44355ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
44365ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
44375ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
44385ffd83dbSDimitry Andric 
44395ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
44405ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4441349cc55cSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4442bdd1243dSDimitry Andric   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
44435ffd83dbSDimitry Andric 
44445ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
44455ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
44465ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
44475ffd83dbSDimitry Andric 
44485ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
44495ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
44505ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
44515ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
44525ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
44535ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
44545ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
44555ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4456bdd1243dSDimitry Andric   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
44575ffd83dbSDimitry Andric 
44585ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
44595ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
44605ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
44615ffd83dbSDimitry Andric 
44625ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
44635ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
44645ffd83dbSDimitry Andric 
44655ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
44665ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
44675ffd83dbSDimitry Andric 
44685ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
44695ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
44705ffd83dbSDimitry Andric 
44715ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
44725ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
44735ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
44745ffd83dbSDimitry Andric 
44755ffd83dbSDimitry Andric   // if C3 != 0 ...
44765ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
44775ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
44785ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4479bdd1243dSDimitry Andric   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
44805ffd83dbSDimitry Andric 
44815ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
44825ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
44835ffd83dbSDimitry Andric 
44845ffd83dbSDimitry Andric   auto C4 =
44855ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
44865ffd83dbSDimitry Andric   auto C5 =
44875ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
44885ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
44895ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
44905ffd83dbSDimitry Andric 
44915ffd83dbSDimitry Andric   // if (C6 != 0)
44925ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
44935ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
44945ffd83dbSDimitry Andric 
44955ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
44965ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4497bdd1243dSDimitry Andric   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
44985ffd83dbSDimitry Andric 
44995ffd83dbSDimitry Andric   // endif C6
45005ffd83dbSDimitry Andric   // endif C3
45015ffd83dbSDimitry Andric 
4502fe6060f1SDimitry Andric   if (DstDivReg) {
45035ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
45045ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4505fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4506fe6060f1SDimitry Andric                   Sel1, MulHi3);
4507fe6060f1SDimitry Andric   }
4508fe6060f1SDimitry Andric 
4509fe6060f1SDimitry Andric   if (DstRemReg) {
45105ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
45115ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4512fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4513fe6060f1SDimitry Andric                   Sel2, Sub1);
45145ffd83dbSDimitry Andric   }
45155ffd83dbSDimitry Andric }
45165ffd83dbSDimitry Andric 
4517fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
45185ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
45195ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
4520fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
4521fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
4522fe6060f1SDimitry Andric   default:
4523fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
4524fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
4525fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4526fe6060f1SDimitry Andric     break;
4527fe6060f1SDimitry Andric   }
4528fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
4529fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
4530fe6060f1SDimitry Andric     break;
4531fe6060f1SDimitry Andric   }
4532fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
4533fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4534fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
4535fe6060f1SDimitry Andric     break;
4536fe6060f1SDimitry Andric   }
4537fe6060f1SDimitry Andric   }
4538fe6060f1SDimitry Andric 
45395ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
45405ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
4541fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4542fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4543fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4544fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
45455ffd83dbSDimitry Andric 
45465ffd83dbSDimitry Andric   if (Ty == S32)
4547fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
45485ffd83dbSDimitry Andric   else if (Ty == S64)
4549fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
45505ffd83dbSDimitry Andric   else
45515ffd83dbSDimitry Andric     return false;
45525ffd83dbSDimitry Andric 
45535ffd83dbSDimitry Andric   MI.eraseFromParent();
45545ffd83dbSDimitry Andric   return true;
45555ffd83dbSDimitry Andric }
45565ffd83dbSDimitry Andric 
4557fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
45585ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
45595ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
45605ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
45615ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
45625ffd83dbSDimitry Andric 
4563fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
45645ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
45655ffd83dbSDimitry Andric     return false;
45665ffd83dbSDimitry Andric 
4567fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4568fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4569fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
45705ffd83dbSDimitry Andric 
45715ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
45725ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
45735ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
45745ffd83dbSDimitry Andric 
45755ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
45765ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
45775ffd83dbSDimitry Andric 
45785ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
45795ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
45805ffd83dbSDimitry Andric 
4581fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4582fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
4583fe6060f1SDimitry Andric   default:
4584fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
4585fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
4586fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4587fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4588fe6060f1SDimitry Andric     break;
4589fe6060f1SDimitry Andric   }
4590fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
4591fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
4592fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4593fe6060f1SDimitry Andric     break;
4594fe6060f1SDimitry Andric   }
4595fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
4596fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4597fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
4598fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4599fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4600fe6060f1SDimitry Andric     break;
4601fe6060f1SDimitry Andric   }
4602fe6060f1SDimitry Andric   }
4603fe6060f1SDimitry Andric 
46045ffd83dbSDimitry Andric   if (Ty == S32)
4605fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
46065ffd83dbSDimitry Andric   else
4607fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
46085ffd83dbSDimitry Andric 
4609fe6060f1SDimitry Andric   if (DstDivReg) {
4610fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4611fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4612fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
4613fe6060f1SDimitry Andric   }
46145ffd83dbSDimitry Andric 
4615fe6060f1SDimitry Andric   if (DstRemReg) {
4616fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4617fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4618fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
4619fe6060f1SDimitry Andric   }
46205ffd83dbSDimitry Andric 
46215ffd83dbSDimitry Andric   MI.eraseFromParent();
46225ffd83dbSDimitry Andric   return true;
46235ffd83dbSDimitry Andric }
46245ffd83dbSDimitry Andric 
46258bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
46268bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
46278bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
46288bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
46298bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
46308bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
46318bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
46328bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
46338bcb0991SDimitry Andric 
46348bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
463506c3fb27SDimitry Andric   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
463606c3fb27SDimitry Andric                             MF.getTarget().Options.UnsafeFPMath;
46378bcb0991SDimitry Andric 
46388bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
463906c3fb27SDimitry Andric     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
464006c3fb27SDimitry Andric       return false;
464106c3fb27SDimitry Andric 
464206c3fb27SDimitry Andric     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
464306c3fb27SDimitry Andric     // the CI documentation has a worst case error of 1 ulp.
464406c3fb27SDimitry Andric     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
464506c3fb27SDimitry Andric     // use it as long as we aren't trying to use denormals.
464606c3fb27SDimitry Andric     //
464706c3fb27SDimitry Andric     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
464806c3fb27SDimitry Andric 
46498bcb0991SDimitry Andric     // 1 / x -> RCP(x)
46508bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
46515f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
46528bcb0991SDimitry Andric           .addUse(RHS)
46538bcb0991SDimitry Andric           .setMIFlags(Flags);
46548bcb0991SDimitry Andric 
46558bcb0991SDimitry Andric       MI.eraseFromParent();
46568bcb0991SDimitry Andric       return true;
46578bcb0991SDimitry Andric     }
46588bcb0991SDimitry Andric 
46598bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
46608bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
46618bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
46625f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
46638bcb0991SDimitry Andric           .addUse(FNeg.getReg(0))
46648bcb0991SDimitry Andric           .setMIFlags(Flags);
46658bcb0991SDimitry Andric 
46668bcb0991SDimitry Andric       MI.eraseFromParent();
46678bcb0991SDimitry Andric       return true;
46688bcb0991SDimitry Andric     }
46698bcb0991SDimitry Andric   }
46708bcb0991SDimitry Andric 
46715f757f3fSDimitry Andric   // For f16 require afn or arcp.
46725f757f3fSDimitry Andric   // For f32 require afn.
467306c3fb27SDimitry Andric   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
467406c3fb27SDimitry Andric                               !MI.getFlag(MachineInstr::FmArcp)))
467506c3fb27SDimitry Andric     return false;
467606c3fb27SDimitry Andric 
46778bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
46785f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
46798bcb0991SDimitry Andric                  .addUse(RHS)
46808bcb0991SDimitry Andric                  .setMIFlags(Flags);
46818bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
46828bcb0991SDimitry Andric 
46838bcb0991SDimitry Andric   MI.eraseFromParent();
46848bcb0991SDimitry Andric   return true;
46858bcb0991SDimitry Andric }
46868bcb0991SDimitry Andric 
4687e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4688e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
4689e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
4690e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4691e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
4692e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
4693e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
4694e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
4695e8d8bef9SDimitry Andric 
4696e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
4697e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4698e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
4699e8d8bef9SDimitry Andric 
4700e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
47018bcb0991SDimitry Andric     return false;
4702e8d8bef9SDimitry Andric 
4703e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
4704e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
4705e8d8bef9SDimitry Andric 
47065f757f3fSDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4707e8d8bef9SDimitry Andric                .addUse(Y)
4708e8d8bef9SDimitry Andric                .setMIFlags(Flags);
4709e8d8bef9SDimitry Andric 
4710e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4711e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
4712e8d8bef9SDimitry Andric 
4713e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4714e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
4715e8d8bef9SDimitry Andric 
4716e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
4717e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4718e8d8bef9SDimitry Andric 
4719e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
4720e8d8bef9SDimitry Andric   MI.eraseFromParent();
4721e8d8bef9SDimitry Andric   return true;
47228bcb0991SDimitry Andric }
47238bcb0991SDimitry Andric 
4724480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4725480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4726480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4727e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4728e8d8bef9SDimitry Andric     return true;
4729e8d8bef9SDimitry Andric 
4730480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4731480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4732480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4733480093f4SDimitry Andric 
4734480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4735480093f4SDimitry Andric 
4736480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
4737480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4738480093f4SDimitry Andric 
4739480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4740480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4741480093f4SDimitry Andric 
47425f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4743480093f4SDimitry Andric                  .addUse(RHSExt.getReg(0))
4744480093f4SDimitry Andric                  .setMIFlags(Flags);
4745480093f4SDimitry Andric 
4746480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4747480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4748480093f4SDimitry Andric 
47495f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4750480093f4SDimitry Andric       .addUse(RDst.getReg(0))
4751480093f4SDimitry Andric       .addUse(RHS)
4752480093f4SDimitry Andric       .addUse(LHS)
4753480093f4SDimitry Andric       .setMIFlags(Flags);
4754480093f4SDimitry Andric 
4755480093f4SDimitry Andric   MI.eraseFromParent();
4756480093f4SDimitry Andric   return true;
4757480093f4SDimitry Andric }
4758480093f4SDimitry Andric 
47595f757f3fSDimitry Andric static const unsigned SPDenormModeBitField =
47605f757f3fSDimitry Andric     AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
47615f757f3fSDimitry Andric     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
47625f757f3fSDimitry Andric 
4763480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4764480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
476506c3fb27SDimitry Andric static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4766480093f4SDimitry Andric                                const GCNSubtarget &ST,
476706c3fb27SDimitry Andric                                SIModeRegisterDefaults Mode) {
4768480093f4SDimitry Andric   // Set SP denorm mode to this value.
4769480093f4SDimitry Andric   unsigned SPDenormMode =
47705ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4771480093f4SDimitry Andric 
4772480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
4773480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
47745ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4775480093f4SDimitry Andric 
47765ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4777480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
4778480093f4SDimitry Andric       .addImm(NewDenormModeValue);
4779480093f4SDimitry Andric 
4780480093f4SDimitry Andric   } else {
4781480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4782480093f4SDimitry Andric       .addImm(SPDenormMode)
4783480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
4784480093f4SDimitry Andric   }
4785480093f4SDimitry Andric }
4786480093f4SDimitry Andric 
4787480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4788480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4789480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4790e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4791e8d8bef9SDimitry Andric     return true;
4792e8d8bef9SDimitry Andric 
4793480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4794480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4795480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4796480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
479706c3fb27SDimitry Andric   SIModeRegisterDefaults Mode = MFI->getMode();
4798480093f4SDimitry Andric 
4799480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4800480093f4SDimitry Andric 
4801480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4802480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
4803480093f4SDimitry Andric 
4804480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
4805480093f4SDimitry Andric 
4806480093f4SDimitry Andric   auto DenominatorScaled =
48075f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4808480093f4SDimitry Andric           .addUse(LHS)
48095ffd83dbSDimitry Andric           .addUse(RHS)
48105ffd83dbSDimitry Andric           .addImm(0)
4811480093f4SDimitry Andric           .setMIFlags(Flags);
4812480093f4SDimitry Andric   auto NumeratorScaled =
48135f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4814480093f4SDimitry Andric           .addUse(LHS)
4815480093f4SDimitry Andric           .addUse(RHS)
48165ffd83dbSDimitry Andric           .addImm(1)
4817480093f4SDimitry Andric           .setMIFlags(Flags);
4818480093f4SDimitry Andric 
48195f757f3fSDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4820480093f4SDimitry Andric                        .addUse(DenominatorScaled.getReg(0))
4821480093f4SDimitry Andric                        .setMIFlags(Flags);
4822480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4823480093f4SDimitry Andric 
48245f757f3fSDimitry Andric   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
48255f757f3fSDimitry Andric   const bool HasDynamicDenormals =
48265f757f3fSDimitry Andric       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
48275f757f3fSDimitry Andric       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
48285f757f3fSDimitry Andric 
48295f757f3fSDimitry Andric   Register SavedSPDenormMode;
48305f757f3fSDimitry Andric   if (!PreservesDenormals) {
48315f757f3fSDimitry Andric     if (HasDynamicDenormals) {
48325f757f3fSDimitry Andric       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
48335f757f3fSDimitry Andric       B.buildInstr(AMDGPU::S_GETREG_B32)
48345f757f3fSDimitry Andric           .addDef(SavedSPDenormMode)
48355f757f3fSDimitry Andric           .addImm(SPDenormModeBitField);
48365f757f3fSDimitry Andric     }
4837480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
48385f757f3fSDimitry Andric   }
4839480093f4SDimitry Andric 
4840480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4841480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4842480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4843480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4844480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4845480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4846480093f4SDimitry Andric 
48475f757f3fSDimitry Andric   if (!PreservesDenormals) {
48485f757f3fSDimitry Andric     if (HasDynamicDenormals) {
48495f757f3fSDimitry Andric       assert(SavedSPDenormMode);
48505f757f3fSDimitry Andric       B.buildInstr(AMDGPU::S_SETREG_B32)
48515f757f3fSDimitry Andric           .addReg(SavedSPDenormMode)
48525f757f3fSDimitry Andric           .addImm(SPDenormModeBitField);
48535f757f3fSDimitry Andric     } else
4854480093f4SDimitry Andric       toggleSPDenormMode(false, B, ST, Mode);
48555f757f3fSDimitry Andric   }
4856480093f4SDimitry Andric 
48575f757f3fSDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4858480093f4SDimitry Andric                   .addUse(Fma4.getReg(0))
4859480093f4SDimitry Andric                   .addUse(Fma1.getReg(0))
4860480093f4SDimitry Andric                   .addUse(Fma3.getReg(0))
4861480093f4SDimitry Andric                   .addUse(NumeratorScaled.getReg(1))
4862480093f4SDimitry Andric                   .setMIFlags(Flags);
4863480093f4SDimitry Andric 
48645f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4865480093f4SDimitry Andric       .addUse(Fmas.getReg(0))
4866480093f4SDimitry Andric       .addUse(RHS)
4867480093f4SDimitry Andric       .addUse(LHS)
4868480093f4SDimitry Andric       .setMIFlags(Flags);
4869480093f4SDimitry Andric 
4870480093f4SDimitry Andric   MI.eraseFromParent();
4871480093f4SDimitry Andric   return true;
4872480093f4SDimitry Andric }
4873480093f4SDimitry Andric 
4874480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4875480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4876480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4877e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4878e8d8bef9SDimitry Andric     return true;
4879e8d8bef9SDimitry Andric 
4880480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4881480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4882480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4883480093f4SDimitry Andric 
4884480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4885480093f4SDimitry Andric 
4886480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
4887480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
4888480093f4SDimitry Andric 
4889480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
4890480093f4SDimitry Andric 
48915f757f3fSDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4892480093f4SDimitry Andric                        .addUse(LHS)
4893480093f4SDimitry Andric                        .addUse(RHS)
48945ffd83dbSDimitry Andric                        .addImm(0)
4895480093f4SDimitry Andric                        .setMIFlags(Flags);
4896480093f4SDimitry Andric 
4897480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4898480093f4SDimitry Andric 
48995f757f3fSDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4900480093f4SDimitry Andric                  .addUse(DivScale0.getReg(0))
4901480093f4SDimitry Andric                  .setMIFlags(Flags);
4902480093f4SDimitry Andric 
4903480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4904480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4905480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4906480093f4SDimitry Andric 
49075f757f3fSDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4908480093f4SDimitry Andric                        .addUse(LHS)
4909480093f4SDimitry Andric                        .addUse(RHS)
49105ffd83dbSDimitry Andric                        .addImm(1)
4911480093f4SDimitry Andric                        .setMIFlags(Flags);
4912480093f4SDimitry Andric 
4913480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
49145ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4915480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4916480093f4SDimitry Andric 
4917480093f4SDimitry Andric   Register Scale;
4918480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
4919480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
4920480093f4SDimitry Andric     // is not usable.
4921480093f4SDimitry Andric 
4922480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
4923480093f4SDimitry Andric 
4924480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4925480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4926480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4927480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4928480093f4SDimitry Andric 
4929480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4930480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
4931480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4932480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
49335ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4934480093f4SDimitry Andric   } else {
4935480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
4936480093f4SDimitry Andric   }
4937480093f4SDimitry Andric 
49385f757f3fSDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
4939480093f4SDimitry Andric                   .addUse(Fma4.getReg(0))
4940480093f4SDimitry Andric                   .addUse(Fma3.getReg(0))
4941480093f4SDimitry Andric                   .addUse(Mul.getReg(0))
4942480093f4SDimitry Andric                   .addUse(Scale)
4943480093f4SDimitry Andric                   .setMIFlags(Flags);
4944480093f4SDimitry Andric 
49455f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
4946480093f4SDimitry Andric       .addUse(Fmas.getReg(0))
4947480093f4SDimitry Andric       .addUse(RHS)
4948480093f4SDimitry Andric       .addUse(LHS)
4949480093f4SDimitry Andric       .setMIFlags(Flags);
4950480093f4SDimitry Andric 
4951480093f4SDimitry Andric   MI.eraseFromParent();
4952480093f4SDimitry Andric   return true;
4953480093f4SDimitry Andric }
4954480093f4SDimitry Andric 
495506c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
495606c3fb27SDimitry Andric                                          MachineRegisterInfo &MRI,
495706c3fb27SDimitry Andric                                          MachineIRBuilder &B) const {
495806c3fb27SDimitry Andric   Register Res0 = MI.getOperand(0).getReg();
495906c3fb27SDimitry Andric   Register Res1 = MI.getOperand(1).getReg();
496006c3fb27SDimitry Andric   Register Val = MI.getOperand(2).getReg();
496106c3fb27SDimitry Andric   uint16_t Flags = MI.getFlags();
496206c3fb27SDimitry Andric 
496306c3fb27SDimitry Andric   LLT Ty = MRI.getType(Res0);
496406c3fb27SDimitry Andric   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
496506c3fb27SDimitry Andric 
49665f757f3fSDimitry Andric   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
496706c3fb27SDimitry Andric                   .addUse(Val)
496806c3fb27SDimitry Andric                   .setMIFlags(Flags);
49695f757f3fSDimitry Andric   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
497006c3fb27SDimitry Andric                  .addUse(Val)
497106c3fb27SDimitry Andric                  .setMIFlags(Flags);
497206c3fb27SDimitry Andric 
497306c3fb27SDimitry Andric   if (ST.hasFractBug()) {
497406c3fb27SDimitry Andric     auto Fabs = B.buildFAbs(Ty, Val);
497506c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
497606c3fb27SDimitry Andric     auto IsFinite =
497706c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
497806c3fb27SDimitry Andric     auto Zero = B.buildConstant(InstrExpTy, 0);
497906c3fb27SDimitry Andric     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
498006c3fb27SDimitry Andric     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
498106c3fb27SDimitry Andric   }
498206c3fb27SDimitry Andric 
498306c3fb27SDimitry Andric   B.buildCopy(Res0, Mant);
498406c3fb27SDimitry Andric   B.buildSExtOrTrunc(Res1, Exp);
498506c3fb27SDimitry Andric 
498606c3fb27SDimitry Andric   MI.eraseFromParent();
498706c3fb27SDimitry Andric   return true;
498806c3fb27SDimitry Andric }
498906c3fb27SDimitry Andric 
49908bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
49918bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
49928bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
49938bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
49948bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
49958bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
49968bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
49978bcb0991SDimitry Andric 
49988bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
49998bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
50008bcb0991SDimitry Andric 
50018bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
50028bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
50038bcb0991SDimitry Andric 
500406c3fb27SDimitry Andric   auto C0 = B.buildFConstant(S32, 0x1p+96f);
500506c3fb27SDimitry Andric   auto C1 = B.buildFConstant(S32, 0x1p-32f);
500606c3fb27SDimitry Andric   auto C2 = B.buildFConstant(S32, 1.0f);
50078bcb0991SDimitry Andric 
50088bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
50098bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
50108bcb0991SDimitry Andric 
50118bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
50128bcb0991SDimitry Andric 
50135f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
50148bcb0991SDimitry Andric                  .addUse(Mul0.getReg(0))
50158bcb0991SDimitry Andric                  .setMIFlags(Flags);
50168bcb0991SDimitry Andric 
50178bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
50188bcb0991SDimitry Andric 
50198bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
50208bcb0991SDimitry Andric 
50218bcb0991SDimitry Andric   MI.eraseFromParent();
50228bcb0991SDimitry Andric   return true;
50238bcb0991SDimitry Andric }
50248bcb0991SDimitry Andric 
50255f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
50265f757f3fSDimitry Andric                                            MachineRegisterInfo &MRI,
50275f757f3fSDimitry Andric                                            MachineIRBuilder &B) const {
50285f757f3fSDimitry Andric   // Bypass the correct expansion a standard promotion through G_FSQRT would
50295f757f3fSDimitry Andric   // get. The f32 op is accurate enough for the f16 cas.
50305f757f3fSDimitry Andric   unsigned Flags = MI.getFlags();
50315f757f3fSDimitry Andric   assert(!ST.has16BitInsts());
50325f757f3fSDimitry Andric   const LLT F32 = LLT::scalar(32);
50335f757f3fSDimitry Andric   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
50345f757f3fSDimitry Andric   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
50355f757f3fSDimitry Andric     .addUse(Ext.getReg(0))
50365f757f3fSDimitry Andric     .setMIFlags(Flags);
50375f757f3fSDimitry Andric   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
50385f757f3fSDimitry Andric   MI.eraseFromParent();
50395f757f3fSDimitry Andric   return true;
50405f757f3fSDimitry Andric }
50415f757f3fSDimitry Andric 
50425f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
50435f757f3fSDimitry Andric                                            MachineRegisterInfo &MRI,
50445f757f3fSDimitry Andric                                            MachineIRBuilder &B) const {
50455f757f3fSDimitry Andric   MachineFunction &MF = B.getMF();
50465f757f3fSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
50475f757f3fSDimitry Andric   Register X = MI.getOperand(1).getReg();
50485f757f3fSDimitry Andric   const unsigned Flags = MI.getFlags();
50495f757f3fSDimitry Andric   const LLT S1 = LLT::scalar(1);
50505f757f3fSDimitry Andric   const LLT F32 = LLT::scalar(32);
50515f757f3fSDimitry Andric   const LLT I32 = LLT::scalar(32);
50525f757f3fSDimitry Andric 
50535f757f3fSDimitry Andric   if (allowApproxFunc(MF, Flags)) {
50545f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
50555f757f3fSDimitry Andric       .addUse(X)
50565f757f3fSDimitry Andric       .setMIFlags(Flags);
50575f757f3fSDimitry Andric     MI.eraseFromParent();
50585f757f3fSDimitry Andric     return true;
50595f757f3fSDimitry Andric   }
50605f757f3fSDimitry Andric 
50615f757f3fSDimitry Andric   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
50625f757f3fSDimitry Andric   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
50635f757f3fSDimitry Andric   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
50645f757f3fSDimitry Andric   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
50655f757f3fSDimitry Andric   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
50665f757f3fSDimitry Andric 
50675f757f3fSDimitry Andric   Register SqrtS = MRI.createGenericVirtualRegister(F32);
50685f757f3fSDimitry Andric   if (needsDenormHandlingF32(MF, X, Flags)) {
50695f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
50705f757f3fSDimitry Andric       .addUse(SqrtX.getReg(0))
50715f757f3fSDimitry Andric       .setMIFlags(Flags);
50725f757f3fSDimitry Andric 
50735f757f3fSDimitry Andric     auto NegOne = B.buildConstant(I32, -1);
50745f757f3fSDimitry Andric     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
50755f757f3fSDimitry Andric 
50765f757f3fSDimitry Andric     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
50775f757f3fSDimitry Andric     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
50785f757f3fSDimitry Andric 
50795f757f3fSDimitry Andric     auto PosOne = B.buildConstant(I32, 1);
50805f757f3fSDimitry Andric     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
50815f757f3fSDimitry Andric 
50825f757f3fSDimitry Andric     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
50835f757f3fSDimitry Andric     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
50845f757f3fSDimitry Andric 
50855f757f3fSDimitry Andric     auto Zero = B.buildFConstant(F32, 0.0f);
50865f757f3fSDimitry Andric     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
50875f757f3fSDimitry Andric 
50885f757f3fSDimitry Andric     SqrtS =
50895f757f3fSDimitry Andric         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
50905f757f3fSDimitry Andric 
50915f757f3fSDimitry Andric     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
50925f757f3fSDimitry Andric     SqrtS =
50935f757f3fSDimitry Andric         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
50945f757f3fSDimitry Andric   } else {
50955f757f3fSDimitry Andric     auto SqrtR =
50965f757f3fSDimitry Andric         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
50975f757f3fSDimitry Andric     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
50985f757f3fSDimitry Andric 
50995f757f3fSDimitry Andric     auto Half = B.buildFConstant(F32, 0.5f);
51005f757f3fSDimitry Andric     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
51015f757f3fSDimitry Andric     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
51025f757f3fSDimitry Andric     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
51035f757f3fSDimitry Andric     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
51045f757f3fSDimitry Andric     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
51055f757f3fSDimitry Andric     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
51065f757f3fSDimitry Andric     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
51075f757f3fSDimitry Andric     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
51085f757f3fSDimitry Andric   }
51095f757f3fSDimitry Andric 
51105f757f3fSDimitry Andric   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
51115f757f3fSDimitry Andric 
51125f757f3fSDimitry Andric   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
51135f757f3fSDimitry Andric 
51145f757f3fSDimitry Andric   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
51155f757f3fSDimitry Andric 
51165f757f3fSDimitry Andric   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
51175f757f3fSDimitry Andric   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
51185f757f3fSDimitry Andric 
51195f757f3fSDimitry Andric   MI.eraseFromParent();
51205f757f3fSDimitry Andric   return true;
51215f757f3fSDimitry Andric }
51225f757f3fSDimitry Andric 
51235f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
512406c3fb27SDimitry Andric                                            MachineRegisterInfo &MRI,
512506c3fb27SDimitry Andric                                            MachineIRBuilder &B) const {
512606c3fb27SDimitry Andric   // For double type, the SQRT and RSQ instructions don't have required
512706c3fb27SDimitry Andric   // precision, we apply Goldschmidt's algorithm to improve the result:
512806c3fb27SDimitry Andric   //
512906c3fb27SDimitry Andric   //   y0 = rsq(x)
513006c3fb27SDimitry Andric   //   g0 = x * y0
513106c3fb27SDimitry Andric   //   h0 = 0.5 * y0
513206c3fb27SDimitry Andric   //
513306c3fb27SDimitry Andric   //   r0 = 0.5 - h0 * g0
513406c3fb27SDimitry Andric   //   g1 = g0 * r0 + g0
513506c3fb27SDimitry Andric   //   h1 = h0 * r0 + h0
513606c3fb27SDimitry Andric   //
513706c3fb27SDimitry Andric   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
513806c3fb27SDimitry Andric   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
513906c3fb27SDimitry Andric   //   h2 = h1 * r1 + h1
514006c3fb27SDimitry Andric   //
514106c3fb27SDimitry Andric   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
514206c3fb27SDimitry Andric   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
514306c3fb27SDimitry Andric   //
514406c3fb27SDimitry Andric   //   sqrt(x) = g3
514506c3fb27SDimitry Andric 
514606c3fb27SDimitry Andric   const LLT S1 = LLT::scalar(1);
514706c3fb27SDimitry Andric   const LLT S32 = LLT::scalar(32);
514806c3fb27SDimitry Andric   const LLT F64 = LLT::scalar(64);
514906c3fb27SDimitry Andric 
515006c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
515106c3fb27SDimitry Andric   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
515206c3fb27SDimitry Andric 
515306c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
515406c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
515506c3fb27SDimitry Andric 
515606c3fb27SDimitry Andric   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
515706c3fb27SDimitry Andric 
515806c3fb27SDimitry Andric   auto ZeroInt = B.buildConstant(S32, 0);
515906c3fb27SDimitry Andric   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
516006c3fb27SDimitry Andric 
516106c3fb27SDimitry Andric   // Scale up input if it is too small.
516206c3fb27SDimitry Andric   auto ScaleUpFactor = B.buildConstant(S32, 256);
516306c3fb27SDimitry Andric   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
516406c3fb27SDimitry Andric   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
516506c3fb27SDimitry Andric 
51665f757f3fSDimitry Andric   auto SqrtY =
51675f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
516806c3fb27SDimitry Andric 
516906c3fb27SDimitry Andric   auto Half = B.buildFConstant(F64, 0.5);
517006c3fb27SDimitry Andric   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
517106c3fb27SDimitry Andric   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
517206c3fb27SDimitry Andric 
517306c3fb27SDimitry Andric   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
517406c3fb27SDimitry Andric   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
517506c3fb27SDimitry Andric 
517606c3fb27SDimitry Andric   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
517706c3fb27SDimitry Andric   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
517806c3fb27SDimitry Andric 
517906c3fb27SDimitry Andric   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
518006c3fb27SDimitry Andric   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
518106c3fb27SDimitry Andric 
518206c3fb27SDimitry Andric   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
518306c3fb27SDimitry Andric 
518406c3fb27SDimitry Andric   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
518506c3fb27SDimitry Andric   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
518606c3fb27SDimitry Andric 
518706c3fb27SDimitry Andric   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
518806c3fb27SDimitry Andric 
518906c3fb27SDimitry Andric   // Scale down the result.
519006c3fb27SDimitry Andric   auto ScaleDownFactor = B.buildConstant(S32, -128);
519106c3fb27SDimitry Andric   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
519206c3fb27SDimitry Andric   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
519306c3fb27SDimitry Andric 
519406c3fb27SDimitry Andric   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
519506c3fb27SDimitry Andric   // with finite only or nsz because rsq(+/-0) = +/-inf
519606c3fb27SDimitry Andric 
519706c3fb27SDimitry Andric   // TODO: Check for DAZ and expand to subnormals
519806c3fb27SDimitry Andric   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
519906c3fb27SDimitry Andric 
520006c3fb27SDimitry Andric   // If x is +INF, +0, or -0, use its original value
520106c3fb27SDimitry Andric   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
520206c3fb27SDimitry Andric 
520306c3fb27SDimitry Andric   MI.eraseFromParent();
520406c3fb27SDimitry Andric   return true;
520506c3fb27SDimitry Andric }
520606c3fb27SDimitry Andric 
52075f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
52085f757f3fSDimitry Andric                                         MachineRegisterInfo &MRI,
52095f757f3fSDimitry Andric                                         MachineIRBuilder &B) const {
52105f757f3fSDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
52115f757f3fSDimitry Andric   if (Ty == LLT::scalar(32))
52125f757f3fSDimitry Andric     return legalizeFSQRTF32(MI, MRI, B);
52135f757f3fSDimitry Andric   if (Ty == LLT::scalar(64))
52145f757f3fSDimitry Andric     return legalizeFSQRTF64(MI, MRI, B);
52155f757f3fSDimitry Andric   if (Ty == LLT::scalar(16))
52165f757f3fSDimitry Andric     return legalizeFSQRTF16(MI, MRI, B);
52175f757f3fSDimitry Andric   return false;
52185f757f3fSDimitry Andric }
52195f757f3fSDimitry Andric 
5220e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5221e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
5222e8d8bef9SDimitry Andric //
5223e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
5224e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5225e8d8bef9SDimitry Andric // +-max_float.
5226e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5227e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
5228e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
5229e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5230e8d8bef9SDimitry Andric     return true;
5231e8d8bef9SDimitry Andric 
5232e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
5233e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
5234e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
5235e8d8bef9SDimitry Andric 
5236e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
5237e8d8bef9SDimitry Andric 
5238e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
5239e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
5240e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
5241e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
5242e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
5243e8d8bef9SDimitry Andric   else
5244e8d8bef9SDimitry Andric     return false;
5245e8d8bef9SDimitry Andric 
52465f757f3fSDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5247e8d8bef9SDimitry Andric                  .addUse(Src)
5248e8d8bef9SDimitry Andric                  .setMIFlags(Flags);
5249e8d8bef9SDimitry Andric 
5250e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
5251e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
5252e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5253e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
5254e8d8bef9SDimitry Andric 
5255e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5256e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5257e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5258e8d8bef9SDimitry Andric 
5259e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5260e8d8bef9SDimitry Andric 
5261e8d8bef9SDimitry Andric   if (UseIEEE)
5262e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5263e8d8bef9SDimitry Andric   else
5264e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5265e8d8bef9SDimitry Andric   MI.eraseFromParent();
5266e8d8bef9SDimitry Andric   return true;
5267e8d8bef9SDimitry Andric }
5268e8d8bef9SDimitry Andric 
5269e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
5270e8d8bef9SDimitry Andric   switch (IID) {
5271e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
5272e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
5273e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
5274e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5275e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
5276e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5277e8d8bef9SDimitry Andric   default:
5278e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
5279e8d8bef9SDimitry Andric   }
5280e8d8bef9SDimitry Andric }
5281e8d8bef9SDimitry Andric 
5282e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
5283e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
5284e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
5285e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
5286e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
5287e8d8bef9SDimitry Andric 
5288e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5289e8d8bef9SDimitry Andric 
5290e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
5291e8d8bef9SDimitry Andric   // construction.
5292e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
529381ad6265SDimitry Andric     MI.removeOperand(I);
5294e8d8bef9SDimitry Andric 
529581ad6265SDimitry Andric   MI.removeOperand(1); // Remove the intrinsic ID.
5296e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
5297e8d8bef9SDimitry Andric   return true;
5298e8d8bef9SDimitry Andric }
5299e8d8bef9SDimitry Andric 
5300e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5301e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
5302e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
5303e8d8bef9SDimitry Andric   uint64_t Offset =
5304e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
5305e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5306e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
5307e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5308e8d8bef9SDimitry Andric 
5309e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5310e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
5311e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5312e8d8bef9SDimitry Andric     return false;
5313e8d8bef9SDimitry Andric 
5314e8d8bef9SDimitry Andric   // FIXME: This should be nuw
5315e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5316e8d8bef9SDimitry Andric   return true;
5317e8d8bef9SDimitry Andric }
5318e8d8bef9SDimitry Andric 
531906c3fb27SDimitry Andric /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
532006c3fb27SDimitry Andric /// bits of the pointer and replace them with the stride argument, then
532106c3fb27SDimitry Andric /// merge_values everything together. In the common case of a raw buffer (the
532206c3fb27SDimitry Andric /// stride component is 0), we can just AND off the upper half.
532306c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
532406c3fb27SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
532506c3fb27SDimitry Andric   Register Result = MI.getOperand(0).getReg();
532606c3fb27SDimitry Andric   Register Pointer = MI.getOperand(2).getReg();
532706c3fb27SDimitry Andric   Register Stride = MI.getOperand(3).getReg();
532806c3fb27SDimitry Andric   Register NumRecords = MI.getOperand(4).getReg();
532906c3fb27SDimitry Andric   Register Flags = MI.getOperand(5).getReg();
533006c3fb27SDimitry Andric 
533106c3fb27SDimitry Andric   LLT S32 = LLT::scalar(32);
533206c3fb27SDimitry Andric 
533306c3fb27SDimitry Andric   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
533406c3fb27SDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Pointer);
533506c3fb27SDimitry Andric   Register LowHalf = Unmerge.getReg(0);
533606c3fb27SDimitry Andric   Register HighHalf = Unmerge.getReg(1);
533706c3fb27SDimitry Andric 
533806c3fb27SDimitry Andric   auto AndMask = B.buildConstant(S32, 0x0000ffff);
533906c3fb27SDimitry Andric   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
534006c3fb27SDimitry Andric 
534106c3fb27SDimitry Andric   MachineInstrBuilder NewHighHalf = Masked;
534206c3fb27SDimitry Andric   std::optional<ValueAndVReg> StrideConst =
534306c3fb27SDimitry Andric       getIConstantVRegValWithLookThrough(Stride, MRI);
534406c3fb27SDimitry Andric   if (!StrideConst || !StrideConst->Value.isZero()) {
534506c3fb27SDimitry Andric     MachineInstrBuilder ShiftedStride;
534606c3fb27SDimitry Andric     if (StrideConst) {
534706c3fb27SDimitry Andric       uint32_t StrideVal = StrideConst->Value.getZExtValue();
534806c3fb27SDimitry Andric       uint32_t ShiftedStrideVal = StrideVal << 16;
534906c3fb27SDimitry Andric       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
535006c3fb27SDimitry Andric     } else {
535106c3fb27SDimitry Andric       auto ExtStride = B.buildAnyExt(S32, Stride);
535206c3fb27SDimitry Andric       auto ShiftConst = B.buildConstant(S32, 16);
535306c3fb27SDimitry Andric       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
535406c3fb27SDimitry Andric     }
535506c3fb27SDimitry Andric     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
535606c3fb27SDimitry Andric   }
535706c3fb27SDimitry Andric   Register NewHighHalfReg = NewHighHalf.getReg(0);
535806c3fb27SDimitry Andric   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
535906c3fb27SDimitry Andric   MI.eraseFromParent();
536006c3fb27SDimitry Andric   return true;
536106c3fb27SDimitry Andric }
536206c3fb27SDimitry Andric 
53630b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
53640b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
53650b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
53660b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
53670b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
53680b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
53690b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
53700b57cec5SDimitry Andric   }
53710b57cec5SDimitry Andric 
53720b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5373e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
53740b57cec5SDimitry Andric     return false;
53750b57cec5SDimitry Andric 
53760b57cec5SDimitry Andric   MI.eraseFromParent();
53770b57cec5SDimitry Andric   return true;
53780b57cec5SDimitry Andric }
53790b57cec5SDimitry Andric 
5380fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5381fcaf7f86SDimitry Andric                                          MachineRegisterInfo &MRI,
5382fcaf7f86SDimitry Andric                                          MachineIRBuilder &B) const {
5383fcaf7f86SDimitry Andric   Function &F = B.getMF().getFunction();
5384bdd1243dSDimitry Andric   std::optional<uint32_t> KnownSize =
5385fcaf7f86SDimitry Andric       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5386fcaf7f86SDimitry Andric   if (KnownSize.has_value())
5387bdd1243dSDimitry Andric     B.buildConstant(DstReg, *KnownSize);
5388fcaf7f86SDimitry Andric   return false;
5389fcaf7f86SDimitry Andric }
5390fcaf7f86SDimitry Andric 
5391fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5392fcaf7f86SDimitry Andric                                               MachineRegisterInfo &MRI,
5393fcaf7f86SDimitry Andric                                               MachineIRBuilder &B) const {
5394fcaf7f86SDimitry Andric 
5395fcaf7f86SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5396fcaf7f86SDimitry Andric   if (!MFI->isEntryFunction()) {
5397fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
5398fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5399fcaf7f86SDimitry Andric   }
5400fcaf7f86SDimitry Andric 
5401fcaf7f86SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5402fcaf7f86SDimitry Andric   if (!getLDSKernelId(DstReg, MRI, B))
5403fcaf7f86SDimitry Andric     return false;
5404fcaf7f86SDimitry Andric 
5405fcaf7f86SDimitry Andric   MI.eraseFromParent();
5406fcaf7f86SDimitry Andric   return true;
5407fcaf7f86SDimitry Andric }
5408fcaf7f86SDimitry Andric 
54098bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
54108bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
54118bcb0991SDimitry Andric                                               MachineIRBuilder &B,
54128bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
54138bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5414e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5415e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
5416e8d8bef9SDimitry Andric 
54178bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
54188bcb0991SDimitry Andric   MI.eraseFromParent();
54198bcb0991SDimitry Andric   return true;
54208bcb0991SDimitry Andric }
54218bcb0991SDimitry Andric 
54225ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
54235ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
54245ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
54255ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
54265ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
54275ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
5428fe6060f1SDimitry Andric std::pair<Register, unsigned>
54295ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
54305ffd83dbSDimitry Andric                                         Register OrigOffset) const {
54315f757f3fSDimitry Andric   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
54325ffd83dbSDimitry Andric   Register BaseReg;
5433fe6060f1SDimitry Andric   unsigned ImmOffset;
54345ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
5435fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
54365ffd83dbSDimitry Andric 
5437fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
5438fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
54395ffd83dbSDimitry Andric 
5440fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
5441fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
5442fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
54435ffd83dbSDimitry Andric 
544406c3fb27SDimitry Andric   // If the immediate value is too big for the immoffset field, put only bits
544506c3fb27SDimitry Andric   // that would normally fit in the immoffset field. The remaining value that
544606c3fb27SDimitry Andric   // is copied/added for the voffset field is a large power of 2, and it
544706c3fb27SDimitry Andric   // stands more chance of being CSEd with the copy/add for another similar
544806c3fb27SDimitry Andric   // load/store.
544906c3fb27SDimitry Andric   // However, do not do that rounding down if that is a negative
545006c3fb27SDimitry Andric   // number, as it appears to be illegal to have a negative offset in the
545106c3fb27SDimitry Andric   // vgpr, even if adding the immediate offset makes it positive.
54525ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
54535ffd83dbSDimitry Andric   ImmOffset -= Overflow;
54545ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
54555ffd83dbSDimitry Andric     Overflow += ImmOffset;
54565ffd83dbSDimitry Andric     ImmOffset = 0;
54575ffd83dbSDimitry Andric   }
54585ffd83dbSDimitry Andric 
54595ffd83dbSDimitry Andric   if (Overflow != 0) {
54605ffd83dbSDimitry Andric     if (!BaseReg) {
54615ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
54625ffd83dbSDimitry Andric     } else {
54635ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
54645ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
54655ffd83dbSDimitry Andric     }
54665ffd83dbSDimitry Andric   }
54675ffd83dbSDimitry Andric 
54685ffd83dbSDimitry Andric   if (!BaseReg)
54695ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
54705ffd83dbSDimitry Andric 
5471bdd1243dSDimitry Andric   return std::pair(BaseReg, ImmOffset);
5472fe6060f1SDimitry Andric }
5473fe6060f1SDimitry Andric 
54748bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
54758bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
54768bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
5477e8d8bef9SDimitry Andric                                              Register Reg,
5478e8d8bef9SDimitry Andric                                              bool ImageStore) const {
54798bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
54808bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
54818bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
54828bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
54838bcb0991SDimitry Andric 
5484e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
54858bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
54868bcb0991SDimitry Andric 
54878bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
54888bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
54898bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
54908bcb0991SDimitry Andric 
54918bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
54928bcb0991SDimitry Andric 
5493fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5494fe6060f1SDimitry Andric         .getReg(0);
54958bcb0991SDimitry Andric   }
54968bcb0991SDimitry Andric 
5497e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
5498e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
5499e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5500e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
5501e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
5502e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5503fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5504fe6060f1SDimitry Andric           .getReg(0);
5505e8d8bef9SDimitry Andric     }
5506e8d8bef9SDimitry Andric 
5507e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
5508e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5509e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
5510e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5511e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
5512e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5513fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5514fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5515e8d8bef9SDimitry Andric     }
5516e8d8bef9SDimitry Andric 
5517e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
5518e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5519fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5520e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
5521e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5522e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
5523e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5524fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5525fe6060f1SDimitry Andric           .getReg(0);
5526e8d8bef9SDimitry Andric     }
5527e8d8bef9SDimitry Andric 
5528e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
5529e8d8bef9SDimitry Andric   }
5530e8d8bef9SDimitry Andric 
55310eae32dcSDimitry Andric   if (StoreVT == LLT::fixed_vector(3, S16)) {
55320eae32dcSDimitry Andric     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
55330eae32dcSDimitry Andric               .getReg(0);
55340eae32dcSDimitry Andric   }
5535e8d8bef9SDimitry Andric   return Reg;
5536e8d8bef9SDimitry Andric }
5537e8d8bef9SDimitry Andric 
55385ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
55395ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
55405ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
55415ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
55428bcb0991SDimitry Andric 
55438bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
55448bcb0991SDimitry Andric 
554506c3fb27SDimitry Andric   // Fixup buffer resources themselves needing to be v4i128.
554606c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty))
554706c3fb27SDimitry Andric     return castBufferRsrcToV4I32(VData, B);
554806c3fb27SDimitry Andric 
55498bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
55508bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
55518bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
55525ffd83dbSDimitry Andric     return AnyExt;
55538bcb0991SDimitry Andric   }
55548bcb0991SDimitry Andric 
55558bcb0991SDimitry Andric   if (Ty.isVector()) {
55568bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
55578bcb0991SDimitry Andric       if (IsFormat)
55585ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
55595ffd83dbSDimitry Andric     }
55605ffd83dbSDimitry Andric   }
55615ffd83dbSDimitry Andric 
55625ffd83dbSDimitry Andric   return VData;
55635ffd83dbSDimitry Andric }
55645ffd83dbSDimitry Andric 
55655ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
55665ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
55675ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
55685ffd83dbSDimitry Andric                                               bool IsTyped,
55695ffd83dbSDimitry Andric                                               bool IsFormat) const {
55705ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
55715ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
55725ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
55735ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
55745ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
55755ffd83dbSDimitry Andric 
55765ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
557706c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 2);
55785ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
55795ffd83dbSDimitry Andric 
55805ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
55815ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
55825ffd83dbSDimitry Andric 
55835ffd83dbSDimitry Andric   unsigned ImmOffset;
55845ffd83dbSDimitry Andric 
55855ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
55865ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
55875ffd83dbSDimitry Andric 
55885ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
55895ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
55905ffd83dbSDimitry Andric   Register VIndex;
55915ffd83dbSDimitry Andric   int OpOffset = 0;
55925ffd83dbSDimitry Andric   if (HasVIndex) {
55935ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
55945ffd83dbSDimitry Andric     OpOffset = 1;
5595fe6060f1SDimitry Andric   } else {
5596fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
55975ffd83dbSDimitry Andric   }
55985ffd83dbSDimitry Andric 
55995ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
56005ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
56015ffd83dbSDimitry Andric 
56025ffd83dbSDimitry Andric   unsigned Format = 0;
56035ffd83dbSDimitry Andric   if (IsTyped) {
56045ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
56055ffd83dbSDimitry Andric     ++OpOffset;
56065ffd83dbSDimitry Andric   }
56075ffd83dbSDimitry Andric 
56085ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
56095ffd83dbSDimitry Andric 
5610fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
56115ffd83dbSDimitry Andric 
56125ffd83dbSDimitry Andric   unsigned Opc;
56135ffd83dbSDimitry Andric   if (IsTyped) {
56145ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
56155ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
56165ffd83dbSDimitry Andric   } else if (IsFormat) {
56175ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
56185ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
56195ffd83dbSDimitry Andric   } else {
56205ffd83dbSDimitry Andric     switch (MemSize) {
56215ffd83dbSDimitry Andric     case 1:
56225ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
56235ffd83dbSDimitry Andric       break;
56245ffd83dbSDimitry Andric     case 2:
56255ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
56265ffd83dbSDimitry Andric       break;
56275ffd83dbSDimitry Andric     default:
56285ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
56295ffd83dbSDimitry Andric       break;
56305ffd83dbSDimitry Andric     }
56315ffd83dbSDimitry Andric   }
56325ffd83dbSDimitry Andric 
56335ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
56345ffd83dbSDimitry Andric     .addUse(VData)              // vdata
56355ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
56365ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
56375ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
56385ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
56395ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
56405ffd83dbSDimitry Andric 
56415ffd83dbSDimitry Andric   if (IsTyped)
56425ffd83dbSDimitry Andric     MIB.addImm(Format);
56435ffd83dbSDimitry Andric 
56445ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
56455ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
56465ffd83dbSDimitry Andric      .addMemOperand(MMO);
56475ffd83dbSDimitry Andric 
56485ffd83dbSDimitry Andric   MI.eraseFromParent();
56498bcb0991SDimitry Andric   return true;
56508bcb0991SDimitry Andric }
56518bcb0991SDimitry Andric 
5652bdd1243dSDimitry Andric static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5653bdd1243dSDimitry Andric                             Register VIndex, Register VOffset, Register SOffset,
5654bdd1243dSDimitry Andric                             unsigned ImmOffset, unsigned Format,
5655bdd1243dSDimitry Andric                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5656bdd1243dSDimitry Andric                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5657bdd1243dSDimitry Andric   auto MIB = B.buildInstr(Opc)
5658bdd1243dSDimitry Andric                  .addDef(LoadDstReg) // vdata
5659bdd1243dSDimitry Andric                  .addUse(RSrc)       // rsrc
5660bdd1243dSDimitry Andric                  .addUse(VIndex)     // vindex
5661bdd1243dSDimitry Andric                  .addUse(VOffset)    // voffset
5662bdd1243dSDimitry Andric                  .addUse(SOffset)    // soffset
5663bdd1243dSDimitry Andric                  .addImm(ImmOffset); // offset(imm)
5664bdd1243dSDimitry Andric 
5665bdd1243dSDimitry Andric   if (IsTyped)
5666bdd1243dSDimitry Andric     MIB.addImm(Format);
5667bdd1243dSDimitry Andric 
5668bdd1243dSDimitry Andric   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5669bdd1243dSDimitry Andric       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5670bdd1243dSDimitry Andric       .addMemOperand(MMO);
5671bdd1243dSDimitry Andric }
5672bdd1243dSDimitry Andric 
56735ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
56745ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
56755ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
56765ffd83dbSDimitry Andric                                              bool IsFormat,
56775ffd83dbSDimitry Andric                                              bool IsTyped) const {
56785ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
56795ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
5680fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
56815ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
56825ffd83dbSDimitry Andric 
56835ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
5684bdd1243dSDimitry Andric 
5685bdd1243dSDimitry Andric   Register StatusDst;
5686bdd1243dSDimitry Andric   int OpOffset = 0;
5687bdd1243dSDimitry Andric   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5688bdd1243dSDimitry Andric   bool IsTFE = MI.getNumExplicitDefs() == 2;
5689bdd1243dSDimitry Andric   if (IsTFE) {
5690bdd1243dSDimitry Andric     StatusDst = MI.getOperand(1).getReg();
5691bdd1243dSDimitry Andric     ++OpOffset;
5692bdd1243dSDimitry Andric   }
5693bdd1243dSDimitry Andric 
569406c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5695bdd1243dSDimitry Andric   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
56965ffd83dbSDimitry Andric 
56975ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
56985ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
56995ffd83dbSDimitry Andric 
57005ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
5701bdd1243dSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
57025ffd83dbSDimitry Andric   Register VIndex;
57035ffd83dbSDimitry Andric   if (HasVIndex) {
5704bdd1243dSDimitry Andric     VIndex = MI.getOperand(3 + OpOffset).getReg();
5705bdd1243dSDimitry Andric     ++OpOffset;
5706fe6060f1SDimitry Andric   } else {
5707fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
57088bcb0991SDimitry Andric   }
57098bcb0991SDimitry Andric 
57105ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
57115ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
57125ffd83dbSDimitry Andric 
57135ffd83dbSDimitry Andric   unsigned Format = 0;
57145ffd83dbSDimitry Andric   if (IsTyped) {
57155ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
57165ffd83dbSDimitry Andric     ++OpOffset;
57178bcb0991SDimitry Andric   }
57188bcb0991SDimitry Andric 
57195ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
57205ffd83dbSDimitry Andric   unsigned ImmOffset;
57215ffd83dbSDimitry Andric 
57225ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
572306c3fb27SDimitry Andric   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
572406c3fb27SDimitry Andric   // logic doesn't have to handle that case.
572506c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty)) {
572606c3fb27SDimitry Andric     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
572706c3fb27SDimitry Andric     Dst = MI.getOperand(0).getReg();
572806c3fb27SDimitry Andric   }
57295ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
57305ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
57315ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
57325ffd83dbSDimitry Andric 
5733fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
57345ffd83dbSDimitry Andric 
57355ffd83dbSDimitry Andric   unsigned Opc;
57365ffd83dbSDimitry Andric 
5737bdd1243dSDimitry Andric   // TODO: Support TFE for typed and narrow loads.
57385ffd83dbSDimitry Andric   if (IsTyped) {
5739bdd1243dSDimitry Andric     if (IsTFE)
5740bdd1243dSDimitry Andric       return false;
57415ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
57425ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
57435ffd83dbSDimitry Andric   } else if (IsFormat) {
5744bdd1243dSDimitry Andric     if (IsD16) {
5745bdd1243dSDimitry Andric       if (IsTFE)
5746bdd1243dSDimitry Andric         return false;
5747bdd1243dSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
57485ffd83dbSDimitry Andric     } else {
5749bdd1243dSDimitry Andric       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5750bdd1243dSDimitry Andric                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5751bdd1243dSDimitry Andric     }
5752bdd1243dSDimitry Andric   } else {
5753bdd1243dSDimitry Andric     if (IsTFE)
5754bdd1243dSDimitry Andric       return false;
5755fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
5756fe6060f1SDimitry Andric     case 8:
57575ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
57585ffd83dbSDimitry Andric       break;
5759fe6060f1SDimitry Andric     case 16:
57605ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
57615ffd83dbSDimitry Andric       break;
57625ffd83dbSDimitry Andric     default:
57635ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
57645ffd83dbSDimitry Andric       break;
57655ffd83dbSDimitry Andric     }
57665ffd83dbSDimitry Andric   }
57675ffd83dbSDimitry Andric 
5768bdd1243dSDimitry Andric   if (IsTFE) {
5769bdd1243dSDimitry Andric     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5770bdd1243dSDimitry Andric     unsigned NumLoadDWords = NumValueDWords + 1;
5771bdd1243dSDimitry Andric     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5772bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5773bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5774bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5775bdd1243dSDimitry Andric     if (NumValueDWords == 1) {
5776bdd1243dSDimitry Andric       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5777bdd1243dSDimitry Andric     } else {
5778bdd1243dSDimitry Andric       SmallVector<Register, 5> LoadElts;
5779bdd1243dSDimitry Andric       for (unsigned I = 0; I != NumValueDWords; ++I)
5780bdd1243dSDimitry Andric         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5781bdd1243dSDimitry Andric       LoadElts.push_back(StatusDst);
5782bdd1243dSDimitry Andric       B.buildUnmerge(LoadElts, LoadDstReg);
5783bdd1243dSDimitry Andric       LoadElts.truncate(NumValueDWords);
5784bdd1243dSDimitry Andric       B.buildMergeLikeInstr(Dst, LoadElts);
5785bdd1243dSDimitry Andric     }
5786bdd1243dSDimitry Andric   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5787bdd1243dSDimitry Andric              (IsD16 && !Ty.isVector())) {
5788bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5789bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5790bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
57915ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
57925ffd83dbSDimitry Andric     B.buildTrunc(Dst, LoadDstReg);
5793bdd1243dSDimitry Andric   } else if (Unpacked && IsD16 && Ty.isVector()) {
5794bdd1243dSDimitry Andric     LLT UnpackedTy = Ty.changeElementSize(32);
5795bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5796bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5797bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5798bdd1243dSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
57995ffd83dbSDimitry Andric     // FIXME: G_TRUNC should work, but legalization currently fails
58005ffd83dbSDimitry Andric     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
58015ffd83dbSDimitry Andric     SmallVector<Register, 4> Repack;
58025ffd83dbSDimitry Andric     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
58035ffd83dbSDimitry Andric       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5804bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, Repack);
5805bdd1243dSDimitry Andric   } else {
5806bdd1243dSDimitry Andric     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5807bdd1243dSDimitry Andric                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
58085ffd83dbSDimitry Andric   }
58095ffd83dbSDimitry Andric 
58105ffd83dbSDimitry Andric   MI.eraseFromParent();
58115ffd83dbSDimitry Andric   return true;
58125ffd83dbSDimitry Andric }
58135ffd83dbSDimitry Andric 
58145ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
58155ffd83dbSDimitry Andric   switch (IntrID) {
58165ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
581706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
58185ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
581906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
58205ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
58215ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
582206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
58235ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
582406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
58255ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
58265ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
582706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
58285ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
582906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
58305ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
58315ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
583206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
58335ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
583406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
58355ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
58365ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
583706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
58385ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
583906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
58405ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
58415ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
584206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
58435ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
584406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
58455ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
58465ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
584706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
58485ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
584906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
58505ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
58515ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
585206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
58535ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
585406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
58555ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
58565ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
585706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
58585ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
585906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
58605ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
58615ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
586206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
58635ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
586406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
58655ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
58665ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
586706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
58685ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
586906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
58705ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
58715ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
587206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
58735ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
587406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
58755ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
58765ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
587706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
58785ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
587906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
58805ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5881e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
588206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5883e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
588406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5885e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5886fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
588706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5888fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
588906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5890fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5891fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
589206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5893fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
589406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5895fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
58965ffd83dbSDimitry Andric   default:
58975ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
58985ffd83dbSDimitry Andric   }
58995ffd83dbSDimitry Andric }
59005ffd83dbSDimitry Andric 
59015ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
59025ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
59035ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
590406c3fb27SDimitry Andric   const bool IsCmpSwap =
590506c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
590606c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
590706c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
590806c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
59095ffd83dbSDimitry Andric 
59105f757f3fSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
591106c3fb27SDimitry Andric   // Since we don't have 128-bit atomics, we don't need to handle the case of
591206c3fb27SDimitry Andric   // p8 argmunents to the atomic itself
59135f757f3fSDimitry Andric   Register VData = MI.getOperand(2).getReg();
59145f757f3fSDimitry Andric 
5915e8d8bef9SDimitry Andric   Register CmpVal;
59165f757f3fSDimitry Andric   int OpOffset = 0;
59175ffd83dbSDimitry Andric 
59185ffd83dbSDimitry Andric   if (IsCmpSwap) {
59195f757f3fSDimitry Andric     CmpVal = MI.getOperand(3).getReg();
59205ffd83dbSDimitry Andric     ++OpOffset;
59215ffd83dbSDimitry Andric   }
59225ffd83dbSDimitry Andric 
592306c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
59245ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
59255f757f3fSDimitry Andric   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
59265ffd83dbSDimitry Andric 
59275ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
59285ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
59295ffd83dbSDimitry Andric   Register VIndex;
59305ffd83dbSDimitry Andric   if (HasVIndex) {
59315ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
59325ffd83dbSDimitry Andric     ++OpOffset;
5933fe6060f1SDimitry Andric   } else {
5934fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
59355ffd83dbSDimitry Andric   }
59365ffd83dbSDimitry Andric 
59375ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
59385ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
59395ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
59405ffd83dbSDimitry Andric 
59415ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
59425ffd83dbSDimitry Andric 
59435ffd83dbSDimitry Andric   unsigned ImmOffset;
5944fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
59455ffd83dbSDimitry Andric 
59465f757f3fSDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
59475f757f3fSDimitry Andric       .addDef(Dst)
59485f757f3fSDimitry Andric       .addUse(VData); // vdata
59495ffd83dbSDimitry Andric 
59505ffd83dbSDimitry Andric   if (IsCmpSwap)
59515ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
59525ffd83dbSDimitry Andric 
59535ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
59545ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
59555ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
59565ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
59575ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
59585ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
59595ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
59605ffd83dbSDimitry Andric      .addMemOperand(MMO);
59615ffd83dbSDimitry Andric 
59625ffd83dbSDimitry Andric   MI.eraseFromParent();
59635ffd83dbSDimitry Andric   return true;
59645ffd83dbSDimitry Andric }
59655ffd83dbSDimitry Andric 
5966fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
59675ffd83dbSDimitry Andric /// vector with s16 typed elements.
5968fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
5969fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
5970fe6060f1SDimitry Andric                                       unsigned ArgOffset,
5971fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
5972fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
59735ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
5974fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
5975fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
59765ffd83dbSDimitry Andric 
5977e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
5978e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
59795ffd83dbSDimitry Andric     if (!SrcOp.isReg())
59805ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
59815ffd83dbSDimitry Andric 
59825ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
59835ffd83dbSDimitry Andric 
5984fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
5985fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
5986fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
59870eae32dcSDimitry Andric       if ((I < Intr->GradientStart) && IsA16 &&
59880eae32dcSDimitry Andric           (B.getMRI()->getType(AddrReg) == S16)) {
598904eeddc0SDimitry Andric         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
59900eae32dcSDimitry Andric         // Special handling of bias when A16 is on. Bias is of type half but
59910eae32dcSDimitry Andric         // occupies full 32-bit.
59920eae32dcSDimitry Andric         PackedAddrs.push_back(
59930eae32dcSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
59940eae32dcSDimitry Andric                 .getReg(0));
59950eae32dcSDimitry Andric       } else {
599604eeddc0SDimitry Andric         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
599704eeddc0SDimitry Andric                "Bias needs to be converted to 16 bit in A16 mode");
599804eeddc0SDimitry Andric         // Handle any gradient or coordinate operands that should not be packed
59995ffd83dbSDimitry Andric         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
60005ffd83dbSDimitry Andric         PackedAddrs.push_back(AddrReg);
60010eae32dcSDimitry Andric       }
60025ffd83dbSDimitry Andric     } else {
60035ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
60045ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
60055ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
6006e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
6007e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
6008e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
6009e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
6010e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
60115ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
6012e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
60135ffd83dbSDimitry Andric         PackedAddrs.push_back(
60145ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
60155ffd83dbSDimitry Andric                 .getReg(0));
60165ffd83dbSDimitry Andric       } else {
60175ffd83dbSDimitry Andric         PackedAddrs.push_back(
6018e8d8bef9SDimitry Andric             B.buildBuildVector(
6019e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
60205ffd83dbSDimitry Andric                 .getReg(0));
60215ffd83dbSDimitry Andric         ++I;
60225ffd83dbSDimitry Andric       }
60235ffd83dbSDimitry Andric     }
60245ffd83dbSDimitry Andric   }
60255ffd83dbSDimitry Andric }
60265ffd83dbSDimitry Andric 
60275ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
60285ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
60295ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
60305ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
60315ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
6032bdd1243dSDimitry Andric   (void)S32;
60335ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
60345ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
60355ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
60365ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
60375ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
60385ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
60395ffd83dbSDimitry Andric     }
60405ffd83dbSDimitry Andric   }
60415ffd83dbSDimitry Andric 
60425ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
60435ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
6044fe6060f1SDimitry Andric     auto VAddr =
6045fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
60465ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
60475ffd83dbSDimitry Andric   }
60485ffd83dbSDimitry Andric 
60495ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
60505ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
60515ffd83dbSDimitry Andric     if (SrcOp.isReg())
60525ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
60535ffd83dbSDimitry Andric   }
60545ffd83dbSDimitry Andric }
60555ffd83dbSDimitry Andric 
60565ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
60575ffd83dbSDimitry Andric ///
60585ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
60595ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
60605ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
60615ffd83dbSDimitry Andric /// registers.
60625ffd83dbSDimitry Andric ///
60635ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
60645ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
606581ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid
60665ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
6067349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires
6068349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg.
60695ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6070e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6071e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
60725ffd83dbSDimitry Andric 
6073bdd1243dSDimitry Andric   const MachineFunction &MF = *MI.getMF();
6074e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
6075e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
60765ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
60775ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
60785ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
60795ffd83dbSDimitry Andric 
60805ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
60815ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6082e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
60835ffd83dbSDimitry Andric 
60845ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
60855ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
60865ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
6087fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
60885ffd83dbSDimitry Andric 
60895ffd83dbSDimitry Andric   unsigned DMask = 0;
609004eeddc0SDimitry Andric   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
609104eeddc0SDimitry Andric   LLT Ty = MRI->getType(VData);
60925ffd83dbSDimitry Andric 
60935ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
6094e8d8bef9SDimitry Andric   LLT GradTy =
6095e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6096e8d8bef9SDimitry Andric   LLT AddrTy =
6097e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
609806c3fb27SDimitry Andric   const bool IsG16 =
609906c3fb27SDimitry Andric       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
61005ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
610104eeddc0SDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
61025ffd83dbSDimitry Andric 
61035ffd83dbSDimitry Andric   int DMaskLanes = 0;
61045ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
6105e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
61065ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
61075ffd83dbSDimitry Andric       DMaskLanes = 4;
61085ffd83dbSDimitry Andric     } else if (DMask != 0) {
6109bdd1243dSDimitry Andric       DMaskLanes = llvm::popcount(DMask);
61105ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
61115ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
61125ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
61135ffd83dbSDimitry Andric       MI.eraseFromParent();
61145ffd83dbSDimitry Andric       return true;
61155ffd83dbSDimitry Andric     }
61165ffd83dbSDimitry Andric   }
61175ffd83dbSDimitry Andric 
61185ffd83dbSDimitry Andric   Observer.changingInstr(MI);
61195ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
61205ffd83dbSDimitry Andric 
612104eeddc0SDimitry Andric   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
612204eeddc0SDimitry Andric                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
612304eeddc0SDimitry Andric   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
612404eeddc0SDimitry Andric                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
612504eeddc0SDimitry Andric   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
61265ffd83dbSDimitry Andric 
61275ffd83dbSDimitry Andric   // Track that we legalized this
61285ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
61295ffd83dbSDimitry Andric 
61305ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
61315ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
61325ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
61335ffd83dbSDimitry Andric     DMask = 0x1;
61345ffd83dbSDimitry Andric     DMaskLanes = 1;
6135e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
61365ffd83dbSDimitry Andric   }
61375ffd83dbSDimitry Andric 
61385ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
61395ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
61405ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
61415ffd83dbSDimitry Andric 
61425ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
61435ffd83dbSDimitry Andric     if (Ty.isVector())
61445ffd83dbSDimitry Andric       return false;
61455ffd83dbSDimitry Andric 
61465ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
61475ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
61485ffd83dbSDimitry Andric       // The two values are packed in one register.
6149fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
61505ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
61515ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
61525ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
61535ffd83dbSDimitry Andric     }
61545ffd83dbSDimitry Andric   }
61555ffd83dbSDimitry Andric 
6156e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
61575ffd83dbSDimitry Andric 
61585ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
6159fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6160fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
6161fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
61625ffd83dbSDimitry Andric     return false;
6163fe6060f1SDimitry Andric   }
61645ffd83dbSDimitry Andric 
6165fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
6166fe6060f1SDimitry Andric     // A16 not supported
6167fe6060f1SDimitry Andric     return false;
6168fe6060f1SDimitry Andric   }
6169fe6060f1SDimitry Andric 
61705f757f3fSDimitry Andric   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
617106c3fb27SDimitry Andric   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
617206c3fb27SDimitry Andric 
6173fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
61745f757f3fSDimitry Andric     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
61755f757f3fSDimitry Andric     // instructions expect VGPR_32
61765ffd83dbSDimitry Andric     SmallVector<Register, 4> PackedRegs;
61775ffd83dbSDimitry Andric 
61785f757f3fSDimitry Andric     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
61795ffd83dbSDimitry Andric 
61805ffd83dbSDimitry Andric     // See also below in the non-a16 branch
6181bdd1243dSDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() &&
6182bdd1243dSDimitry Andric                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
618306c3fb27SDimitry Andric                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
618406c3fb27SDimitry Andric     const bool UsePartialNSA =
618506c3fb27SDimitry Andric         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
61865ffd83dbSDimitry Andric 
618706c3fb27SDimitry Andric     if (UsePartialNSA) {
618806c3fb27SDimitry Andric       // Pack registers that would go over NSAMaxSize into last VAddr register
618906c3fb27SDimitry Andric       LLT PackedAddrTy =
619006c3fb27SDimitry Andric           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
619106c3fb27SDimitry Andric       auto Concat = B.buildConcatVectors(
619206c3fb27SDimitry Andric           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
619306c3fb27SDimitry Andric       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
619406c3fb27SDimitry Andric       PackedRegs.resize(NSAMaxSize);
619506c3fb27SDimitry Andric     } else if (!UseNSA && PackedRegs.size() > 1) {
6196fe6060f1SDimitry Andric       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
61975ffd83dbSDimitry Andric       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
61985ffd83dbSDimitry Andric       PackedRegs[0] = Concat.getReg(0);
61995ffd83dbSDimitry Andric       PackedRegs.resize(1);
62005ffd83dbSDimitry Andric     }
62015ffd83dbSDimitry Andric 
6202e8d8bef9SDimitry Andric     const unsigned NumPacked = PackedRegs.size();
6203e8d8bef9SDimitry Andric     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6204e8d8bef9SDimitry Andric       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
62055ffd83dbSDimitry Andric       if (!SrcOp.isReg()) {
62065ffd83dbSDimitry Andric         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
62075ffd83dbSDimitry Andric         continue;
62085ffd83dbSDimitry Andric       }
62095ffd83dbSDimitry Andric 
62105ffd83dbSDimitry Andric       assert(SrcOp.getReg() != AMDGPU::NoRegister);
62115ffd83dbSDimitry Andric 
6212e8d8bef9SDimitry Andric       if (I - Intr->VAddrStart < NumPacked)
6213e8d8bef9SDimitry Andric         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
62145ffd83dbSDimitry Andric       else
62155ffd83dbSDimitry Andric         SrcOp.setReg(AMDGPU::NoRegister);
62165ffd83dbSDimitry Andric     }
62175ffd83dbSDimitry Andric   } else {
62185ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
62195ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
62205ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
62215ffd83dbSDimitry Andric     // wash in terms of code size or even better.
62225ffd83dbSDimitry Andric     //
62235ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
62245ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
62255ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
62265ffd83dbSDimitry Andric     //
62275ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
62285ffd83dbSDimitry Andric     // allocation when possible.
622981ad6265SDimitry Andric     //
62305f757f3fSDimitry Andric     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
623106c3fb27SDimitry Andric     // set of the remaining addresses.
6232bdd1243dSDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() &&
6233bdd1243dSDimitry Andric                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
623406c3fb27SDimitry Andric                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
623506c3fb27SDimitry Andric     const bool UsePartialNSA =
623606c3fb27SDimitry Andric         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
62375ffd83dbSDimitry Andric 
623806c3fb27SDimitry Andric     if (UsePartialNSA) {
623906c3fb27SDimitry Andric       convertImageAddrToPacked(B, MI,
624006c3fb27SDimitry Andric                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
624106c3fb27SDimitry Andric                                Intr->NumVAddrs - NSAMaxSize + 1);
624206c3fb27SDimitry Andric     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6243e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6244e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
62455ffd83dbSDimitry Andric     }
624606c3fb27SDimitry Andric   }
62475ffd83dbSDimitry Andric 
62485ffd83dbSDimitry Andric   int Flags = 0;
62495ffd83dbSDimitry Andric   if (IsA16)
62505ffd83dbSDimitry Andric     Flags |= 1;
62515ffd83dbSDimitry Andric   if (IsG16)
62525ffd83dbSDimitry Andric     Flags |= 2;
62535ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
62545ffd83dbSDimitry Andric 
62555ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
62565ffd83dbSDimitry Andric     // TODO: Handle dmask trim
625704eeddc0SDimitry Andric     if (!Ty.isVector() || !IsD16)
62585ffd83dbSDimitry Andric       return true;
62595ffd83dbSDimitry Andric 
6260e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
62615ffd83dbSDimitry Andric     if (RepackedReg != VData) {
62625ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
62635ffd83dbSDimitry Andric     }
62645ffd83dbSDimitry Andric 
62655ffd83dbSDimitry Andric     return true;
62665ffd83dbSDimitry Andric   }
62675ffd83dbSDimitry Andric 
62685ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
62695ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
62705ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
62715ffd83dbSDimitry Andric 
62725ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
62735ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
62745ffd83dbSDimitry Andric     return false;
62755ffd83dbSDimitry Andric 
62765ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
62775ffd83dbSDimitry Andric     return false;
62785ffd83dbSDimitry Andric 
62795ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6280fe6060f1SDimitry Andric   const LLT AdjustedTy =
6281fe6060f1SDimitry Andric       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
62825ffd83dbSDimitry Andric 
62835ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
62845ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
62855ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
62865ffd83dbSDimitry Andric   LLT RoundedTy;
62875ffd83dbSDimitry Andric 
6288bdd1243dSDimitry Andric   // S32 vector to cover all data, plus TFE result element.
62895ffd83dbSDimitry Andric   LLT TFETy;
62905ffd83dbSDimitry Andric 
62915ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
62925ffd83dbSDimitry Andric   LLT RegTy;
62935ffd83dbSDimitry Andric 
62945ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
6295fe6060f1SDimitry Andric     RoundedTy =
6296fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6297fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
62985ffd83dbSDimitry Andric     RegTy = S32;
62995ffd83dbSDimitry Andric   } else {
63005ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
63015ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
63025ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
6303fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
6304fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6305fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
63065ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
63075ffd83dbSDimitry Andric   }
63085ffd83dbSDimitry Andric 
63095ffd83dbSDimitry Andric   // The return type does not need adjustment.
63105ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
63115ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
63125ffd83dbSDimitry Andric     return true;
63135ffd83dbSDimitry Andric 
63145ffd83dbSDimitry Andric   Register Dst1Reg;
63155ffd83dbSDimitry Andric 
63165ffd83dbSDimitry Andric   // Insert after the instruction.
63175ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
63185ffd83dbSDimitry Andric 
63195ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
63205ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
63215ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
63225ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
63235ffd83dbSDimitry Andric 
63245ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
63255ffd83dbSDimitry Andric 
63265ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
63275ffd83dbSDimitry Andric 
63285ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
6329349cc55cSDimitry Andric   // type. The instruction really returns these two values in one contiguous
63305ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
63315ffd83dbSDimitry Andric   // return type to use a single register result.
63325ffd83dbSDimitry Andric 
63335ffd83dbSDimitry Andric   if (IsTFE) {
63345ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
63355ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
63365ffd83dbSDimitry Andric       return false;
63375ffd83dbSDimitry Andric 
63385ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
633981ad6265SDimitry Andric     MI.removeOperand(1);
63405ffd83dbSDimitry Andric 
63415ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
63425ffd83dbSDimitry Andric     if (Ty == S32) {
63435ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
63445ffd83dbSDimitry Andric       return true;
63455ffd83dbSDimitry Andric     }
63465ffd83dbSDimitry Andric   }
63475ffd83dbSDimitry Andric 
63485ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
63495ffd83dbSDimitry Andric   // result.
63505ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
63515ffd83dbSDimitry Andric 
63525ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
63535ffd83dbSDimitry Andric 
63545ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
63555ffd83dbSDimitry Andric     assert(!IsTFE);
63565ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
63575ffd83dbSDimitry Andric   } else {
63585ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
63595ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
63605ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
63615ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
63625ffd83dbSDimitry Andric 
63635ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
63645ffd83dbSDimitry Andric     // directly written to the right place already.
63655ffd83dbSDimitry Andric     if (IsTFE)
63665ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
63675ffd83dbSDimitry Andric   }
63685ffd83dbSDimitry Andric 
63695ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
63705ffd83dbSDimitry Andric   // of packed vs. unpacked.
63715ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
63725ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
63735ffd83dbSDimitry Andric     return true;
63745ffd83dbSDimitry Andric   }
63755ffd83dbSDimitry Andric 
63765ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
63775ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
63785ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
63795ffd83dbSDimitry Andric     return true;
63805ffd83dbSDimitry Andric   }
63815ffd83dbSDimitry Andric 
63825ffd83dbSDimitry Andric   assert(Ty.isVector());
63835ffd83dbSDimitry Andric 
63845ffd83dbSDimitry Andric   if (IsD16) {
63855ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
63865ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
63875ffd83dbSDimitry Andric     //
63885ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
63895ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
63905ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
63915ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
63925ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
63935ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
63945ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
63955ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
63965ffd83dbSDimitry Andric     }
63975ffd83dbSDimitry Andric   }
63985ffd83dbSDimitry Andric 
63995ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
64005ffd83dbSDimitry Andric     if (NumElts == 0)
64015ffd83dbSDimitry Andric       return;
64025ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
64035ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
64045ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
64055ffd83dbSDimitry Andric   };
64065ffd83dbSDimitry Andric 
64075ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
64085ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
64095ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
64105ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
64115ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
64125ffd83dbSDimitry Andric     return true;
64135ffd83dbSDimitry Andric   }
64145ffd83dbSDimitry Andric 
64155ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
64165ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
64175ffd83dbSDimitry Andric 
64185ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
6419fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
64205ffd83dbSDimitry Andric   if (Ty == V3S16) {
64210eae32dcSDimitry Andric     if (IsTFE) {
64220eae32dcSDimitry Andric       if (ResultRegs.size() == 1) {
64230eae32dcSDimitry Andric         NewResultReg = ResultRegs[0];
64240eae32dcSDimitry Andric       } else if (ResultRegs.size() == 2) {
64250eae32dcSDimitry Andric         LLT V4S16 = LLT::fixed_vector(4, 16);
64260eae32dcSDimitry Andric         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
64270eae32dcSDimitry Andric       } else {
64280eae32dcSDimitry Andric         return false;
64290eae32dcSDimitry Andric       }
64300eae32dcSDimitry Andric     }
64310eae32dcSDimitry Andric 
64320eae32dcSDimitry Andric     if (MRI->getType(DstReg).getNumElements() <
64330eae32dcSDimitry Andric         MRI->getType(NewResultReg).getNumElements()) {
64340eae32dcSDimitry Andric       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
64350eae32dcSDimitry Andric     } else {
64360eae32dcSDimitry Andric       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
64370eae32dcSDimitry Andric     }
64385ffd83dbSDimitry Andric     return true;
64395ffd83dbSDimitry Andric   }
64405ffd83dbSDimitry Andric 
64415ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
64425ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
64435ffd83dbSDimitry Andric   return true;
64445ffd83dbSDimitry Andric }
64455ffd83dbSDimitry Andric 
64465ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
6447e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
6448e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
6449e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
6450e8d8bef9SDimitry Andric 
64515ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
64525ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
64535ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
64545ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
64555ffd83dbSDimitry Andric 
64565ffd83dbSDimitry Andric   Observer.changingInstr(MI);
64575ffd83dbSDimitry Andric 
645806c3fb27SDimitry Andric   // Handle needing to s.buffer.load() a p8 value.
645906c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty)) {
646006c3fb27SDimitry Andric     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
646106c3fb27SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
646206c3fb27SDimitry Andric   }
6463fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6464e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
6465e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
6466e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
6467e8d8bef9SDimitry Andric   }
6468e8d8bef9SDimitry Andric 
64695ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
64705ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
64715ffd83dbSDimitry Andric   // allowed to add one.
64725ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
647381ad6265SDimitry Andric   MI.removeOperand(1); // Remove intrinsic ID
64745ffd83dbSDimitry Andric 
64755ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
64765ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
64775ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
64785ffd83dbSDimitry Andric   const Align MemAlign(4);
64795ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
64805ffd83dbSDimitry Andric       MachinePointerInfo(),
64815ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
64825ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
64835ffd83dbSDimitry Andric       MemSize, MemAlign);
64845ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
64855ffd83dbSDimitry Andric 
64865f757f3fSDimitry Andric   // If we don't have 96-bit result scalar loads, widening to 128-bit should
64875ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
64885ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
64895f757f3fSDimitry Andric   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
64905ffd83dbSDimitry Andric     if (Ty.isVector())
64915ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
64925ffd83dbSDimitry Andric     else
64935ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
64945ffd83dbSDimitry Andric   }
64955ffd83dbSDimitry Andric 
64965ffd83dbSDimitry Andric   Observer.changedInstr(MI);
64975ffd83dbSDimitry Andric   return true;
64985ffd83dbSDimitry Andric }
64995ffd83dbSDimitry Andric 
6500e8d8bef9SDimitry Andric // TODO: Move to selection
65015ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
65020b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
65030b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
6504fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
6505fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6506fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
6507fe6060f1SDimitry Andric 
650806c3fb27SDimitry Andric   return ST.supportsGetDoorbellID() ?
650906c3fb27SDimitry Andric          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6510fe6060f1SDimitry Andric }
6511fe6060f1SDimitry Andric 
6512fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6513fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
651406c3fb27SDimitry Andric   const DebugLoc &DL = MI.getDebugLoc();
651506c3fb27SDimitry Andric   MachineBasicBlock &BB = B.getMBB();
651606c3fb27SDimitry Andric   MachineFunction *MF = BB.getParent();
651706c3fb27SDimitry Andric 
651806c3fb27SDimitry Andric   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
651906c3fb27SDimitry Andric     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
652006c3fb27SDimitry Andric       .addImm(0);
652106c3fb27SDimitry Andric     MI.eraseFromParent();
652206c3fb27SDimitry Andric     return true;
652306c3fb27SDimitry Andric   }
652406c3fb27SDimitry Andric 
652506c3fb27SDimitry Andric   // We need a block split to make the real endpgm a terminator. We also don't
652606c3fb27SDimitry Andric   // want to break phis in successor blocks, so we can't just delete to the
652706c3fb27SDimitry Andric   // end of the block.
652806c3fb27SDimitry Andric   BB.splitAt(MI, false /*UpdateLiveIns*/);
652906c3fb27SDimitry Andric   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
653006c3fb27SDimitry Andric   MF->push_back(TrapBB);
653106c3fb27SDimitry Andric   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
653206c3fb27SDimitry Andric     .addImm(0);
653306c3fb27SDimitry Andric   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
653406c3fb27SDimitry Andric     .addMBB(TrapBB);
653506c3fb27SDimitry Andric 
653606c3fb27SDimitry Andric   BB.addSuccessor(TrapBB);
6537fe6060f1SDimitry Andric   MI.eraseFromParent();
6538fe6060f1SDimitry Andric   return true;
6539fe6060f1SDimitry Andric }
6540fe6060f1SDimitry Andric 
6541fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6542fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
654381ad6265SDimitry Andric   MachineFunction &MF = B.getMF();
654481ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
654581ad6265SDimitry Andric 
654681ad6265SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
654781ad6265SDimitry Andric   // For code object version 5, queue_ptr is passed through implicit kernarg.
654806c3fb27SDimitry Andric   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
654906c3fb27SDimitry Andric       AMDGPU::AMDHSA_COV5) {
655081ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
655181ad6265SDimitry Andric         AMDGPUTargetLowering::QUEUE_PTR;
655281ad6265SDimitry Andric     uint64_t Offset =
655381ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
655481ad6265SDimitry Andric 
655581ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
655681ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
655781ad6265SDimitry Andric 
655881ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
655981ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
656081ad6265SDimitry Andric       return false;
656181ad6265SDimitry Andric 
656281ad6265SDimitry Andric     // TODO: can we be smarter about machine pointer info?
656381ad6265SDimitry Andric     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
656481ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
656581ad6265SDimitry Andric         PtrInfo,
656681ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
656781ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
656881ad6265SDimitry Andric         LLT::scalar(64), commonAlignment(Align(64), Offset));
656981ad6265SDimitry Andric 
657081ad6265SDimitry Andric     // Pointer address
657181ad6265SDimitry Andric     Register LoadAddr = MRI.createGenericVirtualRegister(
657281ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
657381ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
657481ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
657581ad6265SDimitry Andric     // Load address
657681ad6265SDimitry Andric     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
657781ad6265SDimitry Andric     B.buildCopy(SGPR01, Temp);
657881ad6265SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
657981ad6265SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
658081ad6265SDimitry Andric         .addReg(SGPR01, RegState::Implicit);
658181ad6265SDimitry Andric     MI.eraseFromParent();
658281ad6265SDimitry Andric     return true;
658381ad6265SDimitry Andric   }
658481ad6265SDimitry Andric 
65855ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
65865ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6587e8d8bef9SDimitry Andric   Register LiveIn =
6588e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6589e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
65905ffd83dbSDimitry Andric     return false;
6591e8d8bef9SDimitry Andric 
65925ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
65935ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
6594fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
65955ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
6596fe6060f1SDimitry Andric 
6597fe6060f1SDimitry Andric   MI.eraseFromParent();
6598fe6060f1SDimitry Andric   return true;
65995ffd83dbSDimitry Andric }
66005ffd83dbSDimitry Andric 
6601fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(
6602fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6603fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
6604fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
66055ffd83dbSDimitry Andric   MI.eraseFromParent();
66065ffd83dbSDimitry Andric   return true;
66075ffd83dbSDimitry Andric }
66085ffd83dbSDimitry Andric 
66095ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
66105ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6611349cc55cSDimitry Andric   // Is non-HSA path or trap-handler disabled? Then, report a warning
66125ffd83dbSDimitry Andric   // accordingly
6613fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
6614fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
66155ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
66165ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
66175ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
66185ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
66195ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
66205ffd83dbSDimitry Andric   } else {
66215ffd83dbSDimitry Andric     // Insert debug-trap instruction
6622fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
6623fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
66245ffd83dbSDimitry Andric   }
66255ffd83dbSDimitry Andric 
66265ffd83dbSDimitry Andric   MI.eraseFromParent();
66275ffd83dbSDimitry Andric   return true;
66285ffd83dbSDimitry Andric }
66295ffd83dbSDimitry Andric 
6630e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6631e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
6632e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
6633e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
6634e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
663581ad6265SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
663681ad6265SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
6637e8d8bef9SDimitry Andric 
6638e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
6639e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
6640e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
6641e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
6642e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
6643e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
6644e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
6645e8d8bef9SDimitry Andric 
6646fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
6647fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6648fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
6649fe6060f1SDimitry Andric                                         MI.getDebugLoc());
6650fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6651fe6060f1SDimitry Andric     return false;
6652fe6060f1SDimitry Andric   }
6653fe6060f1SDimitry Andric 
66545f757f3fSDimitry Andric   const bool IsGFX11 = AMDGPU::isGFX11(ST);
665581ad6265SDimitry Andric   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
66565f757f3fSDimitry Andric   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6657349cc55cSDimitry Andric   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6658349cc55cSDimitry Andric   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6659349cc55cSDimitry Andric   const unsigned NumVDataDwords = 4;
6660349cc55cSDimitry Andric   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
666181ad6265SDimitry Andric   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
66625f757f3fSDimitry Andric   const bool UseNSA =
66635f757f3fSDimitry Andric       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
66645f757f3fSDimitry Andric 
6665349cc55cSDimitry Andric   const unsigned BaseOpcodes[2][2] = {
6666349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6667349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6668349cc55cSDimitry Andric        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6669349cc55cSDimitry Andric   int Opcode;
6670349cc55cSDimitry Andric   if (UseNSA) {
667181ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
66725f757f3fSDimitry Andric                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
66735f757f3fSDimitry Andric                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
667481ad6265SDimitry Andric                                                : AMDGPU::MIMGEncGfx10NSA,
6675349cc55cSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
6676349cc55cSDimitry Andric   } else {
66775f757f3fSDimitry Andric     assert(!IsGFX12Plus);
66785f757f3fSDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
66795f757f3fSDimitry Andric                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
66805f757f3fSDimitry Andric                                            : AMDGPU::MIMGEncGfx10Default,
6681bdd1243dSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
6682349cc55cSDimitry Andric   }
6683349cc55cSDimitry Andric   assert(Opcode != -1);
6684e8d8bef9SDimitry Andric 
6685e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
668681ad6265SDimitry Andric   if (UseNSA && IsGFX11Plus) {
668781ad6265SDimitry Andric     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
668881ad6265SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6689bdd1243dSDimitry Andric       auto Merged = B.buildMergeLikeInstr(
669081ad6265SDimitry Andric           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
669181ad6265SDimitry Andric       Ops.push_back(Merged.getReg(0));
669281ad6265SDimitry Andric     };
669381ad6265SDimitry Andric 
669481ad6265SDimitry Andric     Ops.push_back(NodePtr);
669581ad6265SDimitry Andric     Ops.push_back(RayExtent);
669681ad6265SDimitry Andric     packLanes(RayOrigin);
669781ad6265SDimitry Andric 
669881ad6265SDimitry Andric     if (IsA16) {
669981ad6265SDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
670081ad6265SDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6701bdd1243dSDimitry Andric       auto MergedDir = B.buildMergeLikeInstr(
670281ad6265SDimitry Andric           V3S32,
6703bdd1243dSDimitry Andric           {B.buildBitcast(
6704bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
670581ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(0)}))
670681ad6265SDimitry Andric                .getReg(0),
6707bdd1243dSDimitry Andric            B.buildBitcast(
6708bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
670981ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(1)}))
671081ad6265SDimitry Andric                .getReg(0),
6711bdd1243dSDimitry Andric            B.buildBitcast(
6712bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
671381ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(2)}))
671481ad6265SDimitry Andric                .getReg(0)});
671581ad6265SDimitry Andric       Ops.push_back(MergedDir.getReg(0));
671681ad6265SDimitry Andric     } else {
671781ad6265SDimitry Andric       packLanes(RayDir);
671881ad6265SDimitry Andric       packLanes(RayInvDir);
671981ad6265SDimitry Andric     }
672081ad6265SDimitry Andric   } else {
6721e8d8bef9SDimitry Andric     if (Is64) {
6722e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6723e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
6724e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
6725e8d8bef9SDimitry Andric     } else {
6726e8d8bef9SDimitry Andric       Ops.push_back(NodePtr);
6727e8d8bef9SDimitry Andric     }
6728e8d8bef9SDimitry Andric     Ops.push_back(RayExtent);
6729e8d8bef9SDimitry Andric 
6730e8d8bef9SDimitry Andric     auto packLanes = [&Ops, &S32, &B](Register Src) {
67310eae32dcSDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6732e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
6733e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
6734e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(2));
6735e8d8bef9SDimitry Andric     };
6736e8d8bef9SDimitry Andric 
6737e8d8bef9SDimitry Andric     packLanes(RayOrigin);
6738e8d8bef9SDimitry Andric     if (IsA16) {
67390eae32dcSDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
67400eae32dcSDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6741e8d8bef9SDimitry Andric       Register R1 = MRI.createGenericVirtualRegister(S32);
6742e8d8bef9SDimitry Andric       Register R2 = MRI.createGenericVirtualRegister(S32);
6743e8d8bef9SDimitry Andric       Register R3 = MRI.createGenericVirtualRegister(S32);
6744bdd1243dSDimitry Andric       B.buildMergeLikeInstr(R1,
6745bdd1243dSDimitry Andric                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6746bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
6747bdd1243dSDimitry Andric           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6748bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
6749bdd1243dSDimitry Andric           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6750e8d8bef9SDimitry Andric       Ops.push_back(R1);
6751e8d8bef9SDimitry Andric       Ops.push_back(R2);
6752e8d8bef9SDimitry Andric       Ops.push_back(R3);
6753e8d8bef9SDimitry Andric     } else {
6754e8d8bef9SDimitry Andric       packLanes(RayDir);
6755e8d8bef9SDimitry Andric       packLanes(RayInvDir);
6756e8d8bef9SDimitry Andric     }
675781ad6265SDimitry Andric   }
6758e8d8bef9SDimitry Andric 
6759349cc55cSDimitry Andric   if (!UseNSA) {
6760349cc55cSDimitry Andric     // Build a single vector containing all the operands so far prepared.
6761349cc55cSDimitry Andric     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6762bdd1243dSDimitry Andric     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6763349cc55cSDimitry Andric     Ops.clear();
6764349cc55cSDimitry Andric     Ops.push_back(MergedOps);
6765349cc55cSDimitry Andric   }
6766349cc55cSDimitry Andric 
6767e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6768e8d8bef9SDimitry Andric     .addDef(DstReg)
6769e8d8bef9SDimitry Andric     .addImm(Opcode);
6770e8d8bef9SDimitry Andric 
6771e8d8bef9SDimitry Andric   for (Register R : Ops) {
6772e8d8bef9SDimitry Andric     MIB.addUse(R);
6773e8d8bef9SDimitry Andric   }
6774e8d8bef9SDimitry Andric 
6775e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
6776e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
6777e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
6778e8d8bef9SDimitry Andric 
6779e8d8bef9SDimitry Andric   MI.eraseFromParent();
6780e8d8bef9SDimitry Andric   return true;
6781e8d8bef9SDimitry Andric }
6782e8d8bef9SDimitry Andric 
678381ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
678481ad6265SDimitry Andric                                                MachineIRBuilder &B) const {
678581ad6265SDimitry Andric   unsigned Opc;
678681ad6265SDimitry Andric   int RoundMode = MI.getOperand(2).getImm();
678781ad6265SDimitry Andric 
678881ad6265SDimitry Andric   if (RoundMode == (int)RoundingMode::TowardPositive)
678981ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
679081ad6265SDimitry Andric   else if (RoundMode == (int)RoundingMode::TowardNegative)
679181ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
679281ad6265SDimitry Andric   else
679381ad6265SDimitry Andric     return false;
679481ad6265SDimitry Andric 
679581ad6265SDimitry Andric   B.buildInstr(Opc)
679681ad6265SDimitry Andric       .addDef(MI.getOperand(0).getReg())
679781ad6265SDimitry Andric       .addUse(MI.getOperand(1).getReg());
679881ad6265SDimitry Andric 
679904eeddc0SDimitry Andric   MI.eraseFromParent();
680081ad6265SDimitry Andric 
680104eeddc0SDimitry Andric   return true;
680204eeddc0SDimitry Andric }
680304eeddc0SDimitry Andric 
68045f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
68055f757f3fSDimitry Andric                                             MachineIRBuilder &B) const {
68065f757f3fSDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
68075f757f3fSDimitry Andric   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
68085f757f3fSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
68095f757f3fSDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
68105f757f3fSDimitry Andric   MI.eraseFromParent();
68115f757f3fSDimitry Andric   return true;
68125f757f3fSDimitry Andric }
68135f757f3fSDimitry Andric 
68145ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
68155ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
68165ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
68175ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
68185ffd83dbSDimitry Andric 
68190b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
68205f757f3fSDimitry Andric   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
6821480093f4SDimitry Andric   switch (IntrID) {
6822480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
6823480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
6824480093f4SDimitry Andric     MachineInstr *Br = nullptr;
68255ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
6826e8d8bef9SDimitry Andric     bool Negated = false;
6827e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
6828e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
68290b57cec5SDimitry Andric       const SIRegisterInfo *TRI
68300b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
68310b57cec5SDimitry Andric 
68320b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
68330b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
6834480093f4SDimitry Andric 
68355ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6836e8d8bef9SDimitry Andric 
6837e8d8bef9SDimitry Andric       if (Negated)
6838e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
6839e8d8bef9SDimitry Andric 
68405ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6841480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
68420b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
68430b57cec5SDimitry Andric           .addDef(Def)
68440b57cec5SDimitry Andric           .addUse(Use)
68455ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
6846480093f4SDimitry Andric       } else {
6847480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
6848480093f4SDimitry Andric             .addDef(Def)
6849480093f4SDimitry Andric             .addUse(Use)
6850e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
6851480093f4SDimitry Andric       }
6852480093f4SDimitry Andric 
68535ffd83dbSDimitry Andric       if (Br) {
68545ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
68555ffd83dbSDimitry Andric       } else {
68565ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
68575ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
68585ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
68595ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
68605ffd83dbSDimitry Andric       }
68610b57cec5SDimitry Andric 
68620b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
68630b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
68640b57cec5SDimitry Andric       MI.eraseFromParent();
68650b57cec5SDimitry Andric       BrCond->eraseFromParent();
68660b57cec5SDimitry Andric       return true;
68670b57cec5SDimitry Andric     }
68680b57cec5SDimitry Andric 
68690b57cec5SDimitry Andric     return false;
68700b57cec5SDimitry Andric   }
68710b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
6872480093f4SDimitry Andric     MachineInstr *Br = nullptr;
68735ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
6874e8d8bef9SDimitry Andric     bool Negated = false;
6875e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
6876e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
68770b57cec5SDimitry Andric       const SIRegisterInfo *TRI
68780b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
68790b57cec5SDimitry Andric 
68805ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
68810b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
68825ffd83dbSDimitry Andric 
6883e8d8bef9SDimitry Andric       if (Negated)
6884e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
6885e8d8bef9SDimitry Andric 
68865ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
68870b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
68880b57cec5SDimitry Andric         .addUse(Reg)
68895ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
68905ffd83dbSDimitry Andric 
68915ffd83dbSDimitry Andric       if (Br)
68925ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
68935ffd83dbSDimitry Andric       else
68945ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
68955ffd83dbSDimitry Andric 
68960b57cec5SDimitry Andric       MI.eraseFromParent();
68970b57cec5SDimitry Andric       BrCond->eraseFromParent();
68980b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
68990b57cec5SDimitry Andric       return true;
69000b57cec5SDimitry Andric     }
69010b57cec5SDimitry Andric 
69020b57cec5SDimitry Andric     return false;
69030b57cec5SDimitry Andric   }
690406c3fb27SDimitry Andric   case Intrinsic::amdgcn_make_buffer_rsrc:
690506c3fb27SDimitry Andric     return legalizePointerAsRsrcIntrin(MI, MRI, B);
69060b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
69075ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
69085ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
69095ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
69105ffd83dbSDimitry Andric       MI.eraseFromParent();
69115ffd83dbSDimitry Andric       return true;
69125ffd83dbSDimitry Andric     }
69135ffd83dbSDimitry Andric 
69140b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
69150b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
69160b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
69170b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
69180b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
691981ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
69200b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
69210b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
692281ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
69230b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
69240b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
692581ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
69260b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
69270b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
69280b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69290b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
69300b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
69310b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69320b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
69330b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
69340b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69350b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6936fcaf7f86SDimitry Andric   case Intrinsic::amdgcn_lds_kernel_id:
6937fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
6938fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
69390b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
69400b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69410b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
69420b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
69430b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69440b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
69450b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
69460b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
69470b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
69480b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
69490b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69500b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
695181ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_x:
695281ad6265SDimitry Andric     // TODO: Emit error for hsa
695381ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
695481ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_X);
695581ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_y:
695681ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
695781ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Y);
695881ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_z:
695981ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
696081ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Z);
696181ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_x:
696281ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
696381ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
696481ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_y:
696581ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
696681ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
696781ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
696881ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_z:
696981ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
697081ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_x:
697181ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
697281ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_y:
697381ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
697481ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_z:
697581ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
69768bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
69778bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
69788bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
69798bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
69808bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
69818bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
69828bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
69838bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
69848bcb0991SDimitry Andric     MI.eraseFromParent();
69858bcb0991SDimitry Andric     return true;
69868bcb0991SDimitry Andric   }
69875ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
6988e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
69898bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
699006c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_store:
69915ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
699206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_store:
69935ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
69948bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
699506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
69965ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
699706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
69985ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
69995ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
700006c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
70015ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
700206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
70035ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
70045ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
700506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_load:
70065ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
700706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_load:
70085ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
70095ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
701006c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
70115ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
701206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
70135ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
70145ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
701506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
70165ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
701706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
70185ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
70195ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
702006c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
70215ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
702206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
70235ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
702406c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
70255ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
702606c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
70275ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
702806c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
70295ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
703006c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
70315ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
703206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
70335ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
703406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
70355ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
703606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
70375ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
703806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
70395ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
704006c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
70415ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
704206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
70435ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
704406c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
70455ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
704606c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
70475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
704806c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
70495ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
705006c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
70515ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
705206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
70535ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
705406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
70555ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
705606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
70575ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
705806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
70595ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
706006c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
70615ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
706206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
70635ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
706406c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
70655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
706606c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
70675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
706806c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
70695ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
707006c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7071fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
707206c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7073fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
707406c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7075fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
707606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7077fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
707806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
707904eeddc0SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
708006c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7081bdd1243dSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
708206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
708304eeddc0SDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
70845ffd83dbSDimitry Andric   case Intrinsic::trap:
70855ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
70865ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
70875ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
7088e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
7089e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
7090e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
7091e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
7092e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
7093e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
7094e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7095e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
709606c3fb27SDimitry Andric   case Intrinsic::amdgcn_fmed3: {
709706c3fb27SDimitry Andric     GISelChangeObserver &Observer = Helper.Observer;
709806c3fb27SDimitry Andric 
709906c3fb27SDimitry Andric     // FIXME: This is to workaround the inability of tablegen match combiners to
710006c3fb27SDimitry Andric     // match intrinsics in patterns.
710106c3fb27SDimitry Andric     Observer.changingInstr(MI);
710206c3fb27SDimitry Andric     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
710306c3fb27SDimitry Andric     MI.removeOperand(1);
710406c3fb27SDimitry Andric     Observer.changedInstr(MI);
710506c3fb27SDimitry Andric     return true;
710606c3fb27SDimitry Andric   }
71075ffd83dbSDimitry Andric   default: {
71085ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
71095ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
71105ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
71110b57cec5SDimitry Andric     return true;
71120b57cec5SDimitry Andric   }
71135ffd83dbSDimitry Andric   }
71140b57cec5SDimitry Andric 
71150b57cec5SDimitry Andric   return true;
71160b57cec5SDimitry Andric }
7117