xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 5f757f3ff9144b609b3c433dfd370cc6bdc191ad)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
20*5f757f3fSDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21*5f757f3fSDimitry Andric #include "SIInstrInfo.h"
220b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
23*5f757f3fSDimitry Andric #include "SIRegisterInfo.h"
24fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
255ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
26fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
27*5f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
280b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
295ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
3106c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h"
32*5f757f3fSDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h"
338bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
34e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
3581ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
360b57cec5SDimitry Andric 
370b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
380b57cec5SDimitry Andric 
390b57cec5SDimitry Andric using namespace llvm;
400b57cec5SDimitry Andric using namespace LegalizeActions;
410b57cec5SDimitry Andric using namespace LegalizeMutations;
420b57cec5SDimitry Andric using namespace LegalityPredicates;
435ffd83dbSDimitry Andric using namespace MIPatternMatch;
440b57cec5SDimitry Andric 
455ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
465ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
475ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
485ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
495ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
505ffd83dbSDimitry Andric   cl::init(false),
515ffd83dbSDimitry Andric   cl::ReallyHidden);
520b57cec5SDimitry Andric 
535ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
545ffd83dbSDimitry Andric 
555ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
565ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
575ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
585ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
59fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
600b57cec5SDimitry Andric }
610b57cec5SDimitry Andric 
625ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
635ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
645ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
655ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
665ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
678bcb0991SDimitry Andric }
688bcb0991SDimitry Andric 
69349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an
70e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
720b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
730b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
740b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
75e8d8bef9SDimitry Andric     if (!Ty.isVector())
76e8d8bef9SDimitry Andric       return false;
77e8d8bef9SDimitry Andric 
78e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
79e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
80e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
81e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
828bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
838bcb0991SDimitry Andric   };
848bcb0991SDimitry Andric }
858bcb0991SDimitry Andric 
86e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
88e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
89e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
90e8d8bef9SDimitry Andric   };
91e8d8bef9SDimitry Andric }
92e8d8bef9SDimitry Andric 
938bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
948bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
958bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
968bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
978bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
980b57cec5SDimitry Andric   };
990b57cec5SDimitry Andric }
1000b57cec5SDimitry Andric 
1010b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
1020b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1030b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1040b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
105bdd1243dSDimitry Andric     return std::pair(TypeIdx,
106fe6060f1SDimitry Andric                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1070b57cec5SDimitry Andric   };
1080b57cec5SDimitry Andric }
1090b57cec5SDimitry Andric 
1100b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1110b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1120b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1130b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1140b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1150b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1160b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::scalarOrVector(
118bdd1243dSDimitry Andric                                   ElementCount::getFixed(NewNumElts), EltTy));
1190b57cec5SDimitry Andric   };
1200b57cec5SDimitry Andric }
1210b57cec5SDimitry Andric 
1228bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1238bcb0991SDimitry Andric // type.
1248bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1258bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1268bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1278bcb0991SDimitry Andric 
1288bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1298bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1308bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1318bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1328bcb0991SDimitry Andric 
1338bcb0991SDimitry Andric     assert(EltSize < 32);
1348bcb0991SDimitry Andric 
1358bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1378bcb0991SDimitry Andric   };
1388bcb0991SDimitry Andric }
1398bcb0991SDimitry Andric 
14006c3fb27SDimitry Andric // Increase the number of vector elements to reach the next legal RegClass.
14106c3fb27SDimitry Andric static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
14206c3fb27SDimitry Andric   return [=](const LegalityQuery &Query) {
14306c3fb27SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
14406c3fb27SDimitry Andric     const unsigned NumElts = Ty.getNumElements();
14506c3fb27SDimitry Andric     const unsigned EltSize = Ty.getElementType().getSizeInBits();
14606c3fb27SDimitry Andric     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
14706c3fb27SDimitry Andric 
14806c3fb27SDimitry Andric     assert(EltSize == 32 || EltSize == 64);
14906c3fb27SDimitry Andric     assert(Ty.getSizeInBits() < MaxRegisterSize);
15006c3fb27SDimitry Andric 
15106c3fb27SDimitry Andric     unsigned NewNumElts;
15206c3fb27SDimitry Andric     // Find the nearest legal RegClass that is larger than the current type.
15306c3fb27SDimitry Andric     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
15406c3fb27SDimitry Andric       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
15506c3fb27SDimitry Andric         break;
15606c3fb27SDimitry Andric     }
15706c3fb27SDimitry Andric 
15806c3fb27SDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
15906c3fb27SDimitry Andric   };
16006c3fb27SDimitry Andric }
16106c3fb27SDimitry Andric 
16206c3fb27SDimitry Andric static LLT getBufferRsrcScalarType(const LLT Ty) {
16306c3fb27SDimitry Andric   if (!Ty.isVector())
16406c3fb27SDimitry Andric     return LLT::scalar(128);
16506c3fb27SDimitry Andric   const ElementCount NumElems = Ty.getElementCount();
16606c3fb27SDimitry Andric   return LLT::vector(NumElems, LLT::scalar(128));
16706c3fb27SDimitry Andric }
16806c3fb27SDimitry Andric 
16906c3fb27SDimitry Andric static LLT getBufferRsrcRegisterType(const LLT Ty) {
17006c3fb27SDimitry Andric   if (!Ty.isVector())
17106c3fb27SDimitry Andric     return LLT::fixed_vector(4, LLT::scalar(32));
17206c3fb27SDimitry Andric   const unsigned NumElems = Ty.getElementCount().getFixedValue();
17306c3fb27SDimitry Andric   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
17406c3fb27SDimitry Andric }
17506c3fb27SDimitry Andric 
176e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
177e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1785ffd83dbSDimitry Andric 
1795ffd83dbSDimitry Andric   if (Size <= 32) {
1805ffd83dbSDimitry Andric     // <2 x s8> -> s16
1815ffd83dbSDimitry Andric     // <4 x s8> -> s32
182e8d8bef9SDimitry Andric     return LLT::scalar(Size);
183e8d8bef9SDimitry Andric   }
1845ffd83dbSDimitry Andric 
185fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186e8d8bef9SDimitry Andric }
187e8d8bef9SDimitry Andric 
188e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
190e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
191bdd1243dSDimitry Andric     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192e8d8bef9SDimitry Andric   };
193e8d8bef9SDimitry Andric }
194e8d8bef9SDimitry Andric 
195e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
197e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
198e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
199e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
200bdd1243dSDimitry Andric     return std::pair(
201fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
2025ffd83dbSDimitry Andric   };
2035ffd83dbSDimitry Andric }
2045ffd83dbSDimitry Andric 
2058bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
2068bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2078bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2088bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
2098bcb0991SDimitry Andric   };
2108bcb0991SDimitry Andric }
2118bcb0991SDimitry Andric 
2120b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
2130b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
2140b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2150b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
2160b57cec5SDimitry Andric   };
2170b57cec5SDimitry Andric }
2180b57cec5SDimitry Andric 
2190b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
2200b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
2210b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2220b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
2230b57cec5SDimitry Andric   };
2240b57cec5SDimitry Andric }
2250b57cec5SDimitry Andric 
2265ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
2275ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
2285ffd83dbSDimitry Andric }
2295ffd83dbSDimitry Andric 
2305ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
2315ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
2325ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
2335ffd83dbSDimitry Andric }
2345ffd83dbSDimitry Andric 
2355ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
2360b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
2370b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
2380b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
2390b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
2400b57cec5SDimitry Andric }
2410b57cec5SDimitry Andric 
2425ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2435ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2445ffd83dbSDimitry Andric     return false;
2455ffd83dbSDimitry Andric 
2465ffd83dbSDimitry Andric   if (Ty.isVector())
2475ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2485ffd83dbSDimitry Andric 
2495ffd83dbSDimitry Andric   return true;
2505ffd83dbSDimitry Andric }
2515ffd83dbSDimitry Andric 
2525ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2535ffd83dbSDimitry Andric // multiples of v2s16.
2545ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2555ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2565ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2578bcb0991SDimitry Andric   };
2588bcb0991SDimitry Andric }
2598bcb0991SDimitry Andric 
26006c3fb27SDimitry Andric // RegisterType that doesn't have a corresponding RegClass.
26106c3fb27SDimitry Andric static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
26206c3fb27SDimitry Andric   return [=](const LegalityQuery &Query) {
26306c3fb27SDimitry Andric     LLT Ty = Query.Types[TypeIdx];
26406c3fb27SDimitry Andric     return isRegisterType(Ty) &&
26506c3fb27SDimitry Andric            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
26606c3fb27SDimitry Andric   };
26706c3fb27SDimitry Andric }
26806c3fb27SDimitry Andric 
2695ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2708bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2715ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2725ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2735ffd83dbSDimitry Andric       return false;
2745ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2755ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2768bcb0991SDimitry Andric   };
2778bcb0991SDimitry Andric }
2788bcb0991SDimitry Andric 
279fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
280fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
281fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
2828bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2838bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
2848bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
285fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
2860b57cec5SDimitry Andric   };
2870b57cec5SDimitry Andric }
2880b57cec5SDimitry Andric 
2895ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
2905ffd83dbSDimitry Andric // handle some operations by just promoting the register during
2915ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
2925ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
29306c3fb27SDimitry Andric                                     bool IsLoad, bool IsAtomic) {
2945ffd83dbSDimitry Andric   switch (AS) {
2955ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
2965ffd83dbSDimitry Andric     // FIXME: Private element size.
297e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
2985ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
2995ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
3005ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
3015ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
3025ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
30306c3fb27SDimitry Andric   case AMDGPUAS::BUFFER_RESOURCE:
3045ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
3055ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
3065ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
3075ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
3085ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
3095ffd83dbSDimitry Andric     // kernel.
3105ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
3115ffd83dbSDimitry Andric   default:
31206c3fb27SDimitry Andric     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
31306c3fb27SDimitry Andric     // if they may alias scratch depending on the subtarget.  This needs to be
31406c3fb27SDimitry Andric     // moved to custom handling to use addressMayBeAccessedAsPrivate
31506c3fb27SDimitry Andric     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
3165ffd83dbSDimitry Andric   }
3175ffd83dbSDimitry Andric }
3185ffd83dbSDimitry Andric 
3195ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
320fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
3215ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
3225ffd83dbSDimitry Andric 
3235ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
324fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
3255ffd83dbSDimitry Andric 
3265ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
32704eeddc0SDimitry Andric   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
32804eeddc0SDimitry Andric   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
3295ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
3305ffd83dbSDimitry Andric 
3315ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
3325ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
3335ffd83dbSDimitry Andric     return false;
3345ffd83dbSDimitry Andric 
335fe6060f1SDimitry Andric   // Do not handle extending vector loads.
336fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
337fe6060f1SDimitry Andric     return false;
338fe6060f1SDimitry Andric 
3395ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
3405ffd83dbSDimitry Andric   // we also need to modify the memory access size.
3415ffd83dbSDimitry Andric #if 0
3425ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
3435ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
3445ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
3455ffd83dbSDimitry Andric #endif
3465ffd83dbSDimitry Andric 
3475ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
3485ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
3495ffd83dbSDimitry Andric     return false;
3505ffd83dbSDimitry Andric 
35106c3fb27SDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
35206c3fb27SDimitry Andric                                     Query.MMODescrs[0].Ordering !=
35306c3fb27SDimitry Andric                                         AtomicOrdering::NotAtomic))
3545ffd83dbSDimitry Andric     return false;
3555ffd83dbSDimitry Andric 
3565ffd83dbSDimitry Andric   switch (MemSize) {
3575ffd83dbSDimitry Andric   case 8:
3585ffd83dbSDimitry Andric   case 16:
3595ffd83dbSDimitry Andric   case 32:
3605ffd83dbSDimitry Andric   case 64:
3615ffd83dbSDimitry Andric   case 128:
3625ffd83dbSDimitry Andric     break;
3635ffd83dbSDimitry Andric   case 96:
3645ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
3655ffd83dbSDimitry Andric       return false;
3665ffd83dbSDimitry Andric     break;
3675ffd83dbSDimitry Andric   case 256:
3685ffd83dbSDimitry Andric   case 512:
3695ffd83dbSDimitry Andric     // These may contextually need to be broken down.
3705ffd83dbSDimitry Andric     break;
3715ffd83dbSDimitry Andric   default:
3725ffd83dbSDimitry Andric     return false;
3735ffd83dbSDimitry Andric   }
3745ffd83dbSDimitry Andric 
3755ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
3765ffd83dbSDimitry Andric 
377e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
3785ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
379e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
380e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
3815ffd83dbSDimitry Andric       return false;
3825ffd83dbSDimitry Andric   }
3835ffd83dbSDimitry Andric 
3845ffd83dbSDimitry Andric   return true;
3855ffd83dbSDimitry Andric }
3865ffd83dbSDimitry Andric 
38706c3fb27SDimitry Andric // The newer buffer intrinsic forms take their resource arguments as
38806c3fb27SDimitry Andric // pointers in address space 8, aka s128 values. However, in order to not break
38906c3fb27SDimitry Andric // SelectionDAG, the underlying operations have to continue to take v4i32
39006c3fb27SDimitry Andric // arguments. Therefore, we convert resource pointers - or vectors of them
39106c3fb27SDimitry Andric // to integer values here.
39206c3fb27SDimitry Andric static bool hasBufferRsrcWorkaround(const LLT Ty) {
39306c3fb27SDimitry Andric   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
39406c3fb27SDimitry Andric     return true;
39506c3fb27SDimitry Andric   if (Ty.isVector()) {
39606c3fb27SDimitry Andric     const LLT ElemTy = Ty.getElementType();
39706c3fb27SDimitry Andric     return hasBufferRsrcWorkaround(ElemTy);
39806c3fb27SDimitry Andric   }
39906c3fb27SDimitry Andric   return false;
40006c3fb27SDimitry Andric }
40106c3fb27SDimitry Andric 
4025ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
4035ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
4045ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
4055ffd83dbSDimitry Andric // bitcasting.
4065ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
4075ffd83dbSDimitry Andric   if (EnableNewLegality)
4085ffd83dbSDimitry Andric     return false;
4095ffd83dbSDimitry Andric 
4105ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
4115ffd83dbSDimitry Andric   if (Size <= 64)
4125ffd83dbSDimitry Andric     return false;
41306c3fb27SDimitry Andric   // Address space 8 pointers get their own workaround.
41406c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty))
41506c3fb27SDimitry Andric     return false;
4165ffd83dbSDimitry Andric   if (!Ty.isVector())
4175ffd83dbSDimitry Andric     return true;
418e8d8bef9SDimitry Andric 
419e8d8bef9SDimitry Andric   LLT EltTy = Ty.getElementType();
420e8d8bef9SDimitry Andric   if (EltTy.isPointer())
421e8d8bef9SDimitry Andric     return true;
422e8d8bef9SDimitry Andric 
423e8d8bef9SDimitry Andric   unsigned EltSize = EltTy.getSizeInBits();
4245ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
4255ffd83dbSDimitry Andric }
4265ffd83dbSDimitry Andric 
427fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
4285ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
429fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
43006c3fb27SDimitry Andric          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
4315ffd83dbSDimitry Andric }
4325ffd83dbSDimitry Andric 
433e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
434e8d8bef9SDimitry Andric /// to a different type.
435e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
436fe6060f1SDimitry Andric                                        const LLT MemTy) {
437fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
438e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
439e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
440e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
441e8d8bef9SDimitry Andric 
442e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
443e8d8bef9SDimitry Andric     return true;
444fe6060f1SDimitry Andric 
445fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
446fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
447fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
448e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
449e8d8bef9SDimitry Andric }
450e8d8bef9SDimitry Andric 
451e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
452e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
453e8d8bef9SDimitry Andric /// changes, not the size of the result register.
454fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
45504eeddc0SDimitry Andric                             uint64_t AlignInBits, unsigned AddrSpace,
456e8d8bef9SDimitry Andric                             unsigned Opcode) {
457fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
458e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
459e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
460e8d8bef9SDimitry Andric     return false;
461e8d8bef9SDimitry Andric 
462e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
463*5f757f3fSDimitry Andric   // end up widening these for a scalar load during RegBankSelect, if we don't
464*5f757f3fSDimitry Andric   // have 96-bit scalar loads.
465e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
466e8d8bef9SDimitry Andric     return false;
467e8d8bef9SDimitry Andric 
46806c3fb27SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
469e8d8bef9SDimitry Andric     return false;
470e8d8bef9SDimitry Andric 
471e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
472e8d8bef9SDimitry Andric   // to it.
473e8d8bef9SDimitry Andric   //
474e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
475e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
476e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
477e8d8bef9SDimitry Andric     return false;
478e8d8bef9SDimitry Andric 
479e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
480e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
481bdd1243dSDimitry Andric   unsigned Fast = 0;
482e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
483e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
484e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
485e8d8bef9SDimitry Andric          Fast;
486e8d8bef9SDimitry Andric }
487e8d8bef9SDimitry Andric 
488e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
489e8d8bef9SDimitry Andric                             unsigned Opcode) {
490e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
491e8d8bef9SDimitry Andric     return false;
492e8d8bef9SDimitry Andric 
493fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
494e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
495e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
496e8d8bef9SDimitry Andric }
497e8d8bef9SDimitry Andric 
49806c3fb27SDimitry Andric /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
49906c3fb27SDimitry Andric /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
50006c3fb27SDimitry Andric /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
50106c3fb27SDimitry Andric static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
50206c3fb27SDimitry Andric                                    MachineRegisterInfo &MRI, unsigned Idx) {
50306c3fb27SDimitry Andric   MachineOperand &MO = MI.getOperand(Idx);
50406c3fb27SDimitry Andric 
50506c3fb27SDimitry Andric   const LLT PointerTy = MRI.getType(MO.getReg());
50606c3fb27SDimitry Andric 
50706c3fb27SDimitry Andric   // Paranoidly prevent us from doing this multiple times.
50806c3fb27SDimitry Andric   if (!hasBufferRsrcWorkaround(PointerTy))
50906c3fb27SDimitry Andric     return PointerTy;
51006c3fb27SDimitry Andric 
51106c3fb27SDimitry Andric   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
51206c3fb27SDimitry Andric   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
51306c3fb27SDimitry Andric   if (!PointerTy.isVector()) {
51406c3fb27SDimitry Andric     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
51506c3fb27SDimitry Andric     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
51606c3fb27SDimitry Andric     const LLT S32 = LLT::scalar(32);
51706c3fb27SDimitry Andric 
51806c3fb27SDimitry Andric     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
51906c3fb27SDimitry Andric     std::array<Register, 4> VectorElems;
52006c3fb27SDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
52106c3fb27SDimitry Andric     for (unsigned I = 0; I < NumParts; ++I)
52206c3fb27SDimitry Andric       VectorElems[I] =
52306c3fb27SDimitry Andric           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
52406c3fb27SDimitry Andric     B.buildMergeValues(MO, VectorElems);
52506c3fb27SDimitry Andric     MO.setReg(VectorReg);
52606c3fb27SDimitry Andric     return VectorTy;
52706c3fb27SDimitry Andric   }
52806c3fb27SDimitry Andric   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
52906c3fb27SDimitry Andric   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
53006c3fb27SDimitry Andric   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
53106c3fb27SDimitry Andric   B.buildIntToPtr(MO, Scalar);
53206c3fb27SDimitry Andric   MO.setReg(BitcastReg);
53306c3fb27SDimitry Andric 
53406c3fb27SDimitry Andric   return VectorTy;
53506c3fb27SDimitry Andric }
53606c3fb27SDimitry Andric 
53706c3fb27SDimitry Andric /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
53806c3fb27SDimitry Andric /// the form in which the value must be in order to be passed to the low-level
53906c3fb27SDimitry Andric /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
54006c3fb27SDimitry Andric /// needed in order to account for the fact that we can't define a register
54106c3fb27SDimitry Andric /// class for s128 without breaking SelectionDAG.
54206c3fb27SDimitry Andric static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
54306c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
54406c3fb27SDimitry Andric   const LLT PointerTy = MRI.getType(Pointer);
54506c3fb27SDimitry Andric   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
54606c3fb27SDimitry Andric   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
54706c3fb27SDimitry Andric 
54806c3fb27SDimitry Andric   if (!PointerTy.isVector()) {
54906c3fb27SDimitry Andric     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
55006c3fb27SDimitry Andric     SmallVector<Register, 4> PointerParts;
55106c3fb27SDimitry Andric     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
55206c3fb27SDimitry Andric     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
55306c3fb27SDimitry Andric     for (unsigned I = 0; I < NumParts; ++I)
55406c3fb27SDimitry Andric       PointerParts.push_back(Unmerged.getReg(I));
55506c3fb27SDimitry Andric     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
55606c3fb27SDimitry Andric   }
55706c3fb27SDimitry Andric   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
55806c3fb27SDimitry Andric   return B.buildBitcast(VectorTy, Scalar).getReg(0);
55906c3fb27SDimitry Andric }
56006c3fb27SDimitry Andric 
56106c3fb27SDimitry Andric static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
56206c3fb27SDimitry Andric                                      unsigned Idx) {
56306c3fb27SDimitry Andric   MachineOperand &MO = MI.getOperand(Idx);
56406c3fb27SDimitry Andric 
56506c3fb27SDimitry Andric   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
56606c3fb27SDimitry Andric   // Paranoidly prevent us from doing this multiple times.
56706c3fb27SDimitry Andric   if (!hasBufferRsrcWorkaround(PointerTy))
56806c3fb27SDimitry Andric     return;
56906c3fb27SDimitry Andric   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
57006c3fb27SDimitry Andric }
57106c3fb27SDimitry Andric 
5720b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
5730b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
5740b57cec5SDimitry Andric   :  ST(ST_) {
5750b57cec5SDimitry Andric   using namespace TargetOpcode;
5760b57cec5SDimitry Andric 
5770b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
5780b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
5790b57cec5SDimitry Andric   };
5800b57cec5SDimitry Andric 
5810b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
582e8d8bef9SDimitry Andric   const LLT S8 = LLT::scalar(8);
5830b57cec5SDimitry Andric   const LLT S16 = LLT::scalar(16);
5840b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
5850b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
5860b57cec5SDimitry Andric   const LLT S128 = LLT::scalar(128);
5870b57cec5SDimitry Andric   const LLT S256 = LLT::scalar(256);
5885ffd83dbSDimitry Andric   const LLT S512 = LLT::scalar(512);
5895ffd83dbSDimitry Andric   const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
5900b57cec5SDimitry Andric 
591fe6060f1SDimitry Andric   const LLT V2S8 = LLT::fixed_vector(2, 8);
592fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
593fe6060f1SDimitry Andric   const LLT V4S16 = LLT::fixed_vector(4, 16);
5940b57cec5SDimitry Andric 
595fe6060f1SDimitry Andric   const LLT V2S32 = LLT::fixed_vector(2, 32);
596fe6060f1SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
597fe6060f1SDimitry Andric   const LLT V4S32 = LLT::fixed_vector(4, 32);
598fe6060f1SDimitry Andric   const LLT V5S32 = LLT::fixed_vector(5, 32);
599fe6060f1SDimitry Andric   const LLT V6S32 = LLT::fixed_vector(6, 32);
600fe6060f1SDimitry Andric   const LLT V7S32 = LLT::fixed_vector(7, 32);
601fe6060f1SDimitry Andric   const LLT V8S32 = LLT::fixed_vector(8, 32);
602fe6060f1SDimitry Andric   const LLT V9S32 = LLT::fixed_vector(9, 32);
603fe6060f1SDimitry Andric   const LLT V10S32 = LLT::fixed_vector(10, 32);
604fe6060f1SDimitry Andric   const LLT V11S32 = LLT::fixed_vector(11, 32);
605fe6060f1SDimitry Andric   const LLT V12S32 = LLT::fixed_vector(12, 32);
606fe6060f1SDimitry Andric   const LLT V13S32 = LLT::fixed_vector(13, 32);
607fe6060f1SDimitry Andric   const LLT V14S32 = LLT::fixed_vector(14, 32);
608fe6060f1SDimitry Andric   const LLT V15S32 = LLT::fixed_vector(15, 32);
609fe6060f1SDimitry Andric   const LLT V16S32 = LLT::fixed_vector(16, 32);
610fe6060f1SDimitry Andric   const LLT V32S32 = LLT::fixed_vector(32, 32);
6110b57cec5SDimitry Andric 
612fe6060f1SDimitry Andric   const LLT V2S64 = LLT::fixed_vector(2, 64);
613fe6060f1SDimitry Andric   const LLT V3S64 = LLT::fixed_vector(3, 64);
614fe6060f1SDimitry Andric   const LLT V4S64 = LLT::fixed_vector(4, 64);
615fe6060f1SDimitry Andric   const LLT V5S64 = LLT::fixed_vector(5, 64);
616fe6060f1SDimitry Andric   const LLT V6S64 = LLT::fixed_vector(6, 64);
617fe6060f1SDimitry Andric   const LLT V7S64 = LLT::fixed_vector(7, 64);
618fe6060f1SDimitry Andric   const LLT V8S64 = LLT::fixed_vector(8, 64);
619fe6060f1SDimitry Andric   const LLT V16S64 = LLT::fixed_vector(16, 64);
6200b57cec5SDimitry Andric 
6210b57cec5SDimitry Andric   std::initializer_list<LLT> AllS32Vectors =
6220b57cec5SDimitry Andric     {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
6238bcb0991SDimitry Andric      V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
6240b57cec5SDimitry Andric   std::initializer_list<LLT> AllS64Vectors =
6258bcb0991SDimitry Andric     {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
6260b57cec5SDimitry Andric 
6270b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
6280b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
6298bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
6300b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
6318bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
6320b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
6330b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
63406c3fb27SDimitry Andric   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
63506c3fb27SDimitry Andric   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
636*5f757f3fSDimitry Andric   const LLT BufferStridedPtr =
637*5f757f3fSDimitry Andric       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
6380b57cec5SDimitry Andric 
6390b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
6400b57cec5SDimitry Andric 
6410b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
6420b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
6430b57cec5SDimitry Andric   };
6440b57cec5SDimitry Andric 
6450b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
6468bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
6470b57cec5SDimitry Andric   };
6480b57cec5SDimitry Andric 
64906c3fb27SDimitry Andric   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
65006c3fb27SDimitry Andric 
6510b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
6520b57cec5SDimitry Andric     S32, S64
6530b57cec5SDimitry Andric   };
6540b57cec5SDimitry Andric 
6550b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
6560b57cec5SDimitry Andric     S32, S64, S16
6570b57cec5SDimitry Andric   };
6580b57cec5SDimitry Andric 
6590b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
6600b57cec5SDimitry Andric     S32, S64, S16, V2S16
6610b57cec5SDimitry Andric   };
6620b57cec5SDimitry Andric 
6635ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
6645ffd83dbSDimitry Andric 
665fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
666fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
6670b57cec5SDimitry Andric 
6680b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
6690b57cec5SDimitry Andric   // elements for v3s16
6700b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
671e8d8bef9SDimitry Andric       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
6720b57cec5SDimitry Andric       .legalFor(AllS32Vectors)
6730b57cec5SDimitry Andric       .legalFor(AllS64Vectors)
6740b57cec5SDimitry Andric       .legalFor(AddrSpaces64)
6750b57cec5SDimitry Andric       .legalFor(AddrSpaces32)
67606c3fb27SDimitry Andric       .legalFor(AddrSpaces128)
677e8d8bef9SDimitry Andric       .legalIf(isPointer(0))
678e8d8bef9SDimitry Andric       .clampScalar(0, S16, S256)
6790b57cec5SDimitry Andric       .widenScalarToNextPow2(0, 32)
6800b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 16)
6810b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
682e8d8bef9SDimitry Andric       .scalarize(0);
6830b57cec5SDimitry Andric 
684e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
685e8d8bef9SDimitry Andric     // Full set of gfx9 features.
686*5f757f3fSDimitry Andric     if (ST.hasScalarAddSub64()) {
687*5f757f3fSDimitry Andric       getActionDefinitionsBuilder({G_ADD, G_SUB})
688*5f757f3fSDimitry Andric           .legalFor({S64, S32, S16, V2S16})
689*5f757f3fSDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
690*5f757f3fSDimitry Andric           .scalarize(0)
691*5f757f3fSDimitry Andric           .minScalar(0, S16)
692*5f757f3fSDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
693*5f757f3fSDimitry Andric           .maxScalar(0, S32);
694*5f757f3fSDimitry Andric     } else {
69581ad6265SDimitry Andric       getActionDefinitionsBuilder({G_ADD, G_SUB})
6965ffd83dbSDimitry Andric           .legalFor({S32, S16, V2S16})
6970eae32dcSDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
69881ad6265SDimitry Andric           .scalarize(0)
69981ad6265SDimitry Andric           .minScalar(0, S16)
700349cc55cSDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
70181ad6265SDimitry Andric           .maxScalar(0, S32);
702*5f757f3fSDimitry Andric     }
70381ad6265SDimitry Andric 
70481ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
70581ad6265SDimitry Andric       .legalFor({S32, S16, V2S16})
70681ad6265SDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
70781ad6265SDimitry Andric       .scalarize(0)
70881ad6265SDimitry Andric       .minScalar(0, S16)
70981ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
71081ad6265SDimitry Andric       .custom();
71181ad6265SDimitry Andric     assert(ST.hasMad64_32());
712e8d8bef9SDimitry Andric 
713e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
714e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
715e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
7160eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
717e8d8bef9SDimitry Andric       .scalarize(0)
718e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
719e8d8bef9SDimitry Andric       .lower();
7205ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
72181ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
7220b57cec5SDimitry Andric       .legalFor({S32, S16})
723349cc55cSDimitry Andric       .minScalar(0, S16)
724349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
725349cc55cSDimitry Andric       .maxScalar(0, S32)
726349cc55cSDimitry Andric       .scalarize(0);
727e8d8bef9SDimitry Andric 
72881ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
72981ad6265SDimitry Andric       .legalFor({S32, S16})
73081ad6265SDimitry Andric       .scalarize(0)
73181ad6265SDimitry Andric       .minScalar(0, S16)
73281ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
73381ad6265SDimitry Andric       .custom();
73481ad6265SDimitry Andric     assert(ST.hasMad64_32());
73581ad6265SDimitry Andric 
736e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
737e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
738e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
739e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
740e8d8bef9SDimitry Andric       .minScalar(0, S16)
741e8d8bef9SDimitry Andric       .scalarize(0)
742e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
743e8d8bef9SDimitry Andric       .lower();
744e8d8bef9SDimitry Andric 
745e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
746e8d8bef9SDimitry Andric     // coerce to the desired type first.
747e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
748e8d8bef9SDimitry Andric       .minScalar(0, S16)
749e8d8bef9SDimitry Andric       .scalarize(0)
750e8d8bef9SDimitry Andric       .lower();
7510b57cec5SDimitry Andric   } else {
75281ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
7530b57cec5SDimitry Andric       .legalFor({S32})
754349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
7550b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
7560b57cec5SDimitry Andric       .scalarize(0);
757e8d8bef9SDimitry Andric 
75881ad6265SDimitry Andric     auto &Mul = getActionDefinitionsBuilder(G_MUL)
75981ad6265SDimitry Andric       .legalFor({S32})
76081ad6265SDimitry Andric       .scalarize(0)
76181ad6265SDimitry Andric       .minScalar(0, S32)
76281ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32);
76381ad6265SDimitry Andric 
76481ad6265SDimitry Andric     if (ST.hasMad64_32())
76581ad6265SDimitry Andric       Mul.custom();
76681ad6265SDimitry Andric     else
76781ad6265SDimitry Andric       Mul.maxScalar(0, S32);
76881ad6265SDimitry Andric 
769e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
770e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
771e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
772e8d8bef9SDimitry Andric         .scalarize(0)
773e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
774e8d8bef9SDimitry Andric         .lower();
775e8d8bef9SDimitry Andric     } else {
776e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
777e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
778e8d8bef9SDimitry Andric         .minScalar(0, S32)
779e8d8bef9SDimitry Andric         .scalarize(0)
780e8d8bef9SDimitry Andric         .lower();
7810b57cec5SDimitry Andric     }
7820b57cec5SDimitry Andric 
783e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
784e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
785e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
786e8d8bef9SDimitry Andric       .minScalar(0, S32)
787e8d8bef9SDimitry Andric       .scalarize(0)
788e8d8bef9SDimitry Andric       .lower();
789e8d8bef9SDimitry Andric   }
790e8d8bef9SDimitry Andric 
791fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
792fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
7935ffd83dbSDimitry Andric       .customFor({S32, S64})
794480093f4SDimitry Andric       .clampScalar(0, S32, S64)
795480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
796480093f4SDimitry Andric       .scalarize(0);
797480093f4SDimitry Andric 
798e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
7990b57cec5SDimitry Andric                    .legalFor({S32})
800349cc55cSDimitry Andric                    .maxScalar(0, S32);
801e8d8bef9SDimitry Andric 
802e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
803e8d8bef9SDimitry Andric     Mulh
804e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
805e8d8bef9SDimitry Andric       .lowerFor({V2S8});
806e8d8bef9SDimitry Andric   }
807e8d8bef9SDimitry Andric 
808e8d8bef9SDimitry Andric   Mulh
809e8d8bef9SDimitry Andric     .scalarize(0)
810e8d8bef9SDimitry Andric     .lower();
8110b57cec5SDimitry Andric 
8120b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
8130b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
8140b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
8150b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
8160b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
8170b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8188bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
8190b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
8200b57cec5SDimitry Andric     .scalarize(0);
8210b57cec5SDimitry Andric 
822bdd1243dSDimitry Andric   getActionDefinitionsBuilder(
823bdd1243dSDimitry Andric       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
824480093f4SDimitry Andric       .legalFor({{S32, S1}, {S32, S32}})
825bdd1243dSDimitry Andric       .clampScalar(0, S32, S32)
826bdd1243dSDimitry Andric       .scalarize(0);
8270b57cec5SDimitry Andric 
8280b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
8290b57cec5SDimitry Andric     // Don't worry about the size constraint.
8308bcb0991SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
8315ffd83dbSDimitry Andric     .lower();
8320b57cec5SDimitry Andric 
8330b57cec5SDimitry Andric 
8340b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
8358bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
8360b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
837e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
8380b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
839e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
8400b57cec5SDimitry Andric 
8415ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
8425ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
8435ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
8448bcb0991SDimitry Andric 
8455ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
8465ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
8475ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
8485ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
8495ffd83dbSDimitry Andric       .legalFor({S1, S16})
8505ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8515ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
8525ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
8535ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
8545ffd83dbSDimitry Andric 
855fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
8565ffd83dbSDimitry Andric 
8575ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
8585ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
8595ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
8605ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
8615ffd83dbSDimitry Andric 
862*5f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_STACKSAVE)
863*5f757f3fSDimitry Andric     .customFor({PrivatePtr});
864*5f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_STACKRESTORE)
865*5f757f3fSDimitry Andric     .legalFor({PrivatePtr});
866*5f757f3fSDimitry Andric 
8675ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
868e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
869e8d8bef9SDimitry Andric 
870fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
8710b57cec5SDimitry Andric 
8720b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
873bdd1243dSDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
874bdd1243dSDimitry Andric       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
8750b57cec5SDimitry Andric     .legalFor({S32, S64});
8768bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
8778bcb0991SDimitry Andric     .customFor({S32, S64});
8788bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
8798bcb0991SDimitry Andric     .customFor({S32, S64});
8800b57cec5SDimitry Andric 
8810b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
8820b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
8830b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
8840b57cec5SDimitry Andric     else
8850b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
8868bcb0991SDimitry Andric 
8878bcb0991SDimitry Andric     TrigActions.customFor({S16});
8888bcb0991SDimitry Andric     FDIVActions.customFor({S16});
8890b57cec5SDimitry Andric   }
8900b57cec5SDimitry Andric 
891*5f757f3fSDimitry Andric   if (ST.hasPackedFP32Ops()) {
892*5f757f3fSDimitry Andric     FPOpActions.legalFor({V2S32});
893*5f757f3fSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
894*5f757f3fSDimitry Andric   }
895*5f757f3fSDimitry Andric 
8960b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
8970b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
8980b57cec5SDimitry Andric 
8990b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
9000b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
901480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
9020b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
9030b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
9040b57cec5SDimitry Andric       .scalarize(0);
9050b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
9060b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
9070b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
9080b57cec5SDimitry Andric       .scalarize(0);
9090b57cec5SDimitry Andric   } else {
9100b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
9110b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
9120b57cec5SDimitry Andric       .scalarize(0);
9130b57cec5SDimitry Andric   }
9140b57cec5SDimitry Andric 
9150b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
9160eae32dcSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
9178bcb0991SDimitry Andric 
9180b57cec5SDimitry Andric   FPOpActions
9190b57cec5SDimitry Andric     .scalarize(0)
9200b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9210b57cec5SDimitry Andric 
9228bcb0991SDimitry Andric   TrigActions
9238bcb0991SDimitry Andric     .scalarize(0)
9248bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9258bcb0991SDimitry Andric 
9268bcb0991SDimitry Andric   FDIVActions
9278bcb0991SDimitry Andric     .scalarize(0)
9288bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9298bcb0991SDimitry Andric 
9308bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
9318bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
9320eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
9338bcb0991SDimitry Andric     .scalarize(0)
9348bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
9358bcb0991SDimitry Andric 
9360b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
93706c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
938*5f757f3fSDimitry Andric       .legalFor({S16})
939*5f757f3fSDimitry Andric       .customFor({S32, S64})
94006c3fb27SDimitry Andric       .scalarize(0)
941*5f757f3fSDimitry Andric       .unsupported();
94206c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFLOOR)
9430b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
9440b57cec5SDimitry Andric       .scalarize(0)
9450b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
94606c3fb27SDimitry Andric 
94706c3fb27SDimitry Andric     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
94806c3fb27SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
94906c3fb27SDimitry Andric       .scalarize(0)
95006c3fb27SDimitry Andric       .maxScalarIf(typeIs(0, S16), 1, S16)
95106c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
95206c3fb27SDimitry Andric       .lower();
95306c3fb27SDimitry Andric 
95406c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFREXP)
95506c3fb27SDimitry Andric       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
95606c3fb27SDimitry Andric       .scalarize(0)
95706c3fb27SDimitry Andric       .lower();
9580b57cec5SDimitry Andric   } else {
9595ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
960*5f757f3fSDimitry Andric       .customFor({S32, S64, S16})
9615ffd83dbSDimitry Andric       .scalarize(0)
962*5f757f3fSDimitry Andric       .unsupported();
963*5f757f3fSDimitry Andric 
9645ffd83dbSDimitry Andric 
9655ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
9665ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
9675ffd83dbSDimitry Andric         .customFor({S64})
9685ffd83dbSDimitry Andric         .legalFor({S32, S64})
9695ffd83dbSDimitry Andric         .scalarize(0)
9705ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
9715ffd83dbSDimitry Andric     } else {
9725ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
9730b57cec5SDimitry Andric         .legalFor({S32, S64})
9740b57cec5SDimitry Andric         .scalarize(0)
9750b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
9760b57cec5SDimitry Andric     }
97706c3fb27SDimitry Andric 
97806c3fb27SDimitry Andric     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
97906c3fb27SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
98006c3fb27SDimitry Andric       .scalarize(0)
98106c3fb27SDimitry Andric       .clampScalar(0, S32, S64)
98206c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
98306c3fb27SDimitry Andric       .lower();
98406c3fb27SDimitry Andric 
98506c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFREXP)
98606c3fb27SDimitry Andric       .customFor({{S32, S32}, {S64, S32}})
98706c3fb27SDimitry Andric       .scalarize(0)
98806c3fb27SDimitry Andric       .minScalar(0, S32)
98906c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
99006c3fb27SDimitry Andric       .lower();
9915ffd83dbSDimitry Andric   }
9920b57cec5SDimitry Andric 
9930b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
9940b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
9955ffd83dbSDimitry Andric     .scalarize(0)
9965ffd83dbSDimitry Andric     .lower();
9970b57cec5SDimitry Andric 
9980b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
9990b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
1000e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
10010b57cec5SDimitry Andric     .scalarize(0);
10020b57cec5SDimitry Andric 
1003bdd1243dSDimitry Andric   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
100481ad6265SDimitry Andric   if (ST.has16BitInsts()) {
100581ad6265SDimitry Andric     FSubActions
100681ad6265SDimitry Andric       // Use actual fsub instruction
100781ad6265SDimitry Andric       .legalFor({S32, S16})
100881ad6265SDimitry Andric       // Must use fadd + fneg
100981ad6265SDimitry Andric       .lowerFor({S64, V2S16});
101081ad6265SDimitry Andric   } else {
101181ad6265SDimitry Andric     FSubActions
10120b57cec5SDimitry Andric       // Use actual fsub instruction
10130b57cec5SDimitry Andric       .legalFor({S32})
10140b57cec5SDimitry Andric       // Must use fadd + fneg
101581ad6265SDimitry Andric       .lowerFor({S64, S16, V2S16});
101681ad6265SDimitry Andric   }
101781ad6265SDimitry Andric 
101881ad6265SDimitry Andric   FSubActions
10190b57cec5SDimitry Andric     .scalarize(0)
10200b57cec5SDimitry Andric     .clampScalar(0, S32, S64);
10210b57cec5SDimitry Andric 
10228bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
10238bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
10245ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
10258bcb0991SDimitry Andric     FMad.customFor({S32, S16});
10265ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
10278bcb0991SDimitry Andric     FMad.customFor({S32});
10285ffd83dbSDimitry Andric   else if (ST.hasMadF16())
10295ffd83dbSDimitry Andric     FMad.customFor({S16});
10308bcb0991SDimitry Andric   FMad.scalarize(0)
10318bcb0991SDimitry Andric       .lower();
10328bcb0991SDimitry Andric 
1033e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1034e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
1035e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
1036e8d8bef9SDimitry Andric   } else {
1037e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
1038e8d8bef9SDimitry Andric         .customFor({S32, S64});
1039e8d8bef9SDimitry Andric   }
1040e8d8bef9SDimitry Andric   FRem.scalarize(0);
1041e8d8bef9SDimitry Andric 
10425ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
10435ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
10445ffd83dbSDimitry Andric     .legalIf(isScalar(0))
10455ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
10465ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
10475ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
10485ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
10495ffd83dbSDimitry Andric     // in the legalizer.
10505ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
10515ffd83dbSDimitry Andric     .alwaysLegal();
10525ffd83dbSDimitry Andric 
10530b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
10540b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
10555ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
1056480093f4SDimitry Andric     .scalarize(0)
10575ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
10585ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
10590b57cec5SDimitry Andric 
10608bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
10618bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1062480093f4SDimitry Andric                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1063480093f4SDimitry Andric                     .lowerIf(typeIs(1, S1))
1064349cc55cSDimitry Andric                     .customFor({{S32, S64}, {S64, S64}});
10658bcb0991SDimitry Andric   if (ST.has16BitInsts())
10668bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
10678bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
1068e8d8bef9SDimitry Andric        .minScalar(0, S32)
10695ffd83dbSDimitry Andric        .scalarize(0)
10705ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
10710b57cec5SDimitry Andric 
10728bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
10735ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1074fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
1075e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
10768bcb0991SDimitry Andric   if (ST.has16BitInsts())
10778bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
10788bcb0991SDimitry Andric   else
10798bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
10808bcb0991SDimitry Andric 
10818bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
1082fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
10835ffd83dbSDimitry Andric        .scalarize(0)
10845ffd83dbSDimitry Andric        .lower();
10850b57cec5SDimitry Andric 
108681ad6265SDimitry Andric   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
108781ad6265SDimitry Andric       .customFor({S16, S32})
108881ad6265SDimitry Andric       .scalarize(0)
108981ad6265SDimitry Andric       .lower();
109081ad6265SDimitry Andric 
1091*5f757f3fSDimitry Andric   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1092*5f757f3fSDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1093480093f4SDimitry Andric       .scalarize(0)
1094480093f4SDimitry Andric       .lower();
10950b57cec5SDimitry Andric 
1096480093f4SDimitry Andric   if (ST.has16BitInsts()) {
1097*5f757f3fSDimitry Andric     getActionDefinitionsBuilder(
1098*5f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1099480093f4SDimitry Andric         .legalFor({S16, S32, S64})
1100480093f4SDimitry Andric         .clampScalar(0, S16, S64)
1101480093f4SDimitry Andric         .scalarize(0);
1102480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1103*5f757f3fSDimitry Andric     getActionDefinitionsBuilder(
1104*5f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
11050b57cec5SDimitry Andric         .legalFor({S32, S64})
11060b57cec5SDimitry Andric         .clampScalar(0, S32, S64)
11070b57cec5SDimitry Andric         .scalarize(0);
11080b57cec5SDimitry Andric   } else {
1109*5f757f3fSDimitry Andric     getActionDefinitionsBuilder(
1110*5f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
11110b57cec5SDimitry Andric         .legalFor({S32})
11120b57cec5SDimitry Andric         .customFor({S64})
11130b57cec5SDimitry Andric         .clampScalar(0, S32, S64)
11140b57cec5SDimitry Andric         .scalarize(0);
11150b57cec5SDimitry Andric   }
11160b57cec5SDimitry Andric 
1117480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
1118*5f757f3fSDimitry Andric       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1119e8d8bef9SDimitry Andric       .legalIf(all(isPointer(0), sameSize(0, 1)))
1120e8d8bef9SDimitry Andric       .scalarize(0)
1121e8d8bef9SDimitry Andric       .scalarSameSizeAs(1, 0);
11220b57cec5SDimitry Andric 
11235ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
1124e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1125e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
11265ffd83dbSDimitry Andric     .scalarize(0);
11270b57cec5SDimitry Andric 
11280b57cec5SDimitry Andric   auto &CmpBuilder =
11290b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
1130480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
1131480093f4SDimitry Andric     // so make both s1 and s32 legal.
1132480093f4SDimitry Andric     //
1133480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
1134480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
1135480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
1136480093f4SDimitry Andric     // before that won't try to use s32 result types.
1137480093f4SDimitry Andric     //
1138480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1139480093f4SDimitry Andric     // bank.
11400b57cec5SDimitry Andric     .legalForCartesianProduct(
11410b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1142480093f4SDimitry Andric     .legalForCartesianProduct(
1143480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
11440b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
11450b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
11460b57cec5SDimitry Andric   }
11470b57cec5SDimitry Andric 
11480b57cec5SDimitry Andric   CmpBuilder
11490b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
11500b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
11510b57cec5SDimitry Andric     .scalarize(0)
1152480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
11530b57cec5SDimitry Andric 
1154*5f757f3fSDimitry Andric   auto &FCmpBuilder =
1155*5f757f3fSDimitry Andric       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1156*5f757f3fSDimitry Andric           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1157*5f757f3fSDimitry Andric 
1158*5f757f3fSDimitry Andric   if (ST.hasSALUFloatInsts())
1159*5f757f3fSDimitry Andric     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1160*5f757f3fSDimitry Andric 
1161*5f757f3fSDimitry Andric   FCmpBuilder
11620b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
11630b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
11640b57cec5SDimitry Andric     .scalarize(0);
11650b57cec5SDimitry Andric 
11665ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
116706c3fb27SDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
11685ffd83dbSDimitry Andric   if (ST.has16BitInsts())
11695ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
11705ffd83dbSDimitry Andric   else
11715ffd83dbSDimitry Andric     ExpOps.customFor({S32});
11725ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
11730b57cec5SDimitry Andric         .scalarize(0);
11740b57cec5SDimitry Andric 
1175e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
1176e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
1177e8d8bef9SDimitry Andric     .lower();
1178e8d8bef9SDimitry Andric 
117906c3fb27SDimitry Andric   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
118006c3fb27SDimitry Andric   Log2Ops.customFor({S32});
118106c3fb27SDimitry Andric   if (ST.has16BitInsts())
118206c3fb27SDimitry Andric     Log2Ops.legalFor({S16});
118306c3fb27SDimitry Andric   else
118406c3fb27SDimitry Andric     Log2Ops.customFor({S16});
118506c3fb27SDimitry Andric   Log2Ops.scalarize(0)
118606c3fb27SDimitry Andric     .lower();
118706c3fb27SDimitry Andric 
1188*5f757f3fSDimitry Andric   auto &LogOps =
1189*5f757f3fSDimitry Andric       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
119006c3fb27SDimitry Andric   LogOps.customFor({S32, S16});
119106c3fb27SDimitry Andric   LogOps.clampScalar(0, MinScalarFPTy, S32)
119206c3fb27SDimitry Andric         .scalarize(0);
119306c3fb27SDimitry Andric 
11940b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
11955ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
11960b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
11970b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
119804eeddc0SDimitry Andric     .widenScalarToNextPow2(1, 32)
11990b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
12000b57cec5SDimitry Andric     .scalarize(0)
120104eeddc0SDimitry Andric     .widenScalarToNextPow2(0, 32);
120204eeddc0SDimitry Andric 
1203bdd1243dSDimitry Andric   // If no 16 bit instr is available, lower into different instructions.
1204bdd1243dSDimitry Andric   if (ST.has16BitInsts())
1205bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
1206bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypes16)
1207bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
1208bdd1243dSDimitry Andric         .scalarize(0)
1209bdd1243dSDimitry Andric         .lower();
1210bdd1243dSDimitry Andric   else
1211bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
1212bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypesBase)
1213bdd1243dSDimitry Andric         .lowerFor({S1, S16})
1214bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
1215bdd1243dSDimitry Andric         .scalarize(0)
1216bdd1243dSDimitry Andric         .lower();
12170b57cec5SDimitry Andric 
12185ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
12195ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
12205ffd83dbSDimitry Andric   // bitwidth.
12215ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
12225ffd83dbSDimitry Andric     .scalarize(0)
12235ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
12245ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
12255ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
12265ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
1227349cc55cSDimitry Andric     .custom();
12285ffd83dbSDimitry Andric 
12295ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
12305ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
12315ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
12325ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
12335ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
12345ffd83dbSDimitry Andric     .scalarize(0)
12355ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
12365ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
12375ffd83dbSDimitry Andric 
1238fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1239fe6060f1SDimitry Andric   // RegBankSelect.
12405ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
1241fe6060f1SDimitry Andric     .legalFor({S32, S64})
1242fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
1243fe6060f1SDimitry Andric     .scalarize(0)
1244fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
12450b57cec5SDimitry Andric 
12460b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
12475ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
12485ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
12490eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
12505ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
12515ffd83dbSDimitry Andric       // narrowScalar limitation.
12525ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
12535ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
12545ffd83dbSDimitry Andric       .scalarize(0);
12555ffd83dbSDimitry Andric 
12560b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
1257fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12580b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
12590b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
12605ffd83dbSDimitry Andric         .minScalar(0, S16)
12610b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
12625ffd83dbSDimitry Andric         .scalarize(0)
12635ffd83dbSDimitry Andric         .lower();
12640b57cec5SDimitry Andric     } else {
1265fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12660b57cec5SDimitry Andric         .legalFor({S32, S16})
12670b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
12685ffd83dbSDimitry Andric         .minScalar(0, S16)
12695ffd83dbSDimitry Andric         .scalarize(0)
12705ffd83dbSDimitry Andric         .lower();
12710b57cec5SDimitry Andric     }
12720b57cec5SDimitry Andric   } else {
12735ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
12745ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
12755ffd83dbSDimitry Andric       .legalFor({S32})
12765ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
12775ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
12785ffd83dbSDimitry Andric       // narrowScalar limitation.
12795ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
12805ffd83dbSDimitry Andric       .maxScalar(0, S32)
12815ffd83dbSDimitry Andric       .scalarize(0)
12825ffd83dbSDimitry Andric       .lower();
12835ffd83dbSDimitry Andric 
1284fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
12850b57cec5SDimitry Andric       .legalFor({S32})
12865ffd83dbSDimitry Andric       .minScalar(0, S32)
12870b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
12885ffd83dbSDimitry Andric       .scalarize(0)
12895ffd83dbSDimitry Andric       .lower();
12900b57cec5SDimitry Andric   }
12910b57cec5SDimitry Andric 
12920b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
12930b57cec5SDimitry Andric       // List the common cases
12940b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
12950b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
12960b57cec5SDimitry Andric       .scalarize(0)
12970b57cec5SDimitry Andric       // Accept any address space as long as the size matches
12980b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
12990b57cec5SDimitry Andric       .widenScalarIf(smallerThan(1, 0),
13000b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1301bdd1243dSDimitry Andric                        return std::pair(
1302bdd1243dSDimitry Andric                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
13030b57cec5SDimitry Andric                      })
1304bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1305bdd1243dSDimitry Andric         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
13060b57cec5SDimitry Andric       });
13070b57cec5SDimitry Andric 
13080b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
13090b57cec5SDimitry Andric       // List the common cases
13100b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
13110b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
13120b57cec5SDimitry Andric       .scalarize(0)
13130b57cec5SDimitry Andric       // Accept any address space as long as the size matches
13140b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
13150b57cec5SDimitry Andric       .widenScalarIf(smallerThan(0, 1),
13160b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1317bdd1243dSDimitry Andric                        return std::pair(
1318bdd1243dSDimitry Andric                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
13190b57cec5SDimitry Andric                      })
1320bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1321bdd1243dSDimitry Andric         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
13220b57cec5SDimitry Andric       });
13230b57cec5SDimitry Andric 
13240b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
13250b57cec5SDimitry Andric     .scalarize(0)
13260b57cec5SDimitry Andric     .custom();
13270b57cec5SDimitry Andric 
13285ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
13295ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
13308bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
13318bcb0991SDimitry Andric 
13328bcb0991SDimitry Andric     // Split vector extloads.
1333fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1334480093f4SDimitry Andric 
13358bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
13368bcb0991SDimitry Andric       return true;
13378bcb0991SDimitry Andric 
13388bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
13398bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
134006c3fb27SDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
134106c3fb27SDimitry Andric                                       Query.MMODescrs[0].Ordering !=
134206c3fb27SDimitry Andric                                           AtomicOrdering::NotAtomic))
13438bcb0991SDimitry Andric       return true;
13448bcb0991SDimitry Andric 
13458bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
13468bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
13475ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
13485ffd83dbSDimitry Andric     if (NumRegs == 3) {
13495ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
13508bcb0991SDimitry Andric         return true;
13515ffd83dbSDimitry Andric     } else {
13525ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
13535ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
13545ffd83dbSDimitry Andric         return true;
13555ffd83dbSDimitry Andric     }
13568bcb0991SDimitry Andric 
13578bcb0991SDimitry Andric     return false;
13588bcb0991SDimitry Andric   };
13598bcb0991SDimitry Andric 
1360e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1361e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1362e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
13638bcb0991SDimitry Andric 
13648bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
13658bcb0991SDimitry Andric   // LDS
13668bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
13678bcb0991SDimitry Andric 
13688bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
13698bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
13708bcb0991SDimitry Andric 
13718bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
13725ffd83dbSDimitry Andric     // Explicitly list some common cases.
13735ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1374fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1375fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1376fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1377fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1378fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1379fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1380fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1381fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
13828bcb0991SDimitry Andric 
1383fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1384fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1385fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1386fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1387fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1388fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
13898bcb0991SDimitry Andric 
1390fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1391fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1392fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1393fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
13948bcb0991SDimitry Andric 
1395fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1396fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1397fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1398fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1399fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
14005ffd83dbSDimitry Andric     Actions.legalIf(
14015ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1402fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
14035ffd83dbSDimitry Andric       });
14045ffd83dbSDimitry Andric 
140506c3fb27SDimitry Andric     // The custom pointers (fat pointers, buffer resources) don't work with load
140606c3fb27SDimitry Andric     // and store at this level. Fat pointers should have been lowered to
140706c3fb27SDimitry Andric     // intrinsics before the translation to MIR.
1408*5f757f3fSDimitry Andric     Actions.unsupportedIf(
1409*5f757f3fSDimitry Andric         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
141006c3fb27SDimitry Andric 
141106c3fb27SDimitry Andric     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
141206c3fb27SDimitry Andric     // ptrtoint. This is needed to account for the fact that we can't have i128
141306c3fb27SDimitry Andric     // as a register class for SelectionDAG reasons.
141406c3fb27SDimitry Andric     Actions.customIf([=](const LegalityQuery &Query) -> bool {
141506c3fb27SDimitry Andric       return hasBufferRsrcWorkaround(Query.Types[0]);
141606c3fb27SDimitry Andric     });
141706c3fb27SDimitry Andric 
14185ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
14195ffd83dbSDimitry Andric     // 64-bits.
14205ffd83dbSDimitry Andric     //
14215ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
14225ffd83dbSDimitry Andric     // inserting addrspacecasts.
14235ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
14245ffd83dbSDimitry Andric 
14255ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
14265ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
14275ffd83dbSDimitry Andric     // parts anyway.
14285ffd83dbSDimitry Andric     //
14295ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
14305ffd83dbSDimitry Andric     // 16-bit vector parts.
14315ffd83dbSDimitry Andric     Actions.bitcastIf(
14325ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1433e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1434fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
14355ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
14365ffd83dbSDimitry Andric 
1437e8d8bef9SDimitry Andric     if (!IsStore) {
1438e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1439e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1440e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1441e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1442e8d8bef9SDimitry Andric       });
1443e8d8bef9SDimitry Andric     }
1444e8d8bef9SDimitry Andric 
1445e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
14468bcb0991SDimitry Andric     Actions
14478bcb0991SDimitry Andric         .narrowScalarIf(
14488bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
14495ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
14505ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
14518bcb0991SDimitry Andric             },
14528bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
14538bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
14548bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
14558bcb0991SDimitry Andric 
14568bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1457fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
14588bcb0991SDimitry Andric 
14598bcb0991SDimitry Andric               // Split extloads.
14608bcb0991SDimitry Andric               if (DstSize > MemSize)
1461bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MemSize));
14628bcb0991SDimitry Andric 
146306c3fb27SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(
146406c3fb27SDimitry Andric                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
146506c3fb27SDimitry Andric                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
14668bcb0991SDimitry Andric               if (MemSize > MaxSize)
1467bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MaxSize));
14688bcb0991SDimitry Andric 
146904eeddc0SDimitry Andric               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1470bdd1243dSDimitry Andric               return std::pair(0, LLT::scalar(Align));
14718bcb0991SDimitry Andric             })
14728bcb0991SDimitry Andric         .fewerElementsIf(
14738bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
14745ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
14755ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
14768bcb0991SDimitry Andric             },
14778bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
14788bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
14798bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
14808bcb0991SDimitry Andric 
14818bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
148206c3fb27SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(
148306c3fb27SDimitry Andric                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
148406c3fb27SDimitry Andric                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
14855ffd83dbSDimitry Andric 
14865ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
14875ffd83dbSDimitry Andric               // up scalarizing.
14885ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
14898bcb0991SDimitry Andric 
14908bcb0991SDimitry Andric               // Split if it's too large for the address space.
1491fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1492fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
14938bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
14945ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
14955ffd83dbSDimitry Andric 
14965ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
1497bdd1243dSDimitry Andric                   return std::pair(
1498fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1499fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
15005ffd83dbSDimitry Andric                 }
15015ffd83dbSDimitry Andric 
1502fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
15038bcb0991SDimitry Andric 
15048bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
15058bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
15068bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
15078bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
1508bdd1243dSDimitry Andric                   return std::pair(0, EltTy);
15098bcb0991SDimitry Andric 
1510bdd1243dSDimitry Andric                 return std::pair(0,
1511bdd1243dSDimitry Andric                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
15128bcb0991SDimitry Andric               }
15138bcb0991SDimitry Andric 
15145ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
15155ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
1516bdd1243dSDimitry Andric                 return std::pair(0, EltTy);
15175ffd83dbSDimitry Andric 
15185ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
15195ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
15205ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
15215ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
15225ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
15235ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
152406c3fb27SDimitry Andric                 unsigned FloorSize = llvm::bit_floor(DstSize);
1525bdd1243dSDimitry Andric                 return std::pair(
1526fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1527fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
15285ffd83dbSDimitry Andric               }
15295ffd83dbSDimitry Andric 
15308bcb0991SDimitry Andric               // May need relegalization for the scalars.
1531bdd1243dSDimitry Andric               return std::pair(0, EltTy);
15328bcb0991SDimitry Andric             })
1533fe6060f1SDimitry Andric     .minScalar(0, S32)
1534fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
15358bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1536e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1537e8d8bef9SDimitry Andric     .lower();
15388bcb0991SDimitry Andric   }
15390b57cec5SDimitry Andric 
1540fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
15410b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1542fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1543fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1544fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1545fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1546fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1547fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1548fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1549fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1550fe6060f1SDimitry Andric                        .legalIf(
1551fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1552fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1553fe6060f1SDimitry Andric                          });
1554fe6060f1SDimitry Andric 
15550b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
15568bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1557fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
15580b57cec5SDimitry Andric   }
15590b57cec5SDimitry Andric 
1560fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1561fe6060f1SDimitry Andric   // 64-bits.
1562fe6060f1SDimitry Andric   //
1563fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1564fe6060f1SDimitry Andric   // inserting addrspacecasts.
1565fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1566fe6060f1SDimitry Andric 
15670b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
15680b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
15690b57cec5SDimitry Andric           .lower();
15700b57cec5SDimitry Andric 
15710b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
15720b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
15730b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
15740b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
157506c3fb27SDimitry Andric      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
15760b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1577e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1578e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
15790b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
15800b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
15810b57cec5SDimitry Andric   }
15820b57cec5SDimitry Andric 
1583fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1584349cc55cSDimitry Andric   if (ST.hasLDSFPAtomicAdd()) {
1585fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1586fe6060f1SDimitry Andric     if (ST.hasGFX90AInsts())
1587fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
158806c3fb27SDimitry Andric     if (ST.hasAtomicDsPkAdd16Insts())
158981ad6265SDimitry Andric       Atomic.legalFor({{V2S16, LocalPtr}});
15905ffd83dbSDimitry Andric   }
1591fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1592fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
1593bdd1243dSDimitry Andric   if (ST.hasFlatAtomicFaddF32Inst())
1594bdd1243dSDimitry Andric     Atomic.legalFor({{S32, FlatPtr}});
15958bcb0991SDimitry Andric 
159604eeddc0SDimitry Andric   if (ST.hasGFX90AInsts()) {
159704eeddc0SDimitry Andric     // These are legal with some caveats, and should have undergone expansion in
159804eeddc0SDimitry Andric     // the IR in most situations
159904eeddc0SDimitry Andric     // TODO: Move atomic expansion into legalizer
160004eeddc0SDimitry Andric     Atomic.legalFor({
160104eeddc0SDimitry Andric         {S32, GlobalPtr},
160204eeddc0SDimitry Andric         {S64, GlobalPtr},
160304eeddc0SDimitry Andric         {S64, FlatPtr}
160404eeddc0SDimitry Andric       });
160504eeddc0SDimitry Andric   }
160604eeddc0SDimitry Andric 
1607480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1608480093f4SDimitry Andric   // demarshalling
1609480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1610480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1611480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1612480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1613480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
16140b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1615480093f4SDimitry Andric 
1616480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
16170b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1618fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1619fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1620fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1621fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1622fe6060f1SDimitry Andric                                 {S1, S32})
16230b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
16245ffd83dbSDimitry Andric       .scalarize(1)
16250b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
16260b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
16270b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
16280b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
16290b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
16300b57cec5SDimitry Andric       .scalarize(0)
16310b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1632480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
16330b57cec5SDimitry Andric 
16340b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
16350b57cec5SDimitry Andric   // be more flexible with the shift amount type.
16360b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
16370b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
16380b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
16390b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
16405ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
16410b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
16420b57cec5SDimitry Andric     } else
16435ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
16440b57cec5SDimitry Andric 
16455ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
16465ffd83dbSDimitry Andric     Shifts.widenScalarIf(
16475ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
16485ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
16495ffd83dbSDimitry Andric         // 32-bit amount.
16505ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
16515ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
16525ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
16535ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
16545ffd83dbSDimitry Andric       }, changeTo(1, S16));
16555ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1656480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
16570b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
165804eeddc0SDimitry Andric     Shifts.clampScalar(0, S16, S64);
1659e8d8bef9SDimitry Andric 
1660e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1661e8d8bef9SDimitry Andric       .minScalar(0, S16)
1662e8d8bef9SDimitry Andric       .scalarize(0)
1663e8d8bef9SDimitry Andric       .lower();
16640b57cec5SDimitry Andric   } else {
16650b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
16660b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
16670b57cec5SDimitry Andric     // been truncated already.
16680b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
16690b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
167004eeddc0SDimitry Andric     Shifts.clampScalar(0, S32, S64);
1671e8d8bef9SDimitry Andric 
1672e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1673e8d8bef9SDimitry Andric       .minScalar(0, S32)
1674e8d8bef9SDimitry Andric       .scalarize(0)
1675e8d8bef9SDimitry Andric       .lower();
16760b57cec5SDimitry Andric   }
16770b57cec5SDimitry Andric   Shifts.scalarize(0);
16780b57cec5SDimitry Andric 
16790b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
16800b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
16810b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
16820b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
16830b57cec5SDimitry Andric 
16840b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
16850b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
16860b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
16870b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
16880b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1689e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
169006c3fb27SDimitry Andric           const bool isLegalVecType =
169106c3fb27SDimitry Andric               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
169206c3fb27SDimitry Andric           // Address space 8 pointers are 128-bit wide values, but the logic
169306c3fb27SDimitry Andric           // below will try to bitcast them to 2N x s64, which will fail.
169406c3fb27SDimitry Andric           // Therefore, as an intermediate step, wrap extracts/insertions from a
169506c3fb27SDimitry Andric           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
169606c3fb27SDimitry Andric           // extraction result) in order to produce a vector operation that can
169706c3fb27SDimitry Andric           // be handled by the logic below.
169806c3fb27SDimitry Andric           if (EltTy.isPointer() && EltSize > 64)
169906c3fb27SDimitry Andric             return true;
1700e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
17010b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
17025ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
170306c3fb27SDimitry Andric                   IdxTy.getSizeInBits() == 32 &&
170406c3fb27SDimitry Andric                   isLegalVecType;
17050b57cec5SDimitry Andric         })
1706e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1707e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1708e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1709e8d8bef9SDimitry Andric       .bitcastIf(
1710e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1711e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1712e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1713e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1714e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1715e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1716e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1717e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1718e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1719e8d8bef9SDimitry Andric 
1720e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1721bdd1243dSDimitry Andric           return std::pair(
1722fe6060f1SDimitry Andric               VecTypeIdx,
1723fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1724e8d8bef9SDimitry Andric         })
17250b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
17260b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1727e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1728e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1729e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
173006c3fb27SDimitry Andric       .moreElementsIf(
173106c3fb27SDimitry Andric         isIllegalRegisterType(VecTypeIdx),
173206c3fb27SDimitry Andric         moreElementsToNextExistingRegClass(VecTypeIdx))
1733e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1734e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1735e8d8bef9SDimitry Andric       .lower();
17360b57cec5SDimitry Andric   }
17370b57cec5SDimitry Andric 
17380b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
17390b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
17400b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
17410b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
17420b57cec5SDimitry Andric       });
17430b57cec5SDimitry Andric 
17440b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
17450b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
17460b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
17470b57cec5SDimitry Andric 
17480b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
17490b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
17508bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
17510eae32dcSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
17520eae32dcSDimitry Andric           // Sub-vector(or single element) insert and extract.
17530eae32dcSDimitry Andric           // TODO: verify immediate offset here since lower only works with
17540eae32dcSDimitry Andric           // whole elements.
17550eae32dcSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17560eae32dcSDimitry Andric           return BigTy.isVector();
17570eae32dcSDimitry Andric         })
17588bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
17590b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
17600b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17610b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
17620b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
17630b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
17640b57cec5SDimitry Andric         })
17650b57cec5SDimitry Andric       .widenScalarIf(
17660b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
17670b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
17680b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
17690b57cec5SDimitry Andric         },
17700b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
17710b57cec5SDimitry Andric       .widenScalarIf(
17720b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
17730b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
17740b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
17750b57cec5SDimitry Andric         },
17760b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
17770b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
17780b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
17790b57cec5SDimitry Andric 
17800b57cec5SDimitry Andric   }
17810b57cec5SDimitry Andric 
17828bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
17830b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
17840b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
17858bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
17868bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
178706c3fb27SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
178806c3fb27SDimitry Andric     .moreElementsIf(
178906c3fb27SDimitry Andric       isIllegalRegisterType(0),
179006c3fb27SDimitry Andric       moreElementsToNextExistingRegClass(0));
17918bcb0991SDimitry Andric 
17928bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
17935ffd83dbSDimitry Andric     BuildVector
17945ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
17955ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
1796bdd1243dSDimitry Andric       .minScalar(1, S16);
17975ffd83dbSDimitry Andric 
17988bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
17998bcb0991SDimitry Andric       .legalFor({V2S16, S32})
18008bcb0991SDimitry Andric       .lower();
18018bcb0991SDimitry Andric   } else {
18025ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
18035ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
18045ffd83dbSDimitry Andric 
18058bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
18065ffd83dbSDimitry Andric       .customFor({V2S16, S32})
18078bcb0991SDimitry Andric       .lower();
18088bcb0991SDimitry Andric   }
18098bcb0991SDimitry Andric 
18105ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
18115ffd83dbSDimitry Andric 
18125ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
18130b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1814e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1815e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1816e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1817e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
18180b57cec5SDimitry Andric 
18198bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
18208bcb0991SDimitry Andric 
18210b57cec5SDimitry Andric   // Merge/Unmerge
18220b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
18230b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
18240b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
18250b57cec5SDimitry Andric 
18260b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
18275ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
18280b57cec5SDimitry Andric       if (Ty.isVector()) {
18290b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
18305ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
18310b57cec5SDimitry Andric           return true;
183206c3fb27SDimitry Andric         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
18330b57cec5SDimitry Andric           return true;
18340b57cec5SDimitry Andric       }
18350b57cec5SDimitry Andric       return false;
18360b57cec5SDimitry Andric     };
18370b57cec5SDimitry Andric 
18388bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1839e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
18405ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
18415ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
18425ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
18435ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
18445ffd83dbSDimitry Andric         })
18455ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
18465ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
18475ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
18480b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
18498bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
18508bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
18518bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
18528bcb0991SDimitry Andric                        changeTo(1, V2S16))
18535ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
18545ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
18555ffd83dbSDimitry Andric       // valid.
18565ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
18575ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
18580b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
18590b57cec5SDimitry Andric       .fewerElementsIf(
18605ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
18610b57cec5SDimitry Andric         scalarize(0))
18620b57cec5SDimitry Andric       .fewerElementsIf(
18635ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
18640b57cec5SDimitry Andric         scalarize(1))
18655ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
18668bcb0991SDimitry Andric 
18678bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
18688bcb0991SDimitry Andric       Builder.widenScalarIf(
18698bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
18700b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
18718bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
18728bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
18738bcb0991SDimitry Andric         },
18748bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
18758bcb0991SDimitry Andric     }
18768bcb0991SDimitry Andric 
18778bcb0991SDimitry Andric     Builder.widenScalarIf(
18788bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
18798bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
188006c3fb27SDimitry Andric         return Ty.getSizeInBits() % 16 != 0;
18810b57cec5SDimitry Andric       },
18820b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
18830b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
18840b57cec5SDimitry Andric         // Whichever is smaller.
18850b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
18860b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
18870b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
18880b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
18890b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
18900b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
18910b57cec5SDimitry Andric         }
1892bdd1243dSDimitry Andric         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
18930b57cec5SDimitry Andric       })
18940b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
18950b57cec5SDimitry Andric       .scalarize(0)
18960b57cec5SDimitry Andric       .scalarize(1);
18970b57cec5SDimitry Andric   }
18980b57cec5SDimitry Andric 
18995ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
19005ffd83dbSDimitry Andric   // RegBankSelect.
19015ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
19025ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
19038bcb0991SDimitry Andric 
19045ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
19055ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
19065ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
19075ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
19085ffd83dbSDimitry Andric       // expanded.
19090eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2);
19105ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
19115ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
19125ffd83dbSDimitry Andric   } else {
19135ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
19145ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
19155ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
19165ffd83dbSDimitry Andric   }
19175ffd83dbSDimitry Andric 
19185ffd83dbSDimitry Andric   SextInReg
19195ffd83dbSDimitry Andric     .scalarize(0)
19205ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
19215ffd83dbSDimitry Andric     .lower();
19225ffd83dbSDimitry Andric 
1923349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1924349cc55cSDimitry Andric     .scalarize(0)
1925349cc55cSDimitry Andric     .lower();
1926349cc55cSDimitry Andric 
1927fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
19285ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
19295ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
1930fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
19310eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
19325ffd83dbSDimitry Andric     .scalarize(0)
19335ffd83dbSDimitry Andric     .lower();
1934480093f4SDimitry Andric 
1935fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
1936fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1937fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
19380eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
1939fe6060f1SDimitry Andric       .scalarize(0)
1940fe6060f1SDimitry Andric       .lower();
1941fe6060f1SDimitry Andric   } else {
1942fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
1943fe6060f1SDimitry Andric       .scalarize(0)
1944fe6060f1SDimitry Andric       .lower();
1945fe6060f1SDimitry Andric   }
1946fe6060f1SDimitry Andric 
1947480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1948480093f4SDimitry Andric     .legalFor({S64});
1949480093f4SDimitry Andric 
1950e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
1951e8d8bef9SDimitry Andric     .alwaysLegal();
1952e8d8bef9SDimitry Andric 
1953fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
1954fe6060f1SDimitry Andric       .scalarize(0)
1955fe6060f1SDimitry Andric       .minScalar(0, S32)
1956fe6060f1SDimitry Andric       .lower();
1957fe6060f1SDimitry Andric 
1958fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
1959fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
1960fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
1961fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
1962fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
1963fe6060f1SDimitry Andric       .scalarize(0);
1964fe6060f1SDimitry Andric 
1965*5f757f3fSDimitry Andric   getActionDefinitionsBuilder(
1966*5f757f3fSDimitry Andric       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
19675ffd83dbSDimitry Andric        G_FCOPYSIGN,
19685ffd83dbSDimitry Andric 
1969*5f757f3fSDimitry Andric        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
1970*5f757f3fSDimitry Andric        G_READ_REGISTER, G_WRITE_REGISTER,
19715ffd83dbSDimitry Andric 
1972*5f757f3fSDimitry Andric        G_SADDO, G_SSUBO})
1973*5f757f3fSDimitry Andric       .lower();
19745ffd83dbSDimitry Andric 
1975*5f757f3fSDimitry Andric   if (ST.hasIEEEMinMax()) {
1976*5f757f3fSDimitry Andric     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
1977*5f757f3fSDimitry Andric         .legalFor(FPTypesPK16)
1978*5f757f3fSDimitry Andric         .clampMaxNumElements(0, S16, 2)
1979*5f757f3fSDimitry Andric         .scalarize(0);
1980*5f757f3fSDimitry Andric   } else {
19815ffd83dbSDimitry Andric     // TODO: Implement
1982*5f757f3fSDimitry Andric     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
1983*5f757f3fSDimitry Andric   }
19845ffd83dbSDimitry Andric 
1985349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
1986349cc55cSDimitry Andric       .lower();
1987349cc55cSDimitry Andric 
1988480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
19895ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
1990480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
1991480093f4SDimitry Andric     .unsupported();
1992480093f4SDimitry Andric 
1993*5f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
1994*5f757f3fSDimitry Andric 
1995fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
19960b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
19970b57cec5SDimitry Andric }
19980b57cec5SDimitry Andric 
19995ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
20005ffd83dbSDimitry Andric                                          MachineInstr &MI) const {
20015ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
20025ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
20035ffd83dbSDimitry Andric 
20040b57cec5SDimitry Andric   switch (MI.getOpcode()) {
20050b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
20068bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
2007*5f757f3fSDimitry Andric   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2008*5f757f3fSDimitry Andric     return legalizeFroundeven(MI, MRI, B);
20090b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
20108bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
2011e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
2012e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
20130b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
20148bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
20150b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
20168bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
20170b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
20188bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
20195ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
20205ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
20215ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
20225ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
20230b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
20240b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
20250b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
20260b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
20275ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
20280b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
20298bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
20300b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
20318bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
20328bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
20338bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
20348bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
20358bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
20368bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
20378bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
2038fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
2039fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
2040e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
204106c3fb27SDimitry Andric   case TargetOpcode::G_STORE:
204206c3fb27SDimitry Andric     return legalizeStore(Helper, MI);
20438bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
20448bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
20458bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
20468bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
204706c3fb27SDimitry Andric   case TargetOpcode::G_FFREXP:
204806c3fb27SDimitry Andric     return legalizeFFREXP(MI, MRI, B);
204906c3fb27SDimitry Andric   case TargetOpcode::G_FSQRT:
205006c3fb27SDimitry Andric     return legalizeFSQRT(MI, MRI, B);
20515ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
20525ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
2053fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
2054fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
20555ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
20565ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
2057fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
2058fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
2059480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
2060480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
206106c3fb27SDimitry Andric   case TargetOpcode::G_FLOG2:
206206c3fb27SDimitry Andric     return legalizeFlog2(MI, B);
20635ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
20645ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
206506c3fb27SDimitry Andric     return legalizeFlogCommon(MI, B);
206606c3fb27SDimitry Andric   case TargetOpcode::G_FEXP2:
206706c3fb27SDimitry Andric     return legalizeFExp2(MI, B);
20685ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
2069*5f757f3fSDimitry Andric   case TargetOpcode::G_FEXP10:
20705ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
20715ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
20725ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
20735ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
20745ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
20755ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
2076bdd1243dSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
20775ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
207881ad6265SDimitry Andric   case TargetOpcode::G_MUL:
207981ad6265SDimitry Andric     return legalizeMul(Helper, MI);
2080349cc55cSDimitry Andric   case TargetOpcode::G_CTLZ:
2081349cc55cSDimitry Andric   case TargetOpcode::G_CTTZ:
2082349cc55cSDimitry Andric     return legalizeCTLZ_CTTZ(MI, MRI, B);
208381ad6265SDimitry Andric   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
208481ad6265SDimitry Andric     return legalizeFPTruncRound(MI, B);
2085*5f757f3fSDimitry Andric   case TargetOpcode::G_STACKSAVE:
2086*5f757f3fSDimitry Andric     return legalizeStackSave(MI, B);
20870b57cec5SDimitry Andric   default:
20880b57cec5SDimitry Andric     return false;
20890b57cec5SDimitry Andric   }
20900b57cec5SDimitry Andric 
20910b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
20920b57cec5SDimitry Andric }
20930b57cec5SDimitry Andric 
20940b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
20950b57cec5SDimitry Andric   unsigned AS,
20960b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
20978bcb0991SDimitry Andric   MachineIRBuilder &B) const {
20988bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
20990b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
21000b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
2101bdd1243dSDimitry Andric   const LLT S64 = LLT::scalar(64);
21020b57cec5SDimitry Andric 
21038bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
21048bcb0991SDimitry Andric 
21050b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
2106bdd1243dSDimitry Andric     // Note: this register is somewhat broken. When used as a 32-bit operand,
2107bdd1243dSDimitry Andric     // it only returns zeroes. The real value is in the upper 32 bits.
2108bdd1243dSDimitry Andric     // Thus, we must emit extract the high 32 bits.
2109bdd1243dSDimitry Andric     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2110bdd1243dSDimitry Andric                                        ? AMDGPU::SRC_SHARED_BASE
2111bdd1243dSDimitry Andric                                        : AMDGPU::SRC_PRIVATE_BASE;
2112bdd1243dSDimitry Andric     // FIXME: It would be more natural to emit a COPY here, but then copy
2113bdd1243dSDimitry Andric     // coalescing would kick in and it would think it's okay to use the "HI"
2114bdd1243dSDimitry Andric     // subregister (instead of extracting the HI 32 bits) which is an artificial
2115bdd1243dSDimitry Andric     // (unusable) register.
2116bdd1243dSDimitry Andric     //  Register TableGen definitions would need an overhaul to get rid of the
2117bdd1243dSDimitry Andric     //  artificial "HI" aperture registers and prevent this kind of issue from
2118bdd1243dSDimitry Andric     //  happening.
2119bdd1243dSDimitry Andric     Register Dst = MRI.createGenericVirtualRegister(S64);
2120bdd1243dSDimitry Andric     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2121bdd1243dSDimitry Andric     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2122bdd1243dSDimitry Andric     return B.buildUnmerge(S32, Dst).getReg(1);
21230b57cec5SDimitry Andric   }
21240b57cec5SDimitry Andric 
212581ad6265SDimitry Andric   // TODO: can we be smarter about machine pointer info?
212681ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
212781ad6265SDimitry Andric   Register LoadAddr = MRI.createGenericVirtualRegister(
212881ad6265SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
212981ad6265SDimitry Andric   // For code object version 5, private_base and shared_base are passed through
213081ad6265SDimitry Andric   // implicit kernargs.
213106c3fb27SDimitry Andric   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
213206c3fb27SDimitry Andric       AMDGPU::AMDHSA_COV5) {
213381ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
213481ad6265SDimitry Andric         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
213581ad6265SDimitry Andric                                       : AMDGPUTargetLowering::PRIVATE_BASE;
213681ad6265SDimitry Andric     uint64_t Offset =
213781ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
213881ad6265SDimitry Andric 
213981ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
214081ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
214181ad6265SDimitry Andric 
214281ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
214381ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
214481ad6265SDimitry Andric       return Register();
214581ad6265SDimitry Andric 
214681ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
214781ad6265SDimitry Andric         PtrInfo,
214881ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
214981ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
215081ad6265SDimitry Andric         LLT::scalar(32), commonAlignment(Align(64), Offset));
215181ad6265SDimitry Andric 
215281ad6265SDimitry Andric     // Pointer address
215381ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
215481ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
215581ad6265SDimitry Andric     // Load address
215681ad6265SDimitry Andric     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
215781ad6265SDimitry Andric   }
215881ad6265SDimitry Andric 
21590b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
21600b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
21610b57cec5SDimitry Andric 
2162e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
21638bcb0991SDimitry Andric     return Register();
21640b57cec5SDimitry Andric 
21650b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
21660b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
21670b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
21680b57cec5SDimitry Andric 
21690b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
21700b57cec5SDimitry Andric       PtrInfo,
21715ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
21720b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
2173fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
21740b57cec5SDimitry Andric 
217581ad6265SDimitry Andric   B.buildPtrAdd(LoadAddr, QueuePtr,
217681ad6265SDimitry Andric                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
21775ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
21780b57cec5SDimitry Andric }
21790b57cec5SDimitry Andric 
218004eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is
218104eeddc0SDimitry Andric /// not necessary.
218204eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
218304eeddc0SDimitry Andric                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
218404eeddc0SDimitry Andric   MachineInstr *Def = MRI.getVRegDef(Val);
218504eeddc0SDimitry Andric   switch (Def->getOpcode()) {
218604eeddc0SDimitry Andric   case AMDGPU::G_FRAME_INDEX:
218704eeddc0SDimitry Andric   case AMDGPU::G_GLOBAL_VALUE:
218804eeddc0SDimitry Andric   case AMDGPU::G_BLOCK_ADDR:
218904eeddc0SDimitry Andric     return true;
219004eeddc0SDimitry Andric   case AMDGPU::G_CONSTANT: {
219104eeddc0SDimitry Andric     const ConstantInt *CI = Def->getOperand(1).getCImm();
219204eeddc0SDimitry Andric     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
219304eeddc0SDimitry Andric   }
219404eeddc0SDimitry Andric   default:
219504eeddc0SDimitry Andric     return false;
219604eeddc0SDimitry Andric   }
219704eeddc0SDimitry Andric 
219804eeddc0SDimitry Andric   return false;
219904eeddc0SDimitry Andric }
220004eeddc0SDimitry Andric 
22010b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
22020b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
22038bcb0991SDimitry Andric   MachineIRBuilder &B) const {
22048bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
22050b57cec5SDimitry Andric 
22068bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
22070b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
22080b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
22090b57cec5SDimitry Andric 
22100b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
22110b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
22120b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
22130b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
22140b57cec5SDimitry Andric 
22150b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
22160b57cec5SDimitry Andric   // vector element.
22170b57cec5SDimitry Andric   assert(!DstTy.isVector());
22180b57cec5SDimitry Andric 
22190b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
22200b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
22210b57cec5SDimitry Andric 
2222e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
22238bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
22248bcb0991SDimitry Andric     return true;
22258bcb0991SDimitry Andric   }
22268bcb0991SDimitry Andric 
222781ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
222881ad6265SDimitry Andric       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
222981ad6265SDimitry Andric        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
223004eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
223104eeddc0SDimitry Andric       // Extract low 32-bits of the pointer.
223204eeddc0SDimitry Andric       B.buildExtract(Dst, Src, 0);
223304eeddc0SDimitry Andric       MI.eraseFromParent();
223404eeddc0SDimitry Andric       return true;
223504eeddc0SDimitry Andric     }
223604eeddc0SDimitry Andric 
22370b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
22380b57cec5SDimitry Andric 
22398bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
22408bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
22410b57cec5SDimitry Andric 
22420b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
22435ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
22440b57cec5SDimitry Andric 
22455ffd83dbSDimitry Andric     auto CmpRes =
22465ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
22478bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
22480b57cec5SDimitry Andric 
22490b57cec5SDimitry Andric     MI.eraseFromParent();
22500b57cec5SDimitry Andric     return true;
22510b57cec5SDimitry Andric   }
22520b57cec5SDimitry Andric 
225381ad6265SDimitry Andric   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
225481ad6265SDimitry Andric       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
225581ad6265SDimitry Andric        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
22568bcb0991SDimitry Andric     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
22578bcb0991SDimitry Andric     if (!ApertureReg.isValid())
22588bcb0991SDimitry Andric       return false;
22590b57cec5SDimitry Andric 
22600b57cec5SDimitry Andric     // Coerce the type of the low half of the result so we can use merge_values.
22615ffd83dbSDimitry Andric     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
22620b57cec5SDimitry Andric 
22630b57cec5SDimitry Andric     // TODO: Should we allow mismatched types but matching sizes in merges to
22640b57cec5SDimitry Andric     // avoid the ptrtoint?
2265bdd1243dSDimitry Andric     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
226604eeddc0SDimitry Andric 
226704eeddc0SDimitry Andric     if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
226804eeddc0SDimitry Andric       B.buildCopy(Dst, BuildPtr);
226904eeddc0SDimitry Andric       MI.eraseFromParent();
227004eeddc0SDimitry Andric       return true;
227104eeddc0SDimitry Andric     }
227204eeddc0SDimitry Andric 
227304eeddc0SDimitry Andric     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
227404eeddc0SDimitry Andric     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
227504eeddc0SDimitry Andric 
227681ad6265SDimitry Andric     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
227781ad6265SDimitry Andric                               SegmentNull.getReg(0));
227804eeddc0SDimitry Andric 
22795ffd83dbSDimitry Andric     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
22800b57cec5SDimitry Andric 
22810b57cec5SDimitry Andric     MI.eraseFromParent();
22820b57cec5SDimitry Andric     return true;
22830b57cec5SDimitry Andric   }
22840b57cec5SDimitry Andric 
228581ad6265SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
228681ad6265SDimitry Andric       SrcTy.getSizeInBits() == 64) {
228781ad6265SDimitry Andric     // Truncate.
228881ad6265SDimitry Andric     B.buildExtract(Dst, Src, 0);
228981ad6265SDimitry Andric     MI.eraseFromParent();
229081ad6265SDimitry Andric     return true;
229181ad6265SDimitry Andric   }
229281ad6265SDimitry Andric 
229381ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
229481ad6265SDimitry Andric       DstTy.getSizeInBits() == 64) {
229581ad6265SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
229681ad6265SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2297bdd1243dSDimitry Andric     auto PtrLo = B.buildPtrToInt(S32, Src);
2298bdd1243dSDimitry Andric     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2299bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
230081ad6265SDimitry Andric     MI.eraseFromParent();
230181ad6265SDimitry Andric     return true;
230281ad6265SDimitry Andric   }
230381ad6265SDimitry Andric 
230481ad6265SDimitry Andric   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
230581ad6265SDimitry Andric       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
230681ad6265SDimitry Andric 
230781ad6265SDimitry Andric   LLVMContext &Ctx = MF.getFunction().getContext();
230881ad6265SDimitry Andric   Ctx.diagnose(InvalidAddrSpaceCast);
230981ad6265SDimitry Andric   B.buildUndef(Dst);
231081ad6265SDimitry Andric   MI.eraseFromParent();
231181ad6265SDimitry Andric   return true;
231281ad6265SDimitry Andric }
231381ad6265SDimitry Andric 
2314*5f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
2315*5f757f3fSDimitry Andric                                              MachineRegisterInfo &MRI,
23168bcb0991SDimitry Andric                                              MachineIRBuilder &B) const {
23170b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
23180b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
23190b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
23200b57cec5SDimitry Andric 
23210b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
23220b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
23230b57cec5SDimitry Andric 
23248bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
23258bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
23260b57cec5SDimitry Andric 
23270b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
23288bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
23298bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
23300b57cec5SDimitry Andric 
23318bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
23328bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
23330b57cec5SDimitry Andric 
23348bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
23358bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2336e8d8bef9SDimitry Andric   MI.eraseFromParent();
23370b57cec5SDimitry Andric   return true;
23380b57cec5SDimitry Andric }
23390b57cec5SDimitry Andric 
23400b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
23410b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23420b57cec5SDimitry Andric   MachineIRBuilder &B) const {
23430b57cec5SDimitry Andric 
23440b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
23450b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
23460b57cec5SDimitry Andric 
23470b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
23480b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
23490b57cec5SDimitry Andric 
23500b57cec5SDimitry Andric   // result = trunc(src)
23510b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
23520b57cec5SDimitry Andric   //   result += 1.0
23530b57cec5SDimitry Andric 
23545ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
23550b57cec5SDimitry Andric 
23560b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
23570b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
23580b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
23590b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
23600b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
23610b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
23620b57cec5SDimitry Andric 
23630b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
23640b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
236504eeddc0SDimitry Andric   MI.eraseFromParent();
23660b57cec5SDimitry Andric   return true;
23670b57cec5SDimitry Andric }
23680b57cec5SDimitry Andric 
2369e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
2370e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
2371e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
2372e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
2373e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
2374e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
2375e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
2376e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
2377e8d8bef9SDimitry Andric 
2378e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2379e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2380e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2381e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2382e8d8bef9SDimitry Andric     MI.eraseFromParent();
2383e8d8bef9SDimitry Andric     return true;
2384e8d8bef9SDimitry Andric }
2385e8d8bef9SDimitry Andric 
2386e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
23870b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
23880b57cec5SDimitry Andric   const unsigned FractBits = 52;
23890b57cec5SDimitry Andric   const unsigned ExpBits = 11;
23900b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
23910b57cec5SDimitry Andric 
23920b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
23930b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
23940b57cec5SDimitry Andric 
2395*5f757f3fSDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2396e8d8bef9SDimitry Andric                      .addUse(Hi)
23970b57cec5SDimitry Andric                      .addUse(Const0.getReg(0))
23980b57cec5SDimitry Andric                      .addUse(Const1.getReg(0));
23990b57cec5SDimitry Andric 
24000b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
24010b57cec5SDimitry Andric }
24020b57cec5SDimitry Andric 
24030b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
24040b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24050b57cec5SDimitry Andric   MachineIRBuilder &B) const {
24060b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
24070b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
24080b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
24090b57cec5SDimitry Andric 
24100b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
24110b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
24120b57cec5SDimitry Andric 
24130b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
24140b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
24150b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
24160b57cec5SDimitry Andric 
24170b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
24180b57cec5SDimitry Andric   // exponent.
24190b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
24200b57cec5SDimitry Andric 
24210b57cec5SDimitry Andric   const unsigned FractBits = 52;
24220b57cec5SDimitry Andric 
24230b57cec5SDimitry Andric   // Extract the sign bit.
24240b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
24250b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
24260b57cec5SDimitry Andric 
24270b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
24280b57cec5SDimitry Andric 
24290b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
24300b57cec5SDimitry Andric 
24310b57cec5SDimitry Andric   // Extend back to 64-bits.
2432bdd1243dSDimitry Andric   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
24330b57cec5SDimitry Andric 
24340b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
24350b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
24360b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
24370b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
24380b57cec5SDimitry Andric 
24390b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
24400b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
24410b57cec5SDimitry Andric 
24420b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
24430b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2444e8d8bef9SDimitry Andric   MI.eraseFromParent();
24450b57cec5SDimitry Andric   return true;
24460b57cec5SDimitry Andric }
24470b57cec5SDimitry Andric 
24480b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
24490b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24500b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
24510b57cec5SDimitry Andric 
24520b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
24530b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
24540b57cec5SDimitry Andric 
24550b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
24560b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
24570b57cec5SDimitry Andric 
2458349cc55cSDimitry Andric   assert(MRI.getType(Src) == S64);
24590b57cec5SDimitry Andric 
24600b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2461349cc55cSDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
24620b57cec5SDimitry Andric 
2463349cc55cSDimitry Andric   if (MRI.getType(Dst) == S64) {
2464349cc55cSDimitry Andric     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2465349cc55cSDimitry Andric                         : B.buildUITOFP(S64, Unmerge.getReg(1));
24660b57cec5SDimitry Andric 
24670b57cec5SDimitry Andric     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
246806c3fb27SDimitry Andric     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
24690b57cec5SDimitry Andric 
24700b57cec5SDimitry Andric     // TODO: Should this propagate fast-math-flags?
24710b57cec5SDimitry Andric     B.buildFAdd(Dst, LdExp, CvtLo);
24720b57cec5SDimitry Andric     MI.eraseFromParent();
24730b57cec5SDimitry Andric     return true;
24740b57cec5SDimitry Andric   }
24750b57cec5SDimitry Andric 
2476349cc55cSDimitry Andric   assert(MRI.getType(Dst) == S32);
2477349cc55cSDimitry Andric 
2478349cc55cSDimitry Andric   auto One = B.buildConstant(S32, 1);
2479349cc55cSDimitry Andric 
2480349cc55cSDimitry Andric   MachineInstrBuilder ShAmt;
2481349cc55cSDimitry Andric   if (Signed) {
2482349cc55cSDimitry Andric     auto ThirtyOne = B.buildConstant(S32, 31);
2483349cc55cSDimitry Andric     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2484349cc55cSDimitry Andric     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2485349cc55cSDimitry Andric     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2486*5f757f3fSDimitry Andric     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2487349cc55cSDimitry Andric                   .addUse(Unmerge.getReg(1));
2488349cc55cSDimitry Andric     auto LS2 = B.buildSub(S32, LS, One);
2489349cc55cSDimitry Andric     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2490349cc55cSDimitry Andric   } else
2491349cc55cSDimitry Andric     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2492349cc55cSDimitry Andric   auto Norm = B.buildShl(S64, Src, ShAmt);
2493349cc55cSDimitry Andric   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2494349cc55cSDimitry Andric   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2495349cc55cSDimitry Andric   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2496349cc55cSDimitry Andric   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2497349cc55cSDimitry Andric   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
249806c3fb27SDimitry Andric   B.buildFLdexp(Dst, FVal, Scale);
2499349cc55cSDimitry Andric   MI.eraseFromParent();
2500349cc55cSDimitry Andric   return true;
2501349cc55cSDimitry Andric }
2502349cc55cSDimitry Andric 
25035ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
25045ffd83dbSDimitry Andric // actually works.
2505fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2506fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2507fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2508fe6060f1SDimitry Andric                                         bool Signed) const {
25095ffd83dbSDimitry Andric 
25105ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
25115ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
25125ffd83dbSDimitry Andric 
25135ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
25145ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
25155ffd83dbSDimitry Andric 
2516fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2517fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
25185ffd83dbSDimitry Andric 
25195ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
25205ffd83dbSDimitry Andric 
2521fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2522fe6060f1SDimitry Andric   // integers is illustrated as follows:
2523fe6060f1SDimitry Andric   //
2524fe6060f1SDimitry Andric   //     tf := trunc(val);
2525fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2526fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2527fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2528fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2529fe6060f1SDimitry Andric   //
2530fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2531fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2532fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2533fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2534fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2535fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2536fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2537fe6060f1SDimitry Andric     // signedness.
2538fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2539fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2540fe6060f1SDimitry Andric   }
2541fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2542fe6060f1SDimitry Andric   if (SrcLT == S64) {
254306c3fb27SDimitry Andric     K0 = B.buildFConstant(
254406c3fb27SDimitry Andric         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
254506c3fb27SDimitry Andric     K1 = B.buildFConstant(
254606c3fb27SDimitry Andric         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2547fe6060f1SDimitry Andric   } else {
254806c3fb27SDimitry Andric     K0 = B.buildFConstant(
254906c3fb27SDimitry Andric         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
255006c3fb27SDimitry Andric     K1 = B.buildFConstant(
255106c3fb27SDimitry Andric         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2552fe6060f1SDimitry Andric   }
25535ffd83dbSDimitry Andric 
2554fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2555fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2556fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
25575ffd83dbSDimitry Andric 
2558fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2559fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
25605ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
25615ffd83dbSDimitry Andric 
2562fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2563fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2564bdd1243dSDimitry Andric     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2565fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2566bdd1243dSDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2567bdd1243dSDimitry Andric                Sign);
2568fe6060f1SDimitry Andric   } else
2569bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {Lo, Hi});
25705ffd83dbSDimitry Andric   MI.eraseFromParent();
25715ffd83dbSDimitry Andric 
25725ffd83dbSDimitry Andric   return true;
25735ffd83dbSDimitry Andric }
25745ffd83dbSDimitry Andric 
25755ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
25765ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
25775ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
25780b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
25790b57cec5SDimitry Andric 
25800b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
25810b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
25820b57cec5SDimitry Andric 
25830b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
25840b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
25850b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
25860b57cec5SDimitry Andric     return !IsIEEEOp;
25870b57cec5SDimitry Andric 
25880b57cec5SDimitry Andric   if (IsIEEEOp)
25890b57cec5SDimitry Andric     return true;
25900b57cec5SDimitry Andric 
25910b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
25920b57cec5SDimitry Andric }
25930b57cec5SDimitry Andric 
25940b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
25950b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
25960b57cec5SDimitry Andric   MachineIRBuilder &B) const {
25970b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
25980b57cec5SDimitry Andric 
25990b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
26005ffd83dbSDimitry Andric 
260106c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
260206c3fb27SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
260306c3fb27SDimitry Andric 
260406c3fb27SDimitry Andric   LLT VecTy = MRI.getType(Vec);
260506c3fb27SDimitry Andric   LLT EltTy = VecTy.getElementType();
260606c3fb27SDimitry Andric   assert(EltTy == MRI.getType(Dst));
260706c3fb27SDimitry Andric 
260806c3fb27SDimitry Andric   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
260906c3fb27SDimitry Andric   // but we can't go directly to that logic becasue you can't bitcast a vector
261006c3fb27SDimitry Andric   // of pointers to a vector of integers. Therefore, introduce an intermediate
261106c3fb27SDimitry Andric   // vector of integers using ptrtoint (and inttoptr on the output) in order to
261206c3fb27SDimitry Andric   // drive the legalization forward.
261306c3fb27SDimitry Andric   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
261406c3fb27SDimitry Andric     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
261506c3fb27SDimitry Andric     LLT IntVecTy = VecTy.changeElementType(IntTy);
261606c3fb27SDimitry Andric 
261706c3fb27SDimitry Andric     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
261806c3fb27SDimitry Andric     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
261906c3fb27SDimitry Andric     B.buildIntToPtr(Dst, IntElt);
262006c3fb27SDimitry Andric 
262106c3fb27SDimitry Andric     MI.eraseFromParent();
262206c3fb27SDimitry Andric     return true;
262306c3fb27SDimitry Andric   }
262406c3fb27SDimitry Andric 
26255ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
26265ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2627349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2628bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2629349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2630e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
26310b57cec5SDimitry Andric     return true;
2632bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
26330b57cec5SDimitry Andric 
263404eeddc0SDimitry Andric   if (IdxVal < VecTy.getNumElements()) {
263504eeddc0SDimitry Andric     auto Unmerge = B.buildUnmerge(EltTy, Vec);
263604eeddc0SDimitry Andric     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
263704eeddc0SDimitry Andric   } else {
26380b57cec5SDimitry Andric     B.buildUndef(Dst);
263904eeddc0SDimitry Andric   }
26400b57cec5SDimitry Andric 
26410b57cec5SDimitry Andric   MI.eraseFromParent();
26420b57cec5SDimitry Andric   return true;
26430b57cec5SDimitry Andric }
26440b57cec5SDimitry Andric 
26450b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
26460b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
26470b57cec5SDimitry Andric   MachineIRBuilder &B) const {
26480b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
26490b57cec5SDimitry Andric 
26500b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
26515ffd83dbSDimitry Andric 
265206c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
265306c3fb27SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
265406c3fb27SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
265506c3fb27SDimitry Andric 
265606c3fb27SDimitry Andric   LLT VecTy = MRI.getType(Vec);
265706c3fb27SDimitry Andric   LLT EltTy = VecTy.getElementType();
265806c3fb27SDimitry Andric   assert(EltTy == MRI.getType(Ins));
265906c3fb27SDimitry Andric 
266006c3fb27SDimitry Andric   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
266106c3fb27SDimitry Andric   // but we can't go directly to that logic becasue you can't bitcast a vector
266206c3fb27SDimitry Andric   // of pointers to a vector of integers. Therefore, make the pointer vector
266306c3fb27SDimitry Andric   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
266406c3fb27SDimitry Andric   // new value, and then inttoptr the result vector back. This will then allow
266506c3fb27SDimitry Andric   // the rest of legalization to take over.
266606c3fb27SDimitry Andric   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
266706c3fb27SDimitry Andric     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
266806c3fb27SDimitry Andric     LLT IntVecTy = VecTy.changeElementType(IntTy);
266906c3fb27SDimitry Andric 
267006c3fb27SDimitry Andric     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
267106c3fb27SDimitry Andric     auto IntIns = B.buildPtrToInt(IntTy, Ins);
267206c3fb27SDimitry Andric     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
267306c3fb27SDimitry Andric                                                  MI.getOperand(3));
267406c3fb27SDimitry Andric     B.buildIntToPtr(Dst, IntVecDest);
267506c3fb27SDimitry Andric     MI.eraseFromParent();
267606c3fb27SDimitry Andric     return true;
267706c3fb27SDimitry Andric   }
267806c3fb27SDimitry Andric 
26795ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
26805ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2681349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2682bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2683349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2684e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
26850b57cec5SDimitry Andric     return true;
26860b57cec5SDimitry Andric 
2687bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
26880b57cec5SDimitry Andric 
268904eeddc0SDimitry Andric   unsigned NumElts = VecTy.getNumElements();
269004eeddc0SDimitry Andric   if (IdxVal < NumElts) {
269104eeddc0SDimitry Andric     SmallVector<Register, 8> SrcRegs;
269204eeddc0SDimitry Andric     for (unsigned i = 0; i < NumElts; ++i)
269304eeddc0SDimitry Andric       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
269404eeddc0SDimitry Andric     B.buildUnmerge(SrcRegs, Vec);
269504eeddc0SDimitry Andric 
269604eeddc0SDimitry Andric     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2697bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, SrcRegs);
269804eeddc0SDimitry Andric   } else {
26990b57cec5SDimitry Andric     B.buildUndef(Dst);
270004eeddc0SDimitry Andric   }
27010b57cec5SDimitry Andric 
27020b57cec5SDimitry Andric   MI.eraseFromParent();
27030b57cec5SDimitry Andric   return true;
27040b57cec5SDimitry Andric }
27050b57cec5SDimitry Andric 
27068bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
27078bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
27088bcb0991SDimitry Andric   MachineIRBuilder &B) const {
27098bcb0991SDimitry Andric 
27108bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
27118bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
27128bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
27138bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
27148bcb0991SDimitry Andric 
27158bcb0991SDimitry Andric   Register TrigVal;
27165ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
27178bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
27188bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2719*5f757f3fSDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
27208bcb0991SDimitry Andric                   .addUse(MulVal.getReg(0))
2721*5f757f3fSDimitry Andric                   .setMIFlags(Flags)
2722*5f757f3fSDimitry Andric                   .getReg(0);
27238bcb0991SDimitry Andric   } else
27248bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
27258bcb0991SDimitry Andric 
27268bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
27278bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2728*5f757f3fSDimitry Andric   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
27298bcb0991SDimitry Andric       .addUse(TrigVal)
27308bcb0991SDimitry Andric       .setMIFlags(Flags);
27318bcb0991SDimitry Andric   MI.eraseFromParent();
27328bcb0991SDimitry Andric   return true;
27338bcb0991SDimitry Andric }
27348bcb0991SDimitry Andric 
27355ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
27365ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
27375ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
27385ffd83dbSDimitry Andric                                                   int64_t Offset,
27395ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
27405ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
27418bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
27428bcb0991SDimitry Andric   // to the following code sequence:
27438bcb0991SDimitry Andric   //
27448bcb0991SDimitry Andric   // For constant address space:
27458bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
27468bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
27478bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
27488bcb0991SDimitry Andric   //
27498bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
27508bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
27518bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
27528bcb0991SDimitry Andric   //   operand to the global variable.
27538bcb0991SDimitry Andric   //
27548bcb0991SDimitry Andric   // For global address space:
27558bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
27568bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
27578bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
27588bcb0991SDimitry Andric   //
27598bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
27608bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
27618bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
27628bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
27638bcb0991SDimitry Andric   //   operand to the global variable.
27648bcb0991SDimitry Andric 
27658bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
27668bcb0991SDimitry Andric 
27678bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
27688bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
27698bcb0991SDimitry Andric 
27708bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
27718bcb0991SDimitry Andric     .addDef(PCReg);
27728bcb0991SDimitry Andric 
2773*5f757f3fSDimitry Andric   MIB.addGlobalAddress(GV, Offset, GAFlags);
27748bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
27758bcb0991SDimitry Andric     MIB.addImm(0);
27768bcb0991SDimitry Andric   else
2777*5f757f3fSDimitry Andric     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
27788bcb0991SDimitry Andric 
277906c3fb27SDimitry Andric   if (!B.getMRI()->getRegClassOrNull(PCReg))
27808bcb0991SDimitry Andric     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
27818bcb0991SDimitry Andric 
27828bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
27838bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
27848bcb0991SDimitry Andric   return true;
27858bcb0991SDimitry Andric }
27868bcb0991SDimitry Andric 
2787*5f757f3fSDimitry Andric // Emit a ABS32_LO / ABS32_HI relocation stub.
2788*5f757f3fSDimitry Andric void AMDGPULegalizerInfo::buildAbsGlobalAddress(
2789*5f757f3fSDimitry Andric     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
2790*5f757f3fSDimitry Andric     MachineRegisterInfo &MRI) const {
2791*5f757f3fSDimitry Andric   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
2792*5f757f3fSDimitry Andric 
2793*5f757f3fSDimitry Andric   LLT S32 = LLT::scalar(32);
2794*5f757f3fSDimitry Andric 
2795*5f757f3fSDimitry Andric   // Use the destination directly, if and only if we store the lower address
2796*5f757f3fSDimitry Andric   // part only and we don't have a register class being set.
2797*5f757f3fSDimitry Andric   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
2798*5f757f3fSDimitry Andric                         ? DstReg
2799*5f757f3fSDimitry Andric                         : MRI.createGenericVirtualRegister(S32);
2800*5f757f3fSDimitry Andric 
2801*5f757f3fSDimitry Andric   if (!MRI.getRegClassOrNull(AddrLo))
2802*5f757f3fSDimitry Andric     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2803*5f757f3fSDimitry Andric 
2804*5f757f3fSDimitry Andric   // Write the lower half.
2805*5f757f3fSDimitry Andric   B.buildInstr(AMDGPU::S_MOV_B32)
2806*5f757f3fSDimitry Andric       .addDef(AddrLo)
2807*5f757f3fSDimitry Andric       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
2808*5f757f3fSDimitry Andric 
2809*5f757f3fSDimitry Andric   // If required, write the upper half as well.
2810*5f757f3fSDimitry Andric   if (RequiresHighHalf) {
2811*5f757f3fSDimitry Andric     assert(PtrTy.getSizeInBits() == 64 &&
2812*5f757f3fSDimitry Andric            "Must provide a 64-bit pointer type!");
2813*5f757f3fSDimitry Andric 
2814*5f757f3fSDimitry Andric     Register AddrHi = MRI.createGenericVirtualRegister(S32);
2815*5f757f3fSDimitry Andric     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2816*5f757f3fSDimitry Andric 
2817*5f757f3fSDimitry Andric     B.buildInstr(AMDGPU::S_MOV_B32)
2818*5f757f3fSDimitry Andric         .addDef(AddrHi)
2819*5f757f3fSDimitry Andric         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
2820*5f757f3fSDimitry Andric 
2821*5f757f3fSDimitry Andric     // Use the destination directly, if and only if we don't have a register
2822*5f757f3fSDimitry Andric     // class being set.
2823*5f757f3fSDimitry Andric     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
2824*5f757f3fSDimitry Andric                            ? DstReg
2825*5f757f3fSDimitry Andric                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
2826*5f757f3fSDimitry Andric 
2827*5f757f3fSDimitry Andric     if (!MRI.getRegClassOrNull(AddrDst))
2828*5f757f3fSDimitry Andric       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2829*5f757f3fSDimitry Andric 
2830*5f757f3fSDimitry Andric     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2831*5f757f3fSDimitry Andric 
2832*5f757f3fSDimitry Andric     // If we created a new register for the destination, cast the result into
2833*5f757f3fSDimitry Andric     // the final output.
2834*5f757f3fSDimitry Andric     if (AddrDst != DstReg)
2835*5f757f3fSDimitry Andric       B.buildCast(DstReg, AddrDst);
2836*5f757f3fSDimitry Andric   } else if (AddrLo != DstReg) {
2837*5f757f3fSDimitry Andric     // If we created a new register for the destination, cast the result into
2838*5f757f3fSDimitry Andric     // the final output.
2839*5f757f3fSDimitry Andric     B.buildCast(DstReg, AddrLo);
2840*5f757f3fSDimitry Andric   }
2841*5f757f3fSDimitry Andric }
2842*5f757f3fSDimitry Andric 
28438bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
28448bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
28458bcb0991SDimitry Andric   MachineIRBuilder &B) const {
28468bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
28478bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
28488bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
28498bcb0991SDimitry Andric 
28508bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
28518bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
28528bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
28538bcb0991SDimitry Andric 
28548bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2855fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2856fe6060f1SDimitry Andric         !GV->getName().equals("llvm.amdgcn.module.lds")) {
28578bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
28588bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
28595ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
28605ffd83dbSDimitry Andric         DS_Warning);
28618bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
28625ffd83dbSDimitry Andric 
28635ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
28645ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
28655ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
28665ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
28675ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
2868*5f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>());
28695ffd83dbSDimitry Andric       B.buildUndef(DstReg);
28705ffd83dbSDimitry Andric       MI.eraseFromParent();
28715ffd83dbSDimitry Andric       return true;
28728bcb0991SDimitry Andric     }
28738bcb0991SDimitry Andric 
28748bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
2875349cc55cSDimitry Andric     // We ignore the initializer for now and legalize it to allow selection.
2876349cc55cSDimitry Andric     // The initializer will anyway get errored out during assembly emission.
28775ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
28785ffd83dbSDimitry Andric     if (!TLI->shouldUseLDSConstAddress(GV)) {
28795ffd83dbSDimitry Andric       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
28805ffd83dbSDimitry Andric       return true; // Leave in place;
28815ffd83dbSDimitry Andric     }
28825ffd83dbSDimitry Andric 
2883e8d8bef9SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2884e8d8bef9SDimitry Andric       Type *Ty = GV->getValueType();
2885e8d8bef9SDimitry Andric       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2886e8d8bef9SDimitry Andric       // zero-sized type in other languages to declare the dynamic shared
2887e8d8bef9SDimitry Andric       // memory which size is not known at the compile time. They will be
2888e8d8bef9SDimitry Andric       // allocated by the runtime and placed directly after the static
2889e8d8bef9SDimitry Andric       // allocated ones. They all share the same offset.
2890e8d8bef9SDimitry Andric       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2891e8d8bef9SDimitry Andric         // Adjust alignment for that dynamic shared memory array.
289206c3fb27SDimitry Andric         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
2893e8d8bef9SDimitry Andric         LLT S32 = LLT::scalar(32);
2894*5f757f3fSDimitry Andric         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
2895e8d8bef9SDimitry Andric         B.buildIntToPtr(DstReg, Sz);
2896e8d8bef9SDimitry Andric         MI.eraseFromParent();
2897e8d8bef9SDimitry Andric         return true;
2898e8d8bef9SDimitry Andric       }
2899e8d8bef9SDimitry Andric     }
2900e8d8bef9SDimitry Andric 
2901349cc55cSDimitry Andric     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
2902349cc55cSDimitry Andric                                                    *cast<GlobalVariable>(GV)));
29038bcb0991SDimitry Andric     MI.eraseFromParent();
29048bcb0991SDimitry Andric     return true;
29058bcb0991SDimitry Andric   }
29068bcb0991SDimitry Andric 
2907*5f757f3fSDimitry Andric   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
2908*5f757f3fSDimitry Andric     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
2909*5f757f3fSDimitry Andric     MI.eraseFromParent();
2910*5f757f3fSDimitry Andric     return true;
2911*5f757f3fSDimitry Andric   }
2912*5f757f3fSDimitry Andric 
29138bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
29148bcb0991SDimitry Andric 
29158bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
29168bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
29178bcb0991SDimitry Andric     MI.eraseFromParent();
29188bcb0991SDimitry Andric     return true;
29198bcb0991SDimitry Andric   }
29208bcb0991SDimitry Andric 
29218bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
29228bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
29238bcb0991SDimitry Andric     MI.eraseFromParent();
29248bcb0991SDimitry Andric     return true;
29258bcb0991SDimitry Andric   }
29268bcb0991SDimitry Andric 
29278bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
29288bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
29298bcb0991SDimitry Andric 
2930fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
29318bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
29328bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
29338bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
29348bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
2935fe6060f1SDimitry Andric       LoadTy, Align(8));
29368bcb0991SDimitry Andric 
29378bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
29388bcb0991SDimitry Andric 
29398bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
2940349cc55cSDimitry Andric     // Truncate if this is a 32-bit constant address.
29418bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
29428bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
29438bcb0991SDimitry Andric   } else
29448bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
29458bcb0991SDimitry Andric 
29468bcb0991SDimitry Andric   MI.eraseFromParent();
29478bcb0991SDimitry Andric   return true;
29488bcb0991SDimitry Andric }
29498bcb0991SDimitry Andric 
2950e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
2951e8d8bef9SDimitry Andric   if (Ty.isVector())
2952fe6060f1SDimitry Andric     return Ty.changeElementCount(
2953fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
2954e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
2955e8d8bef9SDimitry Andric }
2956e8d8bef9SDimitry Andric 
2957e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
2958e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
2959e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
2960e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
2961e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
2962e8d8bef9SDimitry Andric 
2963e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
2964e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
2965e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
2966e8d8bef9SDimitry Andric 
2967e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
29688bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
2969e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
29708bcb0991SDimitry Andric     Observer.changingInstr(MI);
29718bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
29728bcb0991SDimitry Andric     Observer.changedInstr(MI);
29738bcb0991SDimitry Andric     return true;
29748bcb0991SDimitry Andric   }
29758bcb0991SDimitry Andric 
2976fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
2977fe6060f1SDimitry Andric     return false;
2978fe6060f1SDimitry Andric 
2979e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
2980e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
2981e8d8bef9SDimitry Andric 
298206c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(ValTy)) {
298306c3fb27SDimitry Andric     Observer.changingInstr(MI);
298406c3fb27SDimitry Andric     castBufferRsrcFromV4I32(MI, B, MRI, 0);
298506c3fb27SDimitry Andric     Observer.changedInstr(MI);
298606c3fb27SDimitry Andric     return true;
298706c3fb27SDimitry Andric   }
298806c3fb27SDimitry Andric 
2989e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
2990e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
2991fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
2992e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
2993fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
299404eeddc0SDimitry Andric   const uint64_t AlignInBits = 8 * MemAlign.value();
2995e8d8bef9SDimitry Andric 
2996e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
2997fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
2998e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
2999e8d8bef9SDimitry Andric 
3000e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
3001e8d8bef9SDimitry Andric     // the memory type.
3002e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
3003e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
3004e8d8bef9SDimitry Andric 
3005e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
3006e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3007e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
3008e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
3009e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
3010e8d8bef9SDimitry Andric       return true;
3011e8d8bef9SDimitry Andric     }
3012e8d8bef9SDimitry Andric 
3013e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
3014e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
3015e8d8bef9SDimitry Andric       return false;
3016e8d8bef9SDimitry Andric 
3017e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
3018e8d8bef9SDimitry Andric 
3019e8d8bef9SDimitry Andric     Register WideLoad;
3020e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
3021e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3022e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
3023e8d8bef9SDimitry Andric     } else {
3024e8d8bef9SDimitry Andric       // Extract the subvector.
3025e8d8bef9SDimitry Andric 
3026e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
3027e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
3028e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
3029e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3030e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
3031e8d8bef9SDimitry Andric       } else {
3032e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
3033e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
30340eae32dcSDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
30350eae32dcSDimitry Andric         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3036e8d8bef9SDimitry Andric       }
3037e8d8bef9SDimitry Andric     }
3038e8d8bef9SDimitry Andric 
3039e8d8bef9SDimitry Andric     MI.eraseFromParent();
3040e8d8bef9SDimitry Andric     return true;
3041e8d8bef9SDimitry Andric   }
3042e8d8bef9SDimitry Andric 
3043e8d8bef9SDimitry Andric   return false;
3044e8d8bef9SDimitry Andric }
3045e8d8bef9SDimitry Andric 
304606c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
304706c3fb27SDimitry Andric                                         MachineInstr &MI) const {
304806c3fb27SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
304906c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
305006c3fb27SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
305106c3fb27SDimitry Andric 
305206c3fb27SDimitry Andric   Register DataReg = MI.getOperand(0).getReg();
305306c3fb27SDimitry Andric   LLT DataTy = MRI.getType(DataReg);
305406c3fb27SDimitry Andric 
305506c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(DataTy)) {
305606c3fb27SDimitry Andric     Observer.changingInstr(MI);
305706c3fb27SDimitry Andric     castBufferRsrcArgToV4I32(MI, B, 0);
305806c3fb27SDimitry Andric     Observer.changedInstr(MI);
305906c3fb27SDimitry Andric     return true;
306006c3fb27SDimitry Andric   }
306106c3fb27SDimitry Andric   return false;
306206c3fb27SDimitry Andric }
306306c3fb27SDimitry Andric 
30648bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
30658bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
30668bcb0991SDimitry Andric   MachineIRBuilder &B) const {
30678bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
30688bcb0991SDimitry Andric   assert(Ty.isScalar());
30698bcb0991SDimitry Andric 
3070480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
3071480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3072480093f4SDimitry Andric 
30738bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
30745ffd83dbSDimitry Andric   // FIXME: Do we need just output?
3075*5f757f3fSDimitry Andric   if (Ty == LLT::float32() &&
307606c3fb27SDimitry Andric       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
30778bcb0991SDimitry Andric     return true;
3078*5f757f3fSDimitry Andric   if (Ty == LLT::float16() &&
307906c3fb27SDimitry Andric       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
30808bcb0991SDimitry Andric     return true;
30818bcb0991SDimitry Andric 
30828bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
30838bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
30848bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
30858bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
30868bcb0991SDimitry Andric }
30878bcb0991SDimitry Andric 
3088480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3089480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3090480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
3091480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
3092480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
3093480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
3094480093f4SDimitry Andric 
3095e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3096480093f4SDimitry Andric          "this should not have been custom lowered");
3097480093f4SDimitry Andric 
3098480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
3099fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
3100480093f4SDimitry Andric 
3101480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3102480093f4SDimitry Andric 
3103480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3104480093f4SDimitry Andric     .addDef(DstReg)
3105480093f4SDimitry Andric     .addUse(PtrReg)
3106480093f4SDimitry Andric     .addUse(PackedVal)
3107480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
3108480093f4SDimitry Andric 
3109480093f4SDimitry Andric   MI.eraseFromParent();
3110480093f4SDimitry Andric   return true;
3111480093f4SDimitry Andric }
3112480093f4SDimitry Andric 
311306c3fb27SDimitry Andric /// Return true if it's known that \p Src can never be an f32 denormal value.
311406c3fb27SDimitry Andric static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
311506c3fb27SDimitry Andric                                        Register Src) {
3116*5f757f3fSDimitry Andric   const MachineInstr *DefMI = MRI.getVRegDef(Src);
3117*5f757f3fSDimitry Andric   switch (DefMI->getOpcode()) {
3118*5f757f3fSDimitry Andric   case TargetOpcode::G_INTRINSIC: {
3119*5f757f3fSDimitry Andric     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
3120*5f757f3fSDimitry Andric     case Intrinsic::amdgcn_frexp_mant:
3121*5f757f3fSDimitry Andric       return true;
3122*5f757f3fSDimitry Andric     default:
3123*5f757f3fSDimitry Andric       break;
3124*5f757f3fSDimitry Andric     }
3125*5f757f3fSDimitry Andric 
3126*5f757f3fSDimitry Andric     break;
3127*5f757f3fSDimitry Andric   }
3128*5f757f3fSDimitry Andric   case TargetOpcode::G_FFREXP: {
3129*5f757f3fSDimitry Andric     if (DefMI->getOperand(0).getReg() == Src)
3130*5f757f3fSDimitry Andric       return true;
3131*5f757f3fSDimitry Andric     break;
3132*5f757f3fSDimitry Andric   }
3133*5f757f3fSDimitry Andric   case TargetOpcode::G_FPEXT: {
3134*5f757f3fSDimitry Andric     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3135*5f757f3fSDimitry Andric   }
3136*5f757f3fSDimitry Andric   default:
3137*5f757f3fSDimitry Andric     return false;
3138*5f757f3fSDimitry Andric   }
3139*5f757f3fSDimitry Andric 
314006c3fb27SDimitry Andric   return false;
314106c3fb27SDimitry Andric }
314206c3fb27SDimitry Andric 
314306c3fb27SDimitry Andric static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
314406c3fb27SDimitry Andric   if (Flags & MachineInstr::FmAfn)
314506c3fb27SDimitry Andric     return true;
314606c3fb27SDimitry Andric   const auto &Options = MF.getTarget().Options;
314706c3fb27SDimitry Andric   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
314806c3fb27SDimitry Andric }
314906c3fb27SDimitry Andric 
315006c3fb27SDimitry Andric static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
315106c3fb27SDimitry Andric                                    unsigned Flags) {
315206c3fb27SDimitry Andric   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
315306c3fb27SDimitry Andric          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
315406c3fb27SDimitry Andric              DenormalMode::PreserveSign;
315506c3fb27SDimitry Andric }
315606c3fb27SDimitry Andric 
315706c3fb27SDimitry Andric std::pair<Register, Register>
315806c3fb27SDimitry Andric AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
315906c3fb27SDimitry Andric                                        unsigned Flags) const {
31608a4dda33SDimitry Andric   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
316106c3fb27SDimitry Andric     return {};
316206c3fb27SDimitry Andric 
316306c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
316406c3fb27SDimitry Andric   auto SmallestNormal = B.buildFConstant(
316506c3fb27SDimitry Andric       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
316606c3fb27SDimitry Andric   auto IsLtSmallestNormal =
316706c3fb27SDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
316806c3fb27SDimitry Andric 
316906c3fb27SDimitry Andric   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
317006c3fb27SDimitry Andric   auto One = B.buildFConstant(F32, 1.0);
317106c3fb27SDimitry Andric   auto ScaleFactor =
317206c3fb27SDimitry Andric       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
317306c3fb27SDimitry Andric   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
317406c3fb27SDimitry Andric 
317506c3fb27SDimitry Andric   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
317606c3fb27SDimitry Andric }
317706c3fb27SDimitry Andric 
317806c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
317906c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
318006c3fb27SDimitry Andric   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
318106c3fb27SDimitry Andric   // If we have to handle denormals, scale up the input and adjust the result.
318206c3fb27SDimitry Andric 
318306c3fb27SDimitry Andric   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
318406c3fb27SDimitry Andric   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
318506c3fb27SDimitry Andric 
31865ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
31875ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
31885ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
31895ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
31905ffd83dbSDimitry Andric 
319106c3fb27SDimitry Andric   if (Ty == LLT::scalar(16)) {
319206c3fb27SDimitry Andric     const LLT F32 = LLT::scalar(32);
319306c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
319406c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, Src, Flags);
3195*5f757f3fSDimitry Andric     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
319606c3fb27SDimitry Andric                     .addUse(Ext.getReg(0))
319706c3fb27SDimitry Andric                     .setMIFlags(Flags);
319806c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Log2, Flags);
31995ffd83dbSDimitry Andric     MI.eraseFromParent();
32005ffd83dbSDimitry Andric     return true;
32015ffd83dbSDimitry Andric   }
32025ffd83dbSDimitry Andric 
320306c3fb27SDimitry Andric   assert(Ty == LLT::scalar(32));
320406c3fb27SDimitry Andric 
320506c3fb27SDimitry Andric   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
320606c3fb27SDimitry Andric   if (!ScaledInput) {
3207*5f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
320806c3fb27SDimitry Andric         .addUse(Src)
320906c3fb27SDimitry Andric         .setMIFlags(Flags);
321006c3fb27SDimitry Andric     MI.eraseFromParent();
321106c3fb27SDimitry Andric     return true;
321206c3fb27SDimitry Andric   }
321306c3fb27SDimitry Andric 
3214*5f757f3fSDimitry Andric   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
321506c3fb27SDimitry Andric                   .addUse(ScaledInput)
321606c3fb27SDimitry Andric                   .setMIFlags(Flags);
321706c3fb27SDimitry Andric 
321806c3fb27SDimitry Andric   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
321906c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
322006c3fb27SDimitry Andric   auto ResultOffset =
322106c3fb27SDimitry Andric       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
322206c3fb27SDimitry Andric   B.buildFSub(Dst, Log2, ResultOffset, Flags);
322306c3fb27SDimitry Andric 
322406c3fb27SDimitry Andric   MI.eraseFromParent();
322506c3fb27SDimitry Andric   return true;
322606c3fb27SDimitry Andric }
322706c3fb27SDimitry Andric 
322806c3fb27SDimitry Andric static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
322906c3fb27SDimitry Andric                        Register Z, unsigned Flags) {
323006c3fb27SDimitry Andric   auto FMul = B.buildFMul(Ty, X, Y, Flags);
323106c3fb27SDimitry Andric   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
323206c3fb27SDimitry Andric }
323306c3fb27SDimitry Andric 
323406c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
323506c3fb27SDimitry Andric                                              MachineIRBuilder &B) const {
323606c3fb27SDimitry Andric   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
323706c3fb27SDimitry Andric   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
323806c3fb27SDimitry Andric 
323906c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
324006c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
324106c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
324206c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
324306c3fb27SDimitry Andric   const LLT Ty = MRI.getType(X);
324406c3fb27SDimitry Andric   MachineFunction &MF = B.getMF();
324506c3fb27SDimitry Andric 
324606c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
324706c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
324806c3fb27SDimitry Andric 
324906c3fb27SDimitry Andric   const AMDGPUTargetMachine &TM =
325006c3fb27SDimitry Andric       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
325106c3fb27SDimitry Andric 
325206c3fb27SDimitry Andric   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
325306c3fb27SDimitry Andric       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
325406c3fb27SDimitry Andric     if (Ty == F16 && !ST.has16BitInsts()) {
325506c3fb27SDimitry Andric       Register LogVal = MRI.createGenericVirtualRegister(F32);
325606c3fb27SDimitry Andric       auto PromoteSrc = B.buildFPExt(F32, X);
32578a4dda33SDimitry Andric       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
325806c3fb27SDimitry Andric       B.buildFPTrunc(Dst, LogVal);
325906c3fb27SDimitry Andric     } else {
32608a4dda33SDimitry Andric       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
326106c3fb27SDimitry Andric     }
326206c3fb27SDimitry Andric 
326306c3fb27SDimitry Andric     MI.eraseFromParent();
326406c3fb27SDimitry Andric     return true;
326506c3fb27SDimitry Andric   }
326606c3fb27SDimitry Andric 
326706c3fb27SDimitry Andric   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
326806c3fb27SDimitry Andric   if (ScaledInput)
326906c3fb27SDimitry Andric     X = ScaledInput;
327006c3fb27SDimitry Andric 
3271*5f757f3fSDimitry Andric   auto Y =
3272*5f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
327306c3fb27SDimitry Andric 
327406c3fb27SDimitry Andric   Register R;
327506c3fb27SDimitry Andric   if (ST.hasFastFMAF32()) {
327606c3fb27SDimitry Andric     // c+cc are ln(2)/ln(10) to more than 49 bits
327706c3fb27SDimitry Andric     const float c_log10 = 0x1.344134p-2f;
327806c3fb27SDimitry Andric     const float cc_log10 = 0x1.09f79ep-26f;
327906c3fb27SDimitry Andric 
328006c3fb27SDimitry Andric     // c + cc is ln(2) to more than 49 bits
328106c3fb27SDimitry Andric     const float c_log = 0x1.62e42ep-1f;
328206c3fb27SDimitry Andric     const float cc_log = 0x1.efa39ep-25f;
328306c3fb27SDimitry Andric 
328406c3fb27SDimitry Andric     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
328506c3fb27SDimitry Andric     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
328606c3fb27SDimitry Andric 
328706c3fb27SDimitry Andric     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
328806c3fb27SDimitry Andric     auto NegR = B.buildFNeg(Ty, R, Flags);
328906c3fb27SDimitry Andric     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
329006c3fb27SDimitry Andric     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
329106c3fb27SDimitry Andric     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
329206c3fb27SDimitry Andric   } else {
329306c3fb27SDimitry Andric     // ch+ct is ln(2)/ln(10) to more than 36 bits
329406c3fb27SDimitry Andric     const float ch_log10 = 0x1.344000p-2f;
329506c3fb27SDimitry Andric     const float ct_log10 = 0x1.3509f6p-18f;
329606c3fb27SDimitry Andric 
329706c3fb27SDimitry Andric     // ch + ct is ln(2) to more than 36 bits
329806c3fb27SDimitry Andric     const float ch_log = 0x1.62e000p-1f;
329906c3fb27SDimitry Andric     const float ct_log = 0x1.0bfbe8p-15f;
330006c3fb27SDimitry Andric 
330106c3fb27SDimitry Andric     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
330206c3fb27SDimitry Andric     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
330306c3fb27SDimitry Andric 
330406c3fb27SDimitry Andric     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
330506c3fb27SDimitry Andric     auto YH = B.buildAnd(Ty, Y, MaskConst);
330606c3fb27SDimitry Andric     auto YT = B.buildFSub(Ty, Y, YH, Flags);
330706c3fb27SDimitry Andric     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
330806c3fb27SDimitry Andric 
330906c3fb27SDimitry Andric     Register Mad0 =
331006c3fb27SDimitry Andric         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
331106c3fb27SDimitry Andric     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
331206c3fb27SDimitry Andric     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
331306c3fb27SDimitry Andric   }
331406c3fb27SDimitry Andric 
331506c3fb27SDimitry Andric   const bool IsFiniteOnly =
331606c3fb27SDimitry Andric       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
331706c3fb27SDimitry Andric       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
331806c3fb27SDimitry Andric 
331906c3fb27SDimitry Andric   if (!IsFiniteOnly) {
332006c3fb27SDimitry Andric     // Expand isfinite(x) => fabs(x) < inf
332106c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
332206c3fb27SDimitry Andric     auto Fabs = B.buildFAbs(Ty, Y);
332306c3fb27SDimitry Andric     auto IsFinite =
332406c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
332506c3fb27SDimitry Andric     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
332606c3fb27SDimitry Andric   }
332706c3fb27SDimitry Andric 
332806c3fb27SDimitry Andric   if (ScaledInput) {
332906c3fb27SDimitry Andric     auto Zero = B.buildFConstant(Ty, 0.0);
333006c3fb27SDimitry Andric     auto ShiftK =
333106c3fb27SDimitry Andric         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
333206c3fb27SDimitry Andric     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
333306c3fb27SDimitry Andric     B.buildFSub(Dst, R, Shift, Flags);
333406c3fb27SDimitry Andric   } else {
333506c3fb27SDimitry Andric     B.buildCopy(Dst, R);
333606c3fb27SDimitry Andric   }
333706c3fb27SDimitry Andric 
333806c3fb27SDimitry Andric   MI.eraseFromParent();
333906c3fb27SDimitry Andric   return true;
334006c3fb27SDimitry Andric }
334106c3fb27SDimitry Andric 
334206c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
33438a4dda33SDimitry Andric                                              Register Src, bool IsLog10,
334406c3fb27SDimitry Andric                                              unsigned Flags) const {
33458a4dda33SDimitry Andric   const double Log2BaseInverted =
33468a4dda33SDimitry Andric       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
33478a4dda33SDimitry Andric 
334806c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
33498a4dda33SDimitry Andric 
33508a4dda33SDimitry Andric   if (Ty == LLT::scalar(32)) {
33518a4dda33SDimitry Andric     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
33528a4dda33SDimitry Andric     if (ScaledInput) {
3353*5f757f3fSDimitry Andric       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
33548a4dda33SDimitry Andric                         .addUse(Src)
33558a4dda33SDimitry Andric                         .setMIFlags(Flags);
33568a4dda33SDimitry Andric       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
33578a4dda33SDimitry Andric       auto Zero = B.buildFConstant(Ty, 0.0);
33588a4dda33SDimitry Andric       auto ResultOffset =
33598a4dda33SDimitry Andric           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
33608a4dda33SDimitry Andric       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
33618a4dda33SDimitry Andric 
33628a4dda33SDimitry Andric       if (ST.hasFastFMAF32())
33638a4dda33SDimitry Andric         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
33648a4dda33SDimitry Andric       else {
33658a4dda33SDimitry Andric         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
33668a4dda33SDimitry Andric         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
33678a4dda33SDimitry Andric       }
33688a4dda33SDimitry Andric 
33698a4dda33SDimitry Andric       return true;
33708a4dda33SDimitry Andric     }
33718a4dda33SDimitry Andric   }
33728a4dda33SDimitry Andric 
337306c3fb27SDimitry Andric   auto Log2Operand = Ty == LLT::scalar(16)
337406c3fb27SDimitry Andric                          ? B.buildFLog2(Ty, Src, Flags)
3375*5f757f3fSDimitry Andric                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
337606c3fb27SDimitry Andric                                .addUse(Src)
337706c3fb27SDimitry Andric                                .setMIFlags(Flags);
337806c3fb27SDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
337906c3fb27SDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
338006c3fb27SDimitry Andric   return true;
338106c3fb27SDimitry Andric }
338206c3fb27SDimitry Andric 
338306c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
338406c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
338506c3fb27SDimitry Andric   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
338606c3fb27SDimitry Andric   // If we have to handle denormals, scale up the input and adjust the result.
338706c3fb27SDimitry Andric 
338806c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
338906c3fb27SDimitry Andric   Register Src = MI.getOperand(1).getReg();
339006c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
339106c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
339206c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
339306c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
339406c3fb27SDimitry Andric 
339506c3fb27SDimitry Andric   if (Ty == F16) {
339606c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
339706c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, Src, Flags);
3398*5f757f3fSDimitry Andric     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
339906c3fb27SDimitry Andric                     .addUse(Ext.getReg(0))
340006c3fb27SDimitry Andric                     .setMIFlags(Flags);
340106c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Log2, Flags);
340206c3fb27SDimitry Andric     MI.eraseFromParent();
340306c3fb27SDimitry Andric     return true;
340406c3fb27SDimitry Andric   }
340506c3fb27SDimitry Andric 
340606c3fb27SDimitry Andric   assert(Ty == F32);
340706c3fb27SDimitry Andric 
34088a4dda33SDimitry Andric   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3409*5f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
341006c3fb27SDimitry Andric         .addUse(Src)
341106c3fb27SDimitry Andric         .setMIFlags(Flags);
341206c3fb27SDimitry Andric     MI.eraseFromParent();
341306c3fb27SDimitry Andric     return true;
341406c3fb27SDimitry Andric   }
341506c3fb27SDimitry Andric 
341606c3fb27SDimitry Andric   // bool needs_scaling = x < -0x1.f80000p+6f;
341706c3fb27SDimitry Andric   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
341806c3fb27SDimitry Andric 
341906c3fb27SDimitry Andric   // -nextafter(128.0, -1)
342006c3fb27SDimitry Andric   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
342106c3fb27SDimitry Andric   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
342206c3fb27SDimitry Andric                                   RangeCheckConst, Flags);
342306c3fb27SDimitry Andric 
342406c3fb27SDimitry Andric   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
342506c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
342606c3fb27SDimitry Andric   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
342706c3fb27SDimitry Andric   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
342806c3fb27SDimitry Andric 
3429*5f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
343006c3fb27SDimitry Andric                   .addUse(AddInput.getReg(0))
343106c3fb27SDimitry Andric                   .setMIFlags(Flags);
343206c3fb27SDimitry Andric 
343306c3fb27SDimitry Andric   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
343406c3fb27SDimitry Andric   auto One = B.buildFConstant(Ty, 1.0);
343506c3fb27SDimitry Andric   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
343606c3fb27SDimitry Andric   B.buildFMul(Dst, Exp2, ResultScale, Flags);
343706c3fb27SDimitry Andric   MI.eraseFromParent();
343806c3fb27SDimitry Andric   return true;
343906c3fb27SDimitry Andric }
344006c3fb27SDimitry Andric 
344106c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
3442*5f757f3fSDimitry Andric                                              Register X, unsigned Flags) const {
344306c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
3444*5f757f3fSDimitry Andric   LLT F32 = LLT::scalar(32);
344506c3fb27SDimitry Andric 
3446*5f757f3fSDimitry Andric   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3447*5f757f3fSDimitry Andric     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3448*5f757f3fSDimitry Andric     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
3449*5f757f3fSDimitry Andric 
3450*5f757f3fSDimitry Andric     if (Ty == F32) {
3451*5f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
345206c3fb27SDimitry Andric         .addUse(Mul.getReg(0))
345306c3fb27SDimitry Andric         .setMIFlags(Flags);
345406c3fb27SDimitry Andric     } else {
345506c3fb27SDimitry Andric       B.buildFExp2(Dst, Mul.getReg(0), Flags);
345606c3fb27SDimitry Andric     }
345706c3fb27SDimitry Andric 
345806c3fb27SDimitry Andric     return true;
345906c3fb27SDimitry Andric   }
346006c3fb27SDimitry Andric 
3461*5f757f3fSDimitry Andric   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3462*5f757f3fSDimitry Andric   auto NeedsScaling =
3463*5f757f3fSDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3464*5f757f3fSDimitry Andric   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3465*5f757f3fSDimitry Andric   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3466*5f757f3fSDimitry Andric   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3467*5f757f3fSDimitry Andric 
3468*5f757f3fSDimitry Andric   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3469*5f757f3fSDimitry Andric   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3470*5f757f3fSDimitry Andric 
3471*5f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3472*5f757f3fSDimitry Andric     .addUse(ExpInput.getReg(0))
3473*5f757f3fSDimitry Andric     .setMIFlags(Flags);
3474*5f757f3fSDimitry Andric 
3475*5f757f3fSDimitry Andric   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3476*5f757f3fSDimitry Andric   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3477*5f757f3fSDimitry Andric   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3478*5f757f3fSDimitry Andric   return true;
3479*5f757f3fSDimitry Andric }
3480*5f757f3fSDimitry Andric 
34815ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
34825ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
34835ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
348406c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
348506c3fb27SDimitry Andric   const unsigned Flags = MI.getFlags();
348606c3fb27SDimitry Andric   MachineFunction &MF = B.getMF();
348706c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
348806c3fb27SDimitry Andric   LLT Ty = MRI.getType(Dst);
348906c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
349006c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
3491*5f757f3fSDimitry Andric   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
34925ffd83dbSDimitry Andric 
349306c3fb27SDimitry Andric   if (Ty == F16) {
349406c3fb27SDimitry Andric     // v_exp_f16 (fmul x, log2e)
349506c3fb27SDimitry Andric     if (allowApproxFunc(MF, Flags)) {
349606c3fb27SDimitry Andric       // TODO: Does this really require fast?
349706c3fb27SDimitry Andric       legalizeFExpUnsafe(B, Dst, X, Flags);
349806c3fb27SDimitry Andric       MI.eraseFromParent();
349906c3fb27SDimitry Andric       return true;
350006c3fb27SDimitry Andric     }
350106c3fb27SDimitry Andric 
350206c3fb27SDimitry Andric     // exp(f16 x) ->
350306c3fb27SDimitry Andric     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
350406c3fb27SDimitry Andric 
350506c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
350606c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, X, Flags);
350706c3fb27SDimitry Andric     Register Lowered = MRI.createGenericVirtualRegister(F32);
350806c3fb27SDimitry Andric     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
350906c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Lowered, Flags);
351006c3fb27SDimitry Andric     MI.eraseFromParent();
351106c3fb27SDimitry Andric     return true;
351206c3fb27SDimitry Andric   }
351306c3fb27SDimitry Andric 
351406c3fb27SDimitry Andric   assert(Ty == F32);
351506c3fb27SDimitry Andric 
351606c3fb27SDimitry Andric   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
351706c3fb27SDimitry Andric   // library behavior. Also, is known-not-daz source sufficient?
3518*5f757f3fSDimitry Andric   if (allowApproxFunc(MF, Flags)) {
351906c3fb27SDimitry Andric     legalizeFExpUnsafe(B, Dst, X, Flags);
352006c3fb27SDimitry Andric     MI.eraseFromParent();
352106c3fb27SDimitry Andric     return true;
352206c3fb27SDimitry Andric   }
352306c3fb27SDimitry Andric 
352406c3fb27SDimitry Andric   //    Algorithm:
352506c3fb27SDimitry Andric   //
352606c3fb27SDimitry Andric   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
352706c3fb27SDimitry Andric   //
352806c3fb27SDimitry Andric   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
352906c3fb27SDimitry Andric   //    n = 64*m + j,   0 <= j < 64
353006c3fb27SDimitry Andric   //
353106c3fb27SDimitry Andric   //    e^x = 2^((64*m + j + f)/64)
353206c3fb27SDimitry Andric   //        = (2^m) * (2^(j/64)) * 2^(f/64)
353306c3fb27SDimitry Andric   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
353406c3fb27SDimitry Andric   //
353506c3fb27SDimitry Andric   //    f = x*(64/ln(2)) - n
353606c3fb27SDimitry Andric   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
353706c3fb27SDimitry Andric   //
353806c3fb27SDimitry Andric   //    e^x = (2^m) * (2^(j/64)) * e^r
353906c3fb27SDimitry Andric   //
354006c3fb27SDimitry Andric   //    (2^(j/64)) is precomputed
354106c3fb27SDimitry Andric   //
354206c3fb27SDimitry Andric   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
354306c3fb27SDimitry Andric   //    e^r = 1 + q
354406c3fb27SDimitry Andric   //
354506c3fb27SDimitry Andric   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
354606c3fb27SDimitry Andric   //
354706c3fb27SDimitry Andric   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
354806c3fb27SDimitry Andric   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
354906c3fb27SDimitry Andric   Register PH, PL;
355006c3fb27SDimitry Andric 
355106c3fb27SDimitry Andric   if (ST.hasFastFMAF32()) {
355206c3fb27SDimitry Andric     const float c_exp = numbers::log2ef;
355306c3fb27SDimitry Andric     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
355406c3fb27SDimitry Andric     const float c_exp10 = 0x1.a934f0p+1f;
355506c3fb27SDimitry Andric     const float cc_exp10 = 0x1.2f346ep-24f;
355606c3fb27SDimitry Andric 
355706c3fb27SDimitry Andric     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
355806c3fb27SDimitry Andric     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
355906c3fb27SDimitry Andric     auto NegPH = B.buildFNeg(Ty, PH, Flags);
356006c3fb27SDimitry Andric     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
356106c3fb27SDimitry Andric 
356206c3fb27SDimitry Andric     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
356306c3fb27SDimitry Andric     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
356406c3fb27SDimitry Andric   } else {
356506c3fb27SDimitry Andric     const float ch_exp = 0x1.714000p+0f;
356606c3fb27SDimitry Andric     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
356706c3fb27SDimitry Andric 
356806c3fb27SDimitry Andric     const float ch_exp10 = 0x1.a92000p+1f;
356906c3fb27SDimitry Andric     const float cl_exp10 = 0x1.4f0978p-11f;
357006c3fb27SDimitry Andric 
357106c3fb27SDimitry Andric     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
357206c3fb27SDimitry Andric     auto XH = B.buildAnd(Ty, X, MaskConst);
357306c3fb27SDimitry Andric     auto XL = B.buildFSub(Ty, X, XH, Flags);
357406c3fb27SDimitry Andric 
357506c3fb27SDimitry Andric     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
357606c3fb27SDimitry Andric     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
357706c3fb27SDimitry Andric 
357806c3fb27SDimitry Andric     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
357906c3fb27SDimitry Andric     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
358006c3fb27SDimitry Andric 
358106c3fb27SDimitry Andric     Register Mad0 =
358206c3fb27SDimitry Andric         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
358306c3fb27SDimitry Andric     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
358406c3fb27SDimitry Andric   }
358506c3fb27SDimitry Andric 
3586*5f757f3fSDimitry Andric   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
358706c3fb27SDimitry Andric 
358806c3fb27SDimitry Andric   // It is unsafe to contract this fsub into the PH multiply.
358906c3fb27SDimitry Andric   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
359006c3fb27SDimitry Andric   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
359106c3fb27SDimitry Andric   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
359206c3fb27SDimitry Andric 
3593*5f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
359406c3fb27SDimitry Andric                   .addUse(A.getReg(0))
359506c3fb27SDimitry Andric                   .setMIFlags(Flags);
359606c3fb27SDimitry Andric   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
359706c3fb27SDimitry Andric 
359806c3fb27SDimitry Andric   auto UnderflowCheckConst =
359906c3fb27SDimitry Andric       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
360006c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
360106c3fb27SDimitry Andric   auto Underflow =
360206c3fb27SDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
360306c3fb27SDimitry Andric 
360406c3fb27SDimitry Andric   R = B.buildSelect(Ty, Underflow, Zero, R);
360506c3fb27SDimitry Andric 
360606c3fb27SDimitry Andric   const auto &Options = MF.getTarget().Options;
360706c3fb27SDimitry Andric 
360806c3fb27SDimitry Andric   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
360906c3fb27SDimitry Andric     auto OverflowCheckConst =
361006c3fb27SDimitry Andric         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
361106c3fb27SDimitry Andric 
361206c3fb27SDimitry Andric     auto Overflow =
361306c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
361406c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
361506c3fb27SDimitry Andric     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
361606c3fb27SDimitry Andric   }
361706c3fb27SDimitry Andric 
361806c3fb27SDimitry Andric   B.buildCopy(Dst, R);
36195ffd83dbSDimitry Andric   MI.eraseFromParent();
36205ffd83dbSDimitry Andric   return true;
36215ffd83dbSDimitry Andric }
36225ffd83dbSDimitry Andric 
36235ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
36245ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
36255ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
36265ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
36275ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
36285ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
36295ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
3630*5f757f3fSDimitry Andric   const LLT F16 = LLT::float16();
3631*5f757f3fSDimitry Andric   const LLT F32 = LLT::float32();
36325ffd83dbSDimitry Andric 
3633*5f757f3fSDimitry Andric   if (Ty == F32) {
3634*5f757f3fSDimitry Andric     auto Log = B.buildFLog2(F32, Src0, Flags);
3635*5f757f3fSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
36365ffd83dbSDimitry Andric                    .addUse(Log.getReg(0))
36375ffd83dbSDimitry Andric                    .addUse(Src1)
36385ffd83dbSDimitry Andric                    .setMIFlags(Flags);
36395ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
3640*5f757f3fSDimitry Andric   } else if (Ty == F16) {
36415ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
3642*5f757f3fSDimitry Andric     auto Log = B.buildFLog2(F16, Src0, Flags);
3643*5f757f3fSDimitry Andric     auto Ext0 = B.buildFPExt(F32, Log, Flags);
3644*5f757f3fSDimitry Andric     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
3645*5f757f3fSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
36465ffd83dbSDimitry Andric                    .addUse(Ext0.getReg(0))
36475ffd83dbSDimitry Andric                    .addUse(Ext1.getReg(0))
36485ffd83dbSDimitry Andric                    .setMIFlags(Flags);
3649*5f757f3fSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
36505ffd83dbSDimitry Andric   } else
36515ffd83dbSDimitry Andric     return false;
36525ffd83dbSDimitry Andric 
36535ffd83dbSDimitry Andric   MI.eraseFromParent();
36545ffd83dbSDimitry Andric   return true;
36555ffd83dbSDimitry Andric }
36565ffd83dbSDimitry Andric 
36575ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
36585ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
36595ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
36605ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
36615ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
36625ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
36635ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
36645ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
36655ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
36665ffd83dbSDimitry Andric   return ModSrc;
36675ffd83dbSDimitry Andric }
36685ffd83dbSDimitry Andric 
36695ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
36705ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
36715ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
36725ffd83dbSDimitry Andric 
36735ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
3674*5f757f3fSDimitry Andric   const LLT F64 = LLT::float64();
36755ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
36765ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
36775ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
3678*5f757f3fSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
36795ffd83dbSDimitry Andric          "this should not have been custom lowered");
36805ffd83dbSDimitry Andric 
36815ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
36825ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
36835ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
36845ffd83dbSDimitry Andric   // V_FRACT bug is:
36855ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
36865ffd83dbSDimitry Andric   //
36875ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
36885ffd83dbSDimitry Andric 
3689*5f757f3fSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
36905ffd83dbSDimitry Andric                    .addUse(OrigSrc)
36915ffd83dbSDimitry Andric                    .setMIFlags(Flags);
36925ffd83dbSDimitry Andric 
36935ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
36945ffd83dbSDimitry Andric   // pattern.
36955ffd83dbSDimitry Andric 
36965ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
36975ffd83dbSDimitry Andric   // shouldn't matter?
36985ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
36995ffd83dbSDimitry Andric 
370006c3fb27SDimitry Andric   auto Const =
3701*5f757f3fSDimitry Andric       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
37025ffd83dbSDimitry Andric 
3703*5f757f3fSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(F64);
37045ffd83dbSDimitry Andric 
37055ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
37065ffd83dbSDimitry Andric   // use the one which will directly select.
37075ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
37085ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
37095ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
37105ffd83dbSDimitry Andric   else
37115ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
37125ffd83dbSDimitry Andric 
37135ffd83dbSDimitry Andric   Register CorrectedFract = Min;
37145ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
37155ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
3716*5f757f3fSDimitry Andric     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
37175ffd83dbSDimitry Andric   }
37185ffd83dbSDimitry Andric 
3719*5f757f3fSDimitry Andric   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
37205ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
37215ffd83dbSDimitry Andric 
37225ffd83dbSDimitry Andric   MI.eraseFromParent();
37235ffd83dbSDimitry Andric   return true;
37245ffd83dbSDimitry Andric }
37255ffd83dbSDimitry Andric 
37265ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
37275ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
37285ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
37295ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
37305ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
37315ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3732bdd1243dSDimitry Andric   const LLT S16 = LLT::scalar(16);
3733fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
37345ffd83dbSDimitry Andric 
37355ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
37365ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
37375ffd83dbSDimitry Andric 
3738bdd1243dSDimitry Andric   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3739bdd1243dSDimitry Andric     assert(MRI.getType(Src0) == S32);
3740bdd1243dSDimitry Andric     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3741bdd1243dSDimitry Andric     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3742bdd1243dSDimitry Andric   }
3743bdd1243dSDimitry Andric 
3744bdd1243dSDimitry Andric   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
37455ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
37465ffd83dbSDimitry Andric 
37475ffd83dbSDimitry Andric   MI.eraseFromParent();
37485ffd83dbSDimitry Andric   return true;
37495ffd83dbSDimitry Andric }
37505ffd83dbSDimitry Andric 
375181ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
375281ad6265SDimitry Andric //
375381ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits.
375481ad6265SDimitry Andric //
375581ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence
375681ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of
375781ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go
375881ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction
375981ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions.
376006c3fb27SDimitry Andric void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
376106c3fb27SDimitry Andric                                         MutableArrayRef<Register> Accum,
376206c3fb27SDimitry Andric                                         ArrayRef<Register> Src0,
376306c3fb27SDimitry Andric                                         ArrayRef<Register> Src1,
376406c3fb27SDimitry Andric                                         bool UsePartialMad64_32,
376506c3fb27SDimitry Andric                                         bool SeparateOddAlignedProducts) const {
376681ad6265SDimitry Andric   // Use (possibly empty) vectors of S1 registers to represent the set of
376781ad6265SDimitry Andric   // carries from one pair of positions to the next.
376881ad6265SDimitry Andric   using Carry = SmallVector<Register, 2>;
376981ad6265SDimitry Andric 
377081ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
377106c3fb27SDimitry Andric   GISelKnownBits &KB = *Helper.getKnownBits();
377281ad6265SDimitry Andric 
377381ad6265SDimitry Andric   const LLT S1 = LLT::scalar(1);
377481ad6265SDimitry Andric   const LLT S32 = LLT::scalar(32);
377581ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
377681ad6265SDimitry Andric 
377781ad6265SDimitry Andric   Register Zero32;
377881ad6265SDimitry Andric   Register Zero64;
377981ad6265SDimitry Andric 
378081ad6265SDimitry Andric   auto getZero32 = [&]() -> Register {
378181ad6265SDimitry Andric     if (!Zero32)
378281ad6265SDimitry Andric       Zero32 = B.buildConstant(S32, 0).getReg(0);
378381ad6265SDimitry Andric     return Zero32;
378481ad6265SDimitry Andric   };
378581ad6265SDimitry Andric   auto getZero64 = [&]() -> Register {
378681ad6265SDimitry Andric     if (!Zero64)
378781ad6265SDimitry Andric       Zero64 = B.buildConstant(S64, 0).getReg(0);
378881ad6265SDimitry Andric     return Zero64;
378981ad6265SDimitry Andric   };
379081ad6265SDimitry Andric 
379106c3fb27SDimitry Andric   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
379206c3fb27SDimitry Andric   for (unsigned i = 0; i < Src0.size(); ++i) {
379306c3fb27SDimitry Andric     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
379406c3fb27SDimitry Andric     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
379506c3fb27SDimitry Andric   }
379606c3fb27SDimitry Andric 
379781ad6265SDimitry Andric   // Merge the given carries into the 32-bit LocalAccum, which is modified
379881ad6265SDimitry Andric   // in-place.
379981ad6265SDimitry Andric   //
380081ad6265SDimitry Andric   // Returns the carry-out, which is a single S1 register or null.
380181ad6265SDimitry Andric   auto mergeCarry =
380281ad6265SDimitry Andric       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
380381ad6265SDimitry Andric         if (CarryIn.empty())
380481ad6265SDimitry Andric           return Register();
380581ad6265SDimitry Andric 
380681ad6265SDimitry Andric         bool HaveCarryOut = true;
380781ad6265SDimitry Andric         Register CarryAccum;
380881ad6265SDimitry Andric         if (CarryIn.size() == 1) {
380981ad6265SDimitry Andric           if (!LocalAccum) {
381081ad6265SDimitry Andric             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
381181ad6265SDimitry Andric             return Register();
381281ad6265SDimitry Andric           }
381381ad6265SDimitry Andric 
381481ad6265SDimitry Andric           CarryAccum = getZero32();
381581ad6265SDimitry Andric         } else {
381681ad6265SDimitry Andric           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
381781ad6265SDimitry Andric           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
381881ad6265SDimitry Andric             CarryAccum =
381981ad6265SDimitry Andric                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
382081ad6265SDimitry Andric                     .getReg(0);
382181ad6265SDimitry Andric           }
382281ad6265SDimitry Andric 
382381ad6265SDimitry Andric           if (!LocalAccum) {
382481ad6265SDimitry Andric             LocalAccum = getZero32();
382581ad6265SDimitry Andric             HaveCarryOut = false;
382681ad6265SDimitry Andric           }
382781ad6265SDimitry Andric         }
382881ad6265SDimitry Andric 
382981ad6265SDimitry Andric         auto Add =
383081ad6265SDimitry Andric             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
383181ad6265SDimitry Andric         LocalAccum = Add.getReg(0);
383281ad6265SDimitry Andric         return HaveCarryOut ? Add.getReg(1) : Register();
383381ad6265SDimitry Andric       };
383481ad6265SDimitry Andric 
383581ad6265SDimitry Andric   // Build a multiply-add chain to compute
383681ad6265SDimitry Andric   //
383781ad6265SDimitry Andric   //   LocalAccum + (partial products at DstIndex)
383881ad6265SDimitry Andric   //       + (opportunistic subset of CarryIn)
383981ad6265SDimitry Andric   //
384081ad6265SDimitry Andric   // LocalAccum is an array of one or two 32-bit registers that are updated
384181ad6265SDimitry Andric   // in-place. The incoming registers may be null.
384281ad6265SDimitry Andric   //
384381ad6265SDimitry Andric   // In some edge cases, carry-ins can be consumed "for free". In that case,
384481ad6265SDimitry Andric   // the consumed carry bits are removed from CarryIn in-place.
384581ad6265SDimitry Andric   auto buildMadChain =
384681ad6265SDimitry Andric       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
384781ad6265SDimitry Andric           -> Carry {
384881ad6265SDimitry Andric         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
384981ad6265SDimitry Andric                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
385081ad6265SDimitry Andric 
385181ad6265SDimitry Andric         Carry CarryOut;
385281ad6265SDimitry Andric         unsigned j0 = 0;
385381ad6265SDimitry Andric 
385481ad6265SDimitry Andric         // Use plain 32-bit multiplication for the most significant part of the
385581ad6265SDimitry Andric         // result by default.
385681ad6265SDimitry Andric         if (LocalAccum.size() == 1 &&
385781ad6265SDimitry Andric             (!UsePartialMad64_32 || !CarryIn.empty())) {
385881ad6265SDimitry Andric           do {
385906c3fb27SDimitry Andric             // Skip multiplication if one of the operands is 0
386081ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
386106c3fb27SDimitry Andric             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
386206c3fb27SDimitry Andric               ++j0;
386306c3fb27SDimitry Andric               continue;
386406c3fb27SDimitry Andric             }
386581ad6265SDimitry Andric             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
386606c3fb27SDimitry Andric             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
386781ad6265SDimitry Andric               LocalAccum[0] = Mul.getReg(0);
386881ad6265SDimitry Andric             } else {
386981ad6265SDimitry Andric               if (CarryIn.empty()) {
387081ad6265SDimitry Andric                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
387181ad6265SDimitry Andric               } else {
387281ad6265SDimitry Andric                 LocalAccum[0] =
387381ad6265SDimitry Andric                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
387481ad6265SDimitry Andric                         .getReg(0);
387581ad6265SDimitry Andric                 CarryIn.pop_back();
387681ad6265SDimitry Andric               }
387781ad6265SDimitry Andric             }
387881ad6265SDimitry Andric             ++j0;
387981ad6265SDimitry Andric           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
388081ad6265SDimitry Andric         }
388181ad6265SDimitry Andric 
388281ad6265SDimitry Andric         // Build full 64-bit multiplies.
388381ad6265SDimitry Andric         if (j0 <= DstIndex) {
388481ad6265SDimitry Andric           bool HaveSmallAccum = false;
388581ad6265SDimitry Andric           Register Tmp;
388681ad6265SDimitry Andric 
388781ad6265SDimitry Andric           if (LocalAccum[0]) {
388881ad6265SDimitry Andric             if (LocalAccum.size() == 1) {
388981ad6265SDimitry Andric               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
389081ad6265SDimitry Andric               HaveSmallAccum = true;
389181ad6265SDimitry Andric             } else if (LocalAccum[1]) {
3892bdd1243dSDimitry Andric               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
389381ad6265SDimitry Andric               HaveSmallAccum = false;
389481ad6265SDimitry Andric             } else {
389581ad6265SDimitry Andric               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
389681ad6265SDimitry Andric               HaveSmallAccum = true;
389781ad6265SDimitry Andric             }
389881ad6265SDimitry Andric           } else {
389981ad6265SDimitry Andric             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
390081ad6265SDimitry Andric             Tmp = getZero64();
390181ad6265SDimitry Andric             HaveSmallAccum = true;
390281ad6265SDimitry Andric           }
390381ad6265SDimitry Andric 
390481ad6265SDimitry Andric           do {
390581ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
390606c3fb27SDimitry Andric             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
390706c3fb27SDimitry Andric               ++j0;
390806c3fb27SDimitry Andric               continue;
390906c3fb27SDimitry Andric             }
391081ad6265SDimitry Andric             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
391181ad6265SDimitry Andric                                     {Src0[j0], Src1[j1], Tmp});
391281ad6265SDimitry Andric             Tmp = Mad.getReg(0);
391381ad6265SDimitry Andric             if (!HaveSmallAccum)
391481ad6265SDimitry Andric               CarryOut.push_back(Mad.getReg(1));
391581ad6265SDimitry Andric             HaveSmallAccum = false;
391606c3fb27SDimitry Andric 
391781ad6265SDimitry Andric             ++j0;
391881ad6265SDimitry Andric           } while (j0 <= DstIndex);
391981ad6265SDimitry Andric 
392081ad6265SDimitry Andric           auto Unmerge = B.buildUnmerge(S32, Tmp);
392181ad6265SDimitry Andric           LocalAccum[0] = Unmerge.getReg(0);
392281ad6265SDimitry Andric           if (LocalAccum.size() > 1)
392381ad6265SDimitry Andric             LocalAccum[1] = Unmerge.getReg(1);
392481ad6265SDimitry Andric         }
392581ad6265SDimitry Andric 
392681ad6265SDimitry Andric         return CarryOut;
392781ad6265SDimitry Andric       };
392881ad6265SDimitry Andric 
392981ad6265SDimitry Andric   // Outer multiply loop, iterating over destination parts from least
393081ad6265SDimitry Andric   // significant to most significant parts.
393181ad6265SDimitry Andric   //
393281ad6265SDimitry Andric   // The columns of the following diagram correspond to the destination parts
393381ad6265SDimitry Andric   // affected by one iteration of the outer loop (ignoring boundary
393481ad6265SDimitry Andric   // conditions).
393581ad6265SDimitry Andric   //
393681ad6265SDimitry Andric   //   Dest index relative to 2 * i:      1 0 -1
393781ad6265SDimitry Andric   //                                      ------
393881ad6265SDimitry Andric   //   Carries from previous iteration:     e o
393981ad6265SDimitry Andric   //   Even-aligned partial product sum:  E E .
394081ad6265SDimitry Andric   //   Odd-aligned partial product sum:     O O
394181ad6265SDimitry Andric   //
394281ad6265SDimitry Andric   // 'o' is OddCarry, 'e' is EvenCarry.
394381ad6265SDimitry Andric   // EE and OO are computed from partial products via buildMadChain and use
394481ad6265SDimitry Andric   // accumulation where possible and appropriate.
394581ad6265SDimitry Andric   //
394681ad6265SDimitry Andric   Register SeparateOddCarry;
394781ad6265SDimitry Andric   Carry EvenCarry;
394881ad6265SDimitry Andric   Carry OddCarry;
394981ad6265SDimitry Andric 
395081ad6265SDimitry Andric   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
395181ad6265SDimitry Andric     Carry OddCarryIn = std::move(OddCarry);
395281ad6265SDimitry Andric     Carry EvenCarryIn = std::move(EvenCarry);
395381ad6265SDimitry Andric     OddCarry.clear();
395481ad6265SDimitry Andric     EvenCarry.clear();
395581ad6265SDimitry Andric 
395681ad6265SDimitry Andric     // Partial products at offset 2 * i.
395781ad6265SDimitry Andric     if (2 * i < Accum.size()) {
395881ad6265SDimitry Andric       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
395981ad6265SDimitry Andric       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
396081ad6265SDimitry Andric     }
396181ad6265SDimitry Andric 
396281ad6265SDimitry Andric     // Partial products at offset 2 * i - 1.
396381ad6265SDimitry Andric     if (i > 0) {
396481ad6265SDimitry Andric       if (!SeparateOddAlignedProducts) {
396581ad6265SDimitry Andric         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
396681ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
396781ad6265SDimitry Andric       } else {
396881ad6265SDimitry Andric         bool IsHighest = 2 * i >= Accum.size();
396981ad6265SDimitry Andric         Register SeparateOddOut[2];
3970bdd1243dSDimitry Andric         auto LocalAccum = MutableArrayRef(SeparateOddOut)
397181ad6265SDimitry Andric                               .take_front(IsHighest ? 1 : 2);
397281ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
397381ad6265SDimitry Andric 
397481ad6265SDimitry Andric         MachineInstr *Lo;
397581ad6265SDimitry Andric 
397681ad6265SDimitry Andric         if (i == 1) {
397781ad6265SDimitry Andric           if (!IsHighest)
397881ad6265SDimitry Andric             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
397981ad6265SDimitry Andric           else
398081ad6265SDimitry Andric             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
398181ad6265SDimitry Andric         } else {
398281ad6265SDimitry Andric           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
398381ad6265SDimitry Andric                             SeparateOddCarry);
398481ad6265SDimitry Andric         }
398581ad6265SDimitry Andric         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
398681ad6265SDimitry Andric 
398781ad6265SDimitry Andric         if (!IsHighest) {
398881ad6265SDimitry Andric           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
398981ad6265SDimitry Andric                                 Lo->getOperand(1).getReg());
399081ad6265SDimitry Andric           Accum[2 * i] = Hi.getReg(0);
399181ad6265SDimitry Andric           SeparateOddCarry = Hi.getReg(1);
399281ad6265SDimitry Andric         }
399381ad6265SDimitry Andric       }
399481ad6265SDimitry Andric     }
399581ad6265SDimitry Andric 
399681ad6265SDimitry Andric     // Add in the carries from the previous iteration
399781ad6265SDimitry Andric     if (i > 0) {
399881ad6265SDimitry Andric       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
399981ad6265SDimitry Andric         EvenCarryIn.push_back(CarryOut);
400081ad6265SDimitry Andric 
400181ad6265SDimitry Andric       if (2 * i < Accum.size()) {
400281ad6265SDimitry Andric         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
400381ad6265SDimitry Andric           OddCarry.push_back(CarryOut);
400481ad6265SDimitry Andric       }
400581ad6265SDimitry Andric     }
400681ad6265SDimitry Andric   }
400781ad6265SDimitry Andric }
400881ad6265SDimitry Andric 
400981ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions.
401081ad6265SDimitry Andric //
401181ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to
401281ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
401381ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
401481ad6265SDimitry Andric                                       MachineInstr &MI) const {
401581ad6265SDimitry Andric   assert(ST.hasMad64_32());
401681ad6265SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_MUL);
401781ad6265SDimitry Andric 
401881ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
401981ad6265SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
402081ad6265SDimitry Andric 
402181ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
402281ad6265SDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
402381ad6265SDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
402481ad6265SDimitry Andric 
402581ad6265SDimitry Andric   LLT Ty = MRI.getType(DstReg);
402681ad6265SDimitry Andric   assert(Ty.isScalar());
402781ad6265SDimitry Andric 
402881ad6265SDimitry Andric   unsigned Size = Ty.getSizeInBits();
402981ad6265SDimitry Andric   unsigned NumParts = Size / 32;
403081ad6265SDimitry Andric   assert((Size % 32) == 0);
403181ad6265SDimitry Andric   assert(NumParts >= 2);
403281ad6265SDimitry Andric 
403381ad6265SDimitry Andric   // Whether to use MAD_64_32 for partial products whose high half is
403481ad6265SDimitry Andric   // discarded. This avoids some ADD instructions but risks false dependency
403581ad6265SDimitry Andric   // stalls on some subtargets in some cases.
403681ad6265SDimitry Andric   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
403781ad6265SDimitry Andric 
403881ad6265SDimitry Andric   // Whether to compute odd-aligned partial products separately. This is
403981ad6265SDimitry Andric   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
404081ad6265SDimitry Andric   // in an even-aligned VGPR.
404181ad6265SDimitry Andric   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
404281ad6265SDimitry Andric 
404381ad6265SDimitry Andric   LLT S32 = LLT::scalar(32);
404481ad6265SDimitry Andric   SmallVector<Register, 2> Src0Parts, Src1Parts;
404581ad6265SDimitry Andric   for (unsigned i = 0; i < NumParts; ++i) {
404681ad6265SDimitry Andric     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
404781ad6265SDimitry Andric     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
404881ad6265SDimitry Andric   }
404981ad6265SDimitry Andric   B.buildUnmerge(Src0Parts, Src0);
405081ad6265SDimitry Andric   B.buildUnmerge(Src1Parts, Src1);
405181ad6265SDimitry Andric 
405281ad6265SDimitry Andric   SmallVector<Register, 2> AccumRegs(NumParts);
405381ad6265SDimitry Andric   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
405481ad6265SDimitry Andric                 SeparateOddAlignedProducts);
405581ad6265SDimitry Andric 
4056bdd1243dSDimitry Andric   B.buildMergeLikeInstr(DstReg, AccumRegs);
405781ad6265SDimitry Andric   MI.eraseFromParent();
405881ad6265SDimitry Andric   return true;
405981ad6265SDimitry Andric }
406081ad6265SDimitry Andric 
4061349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4062349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4063349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select.
4064349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4065349cc55cSDimitry Andric                                             MachineRegisterInfo &MRI,
4066349cc55cSDimitry Andric                                             MachineIRBuilder &B) const {
4067349cc55cSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4068349cc55cSDimitry Andric   Register Src = MI.getOperand(1).getReg();
4069349cc55cSDimitry Andric   LLT DstTy = MRI.getType(Dst);
4070349cc55cSDimitry Andric   LLT SrcTy = MRI.getType(Src);
4071349cc55cSDimitry Andric 
4072349cc55cSDimitry Andric   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4073349cc55cSDimitry Andric                         ? AMDGPU::G_AMDGPU_FFBH_U32
4074349cc55cSDimitry Andric                         : AMDGPU::G_AMDGPU_FFBL_B32;
4075349cc55cSDimitry Andric   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4076349cc55cSDimitry Andric   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4077349cc55cSDimitry Andric 
4078349cc55cSDimitry Andric   MI.eraseFromParent();
4079349cc55cSDimitry Andric   return true;
4080349cc55cSDimitry Andric }
4081349cc55cSDimitry Andric 
4082e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
4083e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4084e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
4085e8d8bef9SDimitry Andric     return false;
4086349cc55cSDimitry Andric   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4087e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
4088e8d8bef9SDimitry Andric }
4089e8d8bef9SDimitry Andric 
40900b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
4091e8d8bef9SDimitry Andric static MachineInstr *
4092e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4093e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
40940b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
40950b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
40960b57cec5SDimitry Andric     return nullptr;
40970b57cec5SDimitry Andric 
40985ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
4099e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4100e8d8bef9SDimitry Andric 
4101e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
4102e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
4103e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
4104e8d8bef9SDimitry Andric       return nullptr;
4105e8d8bef9SDimitry Andric 
4106e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
4107349cc55cSDimitry Andric     eraseInstr(*UseMI, MRI);
4108e8d8bef9SDimitry Andric 
4109e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4110e8d8bef9SDimitry Andric     Negated = true;
4111e8d8bef9SDimitry Andric   }
4112e8d8bef9SDimitry Andric 
4113e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4114480093f4SDimitry Andric     return nullptr;
4115480093f4SDimitry Andric 
41165ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4117e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
41185ffd83dbSDimitry Andric   if (Next == Parent->end()) {
41195ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
41205ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
41215ffd83dbSDimitry Andric       return nullptr;
41225ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
41235ffd83dbSDimitry Andric   } else {
4124480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
4125480093f4SDimitry Andric       return nullptr;
4126480093f4SDimitry Andric     Br = &*Next;
41275ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
4128480093f4SDimitry Andric   }
4129480093f4SDimitry Andric 
4130e8d8bef9SDimitry Andric   return UseMI;
41310b57cec5SDimitry Andric }
41320b57cec5SDimitry Andric 
41330b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4134e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
4135e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
4136e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
4137e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
4138e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
41395ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
41400b57cec5SDimitry Andric 
414104eeddc0SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
414204eeddc0SDimitry Andric                                              *ArgRC, B.getDebugLoc(), ArgTy);
41430b57cec5SDimitry Andric   if (Arg->isMasked()) {
41440b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
41450b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
41460b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
414706c3fb27SDimitry Andric     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
41480b57cec5SDimitry Andric 
41498bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
41508bcb0991SDimitry Andric 
415104eeddc0SDimitry Andric     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
415204eeddc0SDimitry Andric     // 0.
41538bcb0991SDimitry Andric     if (Shift != 0) {
41540b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
41558bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
41568bcb0991SDimitry Andric     }
41578bcb0991SDimitry Andric 
41588bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
41595ffd83dbSDimitry Andric   } else {
41600b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
41610b57cec5SDimitry Andric   }
41620b57cec5SDimitry Andric 
41630b57cec5SDimitry Andric   return true;
41640b57cec5SDimitry Andric }
41650b57cec5SDimitry Andric 
4166e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
4167e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
4168e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4169e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4170e8d8bef9SDimitry Andric   const ArgDescriptor *Arg;
4171e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
4172e8d8bef9SDimitry Andric   LLT ArgTy;
4173e8d8bef9SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4174e8d8bef9SDimitry Andric 
4175349cc55cSDimitry Andric   if (!Arg) {
4176349cc55cSDimitry Andric     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4177349cc55cSDimitry Andric       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4178349cc55cSDimitry Andric       // case the pointer argument may be missing and we use null.
4179349cc55cSDimitry Andric       B.buildConstant(DstReg, 0);
4180349cc55cSDimitry Andric       return true;
4181349cc55cSDimitry Andric     }
4182349cc55cSDimitry Andric 
4183349cc55cSDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
4184349cc55cSDimitry Andric     // attributes uses the corresponding intrinsic.
4185349cc55cSDimitry Andric     B.buildUndef(DstReg);
4186349cc55cSDimitry Andric     return true;
4187349cc55cSDimitry Andric   }
4188349cc55cSDimitry Andric 
4189e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4190e8d8bef9SDimitry Andric     return false; // TODO: Handle these
4191e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4192e8d8bef9SDimitry Andric }
4193e8d8bef9SDimitry Andric 
41940b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
41955ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
41960b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4197e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
41985ffd83dbSDimitry Andric     return false;
41995ffd83dbSDimitry Andric 
42000b57cec5SDimitry Andric   MI.eraseFromParent();
42010b57cec5SDimitry Andric   return true;
42020b57cec5SDimitry Andric }
42030b57cec5SDimitry Andric 
420481ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
420581ad6265SDimitry Andric                                 int64_t C) {
420681ad6265SDimitry Andric   B.buildConstant(MI.getOperand(0).getReg(), C);
420781ad6265SDimitry Andric   MI.eraseFromParent();
420881ad6265SDimitry Andric   return true;
420981ad6265SDimitry Andric }
421081ad6265SDimitry Andric 
421181ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
421281ad6265SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
421381ad6265SDimitry Andric     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
421481ad6265SDimitry Andric   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
421581ad6265SDimitry Andric   if (MaxID == 0)
421681ad6265SDimitry Andric     return replaceWithConstant(B, MI, 0);
421781ad6265SDimitry Andric 
421881ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
421981ad6265SDimitry Andric   const ArgDescriptor *Arg;
422081ad6265SDimitry Andric   const TargetRegisterClass *ArgRC;
422181ad6265SDimitry Andric   LLT ArgTy;
422281ad6265SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
422381ad6265SDimitry Andric 
422481ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
422581ad6265SDimitry Andric   if (!Arg) {
422681ad6265SDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
422781ad6265SDimitry Andric     // attributes uses the corresponding intrinsic.
422881ad6265SDimitry Andric     B.buildUndef(DstReg);
422981ad6265SDimitry Andric     MI.eraseFromParent();
423081ad6265SDimitry Andric     return true;
423181ad6265SDimitry Andric   }
423281ad6265SDimitry Andric 
423381ad6265SDimitry Andric   if (Arg->isMasked()) {
423481ad6265SDimitry Andric     // Don't bother inserting AssertZext for packed IDs since we're emitting the
423581ad6265SDimitry Andric     // masking operations anyway.
423681ad6265SDimitry Andric     //
423781ad6265SDimitry Andric     // TODO: We could assert the top bit is 0 for the source copy.
423881ad6265SDimitry Andric     if (!loadInputValue(DstReg, B, ArgType))
423981ad6265SDimitry Andric       return false;
424081ad6265SDimitry Andric   } else {
424181ad6265SDimitry Andric     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
424281ad6265SDimitry Andric     if (!loadInputValue(TmpReg, B, ArgType))
424381ad6265SDimitry Andric       return false;
4244bdd1243dSDimitry Andric     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
424581ad6265SDimitry Andric   }
424681ad6265SDimitry Andric 
424781ad6265SDimitry Andric   MI.eraseFromParent();
424881ad6265SDimitry Andric   return true;
424981ad6265SDimitry Andric }
425081ad6265SDimitry Andric 
425181ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
425281ad6265SDimitry Andric                                                      int64_t Offset) const {
425381ad6265SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
425481ad6265SDimitry Andric   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
425581ad6265SDimitry Andric 
425681ad6265SDimitry Andric   // TODO: If we passed in the base kernel offset we could have a better
425781ad6265SDimitry Andric   // alignment than 4, but we don't really need it.
425881ad6265SDimitry Andric   if (!loadInputValue(KernArgReg, B,
425981ad6265SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
426081ad6265SDimitry Andric     llvm_unreachable("failed to find kernarg segment ptr");
426181ad6265SDimitry Andric 
426281ad6265SDimitry Andric   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
426381ad6265SDimitry Andric   // TODO: Should get nuw
426481ad6265SDimitry Andric   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
426581ad6265SDimitry Andric }
426681ad6265SDimitry Andric 
426781ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by
426881ad6265SDimitry Andric /// legacy intrinsics.
426981ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
427081ad6265SDimitry Andric                                                       MachineIRBuilder &B,
427181ad6265SDimitry Andric                                                       uint64_t Offset,
427281ad6265SDimitry Andric                                                       Align Alignment) const {
427381ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
427481ad6265SDimitry Andric 
427581ad6265SDimitry Andric   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
427681ad6265SDimitry Andric          "unexpected kernarg parameter type");
427781ad6265SDimitry Andric 
427881ad6265SDimitry Andric   Register Ptr = getKernargParameterPtr(B, Offset);
427981ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
428081ad6265SDimitry Andric   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
428181ad6265SDimitry Andric               MachineMemOperand::MODereferenceable |
428281ad6265SDimitry Andric                   MachineMemOperand::MOInvariant);
428381ad6265SDimitry Andric   MI.eraseFromParent();
428481ad6265SDimitry Andric   return true;
428581ad6265SDimitry Andric }
428681ad6265SDimitry Andric 
42878bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
42888bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
42898bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
4290480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4291480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
4292480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
4293480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4294480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
42958bcb0991SDimitry Andric 
4296480093f4SDimitry Andric   if (DstTy == S16)
4297480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
4298480093f4SDimitry Andric   if (DstTy == S32)
4299480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
4300480093f4SDimitry Andric   if (DstTy == S64)
4301480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
4302480093f4SDimitry Andric 
43038bcb0991SDimitry Andric   return false;
43048bcb0991SDimitry Andric }
43058bcb0991SDimitry Andric 
4306fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4307fe6060f1SDimitry Andric                                                         Register DstDivReg,
4308fe6060f1SDimitry Andric                                                         Register DstRemReg,
43095ffd83dbSDimitry Andric                                                         Register X,
4310fe6060f1SDimitry Andric                                                         Register Y) const {
43115ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
43125ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43135ffd83dbSDimitry Andric 
43145ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
43155ffd83dbSDimitry Andric   // algorithm used here.
43165ffd83dbSDimitry Andric 
43175ffd83dbSDimitry Andric   // Initial estimate of inv(y).
43185ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
43195ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
432006c3fb27SDimitry Andric   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
43215ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
43225ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
43235ffd83dbSDimitry Andric 
43245ffd83dbSDimitry Andric   // One round of UNR.
43255ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
43265ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
43275ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
43285ffd83dbSDimitry Andric 
43295ffd83dbSDimitry Andric   // Quotient/remainder estimate.
43305ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
43315ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
43325ffd83dbSDimitry Andric 
43335ffd83dbSDimitry Andric   // First quotient/remainder refinement.
43345ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
43355ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4336fe6060f1SDimitry Andric   if (DstDivReg)
43375ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
43385ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
43395ffd83dbSDimitry Andric 
43405ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
43415ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4342fe6060f1SDimitry Andric   if (DstDivReg)
4343fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
43445ffd83dbSDimitry Andric 
4345fe6060f1SDimitry Andric   if (DstRemReg)
4346fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
43475ffd83dbSDimitry Andric }
43485ffd83dbSDimitry Andric 
4349349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32
43505ffd83dbSDimitry Andric //
43515ffd83dbSDimitry Andric // Return lo, hi of result
43525ffd83dbSDimitry Andric //
43535ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
43545ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
43555ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
43565ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
43575ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
43585ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
43595ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
43605ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
43615ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
43625ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
43635ffd83dbSDimitry Andric                                                        Register Val) {
43645ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
43655ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
43665ffd83dbSDimitry Andric 
43675ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
43685ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
43695ffd83dbSDimitry Andric 
437006c3fb27SDimitry Andric   auto Mad = B.buildFMAD(
437106c3fb27SDimitry Andric       S32, CvtHi, // 2**32
437206c3fb27SDimitry Andric       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
43735ffd83dbSDimitry Andric 
43745ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
437506c3fb27SDimitry Andric   auto Mul1 = B.buildFMul(
437606c3fb27SDimitry Andric       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
43775ffd83dbSDimitry Andric 
43785ffd83dbSDimitry Andric   // 2**(-32)
437906c3fb27SDimitry Andric   auto Mul2 = B.buildFMul(
438006c3fb27SDimitry Andric       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
43815ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
43825ffd83dbSDimitry Andric 
43835ffd83dbSDimitry Andric   // -(2**32)
438406c3fb27SDimitry Andric   auto Mad2 = B.buildFMAD(
438506c3fb27SDimitry Andric       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
438606c3fb27SDimitry Andric       Mul1);
43875ffd83dbSDimitry Andric 
43885ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
43895ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
43905ffd83dbSDimitry Andric 
43915ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
43925ffd83dbSDimitry Andric }
43935ffd83dbSDimitry Andric 
4394fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4395fe6060f1SDimitry Andric                                                         Register DstDivReg,
4396fe6060f1SDimitry Andric                                                         Register DstRemReg,
43975ffd83dbSDimitry Andric                                                         Register Numer,
4398fe6060f1SDimitry Andric                                                         Register Denom) const {
43995ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
44005ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
44015ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
44025ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
44035ffd83dbSDimitry Andric 
44045ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
44055ffd83dbSDimitry Andric 
4406bdd1243dSDimitry Andric   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
44075ffd83dbSDimitry Andric 
44085ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
44095ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
44105ffd83dbSDimitry Andric 
44115ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
44125ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
44135ffd83dbSDimitry Andric 
44145ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
44155ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
44165ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
44175ffd83dbSDimitry Andric 
44185ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
44195ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4420bdd1243dSDimitry Andric   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
44215ffd83dbSDimitry Andric 
44225ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
44235ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
44245ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
44255ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
44265ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
44275ffd83dbSDimitry Andric 
44285ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
44295ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4430349cc55cSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4431bdd1243dSDimitry Andric   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
44325ffd83dbSDimitry Andric 
44335ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
44345ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
44355ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
44365ffd83dbSDimitry Andric 
44375ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
44385ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
44395ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
44405ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
44415ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
44425ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
44435ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
44445ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4445bdd1243dSDimitry Andric   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
44465ffd83dbSDimitry Andric 
44475ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
44485ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
44495ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
44505ffd83dbSDimitry Andric 
44515ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
44525ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
44535ffd83dbSDimitry Andric 
44545ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
44555ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
44565ffd83dbSDimitry Andric 
44575ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
44585ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
44595ffd83dbSDimitry Andric 
44605ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
44615ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
44625ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
44635ffd83dbSDimitry Andric 
44645ffd83dbSDimitry Andric   // if C3 != 0 ...
44655ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
44665ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
44675ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4468bdd1243dSDimitry Andric   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
44695ffd83dbSDimitry Andric 
44705ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
44715ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
44725ffd83dbSDimitry Andric 
44735ffd83dbSDimitry Andric   auto C4 =
44745ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
44755ffd83dbSDimitry Andric   auto C5 =
44765ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
44775ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
44785ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
44795ffd83dbSDimitry Andric 
44805ffd83dbSDimitry Andric   // if (C6 != 0)
44815ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
44825ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
44835ffd83dbSDimitry Andric 
44845ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
44855ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4486bdd1243dSDimitry Andric   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
44875ffd83dbSDimitry Andric 
44885ffd83dbSDimitry Andric   // endif C6
44895ffd83dbSDimitry Andric   // endif C3
44905ffd83dbSDimitry Andric 
4491fe6060f1SDimitry Andric   if (DstDivReg) {
44925ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
44935ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4494fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4495fe6060f1SDimitry Andric                   Sel1, MulHi3);
4496fe6060f1SDimitry Andric   }
4497fe6060f1SDimitry Andric 
4498fe6060f1SDimitry Andric   if (DstRemReg) {
44995ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
45005ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4501fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4502fe6060f1SDimitry Andric                   Sel2, Sub1);
45035ffd83dbSDimitry Andric   }
45045ffd83dbSDimitry Andric }
45055ffd83dbSDimitry Andric 
4506fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
45075ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
45085ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
4509fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
4510fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
4511fe6060f1SDimitry Andric   default:
4512fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
4513fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
4514fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4515fe6060f1SDimitry Andric     break;
4516fe6060f1SDimitry Andric   }
4517fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
4518fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
4519fe6060f1SDimitry Andric     break;
4520fe6060f1SDimitry Andric   }
4521fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
4522fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4523fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
4524fe6060f1SDimitry Andric     break;
4525fe6060f1SDimitry Andric   }
4526fe6060f1SDimitry Andric   }
4527fe6060f1SDimitry Andric 
45285ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
45295ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
4530fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4531fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4532fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4533fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
45345ffd83dbSDimitry Andric 
45355ffd83dbSDimitry Andric   if (Ty == S32)
4536fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
45375ffd83dbSDimitry Andric   else if (Ty == S64)
4538fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
45395ffd83dbSDimitry Andric   else
45405ffd83dbSDimitry Andric     return false;
45415ffd83dbSDimitry Andric 
45425ffd83dbSDimitry Andric   MI.eraseFromParent();
45435ffd83dbSDimitry Andric   return true;
45445ffd83dbSDimitry Andric }
45455ffd83dbSDimitry Andric 
4546fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
45475ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
45485ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
45495ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
45505ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
45515ffd83dbSDimitry Andric 
4552fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
45535ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
45545ffd83dbSDimitry Andric     return false;
45555ffd83dbSDimitry Andric 
4556fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4557fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4558fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
45595ffd83dbSDimitry Andric 
45605ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
45615ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
45625ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
45635ffd83dbSDimitry Andric 
45645ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
45655ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
45665ffd83dbSDimitry Andric 
45675ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
45685ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
45695ffd83dbSDimitry Andric 
4570fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4571fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
4572fe6060f1SDimitry Andric   default:
4573fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
4574fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
4575fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4576fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4577fe6060f1SDimitry Andric     break;
4578fe6060f1SDimitry Andric   }
4579fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
4580fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
4581fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4582fe6060f1SDimitry Andric     break;
4583fe6060f1SDimitry Andric   }
4584fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
4585fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4586fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
4587fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4588fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4589fe6060f1SDimitry Andric     break;
4590fe6060f1SDimitry Andric   }
4591fe6060f1SDimitry Andric   }
4592fe6060f1SDimitry Andric 
45935ffd83dbSDimitry Andric   if (Ty == S32)
4594fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
45955ffd83dbSDimitry Andric   else
4596fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
45975ffd83dbSDimitry Andric 
4598fe6060f1SDimitry Andric   if (DstDivReg) {
4599fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4600fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4601fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
4602fe6060f1SDimitry Andric   }
46035ffd83dbSDimitry Andric 
4604fe6060f1SDimitry Andric   if (DstRemReg) {
4605fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4606fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4607fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
4608fe6060f1SDimitry Andric   }
46095ffd83dbSDimitry Andric 
46105ffd83dbSDimitry Andric   MI.eraseFromParent();
46115ffd83dbSDimitry Andric   return true;
46125ffd83dbSDimitry Andric }
46135ffd83dbSDimitry Andric 
46148bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
46158bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
46168bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
46178bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
46188bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
46198bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
46208bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
46218bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
46228bcb0991SDimitry Andric 
46238bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
462406c3fb27SDimitry Andric   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
462506c3fb27SDimitry Andric                             MF.getTarget().Options.UnsafeFPMath;
46268bcb0991SDimitry Andric 
46278bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
462806c3fb27SDimitry Andric     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
462906c3fb27SDimitry Andric       return false;
463006c3fb27SDimitry Andric 
463106c3fb27SDimitry Andric     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
463206c3fb27SDimitry Andric     // the CI documentation has a worst case error of 1 ulp.
463306c3fb27SDimitry Andric     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
463406c3fb27SDimitry Andric     // use it as long as we aren't trying to use denormals.
463506c3fb27SDimitry Andric     //
463606c3fb27SDimitry Andric     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
463706c3fb27SDimitry Andric 
46388bcb0991SDimitry Andric     // 1 / x -> RCP(x)
46398bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
4640*5f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
46418bcb0991SDimitry Andric           .addUse(RHS)
46428bcb0991SDimitry Andric           .setMIFlags(Flags);
46438bcb0991SDimitry Andric 
46448bcb0991SDimitry Andric       MI.eraseFromParent();
46458bcb0991SDimitry Andric       return true;
46468bcb0991SDimitry Andric     }
46478bcb0991SDimitry Andric 
46488bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
46498bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
46508bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
4651*5f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
46528bcb0991SDimitry Andric           .addUse(FNeg.getReg(0))
46538bcb0991SDimitry Andric           .setMIFlags(Flags);
46548bcb0991SDimitry Andric 
46558bcb0991SDimitry Andric       MI.eraseFromParent();
46568bcb0991SDimitry Andric       return true;
46578bcb0991SDimitry Andric     }
46588bcb0991SDimitry Andric   }
46598bcb0991SDimitry Andric 
4660*5f757f3fSDimitry Andric   // For f16 require afn or arcp.
4661*5f757f3fSDimitry Andric   // For f32 require afn.
466206c3fb27SDimitry Andric   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
466306c3fb27SDimitry Andric                               !MI.getFlag(MachineInstr::FmArcp)))
466406c3fb27SDimitry Andric     return false;
466506c3fb27SDimitry Andric 
46668bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
4667*5f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
46688bcb0991SDimitry Andric                  .addUse(RHS)
46698bcb0991SDimitry Andric                  .setMIFlags(Flags);
46708bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
46718bcb0991SDimitry Andric 
46728bcb0991SDimitry Andric   MI.eraseFromParent();
46738bcb0991SDimitry Andric   return true;
46748bcb0991SDimitry Andric }
46758bcb0991SDimitry Andric 
4676e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4677e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
4678e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
4679e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4680e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
4681e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
4682e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
4683e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
4684e8d8bef9SDimitry Andric 
4685e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
4686e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4687e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
4688e8d8bef9SDimitry Andric 
4689e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
46908bcb0991SDimitry Andric     return false;
4691e8d8bef9SDimitry Andric 
4692e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
4693e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
4694e8d8bef9SDimitry Andric 
4695*5f757f3fSDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4696e8d8bef9SDimitry Andric                .addUse(Y)
4697e8d8bef9SDimitry Andric                .setMIFlags(Flags);
4698e8d8bef9SDimitry Andric 
4699e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4700e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
4701e8d8bef9SDimitry Andric 
4702e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4703e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
4704e8d8bef9SDimitry Andric 
4705e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
4706e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4707e8d8bef9SDimitry Andric 
4708e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
4709e8d8bef9SDimitry Andric   MI.eraseFromParent();
4710e8d8bef9SDimitry Andric   return true;
47118bcb0991SDimitry Andric }
47128bcb0991SDimitry Andric 
4713480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4714480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4715480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4716e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4717e8d8bef9SDimitry Andric     return true;
4718e8d8bef9SDimitry Andric 
4719480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4720480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4721480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4722480093f4SDimitry Andric 
4723480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4724480093f4SDimitry Andric 
4725480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
4726480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4727480093f4SDimitry Andric 
4728480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4729480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4730480093f4SDimitry Andric 
4731*5f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4732480093f4SDimitry Andric                  .addUse(RHSExt.getReg(0))
4733480093f4SDimitry Andric                  .setMIFlags(Flags);
4734480093f4SDimitry Andric 
4735480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4736480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4737480093f4SDimitry Andric 
4738*5f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4739480093f4SDimitry Andric       .addUse(RDst.getReg(0))
4740480093f4SDimitry Andric       .addUse(RHS)
4741480093f4SDimitry Andric       .addUse(LHS)
4742480093f4SDimitry Andric       .setMIFlags(Flags);
4743480093f4SDimitry Andric 
4744480093f4SDimitry Andric   MI.eraseFromParent();
4745480093f4SDimitry Andric   return true;
4746480093f4SDimitry Andric }
4747480093f4SDimitry Andric 
4748*5f757f3fSDimitry Andric static const unsigned SPDenormModeBitField =
4749*5f757f3fSDimitry Andric     AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4750*5f757f3fSDimitry Andric     (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4751*5f757f3fSDimitry Andric 
4752480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4753480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
475406c3fb27SDimitry Andric static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4755480093f4SDimitry Andric                                const GCNSubtarget &ST,
475606c3fb27SDimitry Andric                                SIModeRegisterDefaults Mode) {
4757480093f4SDimitry Andric   // Set SP denorm mode to this value.
4758480093f4SDimitry Andric   unsigned SPDenormMode =
47595ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4760480093f4SDimitry Andric 
4761480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
4762480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
47635ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4764480093f4SDimitry Andric 
47655ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4766480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
4767480093f4SDimitry Andric       .addImm(NewDenormModeValue);
4768480093f4SDimitry Andric 
4769480093f4SDimitry Andric   } else {
4770480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4771480093f4SDimitry Andric       .addImm(SPDenormMode)
4772480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
4773480093f4SDimitry Andric   }
4774480093f4SDimitry Andric }
4775480093f4SDimitry Andric 
4776480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4777480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4778480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4779e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4780e8d8bef9SDimitry Andric     return true;
4781e8d8bef9SDimitry Andric 
4782480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4783480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4784480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4785480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
478606c3fb27SDimitry Andric   SIModeRegisterDefaults Mode = MFI->getMode();
4787480093f4SDimitry Andric 
4788480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4789480093f4SDimitry Andric 
4790480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4791480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
4792480093f4SDimitry Andric 
4793480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
4794480093f4SDimitry Andric 
4795480093f4SDimitry Andric   auto DenominatorScaled =
4796*5f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4797480093f4SDimitry Andric           .addUse(LHS)
47985ffd83dbSDimitry Andric           .addUse(RHS)
47995ffd83dbSDimitry Andric           .addImm(0)
4800480093f4SDimitry Andric           .setMIFlags(Flags);
4801480093f4SDimitry Andric   auto NumeratorScaled =
4802*5f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4803480093f4SDimitry Andric           .addUse(LHS)
4804480093f4SDimitry Andric           .addUse(RHS)
48055ffd83dbSDimitry Andric           .addImm(1)
4806480093f4SDimitry Andric           .setMIFlags(Flags);
4807480093f4SDimitry Andric 
4808*5f757f3fSDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4809480093f4SDimitry Andric                        .addUse(DenominatorScaled.getReg(0))
4810480093f4SDimitry Andric                        .setMIFlags(Flags);
4811480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4812480093f4SDimitry Andric 
4813*5f757f3fSDimitry Andric   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
4814*5f757f3fSDimitry Andric   const bool HasDynamicDenormals =
4815*5f757f3fSDimitry Andric       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
4816*5f757f3fSDimitry Andric       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
4817*5f757f3fSDimitry Andric 
4818*5f757f3fSDimitry Andric   Register SavedSPDenormMode;
4819*5f757f3fSDimitry Andric   if (!PreservesDenormals) {
4820*5f757f3fSDimitry Andric     if (HasDynamicDenormals) {
4821*5f757f3fSDimitry Andric       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4822*5f757f3fSDimitry Andric       B.buildInstr(AMDGPU::S_GETREG_B32)
4823*5f757f3fSDimitry Andric           .addDef(SavedSPDenormMode)
4824*5f757f3fSDimitry Andric           .addImm(SPDenormModeBitField);
4825*5f757f3fSDimitry Andric     }
4826480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
4827*5f757f3fSDimitry Andric   }
4828480093f4SDimitry Andric 
4829480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4830480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4831480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4832480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4833480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4834480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4835480093f4SDimitry Andric 
4836*5f757f3fSDimitry Andric   if (!PreservesDenormals) {
4837*5f757f3fSDimitry Andric     if (HasDynamicDenormals) {
4838*5f757f3fSDimitry Andric       assert(SavedSPDenormMode);
4839*5f757f3fSDimitry Andric       B.buildInstr(AMDGPU::S_SETREG_B32)
4840*5f757f3fSDimitry Andric           .addReg(SavedSPDenormMode)
4841*5f757f3fSDimitry Andric           .addImm(SPDenormModeBitField);
4842*5f757f3fSDimitry Andric     } else
4843480093f4SDimitry Andric       toggleSPDenormMode(false, B, ST, Mode);
4844*5f757f3fSDimitry Andric   }
4845480093f4SDimitry Andric 
4846*5f757f3fSDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
4847480093f4SDimitry Andric                   .addUse(Fma4.getReg(0))
4848480093f4SDimitry Andric                   .addUse(Fma1.getReg(0))
4849480093f4SDimitry Andric                   .addUse(Fma3.getReg(0))
4850480093f4SDimitry Andric                   .addUse(NumeratorScaled.getReg(1))
4851480093f4SDimitry Andric                   .setMIFlags(Flags);
4852480093f4SDimitry Andric 
4853*5f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4854480093f4SDimitry Andric       .addUse(Fmas.getReg(0))
4855480093f4SDimitry Andric       .addUse(RHS)
4856480093f4SDimitry Andric       .addUse(LHS)
4857480093f4SDimitry Andric       .setMIFlags(Flags);
4858480093f4SDimitry Andric 
4859480093f4SDimitry Andric   MI.eraseFromParent();
4860480093f4SDimitry Andric   return true;
4861480093f4SDimitry Andric }
4862480093f4SDimitry Andric 
4863480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
4864480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4865480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4866e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
4867e8d8bef9SDimitry Andric     return true;
4868e8d8bef9SDimitry Andric 
4869480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4870480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4871480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4872480093f4SDimitry Andric 
4873480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4874480093f4SDimitry Andric 
4875480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
4876480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
4877480093f4SDimitry Andric 
4878480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
4879480093f4SDimitry Andric 
4880*5f757f3fSDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4881480093f4SDimitry Andric                        .addUse(LHS)
4882480093f4SDimitry Andric                        .addUse(RHS)
48835ffd83dbSDimitry Andric                        .addImm(0)
4884480093f4SDimitry Andric                        .setMIFlags(Flags);
4885480093f4SDimitry Andric 
4886480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
4887480093f4SDimitry Andric 
4888*5f757f3fSDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
4889480093f4SDimitry Andric                  .addUse(DivScale0.getReg(0))
4890480093f4SDimitry Andric                  .setMIFlags(Flags);
4891480093f4SDimitry Andric 
4892480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
4893480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
4894480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
4895480093f4SDimitry Andric 
4896*5f757f3fSDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
4897480093f4SDimitry Andric                        .addUse(LHS)
4898480093f4SDimitry Andric                        .addUse(RHS)
48995ffd83dbSDimitry Andric                        .addImm(1)
4900480093f4SDimitry Andric                        .setMIFlags(Flags);
4901480093f4SDimitry Andric 
4902480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
49035ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
4904480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
4905480093f4SDimitry Andric 
4906480093f4SDimitry Andric   Register Scale;
4907480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
4908480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
4909480093f4SDimitry Andric     // is not usable.
4910480093f4SDimitry Andric 
4911480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
4912480093f4SDimitry Andric 
4913480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
4914480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
4915480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
4916480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
4917480093f4SDimitry Andric 
4918480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
4919480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
4920480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
4921480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
49225ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
4923480093f4SDimitry Andric   } else {
4924480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
4925480093f4SDimitry Andric   }
4926480093f4SDimitry Andric 
4927*5f757f3fSDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
4928480093f4SDimitry Andric                   .addUse(Fma4.getReg(0))
4929480093f4SDimitry Andric                   .addUse(Fma3.getReg(0))
4930480093f4SDimitry Andric                   .addUse(Mul.getReg(0))
4931480093f4SDimitry Andric                   .addUse(Scale)
4932480093f4SDimitry Andric                   .setMIFlags(Flags);
4933480093f4SDimitry Andric 
4934*5f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
4935480093f4SDimitry Andric       .addUse(Fmas.getReg(0))
4936480093f4SDimitry Andric       .addUse(RHS)
4937480093f4SDimitry Andric       .addUse(LHS)
4938480093f4SDimitry Andric       .setMIFlags(Flags);
4939480093f4SDimitry Andric 
4940480093f4SDimitry Andric   MI.eraseFromParent();
4941480093f4SDimitry Andric   return true;
4942480093f4SDimitry Andric }
4943480093f4SDimitry Andric 
494406c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
494506c3fb27SDimitry Andric                                          MachineRegisterInfo &MRI,
494606c3fb27SDimitry Andric                                          MachineIRBuilder &B) const {
494706c3fb27SDimitry Andric   Register Res0 = MI.getOperand(0).getReg();
494806c3fb27SDimitry Andric   Register Res1 = MI.getOperand(1).getReg();
494906c3fb27SDimitry Andric   Register Val = MI.getOperand(2).getReg();
495006c3fb27SDimitry Andric   uint16_t Flags = MI.getFlags();
495106c3fb27SDimitry Andric 
495206c3fb27SDimitry Andric   LLT Ty = MRI.getType(Res0);
495306c3fb27SDimitry Andric   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
495406c3fb27SDimitry Andric 
4955*5f757f3fSDimitry Andric   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
495606c3fb27SDimitry Andric                   .addUse(Val)
495706c3fb27SDimitry Andric                   .setMIFlags(Flags);
4958*5f757f3fSDimitry Andric   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
495906c3fb27SDimitry Andric                  .addUse(Val)
496006c3fb27SDimitry Andric                  .setMIFlags(Flags);
496106c3fb27SDimitry Andric 
496206c3fb27SDimitry Andric   if (ST.hasFractBug()) {
496306c3fb27SDimitry Andric     auto Fabs = B.buildFAbs(Ty, Val);
496406c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
496506c3fb27SDimitry Andric     auto IsFinite =
496606c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
496706c3fb27SDimitry Andric     auto Zero = B.buildConstant(InstrExpTy, 0);
496806c3fb27SDimitry Andric     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
496906c3fb27SDimitry Andric     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
497006c3fb27SDimitry Andric   }
497106c3fb27SDimitry Andric 
497206c3fb27SDimitry Andric   B.buildCopy(Res0, Mant);
497306c3fb27SDimitry Andric   B.buildSExtOrTrunc(Res1, Exp);
497406c3fb27SDimitry Andric 
497506c3fb27SDimitry Andric   MI.eraseFromParent();
497606c3fb27SDimitry Andric   return true;
497706c3fb27SDimitry Andric }
497806c3fb27SDimitry Andric 
49798bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
49808bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
49818bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
49828bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
49838bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
49848bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
49858bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
49868bcb0991SDimitry Andric 
49878bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
49888bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
49898bcb0991SDimitry Andric 
49908bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
49918bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
49928bcb0991SDimitry Andric 
499306c3fb27SDimitry Andric   auto C0 = B.buildFConstant(S32, 0x1p+96f);
499406c3fb27SDimitry Andric   auto C1 = B.buildFConstant(S32, 0x1p-32f);
499506c3fb27SDimitry Andric   auto C2 = B.buildFConstant(S32, 1.0f);
49968bcb0991SDimitry Andric 
49978bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
49988bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
49998bcb0991SDimitry Andric 
50008bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
50018bcb0991SDimitry Andric 
5002*5f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
50038bcb0991SDimitry Andric                  .addUse(Mul0.getReg(0))
50048bcb0991SDimitry Andric                  .setMIFlags(Flags);
50058bcb0991SDimitry Andric 
50068bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
50078bcb0991SDimitry Andric 
50088bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
50098bcb0991SDimitry Andric 
50108bcb0991SDimitry Andric   MI.eraseFromParent();
50118bcb0991SDimitry Andric   return true;
50128bcb0991SDimitry Andric }
50138bcb0991SDimitry Andric 
5014*5f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
5015*5f757f3fSDimitry Andric                                            MachineRegisterInfo &MRI,
5016*5f757f3fSDimitry Andric                                            MachineIRBuilder &B) const {
5017*5f757f3fSDimitry Andric   // Bypass the correct expansion a standard promotion through G_FSQRT would
5018*5f757f3fSDimitry Andric   // get. The f32 op is accurate enough for the f16 cas.
5019*5f757f3fSDimitry Andric   unsigned Flags = MI.getFlags();
5020*5f757f3fSDimitry Andric   assert(!ST.has16BitInsts());
5021*5f757f3fSDimitry Andric   const LLT F32 = LLT::scalar(32);
5022*5f757f3fSDimitry Andric   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5023*5f757f3fSDimitry Andric   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5024*5f757f3fSDimitry Andric     .addUse(Ext.getReg(0))
5025*5f757f3fSDimitry Andric     .setMIFlags(Flags);
5026*5f757f3fSDimitry Andric   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5027*5f757f3fSDimitry Andric   MI.eraseFromParent();
5028*5f757f3fSDimitry Andric   return true;
5029*5f757f3fSDimitry Andric }
5030*5f757f3fSDimitry Andric 
5031*5f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
5032*5f757f3fSDimitry Andric                                            MachineRegisterInfo &MRI,
5033*5f757f3fSDimitry Andric                                            MachineIRBuilder &B) const {
5034*5f757f3fSDimitry Andric   MachineFunction &MF = B.getMF();
5035*5f757f3fSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
5036*5f757f3fSDimitry Andric   Register X = MI.getOperand(1).getReg();
5037*5f757f3fSDimitry Andric   const unsigned Flags = MI.getFlags();
5038*5f757f3fSDimitry Andric   const LLT S1 = LLT::scalar(1);
5039*5f757f3fSDimitry Andric   const LLT F32 = LLT::scalar(32);
5040*5f757f3fSDimitry Andric   const LLT I32 = LLT::scalar(32);
5041*5f757f3fSDimitry Andric 
5042*5f757f3fSDimitry Andric   if (allowApproxFunc(MF, Flags)) {
5043*5f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5044*5f757f3fSDimitry Andric       .addUse(X)
5045*5f757f3fSDimitry Andric       .setMIFlags(Flags);
5046*5f757f3fSDimitry Andric     MI.eraseFromParent();
5047*5f757f3fSDimitry Andric     return true;
5048*5f757f3fSDimitry Andric   }
5049*5f757f3fSDimitry Andric 
5050*5f757f3fSDimitry Andric   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5051*5f757f3fSDimitry Andric   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5052*5f757f3fSDimitry Andric   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5053*5f757f3fSDimitry Andric   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5054*5f757f3fSDimitry Andric   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5055*5f757f3fSDimitry Andric 
5056*5f757f3fSDimitry Andric   Register SqrtS = MRI.createGenericVirtualRegister(F32);
5057*5f757f3fSDimitry Andric   if (needsDenormHandlingF32(MF, X, Flags)) {
5058*5f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5059*5f757f3fSDimitry Andric       .addUse(SqrtX.getReg(0))
5060*5f757f3fSDimitry Andric       .setMIFlags(Flags);
5061*5f757f3fSDimitry Andric 
5062*5f757f3fSDimitry Andric     auto NegOne = B.buildConstant(I32, -1);
5063*5f757f3fSDimitry Andric     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5064*5f757f3fSDimitry Andric 
5065*5f757f3fSDimitry Andric     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5066*5f757f3fSDimitry Andric     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5067*5f757f3fSDimitry Andric 
5068*5f757f3fSDimitry Andric     auto PosOne = B.buildConstant(I32, 1);
5069*5f757f3fSDimitry Andric     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5070*5f757f3fSDimitry Andric 
5071*5f757f3fSDimitry Andric     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5072*5f757f3fSDimitry Andric     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5073*5f757f3fSDimitry Andric 
5074*5f757f3fSDimitry Andric     auto Zero = B.buildFConstant(F32, 0.0f);
5075*5f757f3fSDimitry Andric     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5076*5f757f3fSDimitry Andric 
5077*5f757f3fSDimitry Andric     SqrtS =
5078*5f757f3fSDimitry Andric         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5079*5f757f3fSDimitry Andric 
5080*5f757f3fSDimitry Andric     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5081*5f757f3fSDimitry Andric     SqrtS =
5082*5f757f3fSDimitry Andric         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5083*5f757f3fSDimitry Andric   } else {
5084*5f757f3fSDimitry Andric     auto SqrtR =
5085*5f757f3fSDimitry Andric         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5086*5f757f3fSDimitry Andric     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5087*5f757f3fSDimitry Andric 
5088*5f757f3fSDimitry Andric     auto Half = B.buildFConstant(F32, 0.5f);
5089*5f757f3fSDimitry Andric     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5090*5f757f3fSDimitry Andric     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5091*5f757f3fSDimitry Andric     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5092*5f757f3fSDimitry Andric     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5093*5f757f3fSDimitry Andric     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5094*5f757f3fSDimitry Andric     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5095*5f757f3fSDimitry Andric     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5096*5f757f3fSDimitry Andric     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5097*5f757f3fSDimitry Andric   }
5098*5f757f3fSDimitry Andric 
5099*5f757f3fSDimitry Andric   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5100*5f757f3fSDimitry Andric 
5101*5f757f3fSDimitry Andric   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5102*5f757f3fSDimitry Andric 
5103*5f757f3fSDimitry Andric   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5104*5f757f3fSDimitry Andric 
5105*5f757f3fSDimitry Andric   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5106*5f757f3fSDimitry Andric   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5107*5f757f3fSDimitry Andric 
5108*5f757f3fSDimitry Andric   MI.eraseFromParent();
5109*5f757f3fSDimitry Andric   return true;
5110*5f757f3fSDimitry Andric }
5111*5f757f3fSDimitry Andric 
5112*5f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
511306c3fb27SDimitry Andric                                            MachineRegisterInfo &MRI,
511406c3fb27SDimitry Andric                                            MachineIRBuilder &B) const {
511506c3fb27SDimitry Andric   // For double type, the SQRT and RSQ instructions don't have required
511606c3fb27SDimitry Andric   // precision, we apply Goldschmidt's algorithm to improve the result:
511706c3fb27SDimitry Andric   //
511806c3fb27SDimitry Andric   //   y0 = rsq(x)
511906c3fb27SDimitry Andric   //   g0 = x * y0
512006c3fb27SDimitry Andric   //   h0 = 0.5 * y0
512106c3fb27SDimitry Andric   //
512206c3fb27SDimitry Andric   //   r0 = 0.5 - h0 * g0
512306c3fb27SDimitry Andric   //   g1 = g0 * r0 + g0
512406c3fb27SDimitry Andric   //   h1 = h0 * r0 + h0
512506c3fb27SDimitry Andric   //
512606c3fb27SDimitry Andric   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
512706c3fb27SDimitry Andric   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
512806c3fb27SDimitry Andric   //   h2 = h1 * r1 + h1
512906c3fb27SDimitry Andric   //
513006c3fb27SDimitry Andric   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
513106c3fb27SDimitry Andric   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
513206c3fb27SDimitry Andric   //
513306c3fb27SDimitry Andric   //   sqrt(x) = g3
513406c3fb27SDimitry Andric 
513506c3fb27SDimitry Andric   const LLT S1 = LLT::scalar(1);
513606c3fb27SDimitry Andric   const LLT S32 = LLT::scalar(32);
513706c3fb27SDimitry Andric   const LLT F64 = LLT::scalar(64);
513806c3fb27SDimitry Andric 
513906c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
514006c3fb27SDimitry Andric   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
514106c3fb27SDimitry Andric 
514206c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
514306c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
514406c3fb27SDimitry Andric 
514506c3fb27SDimitry Andric   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
514606c3fb27SDimitry Andric 
514706c3fb27SDimitry Andric   auto ZeroInt = B.buildConstant(S32, 0);
514806c3fb27SDimitry Andric   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
514906c3fb27SDimitry Andric 
515006c3fb27SDimitry Andric   // Scale up input if it is too small.
515106c3fb27SDimitry Andric   auto ScaleUpFactor = B.buildConstant(S32, 256);
515206c3fb27SDimitry Andric   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
515306c3fb27SDimitry Andric   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
515406c3fb27SDimitry Andric 
5155*5f757f3fSDimitry Andric   auto SqrtY =
5156*5f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
515706c3fb27SDimitry Andric 
515806c3fb27SDimitry Andric   auto Half = B.buildFConstant(F64, 0.5);
515906c3fb27SDimitry Andric   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
516006c3fb27SDimitry Andric   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
516106c3fb27SDimitry Andric 
516206c3fb27SDimitry Andric   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
516306c3fb27SDimitry Andric   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
516406c3fb27SDimitry Andric 
516506c3fb27SDimitry Andric   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
516606c3fb27SDimitry Andric   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
516706c3fb27SDimitry Andric 
516806c3fb27SDimitry Andric   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
516906c3fb27SDimitry Andric   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
517006c3fb27SDimitry Andric 
517106c3fb27SDimitry Andric   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
517206c3fb27SDimitry Andric 
517306c3fb27SDimitry Andric   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
517406c3fb27SDimitry Andric   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
517506c3fb27SDimitry Andric 
517606c3fb27SDimitry Andric   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
517706c3fb27SDimitry Andric 
517806c3fb27SDimitry Andric   // Scale down the result.
517906c3fb27SDimitry Andric   auto ScaleDownFactor = B.buildConstant(S32, -128);
518006c3fb27SDimitry Andric   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
518106c3fb27SDimitry Andric   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
518206c3fb27SDimitry Andric 
518306c3fb27SDimitry Andric   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
518406c3fb27SDimitry Andric   // with finite only or nsz because rsq(+/-0) = +/-inf
518506c3fb27SDimitry Andric 
518606c3fb27SDimitry Andric   // TODO: Check for DAZ and expand to subnormals
518706c3fb27SDimitry Andric   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
518806c3fb27SDimitry Andric 
518906c3fb27SDimitry Andric   // If x is +INF, +0, or -0, use its original value
519006c3fb27SDimitry Andric   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
519106c3fb27SDimitry Andric 
519206c3fb27SDimitry Andric   MI.eraseFromParent();
519306c3fb27SDimitry Andric   return true;
519406c3fb27SDimitry Andric }
519506c3fb27SDimitry Andric 
5196*5f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
5197*5f757f3fSDimitry Andric                                         MachineRegisterInfo &MRI,
5198*5f757f3fSDimitry Andric                                         MachineIRBuilder &B) const {
5199*5f757f3fSDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5200*5f757f3fSDimitry Andric   if (Ty == LLT::scalar(32))
5201*5f757f3fSDimitry Andric     return legalizeFSQRTF32(MI, MRI, B);
5202*5f757f3fSDimitry Andric   if (Ty == LLT::scalar(64))
5203*5f757f3fSDimitry Andric     return legalizeFSQRTF64(MI, MRI, B);
5204*5f757f3fSDimitry Andric   if (Ty == LLT::scalar(16))
5205*5f757f3fSDimitry Andric     return legalizeFSQRTF16(MI, MRI, B);
5206*5f757f3fSDimitry Andric   return false;
5207*5f757f3fSDimitry Andric }
5208*5f757f3fSDimitry Andric 
5209e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5210e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
5211e8d8bef9SDimitry Andric //
5212e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
5213e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5214e8d8bef9SDimitry Andric // +-max_float.
5215e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5216e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
5217e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
5218e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5219e8d8bef9SDimitry Andric     return true;
5220e8d8bef9SDimitry Andric 
5221e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
5222e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
5223e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
5224e8d8bef9SDimitry Andric 
5225e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
5226e8d8bef9SDimitry Andric 
5227e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
5228e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
5229e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
5230e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
5231e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
5232e8d8bef9SDimitry Andric   else
5233e8d8bef9SDimitry Andric     return false;
5234e8d8bef9SDimitry Andric 
5235*5f757f3fSDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5236e8d8bef9SDimitry Andric                  .addUse(Src)
5237e8d8bef9SDimitry Andric                  .setMIFlags(Flags);
5238e8d8bef9SDimitry Andric 
5239e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
5240e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
5241e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5242e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
5243e8d8bef9SDimitry Andric 
5244e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5245e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5246e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5247e8d8bef9SDimitry Andric 
5248e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5249e8d8bef9SDimitry Andric 
5250e8d8bef9SDimitry Andric   if (UseIEEE)
5251e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5252e8d8bef9SDimitry Andric   else
5253e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5254e8d8bef9SDimitry Andric   MI.eraseFromParent();
5255e8d8bef9SDimitry Andric   return true;
5256e8d8bef9SDimitry Andric }
5257e8d8bef9SDimitry Andric 
5258e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
5259e8d8bef9SDimitry Andric   switch (IID) {
5260e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
5261e8d8bef9SDimitry Andric     return AMDGPU::G_ATOMICRMW_FADD;
5262e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
5263e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5264e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
5265e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5266e8d8bef9SDimitry Andric   default:
5267e8d8bef9SDimitry Andric     llvm_unreachable("not a DS FP intrinsic");
5268e8d8bef9SDimitry Andric   }
5269e8d8bef9SDimitry Andric }
5270e8d8bef9SDimitry Andric 
5271e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
5272e8d8bef9SDimitry Andric                                                       MachineInstr &MI,
5273e8d8bef9SDimitry Andric                                                       Intrinsic::ID IID) const {
5274e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
5275e8d8bef9SDimitry Andric   Observer.changingInstr(MI);
5276e8d8bef9SDimitry Andric 
5277e8d8bef9SDimitry Andric   MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
5278e8d8bef9SDimitry Andric 
5279e8d8bef9SDimitry Andric   // The remaining operands were used to set fields in the MemOperand on
5280e8d8bef9SDimitry Andric   // construction.
5281e8d8bef9SDimitry Andric   for (int I = 6; I > 3; --I)
528281ad6265SDimitry Andric     MI.removeOperand(I);
5283e8d8bef9SDimitry Andric 
528481ad6265SDimitry Andric   MI.removeOperand(1); // Remove the intrinsic ID.
5285e8d8bef9SDimitry Andric   Observer.changedInstr(MI);
5286e8d8bef9SDimitry Andric   return true;
5287e8d8bef9SDimitry Andric }
5288e8d8bef9SDimitry Andric 
5289e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5290e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
5291e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
5292e8d8bef9SDimitry Andric   uint64_t Offset =
5293e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
5294e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5295e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
5296e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5297e8d8bef9SDimitry Andric 
5298e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5299e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
5300e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5301e8d8bef9SDimitry Andric     return false;
5302e8d8bef9SDimitry Andric 
5303e8d8bef9SDimitry Andric   // FIXME: This should be nuw
5304e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5305e8d8bef9SDimitry Andric   return true;
5306e8d8bef9SDimitry Andric }
5307e8d8bef9SDimitry Andric 
530806c3fb27SDimitry Andric /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
530906c3fb27SDimitry Andric /// bits of the pointer and replace them with the stride argument, then
531006c3fb27SDimitry Andric /// merge_values everything together. In the common case of a raw buffer (the
531106c3fb27SDimitry Andric /// stride component is 0), we can just AND off the upper half.
531206c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
531306c3fb27SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
531406c3fb27SDimitry Andric   Register Result = MI.getOperand(0).getReg();
531506c3fb27SDimitry Andric   Register Pointer = MI.getOperand(2).getReg();
531606c3fb27SDimitry Andric   Register Stride = MI.getOperand(3).getReg();
531706c3fb27SDimitry Andric   Register NumRecords = MI.getOperand(4).getReg();
531806c3fb27SDimitry Andric   Register Flags = MI.getOperand(5).getReg();
531906c3fb27SDimitry Andric 
532006c3fb27SDimitry Andric   LLT S32 = LLT::scalar(32);
532106c3fb27SDimitry Andric 
532206c3fb27SDimitry Andric   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
532306c3fb27SDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Pointer);
532406c3fb27SDimitry Andric   Register LowHalf = Unmerge.getReg(0);
532506c3fb27SDimitry Andric   Register HighHalf = Unmerge.getReg(1);
532606c3fb27SDimitry Andric 
532706c3fb27SDimitry Andric   auto AndMask = B.buildConstant(S32, 0x0000ffff);
532806c3fb27SDimitry Andric   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
532906c3fb27SDimitry Andric 
533006c3fb27SDimitry Andric   MachineInstrBuilder NewHighHalf = Masked;
533106c3fb27SDimitry Andric   std::optional<ValueAndVReg> StrideConst =
533206c3fb27SDimitry Andric       getIConstantVRegValWithLookThrough(Stride, MRI);
533306c3fb27SDimitry Andric   if (!StrideConst || !StrideConst->Value.isZero()) {
533406c3fb27SDimitry Andric     MachineInstrBuilder ShiftedStride;
533506c3fb27SDimitry Andric     if (StrideConst) {
533606c3fb27SDimitry Andric       uint32_t StrideVal = StrideConst->Value.getZExtValue();
533706c3fb27SDimitry Andric       uint32_t ShiftedStrideVal = StrideVal << 16;
533806c3fb27SDimitry Andric       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
533906c3fb27SDimitry Andric     } else {
534006c3fb27SDimitry Andric       auto ExtStride = B.buildAnyExt(S32, Stride);
534106c3fb27SDimitry Andric       auto ShiftConst = B.buildConstant(S32, 16);
534206c3fb27SDimitry Andric       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
534306c3fb27SDimitry Andric     }
534406c3fb27SDimitry Andric     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
534506c3fb27SDimitry Andric   }
534606c3fb27SDimitry Andric   Register NewHighHalfReg = NewHighHalf.getReg(0);
534706c3fb27SDimitry Andric   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
534806c3fb27SDimitry Andric   MI.eraseFromParent();
534906c3fb27SDimitry Andric   return true;
535006c3fb27SDimitry Andric }
535106c3fb27SDimitry Andric 
53520b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
53530b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
53540b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
53550b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
53560b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
53570b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
53580b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
53590b57cec5SDimitry Andric   }
53600b57cec5SDimitry Andric 
53610b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5362e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
53630b57cec5SDimitry Andric     return false;
53640b57cec5SDimitry Andric 
53650b57cec5SDimitry Andric   MI.eraseFromParent();
53660b57cec5SDimitry Andric   return true;
53670b57cec5SDimitry Andric }
53680b57cec5SDimitry Andric 
5369fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5370fcaf7f86SDimitry Andric                                          MachineRegisterInfo &MRI,
5371fcaf7f86SDimitry Andric                                          MachineIRBuilder &B) const {
5372fcaf7f86SDimitry Andric   Function &F = B.getMF().getFunction();
5373bdd1243dSDimitry Andric   std::optional<uint32_t> KnownSize =
5374fcaf7f86SDimitry Andric       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5375fcaf7f86SDimitry Andric   if (KnownSize.has_value())
5376bdd1243dSDimitry Andric     B.buildConstant(DstReg, *KnownSize);
5377fcaf7f86SDimitry Andric   return false;
5378fcaf7f86SDimitry Andric }
5379fcaf7f86SDimitry Andric 
5380fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5381fcaf7f86SDimitry Andric                                               MachineRegisterInfo &MRI,
5382fcaf7f86SDimitry Andric                                               MachineIRBuilder &B) const {
5383fcaf7f86SDimitry Andric 
5384fcaf7f86SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5385fcaf7f86SDimitry Andric   if (!MFI->isEntryFunction()) {
5386fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
5387fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5388fcaf7f86SDimitry Andric   }
5389fcaf7f86SDimitry Andric 
5390fcaf7f86SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5391fcaf7f86SDimitry Andric   if (!getLDSKernelId(DstReg, MRI, B))
5392fcaf7f86SDimitry Andric     return false;
5393fcaf7f86SDimitry Andric 
5394fcaf7f86SDimitry Andric   MI.eraseFromParent();
5395fcaf7f86SDimitry Andric   return true;
5396fcaf7f86SDimitry Andric }
5397fcaf7f86SDimitry Andric 
53988bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
53998bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
54008bcb0991SDimitry Andric                                               MachineIRBuilder &B,
54018bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
54028bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5403e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5404e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
5405e8d8bef9SDimitry Andric 
54068bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
54078bcb0991SDimitry Andric   MI.eraseFromParent();
54088bcb0991SDimitry Andric   return true;
54098bcb0991SDimitry Andric }
54108bcb0991SDimitry Andric 
54115ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
54125ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
54135ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
54145ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
54155ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
54165ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
5417fe6060f1SDimitry Andric std::pair<Register, unsigned>
54185ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
54195ffd83dbSDimitry Andric                                         Register OrigOffset) const {
5420*5f757f3fSDimitry Andric   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
54215ffd83dbSDimitry Andric   Register BaseReg;
5422fe6060f1SDimitry Andric   unsigned ImmOffset;
54235ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
5424fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
54255ffd83dbSDimitry Andric 
5426fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
5427fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
54285ffd83dbSDimitry Andric 
5429fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
5430fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
5431fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
54325ffd83dbSDimitry Andric 
543306c3fb27SDimitry Andric   // If the immediate value is too big for the immoffset field, put only bits
543406c3fb27SDimitry Andric   // that would normally fit in the immoffset field. The remaining value that
543506c3fb27SDimitry Andric   // is copied/added for the voffset field is a large power of 2, and it
543606c3fb27SDimitry Andric   // stands more chance of being CSEd with the copy/add for another similar
543706c3fb27SDimitry Andric   // load/store.
543806c3fb27SDimitry Andric   // However, do not do that rounding down if that is a negative
543906c3fb27SDimitry Andric   // number, as it appears to be illegal to have a negative offset in the
544006c3fb27SDimitry Andric   // vgpr, even if adding the immediate offset makes it positive.
54415ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
54425ffd83dbSDimitry Andric   ImmOffset -= Overflow;
54435ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
54445ffd83dbSDimitry Andric     Overflow += ImmOffset;
54455ffd83dbSDimitry Andric     ImmOffset = 0;
54465ffd83dbSDimitry Andric   }
54475ffd83dbSDimitry Andric 
54485ffd83dbSDimitry Andric   if (Overflow != 0) {
54495ffd83dbSDimitry Andric     if (!BaseReg) {
54505ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
54515ffd83dbSDimitry Andric     } else {
54525ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
54535ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
54545ffd83dbSDimitry Andric     }
54555ffd83dbSDimitry Andric   }
54565ffd83dbSDimitry Andric 
54575ffd83dbSDimitry Andric   if (!BaseReg)
54585ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
54595ffd83dbSDimitry Andric 
5460bdd1243dSDimitry Andric   return std::pair(BaseReg, ImmOffset);
5461fe6060f1SDimitry Andric }
5462fe6060f1SDimitry Andric 
54638bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
54648bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
54658bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
5466e8d8bef9SDimitry Andric                                              Register Reg,
5467e8d8bef9SDimitry Andric                                              bool ImageStore) const {
54688bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
54698bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
54708bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
54718bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
54728bcb0991SDimitry Andric 
5473e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
54748bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
54758bcb0991SDimitry Andric 
54768bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
54778bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
54788bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
54798bcb0991SDimitry Andric 
54808bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
54818bcb0991SDimitry Andric 
5482fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5483fe6060f1SDimitry Andric         .getReg(0);
54848bcb0991SDimitry Andric   }
54858bcb0991SDimitry Andric 
5486e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
5487e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
5488e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5489e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
5490e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
5491e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5492fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5493fe6060f1SDimitry Andric           .getReg(0);
5494e8d8bef9SDimitry Andric     }
5495e8d8bef9SDimitry Andric 
5496e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
5497e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5498e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
5499e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5500e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
5501e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5502fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5503fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5504e8d8bef9SDimitry Andric     }
5505e8d8bef9SDimitry Andric 
5506e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
5507e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5508fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5509e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
5510e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5511e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
5512e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5513fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5514fe6060f1SDimitry Andric           .getReg(0);
5515e8d8bef9SDimitry Andric     }
5516e8d8bef9SDimitry Andric 
5517e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
5518e8d8bef9SDimitry Andric   }
5519e8d8bef9SDimitry Andric 
55200eae32dcSDimitry Andric   if (StoreVT == LLT::fixed_vector(3, S16)) {
55210eae32dcSDimitry Andric     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
55220eae32dcSDimitry Andric               .getReg(0);
55230eae32dcSDimitry Andric   }
5524e8d8bef9SDimitry Andric   return Reg;
5525e8d8bef9SDimitry Andric }
5526e8d8bef9SDimitry Andric 
55275ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
55285ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
55295ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
55305ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
55318bcb0991SDimitry Andric 
55328bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
55338bcb0991SDimitry Andric 
553406c3fb27SDimitry Andric   // Fixup buffer resources themselves needing to be v4i128.
553506c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty))
553606c3fb27SDimitry Andric     return castBufferRsrcToV4I32(VData, B);
553706c3fb27SDimitry Andric 
55388bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
55398bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
55408bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
55415ffd83dbSDimitry Andric     return AnyExt;
55428bcb0991SDimitry Andric   }
55438bcb0991SDimitry Andric 
55448bcb0991SDimitry Andric   if (Ty.isVector()) {
55458bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
55468bcb0991SDimitry Andric       if (IsFormat)
55475ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
55485ffd83dbSDimitry Andric     }
55495ffd83dbSDimitry Andric   }
55505ffd83dbSDimitry Andric 
55515ffd83dbSDimitry Andric   return VData;
55525ffd83dbSDimitry Andric }
55535ffd83dbSDimitry Andric 
55545ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
55555ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
55565ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
55575ffd83dbSDimitry Andric                                               bool IsTyped,
55585ffd83dbSDimitry Andric                                               bool IsFormat) const {
55595ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
55605ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
55615ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
55625ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
55635ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
55645ffd83dbSDimitry Andric 
55655ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
556606c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 2);
55675ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
55685ffd83dbSDimitry Andric 
55695ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
55705ffd83dbSDimitry Andric   const int MemSize = MMO->getSize();
55715ffd83dbSDimitry Andric 
55725ffd83dbSDimitry Andric   unsigned ImmOffset;
55735ffd83dbSDimitry Andric 
55745ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
55755ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
55765ffd83dbSDimitry Andric 
55775ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
55785ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
55795ffd83dbSDimitry Andric   Register VIndex;
55805ffd83dbSDimitry Andric   int OpOffset = 0;
55815ffd83dbSDimitry Andric   if (HasVIndex) {
55825ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
55835ffd83dbSDimitry Andric     OpOffset = 1;
5584fe6060f1SDimitry Andric   } else {
5585fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
55865ffd83dbSDimitry Andric   }
55875ffd83dbSDimitry Andric 
55885ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
55895ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
55905ffd83dbSDimitry Andric 
55915ffd83dbSDimitry Andric   unsigned Format = 0;
55925ffd83dbSDimitry Andric   if (IsTyped) {
55935ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
55945ffd83dbSDimitry Andric     ++OpOffset;
55955ffd83dbSDimitry Andric   }
55965ffd83dbSDimitry Andric 
55975ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
55985ffd83dbSDimitry Andric 
5599fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
56005ffd83dbSDimitry Andric 
56015ffd83dbSDimitry Andric   unsigned Opc;
56025ffd83dbSDimitry Andric   if (IsTyped) {
56035ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
56045ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
56055ffd83dbSDimitry Andric   } else if (IsFormat) {
56065ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
56075ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
56085ffd83dbSDimitry Andric   } else {
56095ffd83dbSDimitry Andric     switch (MemSize) {
56105ffd83dbSDimitry Andric     case 1:
56115ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
56125ffd83dbSDimitry Andric       break;
56135ffd83dbSDimitry Andric     case 2:
56145ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
56155ffd83dbSDimitry Andric       break;
56165ffd83dbSDimitry Andric     default:
56175ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
56185ffd83dbSDimitry Andric       break;
56195ffd83dbSDimitry Andric     }
56205ffd83dbSDimitry Andric   }
56215ffd83dbSDimitry Andric 
56225ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
56235ffd83dbSDimitry Andric     .addUse(VData)              // vdata
56245ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
56255ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
56265ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
56275ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
56285ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
56295ffd83dbSDimitry Andric 
56305ffd83dbSDimitry Andric   if (IsTyped)
56315ffd83dbSDimitry Andric     MIB.addImm(Format);
56325ffd83dbSDimitry Andric 
56335ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
56345ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
56355ffd83dbSDimitry Andric      .addMemOperand(MMO);
56365ffd83dbSDimitry Andric 
56375ffd83dbSDimitry Andric   MI.eraseFromParent();
56388bcb0991SDimitry Andric   return true;
56398bcb0991SDimitry Andric }
56408bcb0991SDimitry Andric 
5641bdd1243dSDimitry Andric static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5642bdd1243dSDimitry Andric                             Register VIndex, Register VOffset, Register SOffset,
5643bdd1243dSDimitry Andric                             unsigned ImmOffset, unsigned Format,
5644bdd1243dSDimitry Andric                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5645bdd1243dSDimitry Andric                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5646bdd1243dSDimitry Andric   auto MIB = B.buildInstr(Opc)
5647bdd1243dSDimitry Andric                  .addDef(LoadDstReg) // vdata
5648bdd1243dSDimitry Andric                  .addUse(RSrc)       // rsrc
5649bdd1243dSDimitry Andric                  .addUse(VIndex)     // vindex
5650bdd1243dSDimitry Andric                  .addUse(VOffset)    // voffset
5651bdd1243dSDimitry Andric                  .addUse(SOffset)    // soffset
5652bdd1243dSDimitry Andric                  .addImm(ImmOffset); // offset(imm)
5653bdd1243dSDimitry Andric 
5654bdd1243dSDimitry Andric   if (IsTyped)
5655bdd1243dSDimitry Andric     MIB.addImm(Format);
5656bdd1243dSDimitry Andric 
5657bdd1243dSDimitry Andric   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5658bdd1243dSDimitry Andric       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5659bdd1243dSDimitry Andric       .addMemOperand(MMO);
5660bdd1243dSDimitry Andric }
5661bdd1243dSDimitry Andric 
56625ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
56635ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
56645ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
56655ffd83dbSDimitry Andric                                              bool IsFormat,
56665ffd83dbSDimitry Andric                                              bool IsTyped) const {
56675ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
56685ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
5669fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
56705ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
56715ffd83dbSDimitry Andric 
56725ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
5673bdd1243dSDimitry Andric 
5674bdd1243dSDimitry Andric   Register StatusDst;
5675bdd1243dSDimitry Andric   int OpOffset = 0;
5676bdd1243dSDimitry Andric   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5677bdd1243dSDimitry Andric   bool IsTFE = MI.getNumExplicitDefs() == 2;
5678bdd1243dSDimitry Andric   if (IsTFE) {
5679bdd1243dSDimitry Andric     StatusDst = MI.getOperand(1).getReg();
5680bdd1243dSDimitry Andric     ++OpOffset;
5681bdd1243dSDimitry Andric   }
5682bdd1243dSDimitry Andric 
568306c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5684bdd1243dSDimitry Andric   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
56855ffd83dbSDimitry Andric 
56865ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
56875ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
56885ffd83dbSDimitry Andric 
56895ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
5690bdd1243dSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
56915ffd83dbSDimitry Andric   Register VIndex;
56925ffd83dbSDimitry Andric   if (HasVIndex) {
5693bdd1243dSDimitry Andric     VIndex = MI.getOperand(3 + OpOffset).getReg();
5694bdd1243dSDimitry Andric     ++OpOffset;
5695fe6060f1SDimitry Andric   } else {
5696fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
56978bcb0991SDimitry Andric   }
56988bcb0991SDimitry Andric 
56995ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
57005ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
57015ffd83dbSDimitry Andric 
57025ffd83dbSDimitry Andric   unsigned Format = 0;
57035ffd83dbSDimitry Andric   if (IsTyped) {
57045ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
57055ffd83dbSDimitry Andric     ++OpOffset;
57068bcb0991SDimitry Andric   }
57078bcb0991SDimitry Andric 
57085ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
57095ffd83dbSDimitry Andric   unsigned ImmOffset;
57105ffd83dbSDimitry Andric 
57115ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
571206c3fb27SDimitry Andric   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
571306c3fb27SDimitry Andric   // logic doesn't have to handle that case.
571406c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty)) {
571506c3fb27SDimitry Andric     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
571606c3fb27SDimitry Andric     Dst = MI.getOperand(0).getReg();
571706c3fb27SDimitry Andric   }
57185ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
57195ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
57205ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
57215ffd83dbSDimitry Andric 
5722fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
57235ffd83dbSDimitry Andric 
57245ffd83dbSDimitry Andric   unsigned Opc;
57255ffd83dbSDimitry Andric 
5726bdd1243dSDimitry Andric   // TODO: Support TFE for typed and narrow loads.
57275ffd83dbSDimitry Andric   if (IsTyped) {
5728bdd1243dSDimitry Andric     if (IsTFE)
5729bdd1243dSDimitry Andric       return false;
57305ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
57315ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
57325ffd83dbSDimitry Andric   } else if (IsFormat) {
5733bdd1243dSDimitry Andric     if (IsD16) {
5734bdd1243dSDimitry Andric       if (IsTFE)
5735bdd1243dSDimitry Andric         return false;
5736bdd1243dSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
57375ffd83dbSDimitry Andric     } else {
5738bdd1243dSDimitry Andric       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5739bdd1243dSDimitry Andric                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5740bdd1243dSDimitry Andric     }
5741bdd1243dSDimitry Andric   } else {
5742bdd1243dSDimitry Andric     if (IsTFE)
5743bdd1243dSDimitry Andric       return false;
5744fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
5745fe6060f1SDimitry Andric     case 8:
57465ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
57475ffd83dbSDimitry Andric       break;
5748fe6060f1SDimitry Andric     case 16:
57495ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
57505ffd83dbSDimitry Andric       break;
57515ffd83dbSDimitry Andric     default:
57525ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
57535ffd83dbSDimitry Andric       break;
57545ffd83dbSDimitry Andric     }
57555ffd83dbSDimitry Andric   }
57565ffd83dbSDimitry Andric 
5757bdd1243dSDimitry Andric   if (IsTFE) {
5758bdd1243dSDimitry Andric     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
5759bdd1243dSDimitry Andric     unsigned NumLoadDWords = NumValueDWords + 1;
5760bdd1243dSDimitry Andric     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
5761bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
5762bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5763bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5764bdd1243dSDimitry Andric     if (NumValueDWords == 1) {
5765bdd1243dSDimitry Andric       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5766bdd1243dSDimitry Andric     } else {
5767bdd1243dSDimitry Andric       SmallVector<Register, 5> LoadElts;
5768bdd1243dSDimitry Andric       for (unsigned I = 0; I != NumValueDWords; ++I)
5769bdd1243dSDimitry Andric         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
5770bdd1243dSDimitry Andric       LoadElts.push_back(StatusDst);
5771bdd1243dSDimitry Andric       B.buildUnmerge(LoadElts, LoadDstReg);
5772bdd1243dSDimitry Andric       LoadElts.truncate(NumValueDWords);
5773bdd1243dSDimitry Andric       B.buildMergeLikeInstr(Dst, LoadElts);
5774bdd1243dSDimitry Andric     }
5775bdd1243dSDimitry Andric   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
5776bdd1243dSDimitry Andric              (IsD16 && !Ty.isVector())) {
5777bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
5778bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5779bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
57805ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
57815ffd83dbSDimitry Andric     B.buildTrunc(Dst, LoadDstReg);
5782bdd1243dSDimitry Andric   } else if (Unpacked && IsD16 && Ty.isVector()) {
5783bdd1243dSDimitry Andric     LLT UnpackedTy = Ty.changeElementSize(32);
5784bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5785bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5786bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
5787bdd1243dSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
57885ffd83dbSDimitry Andric     // FIXME: G_TRUNC should work, but legalization currently fails
57895ffd83dbSDimitry Andric     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
57905ffd83dbSDimitry Andric     SmallVector<Register, 4> Repack;
57915ffd83dbSDimitry Andric     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
57925ffd83dbSDimitry Andric       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
5793bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, Repack);
5794bdd1243dSDimitry Andric   } else {
5795bdd1243dSDimitry Andric     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
5796bdd1243dSDimitry Andric                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
57975ffd83dbSDimitry Andric   }
57985ffd83dbSDimitry Andric 
57995ffd83dbSDimitry Andric   MI.eraseFromParent();
58005ffd83dbSDimitry Andric   return true;
58015ffd83dbSDimitry Andric }
58025ffd83dbSDimitry Andric 
58035ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
58045ffd83dbSDimitry Andric   switch (IntrID) {
58055ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
580606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
58075ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
580806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
58095ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
58105ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
581106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
58125ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
581306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
58145ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
58155ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
581606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
58175ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
581806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
58195ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
58205ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
582106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
58225ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
582306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
58245ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
58255ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
582606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
58275ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
582806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
58295ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
58305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
583106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
58325ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
583306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
58345ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
58355ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
583606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
58375ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
583806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
58395ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
58405ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
584106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
58425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
584306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
58445ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
58455ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
584606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
58475ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
584806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
58495ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
58505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
585106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
58525ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
585306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
58545ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
58555ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
585606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
58575ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
585806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
58595ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
58605ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
586106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
58625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
586306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
58645ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
58655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
586606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
58675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
586806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
58695ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5870e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
587106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5872e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
587306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5874e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5875fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
587606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5877fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
587806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5879fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5880fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
588106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5882fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
588306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5884fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
58855ffd83dbSDimitry Andric   default:
58865ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
58875ffd83dbSDimitry Andric   }
58885ffd83dbSDimitry Andric }
58895ffd83dbSDimitry Andric 
58905ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
58915ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
58925ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
589306c3fb27SDimitry Andric   const bool IsCmpSwap =
589406c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
589506c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
589606c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
589706c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
58985ffd83dbSDimitry Andric 
5899*5f757f3fSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
590006c3fb27SDimitry Andric   // Since we don't have 128-bit atomics, we don't need to handle the case of
590106c3fb27SDimitry Andric   // p8 argmunents to the atomic itself
5902*5f757f3fSDimitry Andric   Register VData = MI.getOperand(2).getReg();
5903*5f757f3fSDimitry Andric 
5904e8d8bef9SDimitry Andric   Register CmpVal;
5905*5f757f3fSDimitry Andric   int OpOffset = 0;
59065ffd83dbSDimitry Andric 
59075ffd83dbSDimitry Andric   if (IsCmpSwap) {
5908*5f757f3fSDimitry Andric     CmpVal = MI.getOperand(3).getReg();
59095ffd83dbSDimitry Andric     ++OpOffset;
59105ffd83dbSDimitry Andric   }
59115ffd83dbSDimitry Andric 
591206c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
59135ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
5914*5f757f3fSDimitry Andric   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
59155ffd83dbSDimitry Andric 
59165ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
59175ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
59185ffd83dbSDimitry Andric   Register VIndex;
59195ffd83dbSDimitry Andric   if (HasVIndex) {
59205ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
59215ffd83dbSDimitry Andric     ++OpOffset;
5922fe6060f1SDimitry Andric   } else {
5923fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
59245ffd83dbSDimitry Andric   }
59255ffd83dbSDimitry Andric 
59265ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
59275ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
59285ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
59295ffd83dbSDimitry Andric 
59305ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
59315ffd83dbSDimitry Andric 
59325ffd83dbSDimitry Andric   unsigned ImmOffset;
5933fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
59345ffd83dbSDimitry Andric 
5935*5f757f3fSDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
5936*5f757f3fSDimitry Andric       .addDef(Dst)
5937*5f757f3fSDimitry Andric       .addUse(VData); // vdata
59385ffd83dbSDimitry Andric 
59395ffd83dbSDimitry Andric   if (IsCmpSwap)
59405ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
59415ffd83dbSDimitry Andric 
59425ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
59435ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
59445ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
59455ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
59465ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
59475ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
59485ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
59495ffd83dbSDimitry Andric      .addMemOperand(MMO);
59505ffd83dbSDimitry Andric 
59515ffd83dbSDimitry Andric   MI.eraseFromParent();
59525ffd83dbSDimitry Andric   return true;
59535ffd83dbSDimitry Andric }
59545ffd83dbSDimitry Andric 
5955fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
59565ffd83dbSDimitry Andric /// vector with s16 typed elements.
5957fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
5958fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
5959fe6060f1SDimitry Andric                                       unsigned ArgOffset,
5960fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
5961fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
59625ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
5963fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
5964fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
59655ffd83dbSDimitry Andric 
5966e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
5967e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
59685ffd83dbSDimitry Andric     if (!SrcOp.isReg())
59695ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
59705ffd83dbSDimitry Andric 
59715ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
59725ffd83dbSDimitry Andric 
5973fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
5974fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
5975fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
59760eae32dcSDimitry Andric       if ((I < Intr->GradientStart) && IsA16 &&
59770eae32dcSDimitry Andric           (B.getMRI()->getType(AddrReg) == S16)) {
597804eeddc0SDimitry Andric         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
59790eae32dcSDimitry Andric         // Special handling of bias when A16 is on. Bias is of type half but
59800eae32dcSDimitry Andric         // occupies full 32-bit.
59810eae32dcSDimitry Andric         PackedAddrs.push_back(
59820eae32dcSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
59830eae32dcSDimitry Andric                 .getReg(0));
59840eae32dcSDimitry Andric       } else {
598504eeddc0SDimitry Andric         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
598604eeddc0SDimitry Andric                "Bias needs to be converted to 16 bit in A16 mode");
598704eeddc0SDimitry Andric         // Handle any gradient or coordinate operands that should not be packed
59885ffd83dbSDimitry Andric         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
59895ffd83dbSDimitry Andric         PackedAddrs.push_back(AddrReg);
59900eae32dcSDimitry Andric       }
59915ffd83dbSDimitry Andric     } else {
59925ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
59935ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
59945ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
5995e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
5996e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
5997e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
5998e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
5999e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
60005ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
6001e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
60025ffd83dbSDimitry Andric         PackedAddrs.push_back(
60035ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
60045ffd83dbSDimitry Andric                 .getReg(0));
60055ffd83dbSDimitry Andric       } else {
60065ffd83dbSDimitry Andric         PackedAddrs.push_back(
6007e8d8bef9SDimitry Andric             B.buildBuildVector(
6008e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
60095ffd83dbSDimitry Andric                 .getReg(0));
60105ffd83dbSDimitry Andric         ++I;
60115ffd83dbSDimitry Andric       }
60125ffd83dbSDimitry Andric     }
60135ffd83dbSDimitry Andric   }
60145ffd83dbSDimitry Andric }
60155ffd83dbSDimitry Andric 
60165ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
60175ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
60185ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
60195ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
60205ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
6021bdd1243dSDimitry Andric   (void)S32;
60225ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
60235ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
60245ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
60255ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
60265ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
60275ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
60285ffd83dbSDimitry Andric     }
60295ffd83dbSDimitry Andric   }
60305ffd83dbSDimitry Andric 
60315ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
60325ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
6033fe6060f1SDimitry Andric     auto VAddr =
6034fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
60355ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
60365ffd83dbSDimitry Andric   }
60375ffd83dbSDimitry Andric 
60385ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
60395ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
60405ffd83dbSDimitry Andric     if (SrcOp.isReg())
60415ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
60425ffd83dbSDimitry Andric   }
60435ffd83dbSDimitry Andric }
60445ffd83dbSDimitry Andric 
60455ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
60465ffd83dbSDimitry Andric ///
60475ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
60485ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
60495ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
60505ffd83dbSDimitry Andric /// registers.
60515ffd83dbSDimitry Andric ///
60525ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
60535ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
605481ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid
60555ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
6056349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires
6057349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg.
60585ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6059e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6060e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
60615ffd83dbSDimitry Andric 
6062bdd1243dSDimitry Andric   const MachineFunction &MF = *MI.getMF();
6063e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
6064e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
60655ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
60665ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
60675ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
60685ffd83dbSDimitry Andric 
60695ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
60705ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6071e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
60725ffd83dbSDimitry Andric 
60735ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
60745ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
60755ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
6076fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
60775ffd83dbSDimitry Andric 
60785ffd83dbSDimitry Andric   unsigned DMask = 0;
607904eeddc0SDimitry Andric   Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
608004eeddc0SDimitry Andric   LLT Ty = MRI->getType(VData);
60815ffd83dbSDimitry Andric 
60825ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
6083e8d8bef9SDimitry Andric   LLT GradTy =
6084e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6085e8d8bef9SDimitry Andric   LLT AddrTy =
6086e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
608706c3fb27SDimitry Andric   const bool IsG16 =
608806c3fb27SDimitry Andric       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
60895ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
609004eeddc0SDimitry Andric   const bool IsD16 = Ty.getScalarType() == S16;
60915ffd83dbSDimitry Andric 
60925ffd83dbSDimitry Andric   int DMaskLanes = 0;
60935ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
6094e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
60955ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
60965ffd83dbSDimitry Andric       DMaskLanes = 4;
60975ffd83dbSDimitry Andric     } else if (DMask != 0) {
6098bdd1243dSDimitry Andric       DMaskLanes = llvm::popcount(DMask);
60995ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
61005ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
61015ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
61025ffd83dbSDimitry Andric       MI.eraseFromParent();
61035ffd83dbSDimitry Andric       return true;
61045ffd83dbSDimitry Andric     }
61055ffd83dbSDimitry Andric   }
61065ffd83dbSDimitry Andric 
61075ffd83dbSDimitry Andric   Observer.changingInstr(MI);
61085ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
61095ffd83dbSDimitry Andric 
611004eeddc0SDimitry Andric   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
611104eeddc0SDimitry Andric                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
611204eeddc0SDimitry Andric   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
611304eeddc0SDimitry Andric                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
611404eeddc0SDimitry Andric   unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
61155ffd83dbSDimitry Andric 
61165ffd83dbSDimitry Andric   // Track that we legalized this
61175ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
61185ffd83dbSDimitry Andric 
61195ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
61205ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
61215ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
61225ffd83dbSDimitry Andric     DMask = 0x1;
61235ffd83dbSDimitry Andric     DMaskLanes = 1;
6124e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
61255ffd83dbSDimitry Andric   }
61265ffd83dbSDimitry Andric 
61275ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
61285ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
61295ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
61305ffd83dbSDimitry Andric 
61315ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
61325ffd83dbSDimitry Andric     if (Ty.isVector())
61335ffd83dbSDimitry Andric       return false;
61345ffd83dbSDimitry Andric 
61355ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
61365ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
61375ffd83dbSDimitry Andric       // The two values are packed in one register.
6138fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
61395ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
61405ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
61415ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
61425ffd83dbSDimitry Andric     }
61435ffd83dbSDimitry Andric   }
61445ffd83dbSDimitry Andric 
6145e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
61465ffd83dbSDimitry Andric 
61475ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
6148fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6149fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
6150fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
61515ffd83dbSDimitry Andric     return false;
6152fe6060f1SDimitry Andric   }
61535ffd83dbSDimitry Andric 
6154fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
6155fe6060f1SDimitry Andric     // A16 not supported
6156fe6060f1SDimitry Andric     return false;
6157fe6060f1SDimitry Andric   }
6158fe6060f1SDimitry Andric 
6159*5f757f3fSDimitry Andric   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
616006c3fb27SDimitry Andric   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
616106c3fb27SDimitry Andric 
6162fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
6163*5f757f3fSDimitry Andric     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
6164*5f757f3fSDimitry Andric     // instructions expect VGPR_32
61655ffd83dbSDimitry Andric     SmallVector<Register, 4> PackedRegs;
61665ffd83dbSDimitry Andric 
6167*5f757f3fSDimitry Andric     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
61685ffd83dbSDimitry Andric 
61695ffd83dbSDimitry Andric     // See also below in the non-a16 branch
6170bdd1243dSDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() &&
6171bdd1243dSDimitry Andric                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
617206c3fb27SDimitry Andric                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
617306c3fb27SDimitry Andric     const bool UsePartialNSA =
617406c3fb27SDimitry Andric         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
61755ffd83dbSDimitry Andric 
617606c3fb27SDimitry Andric     if (UsePartialNSA) {
617706c3fb27SDimitry Andric       // Pack registers that would go over NSAMaxSize into last VAddr register
617806c3fb27SDimitry Andric       LLT PackedAddrTy =
617906c3fb27SDimitry Andric           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
618006c3fb27SDimitry Andric       auto Concat = B.buildConcatVectors(
618106c3fb27SDimitry Andric           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
618206c3fb27SDimitry Andric       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
618306c3fb27SDimitry Andric       PackedRegs.resize(NSAMaxSize);
618406c3fb27SDimitry Andric     } else if (!UseNSA && PackedRegs.size() > 1) {
6185fe6060f1SDimitry Andric       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
61865ffd83dbSDimitry Andric       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
61875ffd83dbSDimitry Andric       PackedRegs[0] = Concat.getReg(0);
61885ffd83dbSDimitry Andric       PackedRegs.resize(1);
61895ffd83dbSDimitry Andric     }
61905ffd83dbSDimitry Andric 
6191e8d8bef9SDimitry Andric     const unsigned NumPacked = PackedRegs.size();
6192e8d8bef9SDimitry Andric     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6193e8d8bef9SDimitry Andric       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
61945ffd83dbSDimitry Andric       if (!SrcOp.isReg()) {
61955ffd83dbSDimitry Andric         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
61965ffd83dbSDimitry Andric         continue;
61975ffd83dbSDimitry Andric       }
61985ffd83dbSDimitry Andric 
61995ffd83dbSDimitry Andric       assert(SrcOp.getReg() != AMDGPU::NoRegister);
62005ffd83dbSDimitry Andric 
6201e8d8bef9SDimitry Andric       if (I - Intr->VAddrStart < NumPacked)
6202e8d8bef9SDimitry Andric         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
62035ffd83dbSDimitry Andric       else
62045ffd83dbSDimitry Andric         SrcOp.setReg(AMDGPU::NoRegister);
62055ffd83dbSDimitry Andric     }
62065ffd83dbSDimitry Andric   } else {
62075ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
62085ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
62095ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
62105ffd83dbSDimitry Andric     // wash in terms of code size or even better.
62115ffd83dbSDimitry Andric     //
62125ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
62135ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
62145ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
62155ffd83dbSDimitry Andric     //
62165ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
62175ffd83dbSDimitry Andric     // allocation when possible.
621881ad6265SDimitry Andric     //
6219*5f757f3fSDimitry Andric     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
622006c3fb27SDimitry Andric     // set of the remaining addresses.
6221bdd1243dSDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() &&
6222bdd1243dSDimitry Andric                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
622306c3fb27SDimitry Andric                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
622406c3fb27SDimitry Andric     const bool UsePartialNSA =
622506c3fb27SDimitry Andric         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
62265ffd83dbSDimitry Andric 
622706c3fb27SDimitry Andric     if (UsePartialNSA) {
622806c3fb27SDimitry Andric       convertImageAddrToPacked(B, MI,
622906c3fb27SDimitry Andric                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
623006c3fb27SDimitry Andric                                Intr->NumVAddrs - NSAMaxSize + 1);
623106c3fb27SDimitry Andric     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6232e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6233e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
62345ffd83dbSDimitry Andric     }
623506c3fb27SDimitry Andric   }
62365ffd83dbSDimitry Andric 
62375ffd83dbSDimitry Andric   int Flags = 0;
62385ffd83dbSDimitry Andric   if (IsA16)
62395ffd83dbSDimitry Andric     Flags |= 1;
62405ffd83dbSDimitry Andric   if (IsG16)
62415ffd83dbSDimitry Andric     Flags |= 2;
62425ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
62435ffd83dbSDimitry Andric 
62445ffd83dbSDimitry Andric   if (BaseOpcode->Store) { // No TFE for stores?
62455ffd83dbSDimitry Andric     // TODO: Handle dmask trim
624604eeddc0SDimitry Andric     if (!Ty.isVector() || !IsD16)
62475ffd83dbSDimitry Andric       return true;
62485ffd83dbSDimitry Andric 
6249e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
62505ffd83dbSDimitry Andric     if (RepackedReg != VData) {
62515ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
62525ffd83dbSDimitry Andric     }
62535ffd83dbSDimitry Andric 
62545ffd83dbSDimitry Andric     return true;
62555ffd83dbSDimitry Andric   }
62565ffd83dbSDimitry Andric 
62575ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
62585ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
62595ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
62605ffd83dbSDimitry Andric 
62615ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
62625ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
62635ffd83dbSDimitry Andric     return false;
62645ffd83dbSDimitry Andric 
62655ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
62665ffd83dbSDimitry Andric     return false;
62675ffd83dbSDimitry Andric 
62685ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6269fe6060f1SDimitry Andric   const LLT AdjustedTy =
6270fe6060f1SDimitry Andric       Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
62715ffd83dbSDimitry Andric 
62725ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
62735ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
62745ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
62755ffd83dbSDimitry Andric   LLT RoundedTy;
62765ffd83dbSDimitry Andric 
6277bdd1243dSDimitry Andric   // S32 vector to cover all data, plus TFE result element.
62785ffd83dbSDimitry Andric   LLT TFETy;
62795ffd83dbSDimitry Andric 
62805ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
62815ffd83dbSDimitry Andric   LLT RegTy;
62825ffd83dbSDimitry Andric 
62835ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
6284fe6060f1SDimitry Andric     RoundedTy =
6285fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6286fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
62875ffd83dbSDimitry Andric     RegTy = S32;
62885ffd83dbSDimitry Andric   } else {
62895ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
62905ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
62915ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
6292fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
6293fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6294fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
62955ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
62965ffd83dbSDimitry Andric   }
62975ffd83dbSDimitry Andric 
62985ffd83dbSDimitry Andric   // The return type does not need adjustment.
62995ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
63005ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
63015ffd83dbSDimitry Andric     return true;
63025ffd83dbSDimitry Andric 
63035ffd83dbSDimitry Andric   Register Dst1Reg;
63045ffd83dbSDimitry Andric 
63055ffd83dbSDimitry Andric   // Insert after the instruction.
63065ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
63075ffd83dbSDimitry Andric 
63085ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
63095ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
63105ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
63115ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
63125ffd83dbSDimitry Andric 
63135ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
63145ffd83dbSDimitry Andric 
63155ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
63165ffd83dbSDimitry Andric 
63175ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
6318349cc55cSDimitry Andric   // type. The instruction really returns these two values in one contiguous
63195ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
63205ffd83dbSDimitry Andric   // return type to use a single register result.
63215ffd83dbSDimitry Andric 
63225ffd83dbSDimitry Andric   if (IsTFE) {
63235ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
63245ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
63255ffd83dbSDimitry Andric       return false;
63265ffd83dbSDimitry Andric 
63275ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
632881ad6265SDimitry Andric     MI.removeOperand(1);
63295ffd83dbSDimitry Andric 
63305ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
63315ffd83dbSDimitry Andric     if (Ty == S32) {
63325ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
63335ffd83dbSDimitry Andric       return true;
63345ffd83dbSDimitry Andric     }
63355ffd83dbSDimitry Andric   }
63365ffd83dbSDimitry Andric 
63375ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
63385ffd83dbSDimitry Andric   // result.
63395ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
63405ffd83dbSDimitry Andric 
63415ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
63425ffd83dbSDimitry Andric 
63435ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
63445ffd83dbSDimitry Andric     assert(!IsTFE);
63455ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
63465ffd83dbSDimitry Andric   } else {
63475ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
63485ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
63495ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
63505ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
63515ffd83dbSDimitry Andric 
63525ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
63535ffd83dbSDimitry Andric     // directly written to the right place already.
63545ffd83dbSDimitry Andric     if (IsTFE)
63555ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
63565ffd83dbSDimitry Andric   }
63575ffd83dbSDimitry Andric 
63585ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
63595ffd83dbSDimitry Andric   // of packed vs. unpacked.
63605ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
63615ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
63625ffd83dbSDimitry Andric     return true;
63635ffd83dbSDimitry Andric   }
63645ffd83dbSDimitry Andric 
63655ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
63665ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
63675ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
63685ffd83dbSDimitry Andric     return true;
63695ffd83dbSDimitry Andric   }
63705ffd83dbSDimitry Andric 
63715ffd83dbSDimitry Andric   assert(Ty.isVector());
63725ffd83dbSDimitry Andric 
63735ffd83dbSDimitry Andric   if (IsD16) {
63745ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
63755ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
63765ffd83dbSDimitry Andric     //
63775ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
63785ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
63795ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
63805ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
63815ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
63825ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
63835ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
63845ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
63855ffd83dbSDimitry Andric     }
63865ffd83dbSDimitry Andric   }
63875ffd83dbSDimitry Andric 
63885ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
63895ffd83dbSDimitry Andric     if (NumElts == 0)
63905ffd83dbSDimitry Andric       return;
63915ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
63925ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
63935ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
63945ffd83dbSDimitry Andric   };
63955ffd83dbSDimitry Andric 
63965ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
63975ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
63985ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
63995ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
64005ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
64015ffd83dbSDimitry Andric     return true;
64025ffd83dbSDimitry Andric   }
64035ffd83dbSDimitry Andric 
64045ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
64055ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
64065ffd83dbSDimitry Andric 
64075ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
6408fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
64095ffd83dbSDimitry Andric   if (Ty == V3S16) {
64100eae32dcSDimitry Andric     if (IsTFE) {
64110eae32dcSDimitry Andric       if (ResultRegs.size() == 1) {
64120eae32dcSDimitry Andric         NewResultReg = ResultRegs[0];
64130eae32dcSDimitry Andric       } else if (ResultRegs.size() == 2) {
64140eae32dcSDimitry Andric         LLT V4S16 = LLT::fixed_vector(4, 16);
64150eae32dcSDimitry Andric         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
64160eae32dcSDimitry Andric       } else {
64170eae32dcSDimitry Andric         return false;
64180eae32dcSDimitry Andric       }
64190eae32dcSDimitry Andric     }
64200eae32dcSDimitry Andric 
64210eae32dcSDimitry Andric     if (MRI->getType(DstReg).getNumElements() <
64220eae32dcSDimitry Andric         MRI->getType(NewResultReg).getNumElements()) {
64230eae32dcSDimitry Andric       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
64240eae32dcSDimitry Andric     } else {
64250eae32dcSDimitry Andric       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
64260eae32dcSDimitry Andric     }
64275ffd83dbSDimitry Andric     return true;
64285ffd83dbSDimitry Andric   }
64295ffd83dbSDimitry Andric 
64305ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
64315ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
64325ffd83dbSDimitry Andric   return true;
64335ffd83dbSDimitry Andric }
64345ffd83dbSDimitry Andric 
64355ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(
6436e8d8bef9SDimitry Andric   LegalizerHelper &Helper, MachineInstr &MI) const {
6437e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
6438e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
6439e8d8bef9SDimitry Andric 
64405ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
64415ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
64425ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
64435ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
64445ffd83dbSDimitry Andric 
64455ffd83dbSDimitry Andric   Observer.changingInstr(MI);
64465ffd83dbSDimitry Andric 
644706c3fb27SDimitry Andric   // Handle needing to s.buffer.load() a p8 value.
644806c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty)) {
644906c3fb27SDimitry Andric     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
645006c3fb27SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
645106c3fb27SDimitry Andric   }
6452fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6453e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
6454e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
6455e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
6456e8d8bef9SDimitry Andric   }
6457e8d8bef9SDimitry Andric 
64585ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
64595ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
64605ffd83dbSDimitry Andric   // allowed to add one.
64615ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
646281ad6265SDimitry Andric   MI.removeOperand(1); // Remove intrinsic ID
64635ffd83dbSDimitry Andric 
64645ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
64655ffd83dbSDimitry Andric   // TODO: Should this use datalayout alignment?
64665ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
64675ffd83dbSDimitry Andric   const Align MemAlign(4);
64685ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
64695ffd83dbSDimitry Andric       MachinePointerInfo(),
64705ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
64715ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
64725ffd83dbSDimitry Andric       MemSize, MemAlign);
64735ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
64745ffd83dbSDimitry Andric 
6475*5f757f3fSDimitry Andric   // If we don't have 96-bit result scalar loads, widening to 128-bit should
64765ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
64775ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
6478*5f757f3fSDimitry Andric   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
64795ffd83dbSDimitry Andric     if (Ty.isVector())
64805ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
64815ffd83dbSDimitry Andric     else
64825ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
64835ffd83dbSDimitry Andric   }
64845ffd83dbSDimitry Andric 
64855ffd83dbSDimitry Andric   Observer.changedInstr(MI);
64865ffd83dbSDimitry Andric   return true;
64875ffd83dbSDimitry Andric }
64885ffd83dbSDimitry Andric 
6489e8d8bef9SDimitry Andric // TODO: Move to selection
64905ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
64910b57cec5SDimitry Andric                                                 MachineRegisterInfo &MRI,
64920b57cec5SDimitry Andric                                                 MachineIRBuilder &B) const {
6493fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
6494fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6495fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
6496fe6060f1SDimitry Andric 
649706c3fb27SDimitry Andric   return ST.supportsGetDoorbellID() ?
649806c3fb27SDimitry Andric          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6499fe6060f1SDimitry Andric }
6500fe6060f1SDimitry Andric 
6501fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6502fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
650306c3fb27SDimitry Andric   const DebugLoc &DL = MI.getDebugLoc();
650406c3fb27SDimitry Andric   MachineBasicBlock &BB = B.getMBB();
650506c3fb27SDimitry Andric   MachineFunction *MF = BB.getParent();
650606c3fb27SDimitry Andric 
650706c3fb27SDimitry Andric   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
650806c3fb27SDimitry Andric     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
650906c3fb27SDimitry Andric       .addImm(0);
651006c3fb27SDimitry Andric     MI.eraseFromParent();
651106c3fb27SDimitry Andric     return true;
651206c3fb27SDimitry Andric   }
651306c3fb27SDimitry Andric 
651406c3fb27SDimitry Andric   // We need a block split to make the real endpgm a terminator. We also don't
651506c3fb27SDimitry Andric   // want to break phis in successor blocks, so we can't just delete to the
651606c3fb27SDimitry Andric   // end of the block.
651706c3fb27SDimitry Andric   BB.splitAt(MI, false /*UpdateLiveIns*/);
651806c3fb27SDimitry Andric   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
651906c3fb27SDimitry Andric   MF->push_back(TrapBB);
652006c3fb27SDimitry Andric   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
652106c3fb27SDimitry Andric     .addImm(0);
652206c3fb27SDimitry Andric   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
652306c3fb27SDimitry Andric     .addMBB(TrapBB);
652406c3fb27SDimitry Andric 
652506c3fb27SDimitry Andric   BB.addSuccessor(TrapBB);
6526fe6060f1SDimitry Andric   MI.eraseFromParent();
6527fe6060f1SDimitry Andric   return true;
6528fe6060f1SDimitry Andric }
6529fe6060f1SDimitry Andric 
6530fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6531fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
653281ad6265SDimitry Andric   MachineFunction &MF = B.getMF();
653381ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
653481ad6265SDimitry Andric 
653581ad6265SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
653681ad6265SDimitry Andric   // For code object version 5, queue_ptr is passed through implicit kernarg.
653706c3fb27SDimitry Andric   if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
653806c3fb27SDimitry Andric       AMDGPU::AMDHSA_COV5) {
653981ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
654081ad6265SDimitry Andric         AMDGPUTargetLowering::QUEUE_PTR;
654181ad6265SDimitry Andric     uint64_t Offset =
654281ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
654381ad6265SDimitry Andric 
654481ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
654581ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
654681ad6265SDimitry Andric 
654781ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
654881ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
654981ad6265SDimitry Andric       return false;
655081ad6265SDimitry Andric 
655181ad6265SDimitry Andric     // TODO: can we be smarter about machine pointer info?
655281ad6265SDimitry Andric     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
655381ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
655481ad6265SDimitry Andric         PtrInfo,
655581ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
655681ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
655781ad6265SDimitry Andric         LLT::scalar(64), commonAlignment(Align(64), Offset));
655881ad6265SDimitry Andric 
655981ad6265SDimitry Andric     // Pointer address
656081ad6265SDimitry Andric     Register LoadAddr = MRI.createGenericVirtualRegister(
656181ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
656281ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
656381ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
656481ad6265SDimitry Andric     // Load address
656581ad6265SDimitry Andric     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
656681ad6265SDimitry Andric     B.buildCopy(SGPR01, Temp);
656781ad6265SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
656881ad6265SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
656981ad6265SDimitry Andric         .addReg(SGPR01, RegState::Implicit);
657081ad6265SDimitry Andric     MI.eraseFromParent();
657181ad6265SDimitry Andric     return true;
657281ad6265SDimitry Andric   }
657381ad6265SDimitry Andric 
65745ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
65755ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6576e8d8bef9SDimitry Andric   Register LiveIn =
6577e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6578e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
65795ffd83dbSDimitry Andric     return false;
6580e8d8bef9SDimitry Andric 
65815ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
65825ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
6583fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
65845ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
6585fe6060f1SDimitry Andric 
6586fe6060f1SDimitry Andric   MI.eraseFromParent();
6587fe6060f1SDimitry Andric   return true;
65885ffd83dbSDimitry Andric }
65895ffd83dbSDimitry Andric 
6590fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(
6591fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6592fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
6593fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
65945ffd83dbSDimitry Andric   MI.eraseFromParent();
65955ffd83dbSDimitry Andric   return true;
65965ffd83dbSDimitry Andric }
65975ffd83dbSDimitry Andric 
65985ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
65995ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
6600349cc55cSDimitry Andric   // Is non-HSA path or trap-handler disabled? Then, report a warning
66015ffd83dbSDimitry Andric   // accordingly
6602fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
6603fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
66045ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
66055ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
66065ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
66075ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
66085ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
66095ffd83dbSDimitry Andric   } else {
66105ffd83dbSDimitry Andric     // Insert debug-trap instruction
6611fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
6612fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
66135ffd83dbSDimitry Andric   }
66145ffd83dbSDimitry Andric 
66155ffd83dbSDimitry Andric   MI.eraseFromParent();
66165ffd83dbSDimitry Andric   return true;
66175ffd83dbSDimitry Andric }
66185ffd83dbSDimitry Andric 
6619e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6620e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
6621e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
6622e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
6623e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
662481ad6265SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
662581ad6265SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
6626e8d8bef9SDimitry Andric 
6627e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
6628e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
6629e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
6630e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
6631e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
6632e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
6633e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
6634e8d8bef9SDimitry Andric 
6635fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
6636fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6637fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
6638fe6060f1SDimitry Andric                                         MI.getDebugLoc());
6639fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6640fe6060f1SDimitry Andric     return false;
6641fe6060f1SDimitry Andric   }
6642fe6060f1SDimitry Andric 
6643*5f757f3fSDimitry Andric   const bool IsGFX11 = AMDGPU::isGFX11(ST);
664481ad6265SDimitry Andric   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
6645*5f757f3fSDimitry Andric   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6646349cc55cSDimitry Andric   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6647349cc55cSDimitry Andric   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6648349cc55cSDimitry Andric   const unsigned NumVDataDwords = 4;
6649349cc55cSDimitry Andric   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
665081ad6265SDimitry Andric   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6651*5f757f3fSDimitry Andric   const bool UseNSA =
6652*5f757f3fSDimitry Andric       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
6653*5f757f3fSDimitry Andric 
6654349cc55cSDimitry Andric   const unsigned BaseOpcodes[2][2] = {
6655349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6656349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6657349cc55cSDimitry Andric        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6658349cc55cSDimitry Andric   int Opcode;
6659349cc55cSDimitry Andric   if (UseNSA) {
666081ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6661*5f757f3fSDimitry Andric                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6662*5f757f3fSDimitry Andric                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
666381ad6265SDimitry Andric                                                : AMDGPU::MIMGEncGfx10NSA,
6664349cc55cSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
6665349cc55cSDimitry Andric   } else {
6666*5f757f3fSDimitry Andric     assert(!IsGFX12Plus);
6667*5f757f3fSDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
6668*5f757f3fSDimitry Andric                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6669*5f757f3fSDimitry Andric                                            : AMDGPU::MIMGEncGfx10Default,
6670bdd1243dSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
6671349cc55cSDimitry Andric   }
6672349cc55cSDimitry Andric   assert(Opcode != -1);
6673e8d8bef9SDimitry Andric 
6674e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
667581ad6265SDimitry Andric   if (UseNSA && IsGFX11Plus) {
667681ad6265SDimitry Andric     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
667781ad6265SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6678bdd1243dSDimitry Andric       auto Merged = B.buildMergeLikeInstr(
667981ad6265SDimitry Andric           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
668081ad6265SDimitry Andric       Ops.push_back(Merged.getReg(0));
668181ad6265SDimitry Andric     };
668281ad6265SDimitry Andric 
668381ad6265SDimitry Andric     Ops.push_back(NodePtr);
668481ad6265SDimitry Andric     Ops.push_back(RayExtent);
668581ad6265SDimitry Andric     packLanes(RayOrigin);
668681ad6265SDimitry Andric 
668781ad6265SDimitry Andric     if (IsA16) {
668881ad6265SDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
668981ad6265SDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6690bdd1243dSDimitry Andric       auto MergedDir = B.buildMergeLikeInstr(
669181ad6265SDimitry Andric           V3S32,
6692bdd1243dSDimitry Andric           {B.buildBitcast(
6693bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
669481ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(0)}))
669581ad6265SDimitry Andric                .getReg(0),
6696bdd1243dSDimitry Andric            B.buildBitcast(
6697bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
669881ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(1)}))
669981ad6265SDimitry Andric                .getReg(0),
6700bdd1243dSDimitry Andric            B.buildBitcast(
6701bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
670281ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(2)}))
670381ad6265SDimitry Andric                .getReg(0)});
670481ad6265SDimitry Andric       Ops.push_back(MergedDir.getReg(0));
670581ad6265SDimitry Andric     } else {
670681ad6265SDimitry Andric       packLanes(RayDir);
670781ad6265SDimitry Andric       packLanes(RayInvDir);
670881ad6265SDimitry Andric     }
670981ad6265SDimitry Andric   } else {
6710e8d8bef9SDimitry Andric     if (Is64) {
6711e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
6712e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
6713e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
6714e8d8bef9SDimitry Andric     } else {
6715e8d8bef9SDimitry Andric       Ops.push_back(NodePtr);
6716e8d8bef9SDimitry Andric     }
6717e8d8bef9SDimitry Andric     Ops.push_back(RayExtent);
6718e8d8bef9SDimitry Andric 
6719e8d8bef9SDimitry Andric     auto packLanes = [&Ops, &S32, &B](Register Src) {
67200eae32dcSDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6721e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
6722e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
6723e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(2));
6724e8d8bef9SDimitry Andric     };
6725e8d8bef9SDimitry Andric 
6726e8d8bef9SDimitry Andric     packLanes(RayOrigin);
6727e8d8bef9SDimitry Andric     if (IsA16) {
67280eae32dcSDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
67290eae32dcSDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6730e8d8bef9SDimitry Andric       Register R1 = MRI.createGenericVirtualRegister(S32);
6731e8d8bef9SDimitry Andric       Register R2 = MRI.createGenericVirtualRegister(S32);
6732e8d8bef9SDimitry Andric       Register R3 = MRI.createGenericVirtualRegister(S32);
6733bdd1243dSDimitry Andric       B.buildMergeLikeInstr(R1,
6734bdd1243dSDimitry Andric                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6735bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
6736bdd1243dSDimitry Andric           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6737bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
6738bdd1243dSDimitry Andric           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6739e8d8bef9SDimitry Andric       Ops.push_back(R1);
6740e8d8bef9SDimitry Andric       Ops.push_back(R2);
6741e8d8bef9SDimitry Andric       Ops.push_back(R3);
6742e8d8bef9SDimitry Andric     } else {
6743e8d8bef9SDimitry Andric       packLanes(RayDir);
6744e8d8bef9SDimitry Andric       packLanes(RayInvDir);
6745e8d8bef9SDimitry Andric     }
674681ad6265SDimitry Andric   }
6747e8d8bef9SDimitry Andric 
6748349cc55cSDimitry Andric   if (!UseNSA) {
6749349cc55cSDimitry Andric     // Build a single vector containing all the operands so far prepared.
6750349cc55cSDimitry Andric     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
6751bdd1243dSDimitry Andric     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6752349cc55cSDimitry Andric     Ops.clear();
6753349cc55cSDimitry Andric     Ops.push_back(MergedOps);
6754349cc55cSDimitry Andric   }
6755349cc55cSDimitry Andric 
6756e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6757e8d8bef9SDimitry Andric     .addDef(DstReg)
6758e8d8bef9SDimitry Andric     .addImm(Opcode);
6759e8d8bef9SDimitry Andric 
6760e8d8bef9SDimitry Andric   for (Register R : Ops) {
6761e8d8bef9SDimitry Andric     MIB.addUse(R);
6762e8d8bef9SDimitry Andric   }
6763e8d8bef9SDimitry Andric 
6764e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
6765e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
6766e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
6767e8d8bef9SDimitry Andric 
6768e8d8bef9SDimitry Andric   MI.eraseFromParent();
6769e8d8bef9SDimitry Andric   return true;
6770e8d8bef9SDimitry Andric }
6771e8d8bef9SDimitry Andric 
677281ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
677381ad6265SDimitry Andric                                                MachineIRBuilder &B) const {
677481ad6265SDimitry Andric   unsigned Opc;
677581ad6265SDimitry Andric   int RoundMode = MI.getOperand(2).getImm();
677681ad6265SDimitry Andric 
677781ad6265SDimitry Andric   if (RoundMode == (int)RoundingMode::TowardPositive)
677881ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
677981ad6265SDimitry Andric   else if (RoundMode == (int)RoundingMode::TowardNegative)
678081ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
678181ad6265SDimitry Andric   else
678281ad6265SDimitry Andric     return false;
678381ad6265SDimitry Andric 
678481ad6265SDimitry Andric   B.buildInstr(Opc)
678581ad6265SDimitry Andric       .addDef(MI.getOperand(0).getReg())
678681ad6265SDimitry Andric       .addUse(MI.getOperand(1).getReg());
678781ad6265SDimitry Andric 
678804eeddc0SDimitry Andric   MI.eraseFromParent();
678981ad6265SDimitry Andric 
679004eeddc0SDimitry Andric   return true;
679104eeddc0SDimitry Andric }
679204eeddc0SDimitry Andric 
6793*5f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
6794*5f757f3fSDimitry Andric                                             MachineIRBuilder &B) const {
6795*5f757f3fSDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
6796*5f757f3fSDimitry Andric   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
6797*5f757f3fSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
6798*5f757f3fSDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6799*5f757f3fSDimitry Andric   MI.eraseFromParent();
6800*5f757f3fSDimitry Andric   return true;
6801*5f757f3fSDimitry Andric }
6802*5f757f3fSDimitry Andric 
68035ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
68045ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
68055ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
68065ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
68075ffd83dbSDimitry Andric 
68080b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
6809*5f757f3fSDimitry Andric   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
6810480093f4SDimitry Andric   switch (IntrID) {
6811480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
6812480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
6813480093f4SDimitry Andric     MachineInstr *Br = nullptr;
68145ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
6815e8d8bef9SDimitry Andric     bool Negated = false;
6816e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
6817e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
68180b57cec5SDimitry Andric       const SIRegisterInfo *TRI
68190b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
68200b57cec5SDimitry Andric 
68210b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
68220b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
6823480093f4SDimitry Andric 
68245ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
6825e8d8bef9SDimitry Andric 
6826e8d8bef9SDimitry Andric       if (Negated)
6827e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
6828e8d8bef9SDimitry Andric 
68295ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
6830480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
68310b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
68320b57cec5SDimitry Andric           .addDef(Def)
68330b57cec5SDimitry Andric           .addUse(Use)
68345ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
6835480093f4SDimitry Andric       } else {
6836480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
6837480093f4SDimitry Andric             .addDef(Def)
6838480093f4SDimitry Andric             .addUse(Use)
6839e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
6840480093f4SDimitry Andric       }
6841480093f4SDimitry Andric 
68425ffd83dbSDimitry Andric       if (Br) {
68435ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
68445ffd83dbSDimitry Andric       } else {
68455ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
68465ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
68475ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
68485ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
68495ffd83dbSDimitry Andric       }
68500b57cec5SDimitry Andric 
68510b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
68520b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
68530b57cec5SDimitry Andric       MI.eraseFromParent();
68540b57cec5SDimitry Andric       BrCond->eraseFromParent();
68550b57cec5SDimitry Andric       return true;
68560b57cec5SDimitry Andric     }
68570b57cec5SDimitry Andric 
68580b57cec5SDimitry Andric     return false;
68590b57cec5SDimitry Andric   }
68600b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
6861480093f4SDimitry Andric     MachineInstr *Br = nullptr;
68625ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
6863e8d8bef9SDimitry Andric     bool Negated = false;
6864e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
6865e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
68660b57cec5SDimitry Andric       const SIRegisterInfo *TRI
68670b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
68680b57cec5SDimitry Andric 
68695ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
68700b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
68715ffd83dbSDimitry Andric 
6872e8d8bef9SDimitry Andric       if (Negated)
6873e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
6874e8d8bef9SDimitry Andric 
68755ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
68760b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
68770b57cec5SDimitry Andric         .addUse(Reg)
68785ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
68795ffd83dbSDimitry Andric 
68805ffd83dbSDimitry Andric       if (Br)
68815ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
68825ffd83dbSDimitry Andric       else
68835ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
68845ffd83dbSDimitry Andric 
68850b57cec5SDimitry Andric       MI.eraseFromParent();
68860b57cec5SDimitry Andric       BrCond->eraseFromParent();
68870b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
68880b57cec5SDimitry Andric       return true;
68890b57cec5SDimitry Andric     }
68900b57cec5SDimitry Andric 
68910b57cec5SDimitry Andric     return false;
68920b57cec5SDimitry Andric   }
689306c3fb27SDimitry Andric   case Intrinsic::amdgcn_make_buffer_rsrc:
689406c3fb27SDimitry Andric     return legalizePointerAsRsrcIntrin(MI, MRI, B);
68950b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
68965ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
68975ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
68985ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
68995ffd83dbSDimitry Andric       MI.eraseFromParent();
69005ffd83dbSDimitry Andric       return true;
69015ffd83dbSDimitry Andric     }
69025ffd83dbSDimitry Andric 
69030b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
69040b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
69050b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
69060b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
69070b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
690881ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
69090b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
69100b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
691181ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
69120b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
69130b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
691481ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
69150b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
69160b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
69170b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69180b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
69190b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
69200b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69210b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
69220b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
69230b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69240b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6925fcaf7f86SDimitry Andric   case Intrinsic::amdgcn_lds_kernel_id:
6926fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
6927fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
69280b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
69290b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69300b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
69310b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
69320b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69330b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
69340b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
69350b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
69360b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
69370b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
69380b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
69390b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
694081ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_x:
694181ad6265SDimitry Andric     // TODO: Emit error for hsa
694281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
694381ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_X);
694481ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_y:
694581ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
694681ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Y);
694781ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_z:
694881ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
694981ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Z);
695081ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_x:
695181ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
695281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
695381ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_y:
695481ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
695581ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
695681ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
695781ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_z:
695881ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
695981ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_x:
696081ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
696181ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_y:
696281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
696381ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_z:
696481ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
69658bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
69668bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
69678bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
69688bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
69698bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
69708bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
69718bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
69728bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
69738bcb0991SDimitry Andric     MI.eraseFromParent();
69748bcb0991SDimitry Andric     return true;
69758bcb0991SDimitry Andric   }
69765ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
6977e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
69788bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
697906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_store:
69805ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
698106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_store:
69825ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
69838bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
698406c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
69855ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
698606c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
69875ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
69885ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
698906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
69905ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
699106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
69925ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
69935ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
699406c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_load:
69955ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
699606c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_load:
69975ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
69985ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
699906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
70005ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
700106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
70025ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
70035ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
700406c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
70055ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
700606c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
70075ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
70085ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
700906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
70105ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
701106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
70125ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
701306c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
70145ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
701506c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
70165ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
701706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
70185ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
701906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
70205ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
702106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
70225ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
702306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
70245ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
702506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
70265ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
702706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
70285ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
702906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
70305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
703106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
70325ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
703306c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
70345ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
703506c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
70365ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
703706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
70385ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
703906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
70405ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
704106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
70425ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
704306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
70445ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
704506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
70465ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
704706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
70485ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
704906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
70505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
705106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
70525ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
705306c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
70545ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
705506c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
70565ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
705706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
70585ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
705906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7060fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
706106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7062fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
706306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7064fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
706506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7066fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
706706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
706804eeddc0SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
706906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7070bdd1243dSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
707106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
707204eeddc0SDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
70735ffd83dbSDimitry Andric   case Intrinsic::trap:
70745ffd83dbSDimitry Andric     return legalizeTrapIntrinsic(MI, MRI, B);
70755ffd83dbSDimitry Andric   case Intrinsic::debugtrap:
70765ffd83dbSDimitry Andric     return legalizeDebugTrapIntrinsic(MI, MRI, B);
7077e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
7078e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
7079e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fadd:
7080e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmin:
7081e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_ds_fmax:
7082e8d8bef9SDimitry Andric     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
7083e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7084e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
708506c3fb27SDimitry Andric   case Intrinsic::amdgcn_fmed3: {
708606c3fb27SDimitry Andric     GISelChangeObserver &Observer = Helper.Observer;
708706c3fb27SDimitry Andric 
708806c3fb27SDimitry Andric     // FIXME: This is to workaround the inability of tablegen match combiners to
708906c3fb27SDimitry Andric     // match intrinsics in patterns.
709006c3fb27SDimitry Andric     Observer.changingInstr(MI);
709106c3fb27SDimitry Andric     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
709206c3fb27SDimitry Andric     MI.removeOperand(1);
709306c3fb27SDimitry Andric     Observer.changedInstr(MI);
709406c3fb27SDimitry Andric     return true;
709506c3fb27SDimitry Andric   }
70965ffd83dbSDimitry Andric   default: {
70975ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
70985ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
70995ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
71000b57cec5SDimitry Andric     return true;
71010b57cec5SDimitry Andric   }
71025ffd83dbSDimitry Andric   }
71030b57cec5SDimitry Andric 
71040b57cec5SDimitry Andric   return true;
71050b57cec5SDimitry Andric }
7106