xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric /// \file
90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for
100b57cec5SDimitry Andric /// AMDGPU.
110b57cec5SDimitry Andric /// \todo This should be generated by TableGen.
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
158bcb0991SDimitry Andric 
160b57cec5SDimitry Andric #include "AMDGPU.h"
175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h"
18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h"
190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h"
205f757f3fSDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
215f757f3fSDimitry Andric #include "SIInstrInfo.h"
220b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h"
235f757f3fSDimitry Andric #include "SIRegisterInfo.h"
24fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h"
255ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h"
26fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h"
275f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
280b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
295ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
3106c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h"
325f757f3fSDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h"
338bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h"
34e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
3581ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h"
360b57cec5SDimitry Andric 
370b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo"
380b57cec5SDimitry Andric 
390b57cec5SDimitry Andric using namespace llvm;
400b57cec5SDimitry Andric using namespace LegalizeActions;
410b57cec5SDimitry Andric using namespace LegalizeMutations;
420b57cec5SDimitry Andric using namespace LegalityPredicates;
435ffd83dbSDimitry Andric using namespace MIPatternMatch;
440b57cec5SDimitry Andric 
455ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types.
465ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality(
475ffd83dbSDimitry Andric   "amdgpu-global-isel-new-legality",
485ffd83dbSDimitry Andric   cl::desc("Use GlobalISel desired legality, rather than try to use"
495ffd83dbSDimitry Andric            "rules compatible with selection patterns"),
505ffd83dbSDimitry Andric   cl::init(false),
515ffd83dbSDimitry Andric   cl::ReallyHidden);
520b57cec5SDimitry Andric 
535ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024;
545ffd83dbSDimitry Andric 
555ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements
getPow2VectorType(LLT Ty)565ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) {
575ffd83dbSDimitry Andric   unsigned NElts = Ty.getNumElements();
585ffd83dbSDimitry Andric   unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
59fe6060f1SDimitry Andric   return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
600b57cec5SDimitry Andric }
610b57cec5SDimitry Andric 
625ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits
getPow2ScalarType(LLT Ty)635ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) {
645ffd83dbSDimitry Andric   unsigned Bits = Ty.getSizeInBits();
655ffd83dbSDimitry Andric   unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
665ffd83dbSDimitry Andric   return LLT::scalar(Pow2Bits);
678bcb0991SDimitry Andric }
688bcb0991SDimitry Andric 
69349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an
70e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
71e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized.
isSmallOddVector(unsigned TypeIdx)720b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
730b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
740b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
75e8d8bef9SDimitry Andric     if (!Ty.isVector())
76e8d8bef9SDimitry Andric       return false;
77e8d8bef9SDimitry Andric 
78e8d8bef9SDimitry Andric     const LLT EltTy = Ty.getElementType();
79e8d8bef9SDimitry Andric     const unsigned EltSize = EltTy.getSizeInBits();
80e8d8bef9SDimitry Andric     return Ty.getNumElements() % 2 != 0 &&
81e8d8bef9SDimitry Andric            EltSize > 1 && EltSize < 32 &&
828bcb0991SDimitry Andric            Ty.getSizeInBits() % 32 != 0;
838bcb0991SDimitry Andric   };
848bcb0991SDimitry Andric }
858bcb0991SDimitry Andric 
sizeIsMultipleOf32(unsigned TypeIdx)86e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
87e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
88e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
89e8d8bef9SDimitry Andric     return Ty.getSizeInBits() % 32 == 0;
90e8d8bef9SDimitry Andric   };
91e8d8bef9SDimitry Andric }
92e8d8bef9SDimitry Andric 
isWideVec16(unsigned TypeIdx)938bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) {
948bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
958bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
968bcb0991SDimitry Andric     const LLT EltTy = Ty.getScalarType();
978bcb0991SDimitry Andric     return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
980b57cec5SDimitry Andric   };
990b57cec5SDimitry Andric }
1000b57cec5SDimitry Andric 
oneMoreElement(unsigned TypeIdx)1010b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
1020b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1030b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1040b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
105bdd1243dSDimitry Andric     return std::pair(TypeIdx,
106fe6060f1SDimitry Andric                      LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
1070b57cec5SDimitry Andric   };
1080b57cec5SDimitry Andric }
1090b57cec5SDimitry Andric 
fewerEltsToSize64Vector(unsigned TypeIdx)1100b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
1110b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
1120b57cec5SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1130b57cec5SDimitry Andric     const LLT EltTy = Ty.getElementType();
1140b57cec5SDimitry Andric     unsigned Size = Ty.getSizeInBits();
1150b57cec5SDimitry Andric     unsigned Pieces = (Size + 63) / 64;
1160b57cec5SDimitry Andric     unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
117bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::scalarOrVector(
118bdd1243dSDimitry Andric                                   ElementCount::getFixed(NewNumElts), EltTy));
1190b57cec5SDimitry Andric   };
1200b57cec5SDimitry Andric }
1210b57cec5SDimitry Andric 
1228bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit
1238bcb0991SDimitry Andric // type.
moreEltsToNext32Bit(unsigned TypeIdx)1248bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
1258bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
1268bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
1278bcb0991SDimitry Andric 
1288bcb0991SDimitry Andric     const LLT EltTy = Ty.getElementType();
1298bcb0991SDimitry Andric     const int Size = Ty.getSizeInBits();
1308bcb0991SDimitry Andric     const int EltSize = EltTy.getSizeInBits();
1318bcb0991SDimitry Andric     const int NextMul32 = (Size + 31) / 32;
1328bcb0991SDimitry Andric 
1338bcb0991SDimitry Andric     assert(EltSize < 32);
1348bcb0991SDimitry Andric 
1358bcb0991SDimitry Andric     const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
136bdd1243dSDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
1378bcb0991SDimitry Andric   };
1388bcb0991SDimitry Andric }
1398bcb0991SDimitry Andric 
14006c3fb27SDimitry Andric // Increase the number of vector elements to reach the next legal RegClass.
moreElementsToNextExistingRegClass(unsigned TypeIdx)14106c3fb27SDimitry Andric static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
14206c3fb27SDimitry Andric   return [=](const LegalityQuery &Query) {
14306c3fb27SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
14406c3fb27SDimitry Andric     const unsigned NumElts = Ty.getNumElements();
14506c3fb27SDimitry Andric     const unsigned EltSize = Ty.getElementType().getSizeInBits();
14606c3fb27SDimitry Andric     const unsigned MaxNumElts = MaxRegisterSize / EltSize;
14706c3fb27SDimitry Andric 
14806c3fb27SDimitry Andric     assert(EltSize == 32 || EltSize == 64);
14906c3fb27SDimitry Andric     assert(Ty.getSizeInBits() < MaxRegisterSize);
15006c3fb27SDimitry Andric 
15106c3fb27SDimitry Andric     unsigned NewNumElts;
15206c3fb27SDimitry Andric     // Find the nearest legal RegClass that is larger than the current type.
15306c3fb27SDimitry Andric     for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
15406c3fb27SDimitry Andric       if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
15506c3fb27SDimitry Andric         break;
15606c3fb27SDimitry Andric     }
15706c3fb27SDimitry Andric 
15806c3fb27SDimitry Andric     return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
15906c3fb27SDimitry Andric   };
16006c3fb27SDimitry Andric }
16106c3fb27SDimitry Andric 
getBufferRsrcScalarType(const LLT Ty)16206c3fb27SDimitry Andric static LLT getBufferRsrcScalarType(const LLT Ty) {
16306c3fb27SDimitry Andric   if (!Ty.isVector())
16406c3fb27SDimitry Andric     return LLT::scalar(128);
16506c3fb27SDimitry Andric   const ElementCount NumElems = Ty.getElementCount();
16606c3fb27SDimitry Andric   return LLT::vector(NumElems, LLT::scalar(128));
16706c3fb27SDimitry Andric }
16806c3fb27SDimitry Andric 
getBufferRsrcRegisterType(const LLT Ty)16906c3fb27SDimitry Andric static LLT getBufferRsrcRegisterType(const LLT Ty) {
17006c3fb27SDimitry Andric   if (!Ty.isVector())
17106c3fb27SDimitry Andric     return LLT::fixed_vector(4, LLT::scalar(32));
17206c3fb27SDimitry Andric   const unsigned NumElems = Ty.getElementCount().getFixedValue();
17306c3fb27SDimitry Andric   return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
17406c3fb27SDimitry Andric }
17506c3fb27SDimitry Andric 
getBitcastRegisterType(const LLT Ty)176e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) {
177e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
1785ffd83dbSDimitry Andric 
1795ffd83dbSDimitry Andric   if (Size <= 32) {
1805ffd83dbSDimitry Andric     // <2 x s8> -> s16
1815ffd83dbSDimitry Andric     // <4 x s8> -> s32
182e8d8bef9SDimitry Andric     return LLT::scalar(Size);
183e8d8bef9SDimitry Andric   }
1845ffd83dbSDimitry Andric 
185fe6060f1SDimitry Andric   return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32);
186e8d8bef9SDimitry Andric }
187e8d8bef9SDimitry Andric 
bitcastToRegisterType(unsigned TypeIdx)188e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
189e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
190e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
191bdd1243dSDimitry Andric     return std::pair(TypeIdx, getBitcastRegisterType(Ty));
192e8d8bef9SDimitry Andric   };
193e8d8bef9SDimitry Andric }
194e8d8bef9SDimitry Andric 
bitcastToVectorElement32(unsigned TypeIdx)195e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
196e8d8bef9SDimitry Andric   return [=](const LegalityQuery &Query) {
197e8d8bef9SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
198e8d8bef9SDimitry Andric     unsigned Size = Ty.getSizeInBits();
199e8d8bef9SDimitry Andric     assert(Size % 32 == 0);
200bdd1243dSDimitry Andric     return std::pair(
201fe6060f1SDimitry Andric         TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32));
2025ffd83dbSDimitry Andric   };
2035ffd83dbSDimitry Andric }
2045ffd83dbSDimitry Andric 
vectorSmallerThan(unsigned TypeIdx,unsigned Size)2058bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
2068bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2078bcb0991SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2088bcb0991SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
2098bcb0991SDimitry Andric   };
2108bcb0991SDimitry Andric }
2118bcb0991SDimitry Andric 
vectorWiderThan(unsigned TypeIdx,unsigned Size)2120b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
2130b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
2140b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2150b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
2160b57cec5SDimitry Andric   };
2170b57cec5SDimitry Andric }
2180b57cec5SDimitry Andric 
numElementsNotEven(unsigned TypeIdx)2190b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
2200b57cec5SDimitry Andric   return [=](const LegalityQuery &Query) {
2210b57cec5SDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2220b57cec5SDimitry Andric     return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
2230b57cec5SDimitry Andric   };
2240b57cec5SDimitry Andric }
2250b57cec5SDimitry Andric 
isRegisterSize(unsigned Size)2265ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) {
2275ffd83dbSDimitry Andric   return Size % 32 == 0 && Size <= MaxRegisterSize;
2285ffd83dbSDimitry Andric }
2295ffd83dbSDimitry Andric 
isRegisterVectorElementType(LLT EltTy)2305ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) {
2315ffd83dbSDimitry Andric   const int EltSize = EltTy.getSizeInBits();
2325ffd83dbSDimitry Andric   return EltSize == 16 || EltSize % 32 == 0;
2335ffd83dbSDimitry Andric }
2345ffd83dbSDimitry Andric 
isRegisterVectorType(LLT Ty)2355ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) {
2360b57cec5SDimitry Andric   const int EltSize = Ty.getElementType().getSizeInBits();
2370b57cec5SDimitry Andric   return EltSize == 32 || EltSize == 64 ||
2380b57cec5SDimitry Andric          (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
2390b57cec5SDimitry Andric          EltSize == 128 || EltSize == 256;
2400b57cec5SDimitry Andric }
2410b57cec5SDimitry Andric 
242*0fca6ea1SDimitry Andric // TODO: replace all uses of isRegisterType with isRegisterClassType
isRegisterType(LLT Ty)2435ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) {
2445ffd83dbSDimitry Andric   if (!isRegisterSize(Ty.getSizeInBits()))
2455ffd83dbSDimitry Andric     return false;
2465ffd83dbSDimitry Andric 
2475ffd83dbSDimitry Andric   if (Ty.isVector())
2485ffd83dbSDimitry Andric     return isRegisterVectorType(Ty);
2495ffd83dbSDimitry Andric 
2505ffd83dbSDimitry Andric   return true;
2515ffd83dbSDimitry Andric }
2525ffd83dbSDimitry Andric 
2535ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and
2545ffd83dbSDimitry Andric // multiples of v2s16.
isRegisterType(unsigned TypeIdx)2555ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) {
2565ffd83dbSDimitry Andric   return [=](const LegalityQuery &Query) {
2575ffd83dbSDimitry Andric     return isRegisterType(Query.Types[TypeIdx]);
2588bcb0991SDimitry Andric   };
2598bcb0991SDimitry Andric }
2608bcb0991SDimitry Andric 
26106c3fb27SDimitry Andric // RegisterType that doesn't have a corresponding RegClass.
262*0fca6ea1SDimitry Andric // TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
263*0fca6ea1SDimitry Andric // should be removed.
isIllegalRegisterType(unsigned TypeIdx)26406c3fb27SDimitry Andric static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
26506c3fb27SDimitry Andric   return [=](const LegalityQuery &Query) {
26606c3fb27SDimitry Andric     LLT Ty = Query.Types[TypeIdx];
26706c3fb27SDimitry Andric     return isRegisterType(Ty) &&
26806c3fb27SDimitry Andric            !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
26906c3fb27SDimitry Andric   };
27006c3fb27SDimitry Andric }
27106c3fb27SDimitry Andric 
elementTypeIsLegal(unsigned TypeIdx)2725ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
2738bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
2745ffd83dbSDimitry Andric     const LLT QueryTy = Query.Types[TypeIdx];
2755ffd83dbSDimitry Andric     if (!QueryTy.isVector())
2765ffd83dbSDimitry Andric       return false;
2775ffd83dbSDimitry Andric     const LLT EltTy = QueryTy.getElementType();
2785ffd83dbSDimitry Andric     return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
2798bcb0991SDimitry Andric   };
2808bcb0991SDimitry Andric }
2818bcb0991SDimitry Andric 
282*0fca6ea1SDimitry Andric static const LLT S1 = LLT::scalar(1);
283*0fca6ea1SDimitry Andric static const LLT S8 = LLT::scalar(8);
284*0fca6ea1SDimitry Andric static const LLT S16 = LLT::scalar(16);
285*0fca6ea1SDimitry Andric static const LLT S32 = LLT::scalar(32);
286*0fca6ea1SDimitry Andric static const LLT F32 = LLT::float32();
287*0fca6ea1SDimitry Andric static const LLT S64 = LLT::scalar(64);
288*0fca6ea1SDimitry Andric static const LLT F64 = LLT::float64();
289*0fca6ea1SDimitry Andric static const LLT S96 = LLT::scalar(96);
290*0fca6ea1SDimitry Andric static const LLT S128 = LLT::scalar(128);
291*0fca6ea1SDimitry Andric static const LLT S160 = LLT::scalar(160);
292*0fca6ea1SDimitry Andric static const LLT S224 = LLT::scalar(224);
293*0fca6ea1SDimitry Andric static const LLT S256 = LLT::scalar(256);
294*0fca6ea1SDimitry Andric static const LLT S512 = LLT::scalar(512);
295*0fca6ea1SDimitry Andric static const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
296*0fca6ea1SDimitry Andric 
297*0fca6ea1SDimitry Andric static const LLT V2S8 = LLT::fixed_vector(2, 8);
298*0fca6ea1SDimitry Andric static const LLT V2S16 = LLT::fixed_vector(2, 16);
299*0fca6ea1SDimitry Andric static const LLT V4S16 = LLT::fixed_vector(4, 16);
300*0fca6ea1SDimitry Andric static const LLT V6S16 = LLT::fixed_vector(6, 16);
301*0fca6ea1SDimitry Andric static const LLT V8S16 = LLT::fixed_vector(8, 16);
302*0fca6ea1SDimitry Andric static const LLT V10S16 = LLT::fixed_vector(10, 16);
303*0fca6ea1SDimitry Andric static const LLT V12S16 = LLT::fixed_vector(12, 16);
304*0fca6ea1SDimitry Andric static const LLT V16S16 = LLT::fixed_vector(16, 16);
305*0fca6ea1SDimitry Andric 
306*0fca6ea1SDimitry Andric static const LLT V2F16 = LLT::fixed_vector(2, LLT::float16());
307*0fca6ea1SDimitry Andric static const LLT V2BF16 = V2F16; // FIXME
308*0fca6ea1SDimitry Andric 
309*0fca6ea1SDimitry Andric static const LLT V2S32 = LLT::fixed_vector(2, 32);
310*0fca6ea1SDimitry Andric static const LLT V3S32 = LLT::fixed_vector(3, 32);
311*0fca6ea1SDimitry Andric static const LLT V4S32 = LLT::fixed_vector(4, 32);
312*0fca6ea1SDimitry Andric static const LLT V5S32 = LLT::fixed_vector(5, 32);
313*0fca6ea1SDimitry Andric static const LLT V6S32 = LLT::fixed_vector(6, 32);
314*0fca6ea1SDimitry Andric static const LLT V7S32 = LLT::fixed_vector(7, 32);
315*0fca6ea1SDimitry Andric static const LLT V8S32 = LLT::fixed_vector(8, 32);
316*0fca6ea1SDimitry Andric static const LLT V9S32 = LLT::fixed_vector(9, 32);
317*0fca6ea1SDimitry Andric static const LLT V10S32 = LLT::fixed_vector(10, 32);
318*0fca6ea1SDimitry Andric static const LLT V11S32 = LLT::fixed_vector(11, 32);
319*0fca6ea1SDimitry Andric static const LLT V12S32 = LLT::fixed_vector(12, 32);
320*0fca6ea1SDimitry Andric static const LLT V16S32 = LLT::fixed_vector(16, 32);
321*0fca6ea1SDimitry Andric static const LLT V32S32 = LLT::fixed_vector(32, 32);
322*0fca6ea1SDimitry Andric 
323*0fca6ea1SDimitry Andric static const LLT V2S64 = LLT::fixed_vector(2, 64);
324*0fca6ea1SDimitry Andric static const LLT V3S64 = LLT::fixed_vector(3, 64);
325*0fca6ea1SDimitry Andric static const LLT V4S64 = LLT::fixed_vector(4, 64);
326*0fca6ea1SDimitry Andric static const LLT V5S64 = LLT::fixed_vector(5, 64);
327*0fca6ea1SDimitry Andric static const LLT V6S64 = LLT::fixed_vector(6, 64);
328*0fca6ea1SDimitry Andric static const LLT V7S64 = LLT::fixed_vector(7, 64);
329*0fca6ea1SDimitry Andric static const LLT V8S64 = LLT::fixed_vector(8, 64);
330*0fca6ea1SDimitry Andric static const LLT V16S64 = LLT::fixed_vector(16, 64);
331*0fca6ea1SDimitry Andric 
332*0fca6ea1SDimitry Andric static const LLT V2S128 = LLT::fixed_vector(2, 128);
333*0fca6ea1SDimitry Andric static const LLT V4S128 = LLT::fixed_vector(4, 128);
334*0fca6ea1SDimitry Andric 
335*0fca6ea1SDimitry Andric static std::initializer_list<LLT> AllScalarTypes = {S32,  S64,  S96,  S128,
336*0fca6ea1SDimitry Andric                                                     S160, S224, S256, S512};
337*0fca6ea1SDimitry Andric 
338*0fca6ea1SDimitry Andric static std::initializer_list<LLT> AllS16Vectors{
339*0fca6ea1SDimitry Andric     V2S16, V4S16, V6S16, V8S16, V10S16, V12S16, V16S16, V2S128, V4S128};
340*0fca6ea1SDimitry Andric 
341*0fca6ea1SDimitry Andric static std::initializer_list<LLT> AllS32Vectors = {
342*0fca6ea1SDimitry Andric     V2S32, V3S32,  V4S32,  V5S32,  V6S32,  V7S32, V8S32,
343*0fca6ea1SDimitry Andric     V9S32, V10S32, V11S32, V12S32, V16S32, V32S32};
344*0fca6ea1SDimitry Andric 
345*0fca6ea1SDimitry Andric static std::initializer_list<LLT> AllS64Vectors = {V2S64, V3S64, V4S64, V5S64,
346*0fca6ea1SDimitry Andric                                                    V6S64, V7S64, V8S64, V16S64};
347*0fca6ea1SDimitry Andric 
348*0fca6ea1SDimitry Andric // Checks whether a type is in the list of legal register types.
isRegisterClassType(LLT Ty)349*0fca6ea1SDimitry Andric static bool isRegisterClassType(LLT Ty) {
350*0fca6ea1SDimitry Andric   if (Ty.isPointerOrPointerVector())
351*0fca6ea1SDimitry Andric     Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
352*0fca6ea1SDimitry Andric 
353*0fca6ea1SDimitry Andric   return is_contained(AllS32Vectors, Ty) || is_contained(AllS64Vectors, Ty) ||
354*0fca6ea1SDimitry Andric          is_contained(AllScalarTypes, Ty) || is_contained(AllS16Vectors, Ty);
355*0fca6ea1SDimitry Andric }
356*0fca6ea1SDimitry Andric 
isRegisterClassType(unsigned TypeIdx)357*0fca6ea1SDimitry Andric static LegalityPredicate isRegisterClassType(unsigned TypeIdx) {
358*0fca6ea1SDimitry Andric   return [TypeIdx](const LegalityQuery &Query) {
359*0fca6ea1SDimitry Andric     return isRegisterClassType(Query.Types[TypeIdx]);
360*0fca6ea1SDimitry Andric   };
361*0fca6ea1SDimitry Andric }
362*0fca6ea1SDimitry Andric 
363fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger
364fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type.
isWideScalarExtLoadTruncStore(unsigned TypeIdx)365fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
3668bcb0991SDimitry Andric   return [=](const LegalityQuery &Query) {
3678bcb0991SDimitry Andric     const LLT Ty = Query.Types[TypeIdx];
3688bcb0991SDimitry Andric     return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
369fe6060f1SDimitry Andric            Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
3700b57cec5SDimitry Andric   };
3710b57cec5SDimitry Andric }
3720b57cec5SDimitry Andric 
3735ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
3745ffd83dbSDimitry Andric // handle some operations by just promoting the register during
3755ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits.
maxSizeForAddrSpace(const GCNSubtarget & ST,unsigned AS,bool IsLoad,bool IsAtomic)3765ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
37706c3fb27SDimitry Andric                                     bool IsLoad, bool IsAtomic) {
3785ffd83dbSDimitry Andric   switch (AS) {
3795ffd83dbSDimitry Andric   case AMDGPUAS::PRIVATE_ADDRESS:
3805ffd83dbSDimitry Andric     // FIXME: Private element size.
381e8d8bef9SDimitry Andric     return ST.enableFlatScratch() ? 128 : 32;
3825ffd83dbSDimitry Andric   case AMDGPUAS::LOCAL_ADDRESS:
3835ffd83dbSDimitry Andric     return ST.useDS128() ? 128 : 64;
3845ffd83dbSDimitry Andric   case AMDGPUAS::GLOBAL_ADDRESS:
3855ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS:
3865ffd83dbSDimitry Andric   case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
38706c3fb27SDimitry Andric   case AMDGPUAS::BUFFER_RESOURCE:
3885ffd83dbSDimitry Andric     // Treat constant and global as identical. SMRD loads are sometimes usable for
3895ffd83dbSDimitry Andric     // global loads (ideally constant address space should be eliminated)
3905ffd83dbSDimitry Andric     // depending on the context. Legality cannot be context dependent, but
3915ffd83dbSDimitry Andric     // RegBankSelect can split the load as necessary depending on the pointer
3925ffd83dbSDimitry Andric     // register bank/uniformity and if the memory is invariant or not written in a
3935ffd83dbSDimitry Andric     // kernel.
3945ffd83dbSDimitry Andric     return IsLoad ? 512 : 128;
3955ffd83dbSDimitry Andric   default:
39606c3fb27SDimitry Andric     // FIXME: Flat addresses may contextually need to be split to 32-bit parts
39706c3fb27SDimitry Andric     // if they may alias scratch depending on the subtarget.  This needs to be
39806c3fb27SDimitry Andric     // moved to custom handling to use addressMayBeAccessedAsPrivate
39906c3fb27SDimitry Andric     return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
4005ffd83dbSDimitry Andric   }
4015ffd83dbSDimitry Andric }
4025ffd83dbSDimitry Andric 
isLoadStoreSizeLegal(const GCNSubtarget & ST,const LegalityQuery & Query)4035ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
404fe6060f1SDimitry Andric                                  const LegalityQuery &Query) {
4055ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
4065ffd83dbSDimitry Andric 
4075ffd83dbSDimitry Andric   // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
408fe6060f1SDimitry Andric   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
4095ffd83dbSDimitry Andric 
4105ffd83dbSDimitry Andric   unsigned RegSize = Ty.getSizeInBits();
41104eeddc0SDimitry Andric   uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
41204eeddc0SDimitry Andric   uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
4135ffd83dbSDimitry Andric   unsigned AS = Query.Types[1].getAddressSpace();
4145ffd83dbSDimitry Andric 
4155ffd83dbSDimitry Andric   // All of these need to be custom lowered to cast the pointer operand.
4165ffd83dbSDimitry Andric   if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
4175ffd83dbSDimitry Andric     return false;
4185ffd83dbSDimitry Andric 
419fe6060f1SDimitry Andric   // Do not handle extending vector loads.
420fe6060f1SDimitry Andric   if (Ty.isVector() && MemSize != RegSize)
421fe6060f1SDimitry Andric     return false;
422fe6060f1SDimitry Andric 
4235ffd83dbSDimitry Andric   // TODO: We should be able to widen loads if the alignment is high enough, but
4245ffd83dbSDimitry Andric   // we also need to modify the memory access size.
4255ffd83dbSDimitry Andric #if 0
4265ffd83dbSDimitry Andric   // Accept widening loads based on alignment.
4275ffd83dbSDimitry Andric   if (IsLoad && MemSize < Size)
4285ffd83dbSDimitry Andric     MemSize = std::max(MemSize, Align);
4295ffd83dbSDimitry Andric #endif
4305ffd83dbSDimitry Andric 
4315ffd83dbSDimitry Andric   // Only 1-byte and 2-byte to 32-bit extloads are valid.
4325ffd83dbSDimitry Andric   if (MemSize != RegSize && RegSize != 32)
4335ffd83dbSDimitry Andric     return false;
4345ffd83dbSDimitry Andric 
43506c3fb27SDimitry Andric   if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
43606c3fb27SDimitry Andric                                     Query.MMODescrs[0].Ordering !=
43706c3fb27SDimitry Andric                                         AtomicOrdering::NotAtomic))
4385ffd83dbSDimitry Andric     return false;
4395ffd83dbSDimitry Andric 
4405ffd83dbSDimitry Andric   switch (MemSize) {
4415ffd83dbSDimitry Andric   case 8:
4425ffd83dbSDimitry Andric   case 16:
4435ffd83dbSDimitry Andric   case 32:
4445ffd83dbSDimitry Andric   case 64:
4455ffd83dbSDimitry Andric   case 128:
4465ffd83dbSDimitry Andric     break;
4475ffd83dbSDimitry Andric   case 96:
4485ffd83dbSDimitry Andric     if (!ST.hasDwordx3LoadStores())
4495ffd83dbSDimitry Andric       return false;
4505ffd83dbSDimitry Andric     break;
4515ffd83dbSDimitry Andric   case 256:
4525ffd83dbSDimitry Andric   case 512:
4535ffd83dbSDimitry Andric     // These may contextually need to be broken down.
4545ffd83dbSDimitry Andric     break;
4555ffd83dbSDimitry Andric   default:
4565ffd83dbSDimitry Andric     return false;
4575ffd83dbSDimitry Andric   }
4585ffd83dbSDimitry Andric 
4595ffd83dbSDimitry Andric   assert(RegSize >= MemSize);
4605ffd83dbSDimitry Andric 
461e8d8bef9SDimitry Andric   if (AlignBits < MemSize) {
4625ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
463e8d8bef9SDimitry Andric     if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
464e8d8bef9SDimitry Andric                                                  Align(AlignBits / 8)))
4655ffd83dbSDimitry Andric       return false;
4665ffd83dbSDimitry Andric   }
4675ffd83dbSDimitry Andric 
4685ffd83dbSDimitry Andric   return true;
4695ffd83dbSDimitry Andric }
4705ffd83dbSDimitry Andric 
47106c3fb27SDimitry Andric // The newer buffer intrinsic forms take their resource arguments as
47206c3fb27SDimitry Andric // pointers in address space 8, aka s128 values. However, in order to not break
47306c3fb27SDimitry Andric // SelectionDAG, the underlying operations have to continue to take v4i32
47406c3fb27SDimitry Andric // arguments. Therefore, we convert resource pointers - or vectors of them
47506c3fb27SDimitry Andric // to integer values here.
hasBufferRsrcWorkaround(const LLT Ty)47606c3fb27SDimitry Andric static bool hasBufferRsrcWorkaround(const LLT Ty) {
47706c3fb27SDimitry Andric   if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
47806c3fb27SDimitry Andric     return true;
47906c3fb27SDimitry Andric   if (Ty.isVector()) {
48006c3fb27SDimitry Andric     const LLT ElemTy = Ty.getElementType();
48106c3fb27SDimitry Andric     return hasBufferRsrcWorkaround(ElemTy);
48206c3fb27SDimitry Andric   }
48306c3fb27SDimitry Andric   return false;
48406c3fb27SDimitry Andric }
48506c3fb27SDimitry Andric 
4865ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
4875ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care
4885ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by
4895ffd83dbSDimitry Andric // bitcasting.
loadStoreBitcastWorkaround(const LLT Ty)4905ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) {
4915ffd83dbSDimitry Andric   if (EnableNewLegality)
4925ffd83dbSDimitry Andric     return false;
4935ffd83dbSDimitry Andric 
4945ffd83dbSDimitry Andric   const unsigned Size = Ty.getSizeInBits();
4955ffd83dbSDimitry Andric   if (Size <= 64)
4965ffd83dbSDimitry Andric     return false;
49706c3fb27SDimitry Andric   // Address space 8 pointers get their own workaround.
49806c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty))
49906c3fb27SDimitry Andric     return false;
5005ffd83dbSDimitry Andric   if (!Ty.isVector())
5015ffd83dbSDimitry Andric     return true;
502e8d8bef9SDimitry Andric 
503*0fca6ea1SDimitry Andric   if (Ty.isPointerVector())
504e8d8bef9SDimitry Andric     return true;
505e8d8bef9SDimitry Andric 
506*0fca6ea1SDimitry Andric   unsigned EltSize = Ty.getScalarSizeInBits();
5075ffd83dbSDimitry Andric   return EltSize != 32 && EltSize != 64;
5085ffd83dbSDimitry Andric }
5095ffd83dbSDimitry Andric 
isLoadStoreLegal(const GCNSubtarget & ST,const LegalityQuery & Query)510fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
5115ffd83dbSDimitry Andric   const LLT Ty = Query.Types[0];
512fe6060f1SDimitry Andric   return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
51306c3fb27SDimitry Andric          !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
5145ffd83dbSDimitry Andric }
5155ffd83dbSDimitry Andric 
516e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast
517e8d8bef9SDimitry Andric /// to a different type.
shouldBitcastLoadStoreType(const GCNSubtarget & ST,const LLT Ty,const LLT MemTy)518e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
519fe6060f1SDimitry Andric                                        const LLT MemTy) {
520fe6060f1SDimitry Andric   const unsigned MemSizeInBits = MemTy.getSizeInBits();
521e8d8bef9SDimitry Andric   const unsigned Size = Ty.getSizeInBits();
522e8d8bef9SDimitry Andric   if (Size != MemSizeInBits)
523e8d8bef9SDimitry Andric     return Size <= 32 && Ty.isVector();
524e8d8bef9SDimitry Andric 
525e8d8bef9SDimitry Andric   if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
526e8d8bef9SDimitry Andric     return true;
527fe6060f1SDimitry Andric 
528fe6060f1SDimitry Andric   // Don't try to handle bitcasting vector ext loads for now.
529fe6060f1SDimitry Andric   return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
530fe6060f1SDimitry Andric          (Size <= 32 || isRegisterSize(Size)) &&
531e8d8bef9SDimitry Andric          !isRegisterVectorElementType(Ty.getElementType());
532e8d8bef9SDimitry Andric }
533e8d8bef9SDimitry Andric 
534e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory
535e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself
536e8d8bef9SDimitry Andric /// changes, not the size of the result register.
shouldWidenLoad(const GCNSubtarget & ST,LLT MemoryTy,uint64_t AlignInBits,unsigned AddrSpace,unsigned Opcode)537fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
53804eeddc0SDimitry Andric                             uint64_t AlignInBits, unsigned AddrSpace,
539e8d8bef9SDimitry Andric                             unsigned Opcode) {
540fe6060f1SDimitry Andric   unsigned SizeInBits = MemoryTy.getSizeInBits();
541e8d8bef9SDimitry Andric   // We don't want to widen cases that are naturally legal.
542e8d8bef9SDimitry Andric   if (isPowerOf2_32(SizeInBits))
543e8d8bef9SDimitry Andric     return false;
544e8d8bef9SDimitry Andric 
545e8d8bef9SDimitry Andric   // If we have 96-bit memory operations, we shouldn't touch them. Note we may
5465f757f3fSDimitry Andric   // end up widening these for a scalar load during RegBankSelect, if we don't
5475f757f3fSDimitry Andric   // have 96-bit scalar loads.
548e8d8bef9SDimitry Andric   if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
549e8d8bef9SDimitry Andric     return false;
550e8d8bef9SDimitry Andric 
55106c3fb27SDimitry Andric   if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
552e8d8bef9SDimitry Andric     return false;
553e8d8bef9SDimitry Andric 
554e8d8bef9SDimitry Andric   // A load is known dereferenceable up to the alignment, so it's legal to widen
555e8d8bef9SDimitry Andric   // to it.
556e8d8bef9SDimitry Andric   //
557e8d8bef9SDimitry Andric   // TODO: Could check dereferenceable for less aligned cases.
558e8d8bef9SDimitry Andric   unsigned RoundedSize = NextPowerOf2(SizeInBits);
559e8d8bef9SDimitry Andric   if (AlignInBits < RoundedSize)
560e8d8bef9SDimitry Andric     return false;
561e8d8bef9SDimitry Andric 
562e8d8bef9SDimitry Andric   // Do not widen if it would introduce a slow unaligned load.
563e8d8bef9SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
564bdd1243dSDimitry Andric   unsigned Fast = 0;
565e8d8bef9SDimitry Andric   return TLI->allowsMisalignedMemoryAccessesImpl(
566e8d8bef9SDimitry Andric              RoundedSize, AddrSpace, Align(AlignInBits / 8),
567e8d8bef9SDimitry Andric              MachineMemOperand::MOLoad, &Fast) &&
568e8d8bef9SDimitry Andric          Fast;
569e8d8bef9SDimitry Andric }
570e8d8bef9SDimitry Andric 
shouldWidenLoad(const GCNSubtarget & ST,const LegalityQuery & Query,unsigned Opcode)571e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
572e8d8bef9SDimitry Andric                             unsigned Opcode) {
573e8d8bef9SDimitry Andric   if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
574e8d8bef9SDimitry Andric     return false;
575e8d8bef9SDimitry Andric 
576fe6060f1SDimitry Andric   return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
577e8d8bef9SDimitry Andric                          Query.MMODescrs[0].AlignInBits,
578e8d8bef9SDimitry Andric                          Query.Types[1].getAddressSpace(), Opcode);
579e8d8bef9SDimitry Andric }
580e8d8bef9SDimitry Andric 
58106c3fb27SDimitry Andric /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
58206c3fb27SDimitry Andric /// type of the operand `idx` and then to transform it to a `p8` via bitcasts
58306c3fb27SDimitry Andric /// and inttoptr. In addition, handle vectors of p8. Returns the new type.
castBufferRsrcFromV4I32(MachineInstr & MI,MachineIRBuilder & B,MachineRegisterInfo & MRI,unsigned Idx)58406c3fb27SDimitry Andric static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
58506c3fb27SDimitry Andric                                    MachineRegisterInfo &MRI, unsigned Idx) {
58606c3fb27SDimitry Andric   MachineOperand &MO = MI.getOperand(Idx);
58706c3fb27SDimitry Andric 
58806c3fb27SDimitry Andric   const LLT PointerTy = MRI.getType(MO.getReg());
58906c3fb27SDimitry Andric 
59006c3fb27SDimitry Andric   // Paranoidly prevent us from doing this multiple times.
59106c3fb27SDimitry Andric   if (!hasBufferRsrcWorkaround(PointerTy))
59206c3fb27SDimitry Andric     return PointerTy;
59306c3fb27SDimitry Andric 
59406c3fb27SDimitry Andric   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
59506c3fb27SDimitry Andric   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
59606c3fb27SDimitry Andric   if (!PointerTy.isVector()) {
59706c3fb27SDimitry Andric     // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
59806c3fb27SDimitry Andric     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
59906c3fb27SDimitry Andric     const LLT S32 = LLT::scalar(32);
60006c3fb27SDimitry Andric 
60106c3fb27SDimitry Andric     Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
60206c3fb27SDimitry Andric     std::array<Register, 4> VectorElems;
60306c3fb27SDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
60406c3fb27SDimitry Andric     for (unsigned I = 0; I < NumParts; ++I)
60506c3fb27SDimitry Andric       VectorElems[I] =
60606c3fb27SDimitry Andric           B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
60706c3fb27SDimitry Andric     B.buildMergeValues(MO, VectorElems);
60806c3fb27SDimitry Andric     MO.setReg(VectorReg);
60906c3fb27SDimitry Andric     return VectorTy;
61006c3fb27SDimitry Andric   }
61106c3fb27SDimitry Andric   Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
61206c3fb27SDimitry Andric   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
61306c3fb27SDimitry Andric   auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
61406c3fb27SDimitry Andric   B.buildIntToPtr(MO, Scalar);
61506c3fb27SDimitry Andric   MO.setReg(BitcastReg);
61606c3fb27SDimitry Andric 
61706c3fb27SDimitry Andric   return VectorTy;
61806c3fb27SDimitry Andric }
61906c3fb27SDimitry Andric 
62006c3fb27SDimitry Andric /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
62106c3fb27SDimitry Andric /// the form in which the value must be in order to be passed to the low-level
62206c3fb27SDimitry Andric /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
62306c3fb27SDimitry Andric /// needed in order to account for the fact that we can't define a register
62406c3fb27SDimitry Andric /// class for s128 without breaking SelectionDAG.
castBufferRsrcToV4I32(Register Pointer,MachineIRBuilder & B)62506c3fb27SDimitry Andric static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
62606c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
62706c3fb27SDimitry Andric   const LLT PointerTy = MRI.getType(Pointer);
62806c3fb27SDimitry Andric   const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
62906c3fb27SDimitry Andric   const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
63006c3fb27SDimitry Andric 
63106c3fb27SDimitry Andric   if (!PointerTy.isVector()) {
63206c3fb27SDimitry Andric     // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
63306c3fb27SDimitry Andric     SmallVector<Register, 4> PointerParts;
63406c3fb27SDimitry Andric     const unsigned NumParts = PointerTy.getSizeInBits() / 32;
63506c3fb27SDimitry Andric     auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
63606c3fb27SDimitry Andric     for (unsigned I = 0; I < NumParts; ++I)
63706c3fb27SDimitry Andric       PointerParts.push_back(Unmerged.getReg(I));
63806c3fb27SDimitry Andric     return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
63906c3fb27SDimitry Andric   }
64006c3fb27SDimitry Andric   Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
64106c3fb27SDimitry Andric   return B.buildBitcast(VectorTy, Scalar).getReg(0);
64206c3fb27SDimitry Andric }
64306c3fb27SDimitry Andric 
castBufferRsrcArgToV4I32(MachineInstr & MI,MachineIRBuilder & B,unsigned Idx)64406c3fb27SDimitry Andric static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
64506c3fb27SDimitry Andric                                      unsigned Idx) {
64606c3fb27SDimitry Andric   MachineOperand &MO = MI.getOperand(Idx);
64706c3fb27SDimitry Andric 
64806c3fb27SDimitry Andric   const LLT PointerTy = B.getMRI()->getType(MO.getReg());
64906c3fb27SDimitry Andric   // Paranoidly prevent us from doing this multiple times.
65006c3fb27SDimitry Andric   if (!hasBufferRsrcWorkaround(PointerTy))
65106c3fb27SDimitry Andric     return;
65206c3fb27SDimitry Andric   MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
65306c3fb27SDimitry Andric }
65406c3fb27SDimitry Andric 
AMDGPULegalizerInfo(const GCNSubtarget & ST_,const GCNTargetMachine & TM)6550b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
6560b57cec5SDimitry Andric                                          const GCNTargetMachine &TM)
6570b57cec5SDimitry Andric   :  ST(ST_) {
6580b57cec5SDimitry Andric   using namespace TargetOpcode;
6590b57cec5SDimitry Andric 
6600b57cec5SDimitry Andric   auto GetAddrSpacePtr = [&TM](unsigned AS) {
6610b57cec5SDimitry Andric     return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
6620b57cec5SDimitry Andric   };
6630b57cec5SDimitry Andric 
6640b57cec5SDimitry Andric   const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
6650b57cec5SDimitry Andric   const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
6668bcb0991SDimitry Andric   const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
6670b57cec5SDimitry Andric   const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
6688bcb0991SDimitry Andric   const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
6690b57cec5SDimitry Andric   const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
6700b57cec5SDimitry Andric   const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
67106c3fb27SDimitry Andric   const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
67206c3fb27SDimitry Andric   const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
6735f757f3fSDimitry Andric   const LLT BufferStridedPtr =
6745f757f3fSDimitry Andric       GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
6750b57cec5SDimitry Andric 
6760b57cec5SDimitry Andric   const LLT CodePtr = FlatPtr;
6770b57cec5SDimitry Andric 
6780b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces64 = {
6790b57cec5SDimitry Andric     GlobalPtr, ConstantPtr, FlatPtr
6800b57cec5SDimitry Andric   };
6810b57cec5SDimitry Andric 
6820b57cec5SDimitry Andric   const std::initializer_list<LLT> AddrSpaces32 = {
6838bcb0991SDimitry Andric     LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
6840b57cec5SDimitry Andric   };
6850b57cec5SDimitry Andric 
68606c3fb27SDimitry Andric   const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
68706c3fb27SDimitry Andric 
6880b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesBase = {
6890b57cec5SDimitry Andric     S32, S64
6900b57cec5SDimitry Andric   };
6910b57cec5SDimitry Andric 
6920b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypes16 = {
6930b57cec5SDimitry Andric     S32, S64, S16
6940b57cec5SDimitry Andric   };
6950b57cec5SDimitry Andric 
6960b57cec5SDimitry Andric   const std::initializer_list<LLT> FPTypesPK16 = {
6970b57cec5SDimitry Andric     S32, S64, S16, V2S16
6980b57cec5SDimitry Andric   };
6990b57cec5SDimitry Andric 
7005ffd83dbSDimitry Andric   const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
7015ffd83dbSDimitry Andric 
702fe6060f1SDimitry Andric   // s1 for VCC branches, s32 for SCC branches.
703fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32});
7040b57cec5SDimitry Andric 
7050b57cec5SDimitry Andric   // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
7060b57cec5SDimitry Andric   // elements for v3s16
7070b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PHI)
708e8d8bef9SDimitry Andric       .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
7090b57cec5SDimitry Andric       .legalFor(AllS32Vectors)
7100b57cec5SDimitry Andric       .legalFor(AllS64Vectors)
7110b57cec5SDimitry Andric       .legalFor(AddrSpaces64)
7120b57cec5SDimitry Andric       .legalFor(AddrSpaces32)
71306c3fb27SDimitry Andric       .legalFor(AddrSpaces128)
714e8d8bef9SDimitry Andric       .legalIf(isPointer(0))
715e8d8bef9SDimitry Andric       .clampScalar(0, S16, S256)
7160b57cec5SDimitry Andric       .widenScalarToNextPow2(0, 32)
7170b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 16)
7180b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
719e8d8bef9SDimitry Andric       .scalarize(0);
7200b57cec5SDimitry Andric 
721e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
722e8d8bef9SDimitry Andric     // Full set of gfx9 features.
7235f757f3fSDimitry Andric     if (ST.hasScalarAddSub64()) {
7245f757f3fSDimitry Andric       getActionDefinitionsBuilder({G_ADD, G_SUB})
7255f757f3fSDimitry Andric           .legalFor({S64, S32, S16, V2S16})
7265f757f3fSDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
7275f757f3fSDimitry Andric           .scalarize(0)
7285f757f3fSDimitry Andric           .minScalar(0, S16)
7295f757f3fSDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
7305f757f3fSDimitry Andric           .maxScalar(0, S32);
7315f757f3fSDimitry Andric     } else {
73281ad6265SDimitry Andric       getActionDefinitionsBuilder({G_ADD, G_SUB})
7335ffd83dbSDimitry Andric           .legalFor({S32, S16, V2S16})
7340eae32dcSDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
73581ad6265SDimitry Andric           .scalarize(0)
73681ad6265SDimitry Andric           .minScalar(0, S16)
737349cc55cSDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
73881ad6265SDimitry Andric           .maxScalar(0, S32);
7395f757f3fSDimitry Andric     }
74081ad6265SDimitry Andric 
7411db9f3b2SDimitry Andric     if (ST.hasScalarSMulU64()) {
7421db9f3b2SDimitry Andric       getActionDefinitionsBuilder(G_MUL)
7431db9f3b2SDimitry Andric           .legalFor({S64, S32, S16, V2S16})
7441db9f3b2SDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
7451db9f3b2SDimitry Andric           .scalarize(0)
7461db9f3b2SDimitry Andric           .minScalar(0, S16)
7471db9f3b2SDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
7481db9f3b2SDimitry Andric           .custom();
7491db9f3b2SDimitry Andric     } else {
75081ad6265SDimitry Andric       getActionDefinitionsBuilder(G_MUL)
75181ad6265SDimitry Andric           .legalFor({S32, S16, V2S16})
75281ad6265SDimitry Andric           .clampMaxNumElementsStrict(0, S16, 2)
75381ad6265SDimitry Andric           .scalarize(0)
75481ad6265SDimitry Andric           .minScalar(0, S16)
75581ad6265SDimitry Andric           .widenScalarToNextMultipleOf(0, 32)
75681ad6265SDimitry Andric           .custom();
7571db9f3b2SDimitry Andric     }
75881ad6265SDimitry Andric     assert(ST.hasMad64_32());
759e8d8bef9SDimitry Andric 
760e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
761e8d8bef9SDimitry Andric       .legalFor({S32, S16, V2S16}) // Clamp modifier
762e8d8bef9SDimitry Andric       .minScalarOrElt(0, S16)
7630eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
764e8d8bef9SDimitry Andric       .scalarize(0)
765e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 32)
766e8d8bef9SDimitry Andric       .lower();
7675ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
76881ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
7690b57cec5SDimitry Andric       .legalFor({S32, S16})
770349cc55cSDimitry Andric       .minScalar(0, S16)
771349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
772349cc55cSDimitry Andric       .maxScalar(0, S32)
773349cc55cSDimitry Andric       .scalarize(0);
774e8d8bef9SDimitry Andric 
77581ad6265SDimitry Andric     getActionDefinitionsBuilder(G_MUL)
77681ad6265SDimitry Andric       .legalFor({S32, S16})
77781ad6265SDimitry Andric       .scalarize(0)
77881ad6265SDimitry Andric       .minScalar(0, S16)
77981ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
78081ad6265SDimitry Andric       .custom();
78181ad6265SDimitry Andric     assert(ST.hasMad64_32());
78281ad6265SDimitry Andric 
783e8d8bef9SDimitry Andric     // Technically the saturating operations require clamp bit support, but this
784e8d8bef9SDimitry Andric     // was introduced at the same time as 16-bit operations.
785e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
786e8d8bef9SDimitry Andric       .legalFor({S32, S16}) // Clamp modifier
787e8d8bef9SDimitry Andric       .minScalar(0, S16)
788e8d8bef9SDimitry Andric       .scalarize(0)
789e8d8bef9SDimitry Andric       .widenScalarToNextPow2(0, 16)
790e8d8bef9SDimitry Andric       .lower();
791e8d8bef9SDimitry Andric 
792e8d8bef9SDimitry Andric     // We're just lowering this, but it helps get a better result to try to
793e8d8bef9SDimitry Andric     // coerce to the desired type first.
794e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
795e8d8bef9SDimitry Andric       .minScalar(0, S16)
796e8d8bef9SDimitry Andric       .scalarize(0)
797e8d8bef9SDimitry Andric       .lower();
7980b57cec5SDimitry Andric   } else {
79981ad6265SDimitry Andric     getActionDefinitionsBuilder({G_ADD, G_SUB})
8000b57cec5SDimitry Andric       .legalFor({S32})
801349cc55cSDimitry Andric       .widenScalarToNextMultipleOf(0, 32)
8020b57cec5SDimitry Andric       .clampScalar(0, S32, S32)
8030b57cec5SDimitry Andric       .scalarize(0);
804e8d8bef9SDimitry Andric 
80581ad6265SDimitry Andric     auto &Mul = getActionDefinitionsBuilder(G_MUL)
80681ad6265SDimitry Andric       .legalFor({S32})
80781ad6265SDimitry Andric       .scalarize(0)
80881ad6265SDimitry Andric       .minScalar(0, S32)
80981ad6265SDimitry Andric       .widenScalarToNextMultipleOf(0, 32);
81081ad6265SDimitry Andric 
81181ad6265SDimitry Andric     if (ST.hasMad64_32())
81281ad6265SDimitry Andric       Mul.custom();
81381ad6265SDimitry Andric     else
81481ad6265SDimitry Andric       Mul.maxScalar(0, S32);
81581ad6265SDimitry Andric 
816e8d8bef9SDimitry Andric     if (ST.hasIntClamp()) {
817e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
818e8d8bef9SDimitry Andric         .legalFor({S32}) // Clamp modifier.
819e8d8bef9SDimitry Andric         .scalarize(0)
820e8d8bef9SDimitry Andric         .minScalarOrElt(0, S32)
821e8d8bef9SDimitry Andric         .lower();
822e8d8bef9SDimitry Andric     } else {
823e8d8bef9SDimitry Andric       // Clamp bit support was added in VI, along with 16-bit operations.
824e8d8bef9SDimitry Andric       getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
825e8d8bef9SDimitry Andric         .minScalar(0, S32)
826e8d8bef9SDimitry Andric         .scalarize(0)
827e8d8bef9SDimitry Andric         .lower();
8280b57cec5SDimitry Andric     }
8290b57cec5SDimitry Andric 
830e8d8bef9SDimitry Andric     // FIXME: DAG expansion gets better results. The widening uses the smaller
831e8d8bef9SDimitry Andric     // range values and goes for the min/max lowering directly.
832e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
833e8d8bef9SDimitry Andric       .minScalar(0, S32)
834e8d8bef9SDimitry Andric       .scalarize(0)
835e8d8bef9SDimitry Andric       .lower();
836e8d8bef9SDimitry Andric   }
837e8d8bef9SDimitry Andric 
838fe6060f1SDimitry Andric   getActionDefinitionsBuilder(
839fe6060f1SDimitry Andric       {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
8405ffd83dbSDimitry Andric       .customFor({S32, S64})
841480093f4SDimitry Andric       .clampScalar(0, S32, S64)
842480093f4SDimitry Andric       .widenScalarToNextPow2(0, 32)
843480093f4SDimitry Andric       .scalarize(0);
844480093f4SDimitry Andric 
845e8d8bef9SDimitry Andric   auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
8460b57cec5SDimitry Andric                    .legalFor({S32})
847349cc55cSDimitry Andric                    .maxScalar(0, S32);
848e8d8bef9SDimitry Andric 
849e8d8bef9SDimitry Andric   if (ST.hasVOP3PInsts()) {
850e8d8bef9SDimitry Andric     Mulh
851e8d8bef9SDimitry Andric       .clampMaxNumElements(0, S8, 2)
852e8d8bef9SDimitry Andric       .lowerFor({V2S8});
853e8d8bef9SDimitry Andric   }
854e8d8bef9SDimitry Andric 
855e8d8bef9SDimitry Andric   Mulh
856e8d8bef9SDimitry Andric     .scalarize(0)
857e8d8bef9SDimitry Andric     .lower();
8580b57cec5SDimitry Andric 
8590b57cec5SDimitry Andric   // Report legal for any types we can handle anywhere. For the cases only legal
8600b57cec5SDimitry Andric   // on the SALU, RegBankSelect will be able to re-legalize.
8610b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
8620b57cec5SDimitry Andric     .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
8630b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
8640b57cec5SDimitry Andric     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8658bcb0991SDimitry Andric     .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
8660b57cec5SDimitry Andric     .widenScalarToNextPow2(0)
8670b57cec5SDimitry Andric     .scalarize(0);
8680b57cec5SDimitry Andric 
869bdd1243dSDimitry Andric   getActionDefinitionsBuilder(
870bdd1243dSDimitry Andric       {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
871480093f4SDimitry Andric       .legalFor({{S32, S1}, {S32, S32}})
872bdd1243dSDimitry Andric       .clampScalar(0, S32, S32)
873bdd1243dSDimitry Andric       .scalarize(0);
8740b57cec5SDimitry Andric 
8750b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_BITCAST)
8760b57cec5SDimitry Andric       // Don't worry about the size constraint.
877*0fca6ea1SDimitry Andric       .legalIf(all(isRegisterClassType(0), isRegisterClassType(1)))
8785ffd83dbSDimitry Andric       .lower();
8790b57cec5SDimitry Andric 
8800b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONSTANT)
8818bcb0991SDimitry Andric     .legalFor({S1, S32, S64, S16, GlobalPtr,
8820b57cec5SDimitry Andric                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
883e8d8bef9SDimitry Andric     .legalIf(isPointer(0))
8840b57cec5SDimitry Andric     .clampScalar(0, S32, S64)
885e8d8bef9SDimitry Andric     .widenScalarToNextPow2(0);
8860b57cec5SDimitry Andric 
8875ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FCONSTANT)
8885ffd83dbSDimitry Andric     .legalFor({S32, S64, S16})
8895ffd83dbSDimitry Andric     .clampScalar(0, S16, S64);
8908bcb0991SDimitry Andric 
8915ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
8925ffd83dbSDimitry Andric       .legalIf(isRegisterType(0))
8935ffd83dbSDimitry Andric       // s1 and s16 are special cases because they have legal operations on
8945ffd83dbSDimitry Andric       // them, but don't really occupy registers in the normal way.
8955ffd83dbSDimitry Andric       .legalFor({S1, S16})
8965ffd83dbSDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
8975ffd83dbSDimitry Andric       .clampScalarOrElt(0, S32, MaxScalar)
8985ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
8995ffd83dbSDimitry Andric       .clampMaxNumElements(0, S32, 16);
9005ffd83dbSDimitry Andric 
901fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
9025ffd83dbSDimitry Andric 
9035ffd83dbSDimitry Andric   // If the amount is divergent, we have to do a wave reduction to get the
9045ffd83dbSDimitry Andric   // maximum value, so this is expanded during RegBankSelect.
9055ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_DYN_STACKALLOC)
9065ffd83dbSDimitry Andric     .legalFor({{PrivatePtr, S32}});
9075ffd83dbSDimitry Andric 
9085f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_STACKSAVE)
9095f757f3fSDimitry Andric     .customFor({PrivatePtr});
9105f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_STACKRESTORE)
9115f757f3fSDimitry Andric     .legalFor({PrivatePtr});
9125f757f3fSDimitry Andric 
913*0fca6ea1SDimitry Andric   getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
914*0fca6ea1SDimitry Andric 
9155ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
916e8d8bef9SDimitry Andric     .customIf(typeIsNot(0, PrivatePtr));
917e8d8bef9SDimitry Andric 
918fe6060f1SDimitry Andric   getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
9190b57cec5SDimitry Andric 
9200b57cec5SDimitry Andric   auto &FPOpActions = getActionDefinitionsBuilder(
921bdd1243dSDimitry Andric     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
922bdd1243dSDimitry Andric       G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
9230b57cec5SDimitry Andric     .legalFor({S32, S64});
9248bcb0991SDimitry Andric   auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
9258bcb0991SDimitry Andric     .customFor({S32, S64});
9268bcb0991SDimitry Andric   auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
9278bcb0991SDimitry Andric     .customFor({S32, S64});
9280b57cec5SDimitry Andric 
9290b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
9300b57cec5SDimitry Andric     if (ST.hasVOP3PInsts())
9310b57cec5SDimitry Andric       FPOpActions.legalFor({S16, V2S16});
9320b57cec5SDimitry Andric     else
9330b57cec5SDimitry Andric       FPOpActions.legalFor({S16});
9348bcb0991SDimitry Andric 
9358bcb0991SDimitry Andric     TrigActions.customFor({S16});
9368bcb0991SDimitry Andric     FDIVActions.customFor({S16});
9370b57cec5SDimitry Andric   }
9380b57cec5SDimitry Andric 
9395f757f3fSDimitry Andric   if (ST.hasPackedFP32Ops()) {
9405f757f3fSDimitry Andric     FPOpActions.legalFor({V2S32});
9415f757f3fSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
9425f757f3fSDimitry Andric   }
9435f757f3fSDimitry Andric 
9440b57cec5SDimitry Andric   auto &MinNumMaxNum = getActionDefinitionsBuilder({
9450b57cec5SDimitry Andric       G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
9460b57cec5SDimitry Andric 
9470b57cec5SDimitry Andric   if (ST.hasVOP3PInsts()) {
9480b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesPK16)
949480093f4SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
9500b57cec5SDimitry Andric       .clampMaxNumElements(0, S16, 2)
9510b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
9520b57cec5SDimitry Andric       .scalarize(0);
9530b57cec5SDimitry Andric   } else if (ST.has16BitInsts()) {
9540b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypes16)
9550b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
9560b57cec5SDimitry Andric       .scalarize(0);
9570b57cec5SDimitry Andric   } else {
9580b57cec5SDimitry Andric     MinNumMaxNum.customFor(FPTypesBase)
9590b57cec5SDimitry Andric       .clampScalar(0, S32, S64)
9600b57cec5SDimitry Andric       .scalarize(0);
9610b57cec5SDimitry Andric   }
9620b57cec5SDimitry Andric 
9630b57cec5SDimitry Andric   if (ST.hasVOP3PInsts())
9640eae32dcSDimitry Andric     FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
9658bcb0991SDimitry Andric 
9660b57cec5SDimitry Andric   FPOpActions
9670b57cec5SDimitry Andric     .scalarize(0)
9680b57cec5SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9690b57cec5SDimitry Andric 
9708bcb0991SDimitry Andric   TrigActions
9718bcb0991SDimitry Andric     .scalarize(0)
9728bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9738bcb0991SDimitry Andric 
9748bcb0991SDimitry Andric   FDIVActions
9758bcb0991SDimitry Andric     .scalarize(0)
9768bcb0991SDimitry Andric     .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
9778bcb0991SDimitry Andric 
9788bcb0991SDimitry Andric   getActionDefinitionsBuilder({G_FNEG, G_FABS})
9798bcb0991SDimitry Andric     .legalFor(FPTypesPK16)
9800eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
9818bcb0991SDimitry Andric     .scalarize(0)
9828bcb0991SDimitry Andric     .clampScalar(0, S16, S64);
9838bcb0991SDimitry Andric 
9840b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
98506c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
9865f757f3fSDimitry Andric       .legalFor({S16})
9875f757f3fSDimitry Andric       .customFor({S32, S64})
98806c3fb27SDimitry Andric       .scalarize(0)
9895f757f3fSDimitry Andric       .unsupported();
99006c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFLOOR)
9910b57cec5SDimitry Andric       .legalFor({S32, S64, S16})
9920b57cec5SDimitry Andric       .scalarize(0)
9930b57cec5SDimitry Andric       .clampScalar(0, S16, S64);
99406c3fb27SDimitry Andric 
99506c3fb27SDimitry Andric     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
99606c3fb27SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
99706c3fb27SDimitry Andric       .scalarize(0)
99806c3fb27SDimitry Andric       .maxScalarIf(typeIs(0, S16), 1, S16)
99906c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
100006c3fb27SDimitry Andric       .lower();
100106c3fb27SDimitry Andric 
100206c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFREXP)
100306c3fb27SDimitry Andric       .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
100406c3fb27SDimitry Andric       .scalarize(0)
100506c3fb27SDimitry Andric       .lower();
10060b57cec5SDimitry Andric   } else {
10075ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_FSQRT)
10085f757f3fSDimitry Andric       .customFor({S32, S64, S16})
10095ffd83dbSDimitry Andric       .scalarize(0)
10105f757f3fSDimitry Andric       .unsupported();
10115f757f3fSDimitry Andric 
10125ffd83dbSDimitry Andric 
10135ffd83dbSDimitry Andric     if (ST.hasFractBug()) {
10145ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
10155ffd83dbSDimitry Andric         .customFor({S64})
10165ffd83dbSDimitry Andric         .legalFor({S32, S64})
10175ffd83dbSDimitry Andric         .scalarize(0)
10185ffd83dbSDimitry Andric         .clampScalar(0, S32, S64);
10195ffd83dbSDimitry Andric     } else {
10205ffd83dbSDimitry Andric       getActionDefinitionsBuilder(G_FFLOOR)
10210b57cec5SDimitry Andric         .legalFor({S32, S64})
10220b57cec5SDimitry Andric         .scalarize(0)
10230b57cec5SDimitry Andric         .clampScalar(0, S32, S64);
10240b57cec5SDimitry Andric     }
102506c3fb27SDimitry Andric 
102606c3fb27SDimitry Andric     getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
102706c3fb27SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
102806c3fb27SDimitry Andric       .scalarize(0)
102906c3fb27SDimitry Andric       .clampScalar(0, S32, S64)
103006c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
103106c3fb27SDimitry Andric       .lower();
103206c3fb27SDimitry Andric 
103306c3fb27SDimitry Andric     getActionDefinitionsBuilder(G_FFREXP)
103406c3fb27SDimitry Andric       .customFor({{S32, S32}, {S64, S32}})
103506c3fb27SDimitry Andric       .scalarize(0)
103606c3fb27SDimitry Andric       .minScalar(0, S32)
103706c3fb27SDimitry Andric       .clampScalar(1, S32, S32)
103806c3fb27SDimitry Andric       .lower();
10395ffd83dbSDimitry Andric   }
10400b57cec5SDimitry Andric 
10410b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPTRUNC)
10420b57cec5SDimitry Andric     .legalFor({{S32, S64}, {S16, S32}})
10435ffd83dbSDimitry Andric     .scalarize(0)
10445ffd83dbSDimitry Andric     .lower();
10450b57cec5SDimitry Andric 
10460b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_FPEXT)
10470b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}})
1048e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
10490b57cec5SDimitry Andric     .scalarize(0);
10500b57cec5SDimitry Andric 
1051bdd1243dSDimitry Andric   auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
105281ad6265SDimitry Andric   if (ST.has16BitInsts()) {
105381ad6265SDimitry Andric     FSubActions
105481ad6265SDimitry Andric       // Use actual fsub instruction
105581ad6265SDimitry Andric       .legalFor({S32, S16})
105681ad6265SDimitry Andric       // Must use fadd + fneg
105781ad6265SDimitry Andric       .lowerFor({S64, V2S16});
105881ad6265SDimitry Andric   } else {
105981ad6265SDimitry Andric     FSubActions
10600b57cec5SDimitry Andric       // Use actual fsub instruction
10610b57cec5SDimitry Andric       .legalFor({S32})
10620b57cec5SDimitry Andric       // Must use fadd + fneg
106381ad6265SDimitry Andric       .lowerFor({S64, S16, V2S16});
106481ad6265SDimitry Andric   }
106581ad6265SDimitry Andric 
106681ad6265SDimitry Andric   FSubActions
10670b57cec5SDimitry Andric     .scalarize(0)
10680b57cec5SDimitry Andric     .clampScalar(0, S32, S64);
10690b57cec5SDimitry Andric 
10708bcb0991SDimitry Andric   // Whether this is legal depends on the floating point mode for the function.
10718bcb0991SDimitry Andric   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
10725ffd83dbSDimitry Andric   if (ST.hasMadF16() && ST.hasMadMacF32Insts())
10738bcb0991SDimitry Andric     FMad.customFor({S32, S16});
10745ffd83dbSDimitry Andric   else if (ST.hasMadMacF32Insts())
10758bcb0991SDimitry Andric     FMad.customFor({S32});
10765ffd83dbSDimitry Andric   else if (ST.hasMadF16())
10775ffd83dbSDimitry Andric     FMad.customFor({S16});
10788bcb0991SDimitry Andric   FMad.scalarize(0)
10798bcb0991SDimitry Andric       .lower();
10808bcb0991SDimitry Andric 
1081e8d8bef9SDimitry Andric   auto &FRem = getActionDefinitionsBuilder(G_FREM);
1082e8d8bef9SDimitry Andric   if (ST.has16BitInsts()) {
1083e8d8bef9SDimitry Andric     FRem.customFor({S16, S32, S64});
1084e8d8bef9SDimitry Andric   } else {
1085e8d8bef9SDimitry Andric     FRem.minScalar(0, S32)
1086e8d8bef9SDimitry Andric         .customFor({S32, S64});
1087e8d8bef9SDimitry Andric   }
1088e8d8bef9SDimitry Andric   FRem.scalarize(0);
1089e8d8bef9SDimitry Andric 
10905ffd83dbSDimitry Andric   // TODO: Do we need to clamp maximum bitwidth?
10915ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_TRUNC)
10925ffd83dbSDimitry Andric     .legalIf(isScalar(0))
10935ffd83dbSDimitry Andric     .legalFor({{V2S16, V2S32}})
10945ffd83dbSDimitry Andric     .clampMaxNumElements(0, S16, 2)
10955ffd83dbSDimitry Andric     // Avoid scalarizing in cases that should be truly illegal. In unresolvable
10965ffd83dbSDimitry Andric     // situations (like an invalid implicit use), we don't want to infinite loop
10975ffd83dbSDimitry Andric     // in the legalizer.
10985ffd83dbSDimitry Andric     .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
10995ffd83dbSDimitry Andric     .alwaysLegal();
11005ffd83dbSDimitry Andric 
11010b57cec5SDimitry Andric   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
11020b57cec5SDimitry Andric     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
11035ffd83dbSDimitry Andric                {S32, S1}, {S64, S1}, {S16, S1}})
1104480093f4SDimitry Andric     .scalarize(0)
11055ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
11065ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32);
11070b57cec5SDimitry Andric 
11088bcb0991SDimitry Andric   // TODO: Split s1->s64 during regbankselect for VALU.
11098bcb0991SDimitry Andric   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1110480093f4SDimitry Andric                     .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1111480093f4SDimitry Andric                     .lowerIf(typeIs(1, S1))
1112349cc55cSDimitry Andric                     .customFor({{S32, S64}, {S64, S64}});
11138bcb0991SDimitry Andric   if (ST.has16BitInsts())
11148bcb0991SDimitry Andric     IToFP.legalFor({{S16, S16}});
11158bcb0991SDimitry Andric   IToFP.clampScalar(1, S32, S64)
1116e8d8bef9SDimitry Andric        .minScalar(0, S32)
11175ffd83dbSDimitry Andric        .scalarize(0)
11185ffd83dbSDimitry Andric        .widenScalarToNextPow2(1);
11190b57cec5SDimitry Andric 
11208bcb0991SDimitry Andric   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
11215ffd83dbSDimitry Andric     .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1122fe6060f1SDimitry Andric     .customFor({{S64, S32}, {S64, S64}})
1123e8d8bef9SDimitry Andric     .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
11248bcb0991SDimitry Andric   if (ST.has16BitInsts())
11258bcb0991SDimitry Andric     FPToI.legalFor({{S16, S16}});
11268bcb0991SDimitry Andric   else
11278bcb0991SDimitry Andric     FPToI.minScalar(1, S32);
11288bcb0991SDimitry Andric 
11298bcb0991SDimitry Andric   FPToI.minScalar(0, S32)
1130fe6060f1SDimitry Andric        .widenScalarToNextPow2(0, 32)
11315ffd83dbSDimitry Andric        .scalarize(0)
11325ffd83dbSDimitry Andric        .lower();
11330b57cec5SDimitry Andric 
113481ad6265SDimitry Andric   getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
113581ad6265SDimitry Andric       .customFor({S16, S32})
113681ad6265SDimitry Andric       .scalarize(0)
113781ad6265SDimitry Andric       .lower();
113881ad6265SDimitry Andric 
11395f757f3fSDimitry Andric   // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
11405f757f3fSDimitry Andric   getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1141480093f4SDimitry Andric       .scalarize(0)
1142480093f4SDimitry Andric       .lower();
11430b57cec5SDimitry Andric 
1144480093f4SDimitry Andric   if (ST.has16BitInsts()) {
11455f757f3fSDimitry Andric     getActionDefinitionsBuilder(
11465f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1147480093f4SDimitry Andric         .legalFor({S16, S32, S64})
1148480093f4SDimitry Andric         .clampScalar(0, S16, S64)
1149480093f4SDimitry Andric         .scalarize(0);
1150480093f4SDimitry Andric   } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
11515f757f3fSDimitry Andric     getActionDefinitionsBuilder(
11525f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
11530b57cec5SDimitry Andric         .legalFor({S32, S64})
11540b57cec5SDimitry Andric         .clampScalar(0, S32, S64)
11550b57cec5SDimitry Andric         .scalarize(0);
11560b57cec5SDimitry Andric   } else {
11575f757f3fSDimitry Andric     getActionDefinitionsBuilder(
11585f757f3fSDimitry Andric         {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
11590b57cec5SDimitry Andric         .legalFor({S32})
11600b57cec5SDimitry Andric         .customFor({S64})
11610b57cec5SDimitry Andric         .clampScalar(0, S32, S64)
11620b57cec5SDimitry Andric         .scalarize(0);
11630b57cec5SDimitry Andric   }
11640b57cec5SDimitry Andric 
1165480093f4SDimitry Andric   getActionDefinitionsBuilder(G_PTR_ADD)
11665f757f3fSDimitry Andric       .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1167e8d8bef9SDimitry Andric       .legalIf(all(isPointer(0), sameSize(0, 1)))
1168e8d8bef9SDimitry Andric       .scalarize(0)
1169e8d8bef9SDimitry Andric       .scalarSameSizeAs(1, 0);
11700b57cec5SDimitry Andric 
11715ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_PTRMASK)
1172e8d8bef9SDimitry Andric     .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1173e8d8bef9SDimitry Andric     .scalarSameSizeAs(1, 0)
11745ffd83dbSDimitry Andric     .scalarize(0);
11750b57cec5SDimitry Andric 
11760b57cec5SDimitry Andric   auto &CmpBuilder =
11770b57cec5SDimitry Andric     getActionDefinitionsBuilder(G_ICMP)
1178480093f4SDimitry Andric     // The compare output type differs based on the register bank of the output,
1179480093f4SDimitry Andric     // so make both s1 and s32 legal.
1180480093f4SDimitry Andric     //
1181480093f4SDimitry Andric     // Scalar compares producing output in scc will be promoted to s32, as that
1182480093f4SDimitry Andric     // is the allocatable register type that will be needed for the copy from
1183480093f4SDimitry Andric     // scc. This will be promoted during RegBankSelect, and we assume something
1184480093f4SDimitry Andric     // before that won't try to use s32 result types.
1185480093f4SDimitry Andric     //
1186480093f4SDimitry Andric     // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1187480093f4SDimitry Andric     // bank.
11880b57cec5SDimitry Andric     .legalForCartesianProduct(
11890b57cec5SDimitry Andric       {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1190480093f4SDimitry Andric     .legalForCartesianProduct(
1191480093f4SDimitry Andric       {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
11920b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
11930b57cec5SDimitry Andric     CmpBuilder.legalFor({{S1, S16}});
11940b57cec5SDimitry Andric   }
11950b57cec5SDimitry Andric 
11960b57cec5SDimitry Andric   CmpBuilder
11970b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
11980b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
11990b57cec5SDimitry Andric     .scalarize(0)
1200480093f4SDimitry Andric     .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
12010b57cec5SDimitry Andric 
12025f757f3fSDimitry Andric   auto &FCmpBuilder =
12035f757f3fSDimitry Andric       getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
12045f757f3fSDimitry Andric           {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
12055f757f3fSDimitry Andric 
12065f757f3fSDimitry Andric   if (ST.hasSALUFloatInsts())
12075f757f3fSDimitry Andric     FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
12085f757f3fSDimitry Andric 
12095f757f3fSDimitry Andric   FCmpBuilder
12100b57cec5SDimitry Andric     .widenScalarToNextPow2(1)
12110b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
12120b57cec5SDimitry Andric     .scalarize(0);
12130b57cec5SDimitry Andric 
12145ffd83dbSDimitry Andric   // FIXME: fpow has a selection pattern that should move to custom lowering.
121506c3fb27SDimitry Andric   auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
12165ffd83dbSDimitry Andric   if (ST.has16BitInsts())
12175ffd83dbSDimitry Andric     ExpOps.customFor({{S32}, {S16}});
12185ffd83dbSDimitry Andric   else
12195ffd83dbSDimitry Andric     ExpOps.customFor({S32});
12205ffd83dbSDimitry Andric   ExpOps.clampScalar(0, MinScalarFPTy, S32)
12210b57cec5SDimitry Andric         .scalarize(0);
12220b57cec5SDimitry Andric 
1223e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FPOWI)
1224e8d8bef9SDimitry Andric     .clampScalar(0, MinScalarFPTy, S32)
1225e8d8bef9SDimitry Andric     .lower();
1226e8d8bef9SDimitry Andric 
122706c3fb27SDimitry Andric   auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
122806c3fb27SDimitry Andric   Log2Ops.customFor({S32});
122906c3fb27SDimitry Andric   if (ST.has16BitInsts())
123006c3fb27SDimitry Andric     Log2Ops.legalFor({S16});
123106c3fb27SDimitry Andric   else
123206c3fb27SDimitry Andric     Log2Ops.customFor({S16});
123306c3fb27SDimitry Andric   Log2Ops.scalarize(0)
123406c3fb27SDimitry Andric     .lower();
123506c3fb27SDimitry Andric 
12365f757f3fSDimitry Andric   auto &LogOps =
12375f757f3fSDimitry Andric       getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
123806c3fb27SDimitry Andric   LogOps.customFor({S32, S16});
123906c3fb27SDimitry Andric   LogOps.clampScalar(0, MinScalarFPTy, S32)
124006c3fb27SDimitry Andric         .scalarize(0);
124106c3fb27SDimitry Andric 
12420b57cec5SDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
12435ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_CTPOP)
12440b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S32, S64}})
12450b57cec5SDimitry Andric     .clampScalar(0, S32, S32)
124604eeddc0SDimitry Andric     .widenScalarToNextPow2(1, 32)
12470b57cec5SDimitry Andric     .clampScalar(1, S32, S64)
12480b57cec5SDimitry Andric     .scalarize(0)
124904eeddc0SDimitry Andric     .widenScalarToNextPow2(0, 32);
125004eeddc0SDimitry Andric 
1251bdd1243dSDimitry Andric   // If no 16 bit instr is available, lower into different instructions.
1252bdd1243dSDimitry Andric   if (ST.has16BitInsts())
1253bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
1254bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypes16)
1255bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
1256bdd1243dSDimitry Andric         .scalarize(0)
1257bdd1243dSDimitry Andric         .lower();
1258bdd1243dSDimitry Andric   else
1259bdd1243dSDimitry Andric     getActionDefinitionsBuilder(G_IS_FPCLASS)
1260bdd1243dSDimitry Andric         .legalForCartesianProduct({S1}, FPTypesBase)
1261bdd1243dSDimitry Andric         .lowerFor({S1, S16})
1262bdd1243dSDimitry Andric         .widenScalarToNextPow2(1)
1263bdd1243dSDimitry Andric         .scalarize(0)
1264bdd1243dSDimitry Andric         .lower();
12650b57cec5SDimitry Andric 
12665ffd83dbSDimitry Andric   // The hardware instructions return a different result on 0 than the generic
12675ffd83dbSDimitry Andric   // instructions expect. The hardware produces -1, but these produce the
12685ffd83dbSDimitry Andric   // bitwidth.
12695ffd83dbSDimitry Andric   getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
12705ffd83dbSDimitry Andric     .scalarize(0)
12715ffd83dbSDimitry Andric     .clampScalar(0, S32, S32)
12725ffd83dbSDimitry Andric     .clampScalar(1, S32, S64)
12735ffd83dbSDimitry Andric     .widenScalarToNextPow2(0, 32)
12745ffd83dbSDimitry Andric     .widenScalarToNextPow2(1, 32)
1275349cc55cSDimitry Andric     .custom();
12765ffd83dbSDimitry Andric 
12775ffd83dbSDimitry Andric   // The 64-bit versions produce 32-bit results, but only on the SALU.
1278*0fca6ea1SDimitry Andric   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1279*0fca6ea1SDimitry Andric       .legalFor({{S32, S32}, {S32, S64}})
1280*0fca6ea1SDimitry Andric       .customIf(scalarNarrowerThan(1, 32))
1281*0fca6ea1SDimitry Andric       .clampScalar(0, S32, S32)
1282*0fca6ea1SDimitry Andric       .clampScalar(1, S32, S64)
1283*0fca6ea1SDimitry Andric       .scalarize(0)
1284*0fca6ea1SDimitry Andric       .widenScalarToNextPow2(0, 32)
1285*0fca6ea1SDimitry Andric       .widenScalarToNextPow2(1, 32);
1286*0fca6ea1SDimitry Andric 
1287*0fca6ea1SDimitry Andric   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
12885ffd83dbSDimitry Andric       .legalFor({{S32, S32}, {S32, S64}})
12895ffd83dbSDimitry Andric       .clampScalar(0, S32, S32)
12905ffd83dbSDimitry Andric       .clampScalar(1, S32, S64)
12915ffd83dbSDimitry Andric       .scalarize(0)
12925ffd83dbSDimitry Andric       .widenScalarToNextPow2(0, 32)
12935ffd83dbSDimitry Andric       .widenScalarToNextPow2(1, 32);
12945ffd83dbSDimitry Andric 
1295fe6060f1SDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1296fe6060f1SDimitry Andric   // RegBankSelect.
12975ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_BITREVERSE)
1298fe6060f1SDimitry Andric     .legalFor({S32, S64})
1299fe6060f1SDimitry Andric     .clampScalar(0, S32, S64)
1300fe6060f1SDimitry Andric     .scalarize(0)
1301fe6060f1SDimitry Andric     .widenScalarToNextPow2(0);
13020b57cec5SDimitry Andric 
13030b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
13045ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
13055ffd83dbSDimitry Andric       .legalFor({S16, S32, V2S16})
13060eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
13075ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
13085ffd83dbSDimitry Andric       // narrowScalar limitation.
13095ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
13105ffd83dbSDimitry Andric       .clampScalar(0, S16, S32)
13115ffd83dbSDimitry Andric       .scalarize(0);
13125ffd83dbSDimitry Andric 
13130b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
1314fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
13150b57cec5SDimitry Andric         .legalFor({S32, S16, V2S16})
13160b57cec5SDimitry Andric         .clampMaxNumElements(0, S16, 2)
13175ffd83dbSDimitry Andric         .minScalar(0, S16)
13180b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
13195ffd83dbSDimitry Andric         .scalarize(0)
13205ffd83dbSDimitry Andric         .lower();
13210b57cec5SDimitry Andric     } else {
1322fe6060f1SDimitry Andric       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
13230b57cec5SDimitry Andric         .legalFor({S32, S16})
13240b57cec5SDimitry Andric         .widenScalarToNextPow2(0)
13255ffd83dbSDimitry Andric         .minScalar(0, S16)
13265ffd83dbSDimitry Andric         .scalarize(0)
13275ffd83dbSDimitry Andric         .lower();
13280b57cec5SDimitry Andric     }
13290b57cec5SDimitry Andric   } else {
13305ffd83dbSDimitry Andric     // TODO: Should have same legality without v_perm_b32
13315ffd83dbSDimitry Andric     getActionDefinitionsBuilder(G_BSWAP)
13325ffd83dbSDimitry Andric       .legalFor({S32})
13335ffd83dbSDimitry Andric       .lowerIf(scalarNarrowerThan(0, 32))
13345ffd83dbSDimitry Andric       // FIXME: Fixing non-power-of-2 before clamp is workaround for
13355ffd83dbSDimitry Andric       // narrowScalar limitation.
13365ffd83dbSDimitry Andric       .widenScalarToNextPow2(0)
13375ffd83dbSDimitry Andric       .maxScalar(0, S32)
13385ffd83dbSDimitry Andric       .scalarize(0)
13395ffd83dbSDimitry Andric       .lower();
13405ffd83dbSDimitry Andric 
1341fe6060f1SDimitry Andric     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
13420b57cec5SDimitry Andric       .legalFor({S32})
13435ffd83dbSDimitry Andric       .minScalar(0, S32)
13440b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
13455ffd83dbSDimitry Andric       .scalarize(0)
13465ffd83dbSDimitry Andric       .lower();
13470b57cec5SDimitry Andric   }
13480b57cec5SDimitry Andric 
13490b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_INTTOPTR)
13500b57cec5SDimitry Andric       // List the common cases
13510b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
13520b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
13530b57cec5SDimitry Andric       .scalarize(0)
13540b57cec5SDimitry Andric       // Accept any address space as long as the size matches
13550b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
13560b57cec5SDimitry Andric       .widenScalarIf(smallerThan(1, 0),
13570b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1358bdd1243dSDimitry Andric                        return std::pair(
1359bdd1243dSDimitry Andric                            1, LLT::scalar(Query.Types[0].getSizeInBits()));
13600b57cec5SDimitry Andric                      })
1361bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1362bdd1243dSDimitry Andric         return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
13630b57cec5SDimitry Andric       });
13640b57cec5SDimitry Andric 
13650b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_PTRTOINT)
13660b57cec5SDimitry Andric       // List the common cases
13670b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces64, {S64})
13680b57cec5SDimitry Andric       .legalForCartesianProduct(AddrSpaces32, {S32})
13690b57cec5SDimitry Andric       .scalarize(0)
13700b57cec5SDimitry Andric       // Accept any address space as long as the size matches
13710b57cec5SDimitry Andric       .legalIf(sameSize(0, 1))
13720b57cec5SDimitry Andric       .widenScalarIf(smallerThan(0, 1),
13730b57cec5SDimitry Andric                      [](const LegalityQuery &Query) {
1374bdd1243dSDimitry Andric                        return std::pair(
1375bdd1243dSDimitry Andric                            0, LLT::scalar(Query.Types[1].getSizeInBits()));
13760b57cec5SDimitry Andric                      })
1377bdd1243dSDimitry Andric       .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1378bdd1243dSDimitry Andric         return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
13790b57cec5SDimitry Andric       });
13800b57cec5SDimitry Andric 
13810b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
13820b57cec5SDimitry Andric     .scalarize(0)
13830b57cec5SDimitry Andric     .custom();
13840b57cec5SDimitry Andric 
13855ffd83dbSDimitry Andric   const auto needToSplitMemOp = [=](const LegalityQuery &Query,
13865ffd83dbSDimitry Andric                                     bool IsLoad) -> bool {
13878bcb0991SDimitry Andric     const LLT DstTy = Query.Types[0];
13888bcb0991SDimitry Andric 
13898bcb0991SDimitry Andric     // Split vector extloads.
1390fe6060f1SDimitry Andric     unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1391480093f4SDimitry Andric 
13928bcb0991SDimitry Andric     if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
13938bcb0991SDimitry Andric       return true;
13948bcb0991SDimitry Andric 
13958bcb0991SDimitry Andric     const LLT PtrTy = Query.Types[1];
13968bcb0991SDimitry Andric     unsigned AS = PtrTy.getAddressSpace();
139706c3fb27SDimitry Andric     if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
139806c3fb27SDimitry Andric                                       Query.MMODescrs[0].Ordering !=
139906c3fb27SDimitry Andric                                           AtomicOrdering::NotAtomic))
14008bcb0991SDimitry Andric       return true;
14018bcb0991SDimitry Andric 
14028bcb0991SDimitry Andric     // Catch weird sized loads that don't evenly divide into the access sizes
14038bcb0991SDimitry Andric     // TODO: May be able to widen depending on alignment etc.
14045ffd83dbSDimitry Andric     unsigned NumRegs = (MemSize + 31) / 32;
14055ffd83dbSDimitry Andric     if (NumRegs == 3) {
14065ffd83dbSDimitry Andric       if (!ST.hasDwordx3LoadStores())
14078bcb0991SDimitry Andric         return true;
14085ffd83dbSDimitry Andric     } else {
14095ffd83dbSDimitry Andric       // If the alignment allows, these should have been widened.
14105ffd83dbSDimitry Andric       if (!isPowerOf2_32(NumRegs))
14115ffd83dbSDimitry Andric         return true;
14125ffd83dbSDimitry Andric     }
14138bcb0991SDimitry Andric 
14148bcb0991SDimitry Andric     return false;
14158bcb0991SDimitry Andric   };
14168bcb0991SDimitry Andric 
1417e8d8bef9SDimitry Andric   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1418e8d8bef9SDimitry Andric   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1419e8d8bef9SDimitry Andric   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
14208bcb0991SDimitry Andric 
14218bcb0991SDimitry Andric   // TODO: Refine based on subtargets which support unaligned access or 128-bit
14228bcb0991SDimitry Andric   // LDS
14238bcb0991SDimitry Andric   // TODO: Unsupported flat for SI.
14248bcb0991SDimitry Andric 
14258bcb0991SDimitry Andric   for (unsigned Op : {G_LOAD, G_STORE}) {
14268bcb0991SDimitry Andric     const bool IsStore = Op == G_STORE;
14278bcb0991SDimitry Andric 
14288bcb0991SDimitry Andric     auto &Actions = getActionDefinitionsBuilder(Op);
14295ffd83dbSDimitry Andric     // Explicitly list some common cases.
14305ffd83dbSDimitry Andric     // TODO: Does this help compile time at all?
1431fe6060f1SDimitry Andric     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1432fe6060f1SDimitry Andric                                       {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1433fe6060f1SDimitry Andric                                       {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1434fe6060f1SDimitry Andric                                       {S64, GlobalPtr, S64, GlobalAlign32},
1435fe6060f1SDimitry Andric                                       {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1436fe6060f1SDimitry Andric                                       {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1437fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S8, GlobalAlign8},
1438fe6060f1SDimitry Andric                                       {S32, GlobalPtr, S16, GlobalAlign16},
14398bcb0991SDimitry Andric 
1440fe6060f1SDimitry Andric                                       {S32, LocalPtr, S32, 32},
1441fe6060f1SDimitry Andric                                       {S64, LocalPtr, S64, 32},
1442fe6060f1SDimitry Andric                                       {V2S32, LocalPtr, V2S32, 32},
1443fe6060f1SDimitry Andric                                       {S32, LocalPtr, S8, 8},
1444fe6060f1SDimitry Andric                                       {S32, LocalPtr, S16, 16},
1445fe6060f1SDimitry Andric                                       {V2S16, LocalPtr, S32, 32},
14468bcb0991SDimitry Andric 
1447fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S32, 32},
1448fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S8, 8},
1449fe6060f1SDimitry Andric                                       {S32, PrivatePtr, S16, 16},
1450fe6060f1SDimitry Andric                                       {V2S16, PrivatePtr, S32, 32},
14518bcb0991SDimitry Andric 
1452fe6060f1SDimitry Andric                                       {S32, ConstantPtr, S32, GlobalAlign32},
1453fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1454fe6060f1SDimitry Andric                                       {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1455fe6060f1SDimitry Andric                                       {S64, ConstantPtr, S64, GlobalAlign32},
1456fe6060f1SDimitry Andric                                       {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
14575ffd83dbSDimitry Andric     Actions.legalIf(
14585ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1459fe6060f1SDimitry Andric         return isLoadStoreLegal(ST, Query);
14605ffd83dbSDimitry Andric       });
14615ffd83dbSDimitry Andric 
146206c3fb27SDimitry Andric     // The custom pointers (fat pointers, buffer resources) don't work with load
146306c3fb27SDimitry Andric     // and store at this level. Fat pointers should have been lowered to
146406c3fb27SDimitry Andric     // intrinsics before the translation to MIR.
14655f757f3fSDimitry Andric     Actions.unsupportedIf(
14665f757f3fSDimitry Andric         typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
146706c3fb27SDimitry Andric 
146806c3fb27SDimitry Andric     // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
146906c3fb27SDimitry Andric     // ptrtoint. This is needed to account for the fact that we can't have i128
147006c3fb27SDimitry Andric     // as a register class for SelectionDAG reasons.
147106c3fb27SDimitry Andric     Actions.customIf([=](const LegalityQuery &Query) -> bool {
147206c3fb27SDimitry Andric       return hasBufferRsrcWorkaround(Query.Types[0]);
147306c3fb27SDimitry Andric     });
147406c3fb27SDimitry Andric 
14755ffd83dbSDimitry Andric     // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
14765ffd83dbSDimitry Andric     // 64-bits.
14775ffd83dbSDimitry Andric     //
14785ffd83dbSDimitry Andric     // TODO: Should generalize bitcast action into coerce, which will also cover
14795ffd83dbSDimitry Andric     // inserting addrspacecasts.
14805ffd83dbSDimitry Andric     Actions.customIf(typeIs(1, Constant32Ptr));
14815ffd83dbSDimitry Andric 
14825ffd83dbSDimitry Andric     // Turn any illegal element vectors into something easier to deal
14835ffd83dbSDimitry Andric     // with. These will ultimately produce 32-bit scalar shifts to extract the
14845ffd83dbSDimitry Andric     // parts anyway.
14855ffd83dbSDimitry Andric     //
14865ffd83dbSDimitry Andric     // For odd 16-bit element vectors, prefer to split those into pieces with
14875ffd83dbSDimitry Andric     // 16-bit vector parts.
14885ffd83dbSDimitry Andric     Actions.bitcastIf(
14895ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) -> bool {
1490e8d8bef9SDimitry Andric         return shouldBitcastLoadStoreType(ST, Query.Types[0],
1491fe6060f1SDimitry Andric                                           Query.MMODescrs[0].MemoryTy);
14925ffd83dbSDimitry Andric       }, bitcastToRegisterType(0));
14935ffd83dbSDimitry Andric 
1494e8d8bef9SDimitry Andric     if (!IsStore) {
1495e8d8bef9SDimitry Andric       // Widen suitably aligned loads by loading extra bytes. The standard
1496e8d8bef9SDimitry Andric       // legalization actions can't properly express widening memory operands.
1497e8d8bef9SDimitry Andric       Actions.customIf([=](const LegalityQuery &Query) -> bool {
1498e8d8bef9SDimitry Andric         return shouldWidenLoad(ST, Query, G_LOAD);
1499e8d8bef9SDimitry Andric       });
1500e8d8bef9SDimitry Andric     }
1501e8d8bef9SDimitry Andric 
1502e8d8bef9SDimitry Andric     // FIXME: load/store narrowing should be moved to lower action
15038bcb0991SDimitry Andric     Actions
15048bcb0991SDimitry Andric         .narrowScalarIf(
15058bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
15065ffd83dbSDimitry Andric               return !Query.Types[0].isVector() &&
15075ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
15088bcb0991SDimitry Andric             },
15098bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
15108bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
15118bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
15128bcb0991SDimitry Andric 
15138bcb0991SDimitry Andric               const unsigned DstSize = DstTy.getSizeInBits();
1514fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
15158bcb0991SDimitry Andric 
15168bcb0991SDimitry Andric               // Split extloads.
15178bcb0991SDimitry Andric               if (DstSize > MemSize)
1518bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MemSize));
15198bcb0991SDimitry Andric 
152006c3fb27SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(
152106c3fb27SDimitry Andric                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
152206c3fb27SDimitry Andric                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
15238bcb0991SDimitry Andric               if (MemSize > MaxSize)
1524bdd1243dSDimitry Andric                 return std::pair(0, LLT::scalar(MaxSize));
15258bcb0991SDimitry Andric 
152604eeddc0SDimitry Andric               uint64_t Align = Query.MMODescrs[0].AlignInBits;
1527bdd1243dSDimitry Andric               return std::pair(0, LLT::scalar(Align));
15288bcb0991SDimitry Andric             })
15298bcb0991SDimitry Andric         .fewerElementsIf(
15308bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> bool {
15315ffd83dbSDimitry Andric               return Query.Types[0].isVector() &&
15325ffd83dbSDimitry Andric                      needToSplitMemOp(Query, Op == G_LOAD);
15338bcb0991SDimitry Andric             },
15348bcb0991SDimitry Andric             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
15358bcb0991SDimitry Andric               const LLT DstTy = Query.Types[0];
15368bcb0991SDimitry Andric               const LLT PtrTy = Query.Types[1];
15378bcb0991SDimitry Andric 
15388bcb0991SDimitry Andric               LLT EltTy = DstTy.getElementType();
153906c3fb27SDimitry Andric               unsigned MaxSize = maxSizeForAddrSpace(
154006c3fb27SDimitry Andric                   ST, PtrTy.getAddressSpace(), Op == G_LOAD,
154106c3fb27SDimitry Andric                   Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
15425ffd83dbSDimitry Andric 
15435ffd83dbSDimitry Andric               // FIXME: Handle widened to power of 2 results better. This ends
15445ffd83dbSDimitry Andric               // up scalarizing.
15455ffd83dbSDimitry Andric               // FIXME: 3 element stores scalarized on SI
15468bcb0991SDimitry Andric 
15478bcb0991SDimitry Andric               // Split if it's too large for the address space.
1548fe6060f1SDimitry Andric               unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1549fe6060f1SDimitry Andric               if (MemSize > MaxSize) {
15508bcb0991SDimitry Andric                 unsigned NumElts = DstTy.getNumElements();
15515ffd83dbSDimitry Andric                 unsigned EltSize = EltTy.getSizeInBits();
15525ffd83dbSDimitry Andric 
15535ffd83dbSDimitry Andric                 if (MaxSize % EltSize == 0) {
1554bdd1243dSDimitry Andric                   return std::pair(
1555fe6060f1SDimitry Andric                       0, LLT::scalarOrVector(
1556fe6060f1SDimitry Andric                              ElementCount::getFixed(MaxSize / EltSize), EltTy));
15575ffd83dbSDimitry Andric                 }
15585ffd83dbSDimitry Andric 
1559fe6060f1SDimitry Andric                 unsigned NumPieces = MemSize / MaxSize;
15608bcb0991SDimitry Andric 
15618bcb0991SDimitry Andric                 // FIXME: Refine when odd breakdowns handled
15628bcb0991SDimitry Andric                 // The scalars will need to be re-legalized.
15638bcb0991SDimitry Andric                 if (NumPieces == 1 || NumPieces >= NumElts ||
15648bcb0991SDimitry Andric                     NumElts % NumPieces != 0)
1565bdd1243dSDimitry Andric                   return std::pair(0, EltTy);
15668bcb0991SDimitry Andric 
1567bdd1243dSDimitry Andric                 return std::pair(0,
1568bdd1243dSDimitry Andric                                  LLT::fixed_vector(NumElts / NumPieces, EltTy));
15698bcb0991SDimitry Andric               }
15708bcb0991SDimitry Andric 
15715ffd83dbSDimitry Andric               // FIXME: We could probably handle weird extending loads better.
15725ffd83dbSDimitry Andric               if (DstTy.getSizeInBits() > MemSize)
1573bdd1243dSDimitry Andric                 return std::pair(0, EltTy);
15745ffd83dbSDimitry Andric 
15755ffd83dbSDimitry Andric               unsigned EltSize = EltTy.getSizeInBits();
15765ffd83dbSDimitry Andric               unsigned DstSize = DstTy.getSizeInBits();
15775ffd83dbSDimitry Andric               if (!isPowerOf2_32(DstSize)) {
15785ffd83dbSDimitry Andric                 // We're probably decomposing an odd sized store. Try to split
15795ffd83dbSDimitry Andric                 // to the widest type. TODO: Account for alignment. As-is it
15805ffd83dbSDimitry Andric                 // should be OK, since the new parts will be further legalized.
158106c3fb27SDimitry Andric                 unsigned FloorSize = llvm::bit_floor(DstSize);
1582bdd1243dSDimitry Andric                 return std::pair(
1583fe6060f1SDimitry Andric                     0, LLT::scalarOrVector(
1584fe6060f1SDimitry Andric                            ElementCount::getFixed(FloorSize / EltSize), EltTy));
15855ffd83dbSDimitry Andric               }
15865ffd83dbSDimitry Andric 
15878bcb0991SDimitry Andric               // May need relegalization for the scalars.
1588bdd1243dSDimitry Andric               return std::pair(0, EltTy);
15898bcb0991SDimitry Andric             })
1590fe6060f1SDimitry Andric     .minScalar(0, S32)
1591fe6060f1SDimitry Andric     .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32))
15928bcb0991SDimitry Andric     .widenScalarToNextPow2(0)
1593e8d8bef9SDimitry Andric     .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1594e8d8bef9SDimitry Andric     .lower();
15958bcb0991SDimitry Andric   }
15960b57cec5SDimitry Andric 
1597fe6060f1SDimitry Andric   // FIXME: Unaligned accesses not lowered.
15980b57cec5SDimitry Andric   auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1599fe6060f1SDimitry Andric                        .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1600fe6060f1SDimitry Andric                                                   {S32, GlobalPtr, S16, 2 * 8},
1601fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S8, 8},
1602fe6060f1SDimitry Andric                                                   {S32, LocalPtr, S16, 16},
1603fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S8, 8},
1604fe6060f1SDimitry Andric                                                   {S32, PrivatePtr, S16, 16},
1605fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S8, 8},
1606fe6060f1SDimitry Andric                                                   {S32, ConstantPtr, S16, 2 * 8}})
1607fe6060f1SDimitry Andric                        .legalIf(
1608fe6060f1SDimitry Andric                          [=](const LegalityQuery &Query) -> bool {
1609fe6060f1SDimitry Andric                            return isLoadStoreLegal(ST, Query);
1610fe6060f1SDimitry Andric                          });
1611fe6060f1SDimitry Andric 
16120b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
16138bcb0991SDimitry Andric     ExtLoads.legalForTypesWithMemDesc(
1614fe6060f1SDimitry Andric         {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
16150b57cec5SDimitry Andric   }
16160b57cec5SDimitry Andric 
1617fe6060f1SDimitry Andric   // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1618fe6060f1SDimitry Andric   // 64-bits.
1619fe6060f1SDimitry Andric   //
1620fe6060f1SDimitry Andric   // TODO: Should generalize bitcast action into coerce, which will also cover
1621fe6060f1SDimitry Andric   // inserting addrspacecasts.
1622fe6060f1SDimitry Andric   ExtLoads.customIf(typeIs(1, Constant32Ptr));
1623fe6060f1SDimitry Andric 
16240b57cec5SDimitry Andric   ExtLoads.clampScalar(0, S32, S32)
16250b57cec5SDimitry Andric           .widenScalarToNextPow2(0)
16260b57cec5SDimitry Andric           .lower();
16270b57cec5SDimitry Andric 
16280b57cec5SDimitry Andric   auto &Atomics = getActionDefinitionsBuilder(
16290b57cec5SDimitry Andric     {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
16300b57cec5SDimitry Andric      G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
16310b57cec5SDimitry Andric      G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
163206c3fb27SDimitry Andric      G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
16330b57cec5SDimitry Andric     .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1634e8d8bef9SDimitry Andric                {S64, GlobalPtr}, {S64, LocalPtr},
1635e8d8bef9SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
16360b57cec5SDimitry Andric   if (ST.hasFlatAddressSpace()) {
16370b57cec5SDimitry Andric     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
16380b57cec5SDimitry Andric   }
16390b57cec5SDimitry Andric 
1640*0fca6ea1SDimitry Andric   // TODO: v2bf16 operations, and fat buffer pointer support.
1641fe6060f1SDimitry Andric   auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1642*0fca6ea1SDimitry Andric   if (ST.hasLDSFPAtomicAddF32()) {
1643fe6060f1SDimitry Andric     Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1644*0fca6ea1SDimitry Andric     if (ST.hasLdsAtomicAddF64())
1645fe6060f1SDimitry Andric       Atomic.legalFor({{S64, LocalPtr}});
164606c3fb27SDimitry Andric     if (ST.hasAtomicDsPkAdd16Insts())
1647*0fca6ea1SDimitry Andric       Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
16485ffd83dbSDimitry Andric   }
1649fe6060f1SDimitry Andric   if (ST.hasAtomicFaddInsts())
1650fe6060f1SDimitry Andric     Atomic.legalFor({{S32, GlobalPtr}});
1651bdd1243dSDimitry Andric   if (ST.hasFlatAtomicFaddF32Inst())
1652bdd1243dSDimitry Andric     Atomic.legalFor({{S32, FlatPtr}});
16538bcb0991SDimitry Andric 
165404eeddc0SDimitry Andric   if (ST.hasGFX90AInsts()) {
165504eeddc0SDimitry Andric     // These are legal with some caveats, and should have undergone expansion in
165604eeddc0SDimitry Andric     // the IR in most situations
165704eeddc0SDimitry Andric     // TODO: Move atomic expansion into legalizer
165804eeddc0SDimitry Andric     Atomic.legalFor({
165904eeddc0SDimitry Andric         {S32, GlobalPtr},
166004eeddc0SDimitry Andric         {S64, GlobalPtr},
166104eeddc0SDimitry Andric         {S64, FlatPtr}
166204eeddc0SDimitry Andric       });
166304eeddc0SDimitry Andric   }
166404eeddc0SDimitry Andric 
1665*0fca6ea1SDimitry Andric   if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1666*0fca6ea1SDimitry Andric       ST.hasAtomicBufferGlobalPkAddF16Insts())
1667*0fca6ea1SDimitry Andric     Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1668*0fca6ea1SDimitry Andric   if (ST.hasAtomicGlobalPkAddBF16Inst())
1669*0fca6ea1SDimitry Andric     Atomic.legalFor({{V2BF16, GlobalPtr}});
1670*0fca6ea1SDimitry Andric   if (ST.hasAtomicFlatPkAdd16Insts())
1671*0fca6ea1SDimitry Andric     Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1672*0fca6ea1SDimitry Andric 
1673*0fca6ea1SDimitry Andric 
1674*0fca6ea1SDimitry Andric   // Most of the legalization work here is done by AtomicExpand. We could
1675*0fca6ea1SDimitry Andric   // probably use a simpler legality rule that just assumes anything is OK.
1676*0fca6ea1SDimitry Andric   auto &AtomicFMinFMax =
1677*0fca6ea1SDimitry Andric     getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1678*0fca6ea1SDimitry Andric     .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1679*0fca6ea1SDimitry Andric 
1680*0fca6ea1SDimitry Andric   if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1681*0fca6ea1SDimitry Andric     AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1682*0fca6ea1SDimitry Andric   if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1683*0fca6ea1SDimitry Andric     AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1684*0fca6ea1SDimitry Andric   if (ST.hasAtomicFMinFMaxF32FlatInsts())
1685*0fca6ea1SDimitry Andric     AtomicFMinFMax.legalFor({F32, FlatPtr});
1686*0fca6ea1SDimitry Andric   if (ST.hasAtomicFMinFMaxF64FlatInsts())
1687*0fca6ea1SDimitry Andric     AtomicFMinFMax.legalFor({F64, FlatPtr});
1688*0fca6ea1SDimitry Andric 
1689480093f4SDimitry Andric   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1690480093f4SDimitry Andric   // demarshalling
1691480093f4SDimitry Andric   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1692480093f4SDimitry Andric     .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1693480093f4SDimitry Andric                 {S32, FlatPtr}, {S64, FlatPtr}})
1694480093f4SDimitry Andric     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1695480093f4SDimitry Andric                {S32, RegionPtr}, {S64, RegionPtr}});
16960b57cec5SDimitry Andric   // TODO: Pointer types, any 32-bit or 64-bit vector
1697480093f4SDimitry Andric 
1698480093f4SDimitry Andric   // Condition should be s32 for scalar, s1 for vector.
16990b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_SELECT)
1700fe6060f1SDimitry Andric       .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1701fe6060f1SDimitry Andric                                  LocalPtr, FlatPtr, PrivatePtr,
1702fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, LocalPtr),
1703fe6060f1SDimitry Andric                                  LLT::fixed_vector(2, PrivatePtr)},
1704fe6060f1SDimitry Andric                                 {S1, S32})
17050b57cec5SDimitry Andric       .clampScalar(0, S16, S64)
17065ffd83dbSDimitry Andric       .scalarize(1)
17070b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
17080b57cec5SDimitry Andric       .fewerElementsIf(numElementsNotEven(0), scalarize(0))
17090b57cec5SDimitry Andric       .clampMaxNumElements(0, S32, 2)
17100b57cec5SDimitry Andric       .clampMaxNumElements(0, LocalPtr, 2)
17110b57cec5SDimitry Andric       .clampMaxNumElements(0, PrivatePtr, 2)
17120b57cec5SDimitry Andric       .scalarize(0)
17130b57cec5SDimitry Andric       .widenScalarToNextPow2(0)
1714480093f4SDimitry Andric       .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
17150b57cec5SDimitry Andric 
17160b57cec5SDimitry Andric   // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
17170b57cec5SDimitry Andric   // be more flexible with the shift amount type.
17180b57cec5SDimitry Andric   auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
17190b57cec5SDimitry Andric     .legalFor({{S32, S32}, {S64, S32}});
17200b57cec5SDimitry Andric   if (ST.has16BitInsts()) {
17210b57cec5SDimitry Andric     if (ST.hasVOP3PInsts()) {
17225ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
17230b57cec5SDimitry Andric             .clampMaxNumElements(0, S16, 2);
17240b57cec5SDimitry Andric     } else
17255ffd83dbSDimitry Andric       Shifts.legalFor({{S16, S16}});
17260b57cec5SDimitry Andric 
17275ffd83dbSDimitry Andric     // TODO: Support 16-bit shift amounts for all types
17285ffd83dbSDimitry Andric     Shifts.widenScalarIf(
17295ffd83dbSDimitry Andric       [=](const LegalityQuery &Query) {
17305ffd83dbSDimitry Andric         // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
17315ffd83dbSDimitry Andric         // 32-bit amount.
17325ffd83dbSDimitry Andric         const LLT ValTy = Query.Types[0];
17335ffd83dbSDimitry Andric         const LLT AmountTy = Query.Types[1];
17345ffd83dbSDimitry Andric         return ValTy.getSizeInBits() <= 16 &&
17355ffd83dbSDimitry Andric                AmountTy.getSizeInBits() < 16;
17365ffd83dbSDimitry Andric       }, changeTo(1, S16));
17375ffd83dbSDimitry Andric     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1738480093f4SDimitry Andric     Shifts.clampScalar(1, S32, S32);
17390b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 16);
174004eeddc0SDimitry Andric     Shifts.clampScalar(0, S16, S64);
1741e8d8bef9SDimitry Andric 
1742e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1743e8d8bef9SDimitry Andric       .minScalar(0, S16)
1744e8d8bef9SDimitry Andric       .scalarize(0)
1745e8d8bef9SDimitry Andric       .lower();
17460b57cec5SDimitry Andric   } else {
17470b57cec5SDimitry Andric     // Make sure we legalize the shift amount type first, as the general
17480b57cec5SDimitry Andric     // expansion for the shifted type will produce much worse code if it hasn't
17490b57cec5SDimitry Andric     // been truncated already.
17500b57cec5SDimitry Andric     Shifts.clampScalar(1, S32, S32);
17510b57cec5SDimitry Andric     Shifts.widenScalarToNextPow2(0, 32);
175204eeddc0SDimitry Andric     Shifts.clampScalar(0, S32, S64);
1753e8d8bef9SDimitry Andric 
1754e8d8bef9SDimitry Andric     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1755e8d8bef9SDimitry Andric       .minScalar(0, S32)
1756e8d8bef9SDimitry Andric       .scalarize(0)
1757e8d8bef9SDimitry Andric       .lower();
17580b57cec5SDimitry Andric   }
17590b57cec5SDimitry Andric   Shifts.scalarize(0);
17600b57cec5SDimitry Andric 
17610b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
17620b57cec5SDimitry Andric     unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
17630b57cec5SDimitry Andric     unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
17640b57cec5SDimitry Andric     unsigned IdxTypeIdx = 2;
17650b57cec5SDimitry Andric 
17660b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
17670b57cec5SDimitry Andric       .customIf([=](const LegalityQuery &Query) {
17680b57cec5SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
17690b57cec5SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
17700b57cec5SDimitry Andric           const LLT IdxTy = Query.Types[IdxTypeIdx];
1771e8d8bef9SDimitry Andric           const unsigned EltSize = EltTy.getSizeInBits();
177206c3fb27SDimitry Andric           const bool isLegalVecType =
177306c3fb27SDimitry Andric               !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
177406c3fb27SDimitry Andric           // Address space 8 pointers are 128-bit wide values, but the logic
177506c3fb27SDimitry Andric           // below will try to bitcast them to 2N x s64, which will fail.
177606c3fb27SDimitry Andric           // Therefore, as an intermediate step, wrap extracts/insertions from a
177706c3fb27SDimitry Andric           // ptrtoint-ing the vector and scalar arguments (or inttoptring the
177806c3fb27SDimitry Andric           // extraction result) in order to produce a vector operation that can
177906c3fb27SDimitry Andric           // be handled by the logic below.
178006c3fb27SDimitry Andric           if (EltTy.isPointer() && EltSize > 64)
178106c3fb27SDimitry Andric             return true;
1782e8d8bef9SDimitry Andric           return (EltSize == 32 || EltSize == 64) &&
17830b57cec5SDimitry Andric                   VecTy.getSizeInBits() % 32 == 0 &&
17845ffd83dbSDimitry Andric                   VecTy.getSizeInBits() <= MaxRegisterSize &&
178506c3fb27SDimitry Andric                   IdxTy.getSizeInBits() == 32 &&
178606c3fb27SDimitry Andric                   isLegalVecType;
17870b57cec5SDimitry Andric         })
1788e8d8bef9SDimitry Andric       .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1789e8d8bef9SDimitry Andric                  bitcastToVectorElement32(VecTypeIdx))
1790e8d8bef9SDimitry Andric       //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1791e8d8bef9SDimitry Andric       .bitcastIf(
1792e8d8bef9SDimitry Andric         all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
1793e8d8bef9SDimitry Andric         [=](const LegalityQuery &Query) {
1794e8d8bef9SDimitry Andric           // For > 64-bit element types, try to turn this into a 64-bit
1795e8d8bef9SDimitry Andric           // element vector since we may be able to do better indexing
1796e8d8bef9SDimitry Andric           // if this is scalar. If not, fall back to 32.
1797e8d8bef9SDimitry Andric           const LLT EltTy = Query.Types[EltTypeIdx];
1798e8d8bef9SDimitry Andric           const LLT VecTy = Query.Types[VecTypeIdx];
1799e8d8bef9SDimitry Andric           const unsigned DstEltSize = EltTy.getSizeInBits();
1800e8d8bef9SDimitry Andric           const unsigned VecSize = VecTy.getSizeInBits();
1801e8d8bef9SDimitry Andric 
1802e8d8bef9SDimitry Andric           const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1803bdd1243dSDimitry Andric           return std::pair(
1804fe6060f1SDimitry Andric               VecTypeIdx,
1805fe6060f1SDimitry Andric               LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize));
1806e8d8bef9SDimitry Andric         })
18070b57cec5SDimitry Andric       .clampScalar(EltTypeIdx, S32, S64)
18080b57cec5SDimitry Andric       .clampScalar(VecTypeIdx, S32, S64)
1809e8d8bef9SDimitry Andric       .clampScalar(IdxTypeIdx, S32, S32)
1810e8d8bef9SDimitry Andric       .clampMaxNumElements(VecTypeIdx, S32, 32)
1811e8d8bef9SDimitry Andric       // TODO: Clamp elements for 64-bit vectors?
181206c3fb27SDimitry Andric       .moreElementsIf(
181306c3fb27SDimitry Andric         isIllegalRegisterType(VecTypeIdx),
181406c3fb27SDimitry Andric         moreElementsToNextExistingRegClass(VecTypeIdx))
1815e8d8bef9SDimitry Andric       // It should only be necessary with variable indexes.
1816e8d8bef9SDimitry Andric       // As a last resort, lower to the stack
1817e8d8bef9SDimitry Andric       .lower();
18180b57cec5SDimitry Andric   }
18190b57cec5SDimitry Andric 
18200b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
18210b57cec5SDimitry Andric     .unsupportedIf([=](const LegalityQuery &Query) {
18220b57cec5SDimitry Andric         const LLT &EltTy = Query.Types[1].getElementType();
18230b57cec5SDimitry Andric         return Query.Types[0] != EltTy;
18240b57cec5SDimitry Andric       });
18250b57cec5SDimitry Andric 
18260b57cec5SDimitry Andric   for (unsigned Op : {G_EXTRACT, G_INSERT}) {
18270b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
18280b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
18290b57cec5SDimitry Andric 
18300b57cec5SDimitry Andric     // FIXME: Doesn't handle extract of illegal sizes.
18310b57cec5SDimitry Andric     getActionDefinitionsBuilder(Op)
18328bcb0991SDimitry Andric       .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
18330eae32dcSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
18340eae32dcSDimitry Andric           // Sub-vector(or single element) insert and extract.
18350eae32dcSDimitry Andric           // TODO: verify immediate offset here since lower only works with
18360eae32dcSDimitry Andric           // whole elements.
18370eae32dcSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
18380eae32dcSDimitry Andric           return BigTy.isVector();
18390eae32dcSDimitry Andric         })
18408bcb0991SDimitry Andric       // FIXME: Multiples of 16 should not be legal.
18410b57cec5SDimitry Andric       .legalIf([=](const LegalityQuery &Query) {
18420b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
18430b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
18440b57cec5SDimitry Andric           return (BigTy.getSizeInBits() % 32 == 0) &&
18450b57cec5SDimitry Andric                  (LitTy.getSizeInBits() % 16 == 0);
18460b57cec5SDimitry Andric         })
18470b57cec5SDimitry Andric       .widenScalarIf(
18480b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
18490b57cec5SDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
18500b57cec5SDimitry Andric           return (BigTy.getScalarSizeInBits() < 16);
18510b57cec5SDimitry Andric         },
18520b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16))
18530b57cec5SDimitry Andric       .widenScalarIf(
18540b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
18550b57cec5SDimitry Andric           const LLT LitTy = Query.Types[LitTyIdx];
18560b57cec5SDimitry Andric           return (LitTy.getScalarSizeInBits() < 16);
18570b57cec5SDimitry Andric         },
18580b57cec5SDimitry Andric         LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16))
18590b57cec5SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
18600b57cec5SDimitry Andric       .widenScalarToNextPow2(BigTyIdx, 32);
18610b57cec5SDimitry Andric 
18620b57cec5SDimitry Andric   }
18630b57cec5SDimitry Andric 
18648bcb0991SDimitry Andric   auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
18650b57cec5SDimitry Andric     .legalForCartesianProduct(AllS32Vectors, {S32})
18660b57cec5SDimitry Andric     .legalForCartesianProduct(AllS64Vectors, {S64})
18678bcb0991SDimitry Andric     .clampNumElements(0, V16S32, V32S32)
18688bcb0991SDimitry Andric     .clampNumElements(0, V2S64, V16S64)
186906c3fb27SDimitry Andric     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
187006c3fb27SDimitry Andric     .moreElementsIf(
187106c3fb27SDimitry Andric       isIllegalRegisterType(0),
187206c3fb27SDimitry Andric       moreElementsToNextExistingRegClass(0));
18738bcb0991SDimitry Andric 
18748bcb0991SDimitry Andric   if (ST.hasScalarPackInsts()) {
18755ffd83dbSDimitry Andric     BuildVector
18765ffd83dbSDimitry Andric       // FIXME: Should probably widen s1 vectors straight to s32
18775ffd83dbSDimitry Andric       .minScalarOrElt(0, S16)
1878bdd1243dSDimitry Andric       .minScalar(1, S16);
18795ffd83dbSDimitry Andric 
18808bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
18818bcb0991SDimitry Andric       .legalFor({V2S16, S32})
18828bcb0991SDimitry Andric       .lower();
18838bcb0991SDimitry Andric   } else {
18845ffd83dbSDimitry Andric     BuildVector.customFor({V2S16, S16});
18855ffd83dbSDimitry Andric     BuildVector.minScalarOrElt(0, S32);
18865ffd83dbSDimitry Andric 
18878bcb0991SDimitry Andric     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
18885ffd83dbSDimitry Andric       .customFor({V2S16, S32})
18898bcb0991SDimitry Andric       .lower();
18908bcb0991SDimitry Andric   }
18918bcb0991SDimitry Andric 
18925ffd83dbSDimitry Andric   BuildVector.legalIf(isRegisterType(0));
18935ffd83dbSDimitry Andric 
18945ffd83dbSDimitry Andric   // FIXME: Clamp maximum size
18950b57cec5SDimitry Andric   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1896e8d8bef9SDimitry Andric     .legalIf(all(isRegisterType(0), isRegisterType(1)))
1897e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S32, 32)
1898e8d8bef9SDimitry Andric     .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
1899e8d8bef9SDimitry Andric     .clampMaxNumElements(0, S16, 64);
19000b57cec5SDimitry Andric 
19018bcb0991SDimitry Andric   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
19028bcb0991SDimitry Andric 
19030b57cec5SDimitry Andric   // Merge/Unmerge
19040b57cec5SDimitry Andric   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
19050b57cec5SDimitry Andric     unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
19060b57cec5SDimitry Andric     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
19070b57cec5SDimitry Andric 
19080b57cec5SDimitry Andric     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
19095ffd83dbSDimitry Andric       const LLT Ty = Query.Types[TypeIdx];
19100b57cec5SDimitry Andric       if (Ty.isVector()) {
19110b57cec5SDimitry Andric         const LLT &EltTy = Ty.getElementType();
19125ffd83dbSDimitry Andric         if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
19130b57cec5SDimitry Andric           return true;
191406c3fb27SDimitry Andric         if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
19150b57cec5SDimitry Andric           return true;
19160b57cec5SDimitry Andric       }
19170b57cec5SDimitry Andric       return false;
19180b57cec5SDimitry Andric     };
19190b57cec5SDimitry Andric 
19208bcb0991SDimitry Andric     auto &Builder = getActionDefinitionsBuilder(Op)
1921e8d8bef9SDimitry Andric       .legalIf(all(isRegisterType(0), isRegisterType(1)))
19225ffd83dbSDimitry Andric       .lowerFor({{S16, V2S16}})
19235ffd83dbSDimitry Andric       .lowerIf([=](const LegalityQuery &Query) {
19245ffd83dbSDimitry Andric           const LLT BigTy = Query.Types[BigTyIdx];
19255ffd83dbSDimitry Andric           return BigTy.getSizeInBits() == 32;
19265ffd83dbSDimitry Andric         })
19275ffd83dbSDimitry Andric       // Try to widen to s16 first for small types.
19285ffd83dbSDimitry Andric       // TODO: Only do this on targets with legal s16 shifts
19295ffd83dbSDimitry Andric       .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
19300b57cec5SDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
19318bcb0991SDimitry Andric       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
19328bcb0991SDimitry Andric       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
19338bcb0991SDimitry Andric                            elementTypeIs(1, S16)),
19348bcb0991SDimitry Andric                        changeTo(1, V2S16))
19355ffd83dbSDimitry Andric       // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
19365ffd83dbSDimitry Andric       // worth considering the multiples of 64 since 2*192 and 2*384 are not
19375ffd83dbSDimitry Andric       // valid.
19385ffd83dbSDimitry Andric       .clampScalar(LitTyIdx, S32, S512)
19395ffd83dbSDimitry Andric       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
19400b57cec5SDimitry Andric       // Break up vectors with weird elements into scalars
19410b57cec5SDimitry Andric       .fewerElementsIf(
19425ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
19430b57cec5SDimitry Andric         scalarize(0))
19440b57cec5SDimitry Andric       .fewerElementsIf(
19455ffd83dbSDimitry Andric         [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
19460b57cec5SDimitry Andric         scalarize(1))
19475ffd83dbSDimitry Andric       .clampScalar(BigTyIdx, S32, MaxScalar);
19488bcb0991SDimitry Andric 
19498bcb0991SDimitry Andric     if (Op == G_MERGE_VALUES) {
19508bcb0991SDimitry Andric       Builder.widenScalarIf(
19518bcb0991SDimitry Andric         // TODO: Use 16-bit shifts if legal for 8-bit values?
19520b57cec5SDimitry Andric         [=](const LegalityQuery &Query) {
19538bcb0991SDimitry Andric           const LLT Ty = Query.Types[LitTyIdx];
19548bcb0991SDimitry Andric           return Ty.getSizeInBits() < 32;
19558bcb0991SDimitry Andric         },
19568bcb0991SDimitry Andric         changeTo(LitTyIdx, S32));
19578bcb0991SDimitry Andric     }
19588bcb0991SDimitry Andric 
19598bcb0991SDimitry Andric     Builder.widenScalarIf(
19608bcb0991SDimitry Andric       [=](const LegalityQuery &Query) {
19618bcb0991SDimitry Andric         const LLT Ty = Query.Types[BigTyIdx];
196206c3fb27SDimitry Andric         return Ty.getSizeInBits() % 16 != 0;
19630b57cec5SDimitry Andric       },
19640b57cec5SDimitry Andric       [=](const LegalityQuery &Query) {
19650b57cec5SDimitry Andric         // Pick the next power of 2, or a multiple of 64 over 128.
19660b57cec5SDimitry Andric         // Whichever is smaller.
19670b57cec5SDimitry Andric         const LLT &Ty = Query.Types[BigTyIdx];
19680b57cec5SDimitry Andric         unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
19690b57cec5SDimitry Andric         if (NewSizeInBits >= 256) {
19700b57cec5SDimitry Andric           unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
19710b57cec5SDimitry Andric           if (RoundedTo < NewSizeInBits)
19720b57cec5SDimitry Andric             NewSizeInBits = RoundedTo;
19730b57cec5SDimitry Andric         }
1974bdd1243dSDimitry Andric         return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
19750b57cec5SDimitry Andric       })
19760b57cec5SDimitry Andric       // Any vectors left are the wrong size. Scalarize them.
19770b57cec5SDimitry Andric       .scalarize(0)
19780b57cec5SDimitry Andric       .scalarize(1);
19790b57cec5SDimitry Andric   }
19800b57cec5SDimitry Andric 
19815ffd83dbSDimitry Andric   // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
19825ffd83dbSDimitry Andric   // RegBankSelect.
19835ffd83dbSDimitry Andric   auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
19845ffd83dbSDimitry Andric     .legalFor({{S32}, {S64}});
19858bcb0991SDimitry Andric 
19865ffd83dbSDimitry Andric   if (ST.hasVOP3PInsts()) {
19875ffd83dbSDimitry Andric     SextInReg.lowerFor({{V2S16}})
19885ffd83dbSDimitry Andric       // Prefer to reduce vector widths for 16-bit vectors before lowering, to
19895ffd83dbSDimitry Andric       // get more vector shift opportunities, since we'll get those when
19905ffd83dbSDimitry Andric       // expanded.
19910eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2);
19925ffd83dbSDimitry Andric   } else if (ST.has16BitInsts()) {
19935ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}, {S16}});
19945ffd83dbSDimitry Andric   } else {
19955ffd83dbSDimitry Andric     // Prefer to promote to s32 before lowering if we don't have 16-bit
19965ffd83dbSDimitry Andric     // shifts. This avoid a lot of intermediate truncate and extend operations.
19975ffd83dbSDimitry Andric     SextInReg.lowerFor({{S32}, {S64}});
19985ffd83dbSDimitry Andric   }
19995ffd83dbSDimitry Andric 
20005ffd83dbSDimitry Andric   SextInReg
20015ffd83dbSDimitry Andric     .scalarize(0)
20025ffd83dbSDimitry Andric     .clampScalar(0, S32, S64)
20035ffd83dbSDimitry Andric     .lower();
20045ffd83dbSDimitry Andric 
2005349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2006349cc55cSDimitry Andric     .scalarize(0)
2007349cc55cSDimitry Andric     .lower();
2008349cc55cSDimitry Andric 
2009fe6060f1SDimitry Andric   // TODO: Only Try to form v2s16 with legal packed instructions.
20105ffd83dbSDimitry Andric   getActionDefinitionsBuilder(G_FSHR)
20115ffd83dbSDimitry Andric     .legalFor({{S32, S32}})
2012fe6060f1SDimitry Andric     .lowerFor({{V2S16, V2S16}})
20130eae32dcSDimitry Andric     .clampMaxNumElementsStrict(0, S16, 2)
20145ffd83dbSDimitry Andric     .scalarize(0)
20155ffd83dbSDimitry Andric     .lower();
2016480093f4SDimitry Andric 
2017fe6060f1SDimitry Andric   if (ST.hasVOP3PInsts()) {
2018fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
2019fe6060f1SDimitry Andric       .lowerFor({{V2S16, V2S16}})
20200eae32dcSDimitry Andric       .clampMaxNumElementsStrict(0, S16, 2)
2021fe6060f1SDimitry Andric       .scalarize(0)
2022fe6060f1SDimitry Andric       .lower();
2023fe6060f1SDimitry Andric   } else {
2024fe6060f1SDimitry Andric     getActionDefinitionsBuilder(G_FSHL)
2025fe6060f1SDimitry Andric       .scalarize(0)
2026fe6060f1SDimitry Andric       .lower();
2027fe6060f1SDimitry Andric   }
2028fe6060f1SDimitry Andric 
2029480093f4SDimitry Andric   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2030480093f4SDimitry Andric     .legalFor({S64});
2031480093f4SDimitry Andric 
2032*0fca6ea1SDimitry Andric   getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2033*0fca6ea1SDimitry Andric 
2034e8d8bef9SDimitry Andric   getActionDefinitionsBuilder(G_FENCE)
2035e8d8bef9SDimitry Andric     .alwaysLegal();
2036e8d8bef9SDimitry Andric 
2037fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2038fe6060f1SDimitry Andric       .scalarize(0)
2039fe6060f1SDimitry Andric       .minScalar(0, S32)
2040fe6060f1SDimitry Andric       .lower();
2041fe6060f1SDimitry Andric 
2042fe6060f1SDimitry Andric   getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2043fe6060f1SDimitry Andric       .legalFor({{S32, S32}, {S64, S32}})
2044fe6060f1SDimitry Andric       .clampScalar(1, S32, S32)
2045fe6060f1SDimitry Andric       .clampScalar(0, S32, S64)
2046fe6060f1SDimitry Andric       .widenScalarToNextPow2(0)
2047fe6060f1SDimitry Andric       .scalarize(0);
2048fe6060f1SDimitry Andric 
20495f757f3fSDimitry Andric   getActionDefinitionsBuilder(
20505f757f3fSDimitry Andric       {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
20515ffd83dbSDimitry Andric        G_FCOPYSIGN,
20525ffd83dbSDimitry Andric 
20535f757f3fSDimitry Andric        G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
20545f757f3fSDimitry Andric        G_READ_REGISTER, G_WRITE_REGISTER,
20555ffd83dbSDimitry Andric 
20565f757f3fSDimitry Andric        G_SADDO, G_SSUBO})
20575f757f3fSDimitry Andric       .lower();
20585ffd83dbSDimitry Andric 
20595f757f3fSDimitry Andric   if (ST.hasIEEEMinMax()) {
20605f757f3fSDimitry Andric     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
20615f757f3fSDimitry Andric         .legalFor(FPTypesPK16)
20625f757f3fSDimitry Andric         .clampMaxNumElements(0, S16, 2)
20635f757f3fSDimitry Andric         .scalarize(0);
20645f757f3fSDimitry Andric   } else {
20655ffd83dbSDimitry Andric     // TODO: Implement
20665f757f3fSDimitry Andric     getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
20675f757f3fSDimitry Andric   }
20685ffd83dbSDimitry Andric 
2069349cc55cSDimitry Andric   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2070349cc55cSDimitry Andric       .lower();
2071349cc55cSDimitry Andric 
2072*0fca6ea1SDimitry Andric   getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2073*0fca6ea1SDimitry Andric 
2074480093f4SDimitry Andric   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
20755ffd83dbSDimitry Andric         G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2076480093f4SDimitry Andric         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2077480093f4SDimitry Andric     .unsupported();
2078480093f4SDimitry Andric 
20795f757f3fSDimitry Andric   getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
20805f757f3fSDimitry Andric 
2081fe6060f1SDimitry Andric   getLegacyLegalizerInfo().computeTables();
20820b57cec5SDimitry Andric   verify(*ST.getInstrInfo());
20830b57cec5SDimitry Andric }
20840b57cec5SDimitry Andric 
legalizeCustom(LegalizerHelper & Helper,MachineInstr & MI,LostDebugLocObserver & LocObserver) const20851db9f3b2SDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(
20861db9f3b2SDimitry Andric     LegalizerHelper &Helper, MachineInstr &MI,
20871db9f3b2SDimitry Andric     LostDebugLocObserver &LocObserver) const {
20885ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
20895ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
20905ffd83dbSDimitry Andric 
20910b57cec5SDimitry Andric   switch (MI.getOpcode()) {
20920b57cec5SDimitry Andric   case TargetOpcode::G_ADDRSPACE_CAST:
20938bcb0991SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
20945f757f3fSDimitry Andric   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
20955f757f3fSDimitry Andric     return legalizeFroundeven(MI, MRI, B);
20960b57cec5SDimitry Andric   case TargetOpcode::G_FCEIL:
20978bcb0991SDimitry Andric     return legalizeFceil(MI, MRI, B);
2098e8d8bef9SDimitry Andric   case TargetOpcode::G_FREM:
2099e8d8bef9SDimitry Andric     return legalizeFrem(MI, MRI, B);
21000b57cec5SDimitry Andric   case TargetOpcode::G_INTRINSIC_TRUNC:
21018bcb0991SDimitry Andric     return legalizeIntrinsicTrunc(MI, MRI, B);
21020b57cec5SDimitry Andric   case TargetOpcode::G_SITOFP:
21038bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, true);
21040b57cec5SDimitry Andric   case TargetOpcode::G_UITOFP:
21058bcb0991SDimitry Andric     return legalizeITOFP(MI, MRI, B, false);
21065ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOSI:
21075ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, true);
21085ffd83dbSDimitry Andric   case TargetOpcode::G_FPTOUI:
21095ffd83dbSDimitry Andric     return legalizeFPTOI(MI, MRI, B, false);
21100b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM:
21110b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM:
21120b57cec5SDimitry Andric   case TargetOpcode::G_FMINNUM_IEEE:
21130b57cec5SDimitry Andric   case TargetOpcode::G_FMAXNUM_IEEE:
21145ffd83dbSDimitry Andric     return legalizeMinNumMaxNum(Helper, MI);
21150b57cec5SDimitry Andric   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
21168bcb0991SDimitry Andric     return legalizeExtractVectorElt(MI, MRI, B);
21170b57cec5SDimitry Andric   case TargetOpcode::G_INSERT_VECTOR_ELT:
21188bcb0991SDimitry Andric     return legalizeInsertVectorElt(MI, MRI, B);
21198bcb0991SDimitry Andric   case TargetOpcode::G_FSIN:
21208bcb0991SDimitry Andric   case TargetOpcode::G_FCOS:
21218bcb0991SDimitry Andric     return legalizeSinCos(MI, MRI, B);
21228bcb0991SDimitry Andric   case TargetOpcode::G_GLOBAL_VALUE:
21238bcb0991SDimitry Andric     return legalizeGlobalValue(MI, MRI, B);
21248bcb0991SDimitry Andric   case TargetOpcode::G_LOAD:
2125fe6060f1SDimitry Andric   case TargetOpcode::G_SEXTLOAD:
2126fe6060f1SDimitry Andric   case TargetOpcode::G_ZEXTLOAD:
2127e8d8bef9SDimitry Andric     return legalizeLoad(Helper, MI);
212806c3fb27SDimitry Andric   case TargetOpcode::G_STORE:
212906c3fb27SDimitry Andric     return legalizeStore(Helper, MI);
21308bcb0991SDimitry Andric   case TargetOpcode::G_FMAD:
21318bcb0991SDimitry Andric     return legalizeFMad(MI, MRI, B);
21328bcb0991SDimitry Andric   case TargetOpcode::G_FDIV:
21338bcb0991SDimitry Andric     return legalizeFDIV(MI, MRI, B);
213406c3fb27SDimitry Andric   case TargetOpcode::G_FFREXP:
213506c3fb27SDimitry Andric     return legalizeFFREXP(MI, MRI, B);
213606c3fb27SDimitry Andric   case TargetOpcode::G_FSQRT:
213706c3fb27SDimitry Andric     return legalizeFSQRT(MI, MRI, B);
21385ffd83dbSDimitry Andric   case TargetOpcode::G_UDIV:
21395ffd83dbSDimitry Andric   case TargetOpcode::G_UREM:
2140fe6060f1SDimitry Andric   case TargetOpcode::G_UDIVREM:
2141fe6060f1SDimitry Andric     return legalizeUnsignedDIV_REM(MI, MRI, B);
21425ffd83dbSDimitry Andric   case TargetOpcode::G_SDIV:
21435ffd83dbSDimitry Andric   case TargetOpcode::G_SREM:
2144fe6060f1SDimitry Andric   case TargetOpcode::G_SDIVREM:
2145fe6060f1SDimitry Andric     return legalizeSignedDIV_REM(MI, MRI, B);
2146480093f4SDimitry Andric   case TargetOpcode::G_ATOMIC_CMPXCHG:
2147480093f4SDimitry Andric     return legalizeAtomicCmpXChg(MI, MRI, B);
214806c3fb27SDimitry Andric   case TargetOpcode::G_FLOG2:
214906c3fb27SDimitry Andric     return legalizeFlog2(MI, B);
21505ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG:
21515ffd83dbSDimitry Andric   case TargetOpcode::G_FLOG10:
215206c3fb27SDimitry Andric     return legalizeFlogCommon(MI, B);
215306c3fb27SDimitry Andric   case TargetOpcode::G_FEXP2:
215406c3fb27SDimitry Andric     return legalizeFExp2(MI, B);
21555ffd83dbSDimitry Andric   case TargetOpcode::G_FEXP:
21565f757f3fSDimitry Andric   case TargetOpcode::G_FEXP10:
21575ffd83dbSDimitry Andric     return legalizeFExp(MI, B);
21585ffd83dbSDimitry Andric   case TargetOpcode::G_FPOW:
21595ffd83dbSDimitry Andric     return legalizeFPow(MI, B);
21605ffd83dbSDimitry Andric   case TargetOpcode::G_FFLOOR:
21615ffd83dbSDimitry Andric     return legalizeFFloor(MI, MRI, B);
21625ffd83dbSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR:
2163bdd1243dSDimitry Andric   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
21645ffd83dbSDimitry Andric     return legalizeBuildVector(MI, MRI, B);
216581ad6265SDimitry Andric   case TargetOpcode::G_MUL:
216681ad6265SDimitry Andric     return legalizeMul(Helper, MI);
2167349cc55cSDimitry Andric   case TargetOpcode::G_CTLZ:
2168349cc55cSDimitry Andric   case TargetOpcode::G_CTTZ:
2169349cc55cSDimitry Andric     return legalizeCTLZ_CTTZ(MI, MRI, B);
2170*0fca6ea1SDimitry Andric   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2171*0fca6ea1SDimitry Andric     return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
217281ad6265SDimitry Andric   case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
217381ad6265SDimitry Andric     return legalizeFPTruncRound(MI, B);
21745f757f3fSDimitry Andric   case TargetOpcode::G_STACKSAVE:
21755f757f3fSDimitry Andric     return legalizeStackSave(MI, B);
2176*0fca6ea1SDimitry Andric   case TargetOpcode::G_GET_FPENV:
2177*0fca6ea1SDimitry Andric     return legalizeGetFPEnv(MI, MRI, B);
2178*0fca6ea1SDimitry Andric   case TargetOpcode::G_SET_FPENV:
2179*0fca6ea1SDimitry Andric     return legalizeSetFPEnv(MI, MRI, B);
2180*0fca6ea1SDimitry Andric   case TargetOpcode::G_TRAP:
2181*0fca6ea1SDimitry Andric     return legalizeTrap(MI, MRI, B);
2182*0fca6ea1SDimitry Andric   case TargetOpcode::G_DEBUGTRAP:
2183*0fca6ea1SDimitry Andric     return legalizeDebugTrap(MI, MRI, B);
21840b57cec5SDimitry Andric   default:
21850b57cec5SDimitry Andric     return false;
21860b57cec5SDimitry Andric   }
21870b57cec5SDimitry Andric 
21880b57cec5SDimitry Andric   llvm_unreachable("expected switch to return");
21890b57cec5SDimitry Andric }
21900b57cec5SDimitry Andric 
getSegmentAperture(unsigned AS,MachineRegisterInfo & MRI,MachineIRBuilder & B) const21910b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture(
21920b57cec5SDimitry Andric   unsigned AS,
21930b57cec5SDimitry Andric   MachineRegisterInfo &MRI,
21948bcb0991SDimitry Andric   MachineIRBuilder &B) const {
21958bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
21960b57cec5SDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
21970b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
2198bdd1243dSDimitry Andric   const LLT S64 = LLT::scalar(64);
21990b57cec5SDimitry Andric 
22008bcb0991SDimitry Andric   assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
22018bcb0991SDimitry Andric 
22020b57cec5SDimitry Andric   if (ST.hasApertureRegs()) {
2203bdd1243dSDimitry Andric     // Note: this register is somewhat broken. When used as a 32-bit operand,
2204bdd1243dSDimitry Andric     // it only returns zeroes. The real value is in the upper 32 bits.
2205bdd1243dSDimitry Andric     // Thus, we must emit extract the high 32 bits.
2206bdd1243dSDimitry Andric     const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2207bdd1243dSDimitry Andric                                        ? AMDGPU::SRC_SHARED_BASE
2208bdd1243dSDimitry Andric                                        : AMDGPU::SRC_PRIVATE_BASE;
2209bdd1243dSDimitry Andric     // FIXME: It would be more natural to emit a COPY here, but then copy
2210bdd1243dSDimitry Andric     // coalescing would kick in and it would think it's okay to use the "HI"
2211bdd1243dSDimitry Andric     // subregister (instead of extracting the HI 32 bits) which is an artificial
2212bdd1243dSDimitry Andric     // (unusable) register.
2213bdd1243dSDimitry Andric     //  Register TableGen definitions would need an overhaul to get rid of the
2214bdd1243dSDimitry Andric     //  artificial "HI" aperture registers and prevent this kind of issue from
2215bdd1243dSDimitry Andric     //  happening.
2216bdd1243dSDimitry Andric     Register Dst = MRI.createGenericVirtualRegister(S64);
2217bdd1243dSDimitry Andric     MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2218bdd1243dSDimitry Andric     B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)});
2219bdd1243dSDimitry Andric     return B.buildUnmerge(S32, Dst).getReg(1);
22200b57cec5SDimitry Andric   }
22210b57cec5SDimitry Andric 
222281ad6265SDimitry Andric   // TODO: can we be smarter about machine pointer info?
222381ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
222481ad6265SDimitry Andric   Register LoadAddr = MRI.createGenericVirtualRegister(
222581ad6265SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
222681ad6265SDimitry Andric   // For code object version 5, private_base and shared_base are passed through
222781ad6265SDimitry Andric   // implicit kernargs.
22287a6dacacSDimitry Andric   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
222906c3fb27SDimitry Andric       AMDGPU::AMDHSA_COV5) {
223081ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
223181ad6265SDimitry Andric         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
223281ad6265SDimitry Andric                                       : AMDGPUTargetLowering::PRIVATE_BASE;
223381ad6265SDimitry Andric     uint64_t Offset =
223481ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
223581ad6265SDimitry Andric 
223681ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
223781ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
223881ad6265SDimitry Andric 
223981ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
224081ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
224181ad6265SDimitry Andric       return Register();
224281ad6265SDimitry Andric 
224381ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
224481ad6265SDimitry Andric         PtrInfo,
224581ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
224681ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
224781ad6265SDimitry Andric         LLT::scalar(32), commonAlignment(Align(64), Offset));
224881ad6265SDimitry Andric 
224981ad6265SDimitry Andric     // Pointer address
225081ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
225181ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
225281ad6265SDimitry Andric     // Load address
225381ad6265SDimitry Andric     return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
225481ad6265SDimitry Andric   }
225581ad6265SDimitry Andric 
22560b57cec5SDimitry Andric   Register QueuePtr = MRI.createGenericVirtualRegister(
22570b57cec5SDimitry Andric     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
22580b57cec5SDimitry Andric 
2259e8d8bef9SDimitry Andric   if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
22608bcb0991SDimitry Andric     return Register();
22610b57cec5SDimitry Andric 
22620b57cec5SDimitry Andric   // Offset into amd_queue_t for group_segment_aperture_base_hi /
22630b57cec5SDimitry Andric   // private_segment_aperture_base_hi.
22640b57cec5SDimitry Andric   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
22650b57cec5SDimitry Andric 
22660b57cec5SDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
22670b57cec5SDimitry Andric       PtrInfo,
22685ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
22690b57cec5SDimitry Andric           MachineMemOperand::MOInvariant,
2270fe6060f1SDimitry Andric       LLT::scalar(32), commonAlignment(Align(64), StructOffset));
22710b57cec5SDimitry Andric 
227281ad6265SDimitry Andric   B.buildPtrAdd(LoadAddr, QueuePtr,
227381ad6265SDimitry Andric                 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
22745ffd83dbSDimitry Andric   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
22750b57cec5SDimitry Andric }
22760b57cec5SDimitry Andric 
227704eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is
227804eeddc0SDimitry Andric /// not necessary.
isKnownNonNull(Register Val,MachineRegisterInfo & MRI,const AMDGPUTargetMachine & TM,unsigned AddrSpace)227904eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
228004eeddc0SDimitry Andric                            const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
228104eeddc0SDimitry Andric   MachineInstr *Def = MRI.getVRegDef(Val);
228204eeddc0SDimitry Andric   switch (Def->getOpcode()) {
228304eeddc0SDimitry Andric   case AMDGPU::G_FRAME_INDEX:
228404eeddc0SDimitry Andric   case AMDGPU::G_GLOBAL_VALUE:
228504eeddc0SDimitry Andric   case AMDGPU::G_BLOCK_ADDR:
228604eeddc0SDimitry Andric     return true;
228704eeddc0SDimitry Andric   case AMDGPU::G_CONSTANT: {
228804eeddc0SDimitry Andric     const ConstantInt *CI = Def->getOperand(1).getCImm();
228904eeddc0SDimitry Andric     return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
229004eeddc0SDimitry Andric   }
229104eeddc0SDimitry Andric   default:
229204eeddc0SDimitry Andric     return false;
229304eeddc0SDimitry Andric   }
229404eeddc0SDimitry Andric 
229504eeddc0SDimitry Andric   return false;
229604eeddc0SDimitry Andric }
229704eeddc0SDimitry Andric 
legalizeAddrSpaceCast(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const22980b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
22990b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
23008bcb0991SDimitry Andric   MachineIRBuilder &B) const {
23018bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
23020b57cec5SDimitry Andric 
2303*0fca6ea1SDimitry Andric   // MI can either be a G_ADDRSPACE_CAST or a
2304*0fca6ea1SDimitry Andric   // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2305*0fca6ea1SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2306*0fca6ea1SDimitry Andric          (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2307*0fca6ea1SDimitry Andric                                      Intrinsic::amdgcn_addrspacecast_nonnull));
2308*0fca6ea1SDimitry Andric 
23098bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
23100b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
2311*0fca6ea1SDimitry Andric   Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2312*0fca6ea1SDimitry Andric                                      : MI.getOperand(1).getReg();
23130b57cec5SDimitry Andric   LLT DstTy = MRI.getType(Dst);
23140b57cec5SDimitry Andric   LLT SrcTy = MRI.getType(Src);
23150b57cec5SDimitry Andric   unsigned DestAS = DstTy.getAddressSpace();
23160b57cec5SDimitry Andric   unsigned SrcAS = SrcTy.getAddressSpace();
23170b57cec5SDimitry Andric 
23180b57cec5SDimitry Andric   // TODO: Avoid reloading from the queue ptr for each cast, or at least each
23190b57cec5SDimitry Andric   // vector element.
23200b57cec5SDimitry Andric   assert(!DstTy.isVector());
23210b57cec5SDimitry Andric 
23220b57cec5SDimitry Andric   const AMDGPUTargetMachine &TM
23230b57cec5SDimitry Andric     = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
23240b57cec5SDimitry Andric 
2325e8d8bef9SDimitry Andric   if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
23268bcb0991SDimitry Andric     MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
23278bcb0991SDimitry Andric     return true;
23288bcb0991SDimitry Andric   }
23298bcb0991SDimitry Andric 
233081ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
233181ad6265SDimitry Andric       (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
233281ad6265SDimitry Andric        DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2333*0fca6ea1SDimitry Andric     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2334*0fca6ea1SDimitry Andric     // G_ADDRSPACE_CAST we need to guess.
2335*0fca6ea1SDimitry Andric     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
233604eeddc0SDimitry Andric       // Extract low 32-bits of the pointer.
233704eeddc0SDimitry Andric       B.buildExtract(Dst, Src, 0);
233804eeddc0SDimitry Andric       MI.eraseFromParent();
233904eeddc0SDimitry Andric       return true;
234004eeddc0SDimitry Andric     }
234104eeddc0SDimitry Andric 
23420b57cec5SDimitry Andric     unsigned NullVal = TM.getNullPointerValue(DestAS);
23430b57cec5SDimitry Andric 
23448bcb0991SDimitry Andric     auto SegmentNull = B.buildConstant(DstTy, NullVal);
23458bcb0991SDimitry Andric     auto FlatNull = B.buildConstant(SrcTy, 0);
23460b57cec5SDimitry Andric 
23470b57cec5SDimitry Andric     // Extract low 32-bits of the pointer.
23485ffd83dbSDimitry Andric     auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
23490b57cec5SDimitry Andric 
23505ffd83dbSDimitry Andric     auto CmpRes =
23515ffd83dbSDimitry Andric         B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
23528bcb0991SDimitry Andric     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
23530b57cec5SDimitry Andric 
23540b57cec5SDimitry Andric     MI.eraseFromParent();
23550b57cec5SDimitry Andric     return true;
23560b57cec5SDimitry Andric   }
23570b57cec5SDimitry Andric 
235881ad6265SDimitry Andric   if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
235981ad6265SDimitry Andric       (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
236081ad6265SDimitry Andric        SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
23618bcb0991SDimitry Andric     Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
23628bcb0991SDimitry Andric     if (!ApertureReg.isValid())
23638bcb0991SDimitry Andric       return false;
23640b57cec5SDimitry Andric 
23650b57cec5SDimitry Andric     // Coerce the type of the low half of the result so we can use merge_values.
23665ffd83dbSDimitry Andric     Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
23670b57cec5SDimitry Andric 
23680b57cec5SDimitry Andric     // TODO: Should we allow mismatched types but matching sizes in merges to
23690b57cec5SDimitry Andric     // avoid the ptrtoint?
2370bdd1243dSDimitry Andric     auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
237104eeddc0SDimitry Andric 
2372*0fca6ea1SDimitry Andric     // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2373*0fca6ea1SDimitry Andric     // G_ADDRSPACE_CAST we need to guess.
2374*0fca6ea1SDimitry Andric     if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
237504eeddc0SDimitry Andric       B.buildCopy(Dst, BuildPtr);
237604eeddc0SDimitry Andric       MI.eraseFromParent();
237704eeddc0SDimitry Andric       return true;
237804eeddc0SDimitry Andric     }
237904eeddc0SDimitry Andric 
238004eeddc0SDimitry Andric     auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
238104eeddc0SDimitry Andric     auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
238204eeddc0SDimitry Andric 
238381ad6265SDimitry Andric     auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
238481ad6265SDimitry Andric                               SegmentNull.getReg(0));
238504eeddc0SDimitry Andric 
23865ffd83dbSDimitry Andric     B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
23870b57cec5SDimitry Andric 
23880b57cec5SDimitry Andric     MI.eraseFromParent();
23890b57cec5SDimitry Andric     return true;
23900b57cec5SDimitry Andric   }
23910b57cec5SDimitry Andric 
239281ad6265SDimitry Andric   if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
239381ad6265SDimitry Andric       SrcTy.getSizeInBits() == 64) {
239481ad6265SDimitry Andric     // Truncate.
239581ad6265SDimitry Andric     B.buildExtract(Dst, Src, 0);
239681ad6265SDimitry Andric     MI.eraseFromParent();
239781ad6265SDimitry Andric     return true;
239881ad6265SDimitry Andric   }
239981ad6265SDimitry Andric 
240081ad6265SDimitry Andric   if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
240181ad6265SDimitry Andric       DstTy.getSizeInBits() == 64) {
240281ad6265SDimitry Andric     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
240381ad6265SDimitry Andric     uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2404bdd1243dSDimitry Andric     auto PtrLo = B.buildPtrToInt(S32, Src);
2405bdd1243dSDimitry Andric     auto HighAddr = B.buildConstant(S32, AddrHiVal);
2406bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
240781ad6265SDimitry Andric     MI.eraseFromParent();
240881ad6265SDimitry Andric     return true;
240981ad6265SDimitry Andric   }
241081ad6265SDimitry Andric 
241181ad6265SDimitry Andric   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
241281ad6265SDimitry Andric       MF.getFunction(), "invalid addrspacecast", B.getDebugLoc());
241381ad6265SDimitry Andric 
241481ad6265SDimitry Andric   LLVMContext &Ctx = MF.getFunction().getContext();
241581ad6265SDimitry Andric   Ctx.diagnose(InvalidAddrSpaceCast);
241681ad6265SDimitry Andric   B.buildUndef(Dst);
241781ad6265SDimitry Andric   MI.eraseFromParent();
241881ad6265SDimitry Andric   return true;
241981ad6265SDimitry Andric }
242081ad6265SDimitry Andric 
legalizeFroundeven(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const24215f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
24225f757f3fSDimitry Andric                                              MachineRegisterInfo &MRI,
24238bcb0991SDimitry Andric                                              MachineIRBuilder &B) const {
24240b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
24250b57cec5SDimitry Andric   LLT Ty = MRI.getType(Src);
24260b57cec5SDimitry Andric   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
24270b57cec5SDimitry Andric 
24280b57cec5SDimitry Andric   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
24290b57cec5SDimitry Andric   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
24300b57cec5SDimitry Andric 
24318bcb0991SDimitry Andric   auto C1 = B.buildFConstant(Ty, C1Val);
24328bcb0991SDimitry Andric   auto CopySign = B.buildFCopysign(Ty, C1, Src);
24330b57cec5SDimitry Andric 
24340b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
24358bcb0991SDimitry Andric   auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
24368bcb0991SDimitry Andric   auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
24370b57cec5SDimitry Andric 
24388bcb0991SDimitry Andric   auto C2 = B.buildFConstant(Ty, C2Val);
24398bcb0991SDimitry Andric   auto Fabs = B.buildFAbs(Ty, Src);
24400b57cec5SDimitry Andric 
24418bcb0991SDimitry Andric   auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
24428bcb0991SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2443e8d8bef9SDimitry Andric   MI.eraseFromParent();
24440b57cec5SDimitry Andric   return true;
24450b57cec5SDimitry Andric }
24460b57cec5SDimitry Andric 
legalizeFceil(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const24470b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil(
24480b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
24490b57cec5SDimitry Andric   MachineIRBuilder &B) const {
24500b57cec5SDimitry Andric 
24510b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
24520b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
24530b57cec5SDimitry Andric 
24540b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
24550b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
24560b57cec5SDimitry Andric 
24570b57cec5SDimitry Andric   // result = trunc(src)
24580b57cec5SDimitry Andric   // if (src > 0.0 && src != result)
24590b57cec5SDimitry Andric   //   result += 1.0
24600b57cec5SDimitry Andric 
24615ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S64, Src);
24620b57cec5SDimitry Andric 
24630b57cec5SDimitry Andric   const auto Zero = B.buildFConstant(S64, 0.0);
24640b57cec5SDimitry Andric   const auto One = B.buildFConstant(S64, 1.0);
24650b57cec5SDimitry Andric   auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
24660b57cec5SDimitry Andric   auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
24670b57cec5SDimitry Andric   auto And = B.buildAnd(S1, Lt0, NeTrunc);
24680b57cec5SDimitry Andric   auto Add = B.buildSelect(S64, And, One, Zero);
24690b57cec5SDimitry Andric 
24700b57cec5SDimitry Andric   // TODO: Should this propagate fast-math-flags?
24710b57cec5SDimitry Andric   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
247204eeddc0SDimitry Andric   MI.eraseFromParent();
24730b57cec5SDimitry Andric   return true;
24740b57cec5SDimitry Andric }
24750b57cec5SDimitry Andric 
legalizeFrem(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const2476e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem(
2477e8d8bef9SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
2478e8d8bef9SDimitry Andric   MachineIRBuilder &B) const {
2479e8d8bef9SDimitry Andric     Register DstReg = MI.getOperand(0).getReg();
2480e8d8bef9SDimitry Andric     Register Src0Reg = MI.getOperand(1).getReg();
2481e8d8bef9SDimitry Andric     Register Src1Reg = MI.getOperand(2).getReg();
2482e8d8bef9SDimitry Andric     auto Flags = MI.getFlags();
2483e8d8bef9SDimitry Andric     LLT Ty = MRI.getType(DstReg);
2484e8d8bef9SDimitry Andric 
2485e8d8bef9SDimitry Andric     auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2486e8d8bef9SDimitry Andric     auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2487e8d8bef9SDimitry Andric     auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2488e8d8bef9SDimitry Andric     B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2489e8d8bef9SDimitry Andric     MI.eraseFromParent();
2490e8d8bef9SDimitry Andric     return true;
2491e8d8bef9SDimitry Andric }
2492e8d8bef9SDimitry Andric 
extractF64Exponent(Register Hi,MachineIRBuilder & B)2493e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi,
24940b57cec5SDimitry Andric                                               MachineIRBuilder &B) {
24950b57cec5SDimitry Andric   const unsigned FractBits = 52;
24960b57cec5SDimitry Andric   const unsigned ExpBits = 11;
24970b57cec5SDimitry Andric   LLT S32 = LLT::scalar(32);
24980b57cec5SDimitry Andric 
24990b57cec5SDimitry Andric   auto Const0 = B.buildConstant(S32, FractBits - 32);
25000b57cec5SDimitry Andric   auto Const1 = B.buildConstant(S32, ExpBits);
25010b57cec5SDimitry Andric 
25025f757f3fSDimitry Andric   auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2503e8d8bef9SDimitry Andric                      .addUse(Hi)
25040b57cec5SDimitry Andric                      .addUse(Const0.getReg(0))
25050b57cec5SDimitry Andric                      .addUse(Const1.getReg(0));
25060b57cec5SDimitry Andric 
25070b57cec5SDimitry Andric   return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
25080b57cec5SDimitry Andric }
25090b57cec5SDimitry Andric 
legalizeIntrinsicTrunc(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const25100b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
25110b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
25120b57cec5SDimitry Andric   MachineIRBuilder &B) const {
25130b57cec5SDimitry Andric   const LLT S1 = LLT::scalar(1);
25140b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
25150b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
25160b57cec5SDimitry Andric 
25170b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
25180b57cec5SDimitry Andric   assert(MRI.getType(Src) == S64);
25190b57cec5SDimitry Andric 
25200b57cec5SDimitry Andric   // TODO: Should this use extract since the low half is unused?
25210b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
25220b57cec5SDimitry Andric   Register Hi = Unmerge.getReg(1);
25230b57cec5SDimitry Andric 
25240b57cec5SDimitry Andric   // Extract the upper half, since this is where we will find the sign and
25250b57cec5SDimitry Andric   // exponent.
25260b57cec5SDimitry Andric   auto Exp = extractF64Exponent(Hi, B);
25270b57cec5SDimitry Andric 
25280b57cec5SDimitry Andric   const unsigned FractBits = 52;
25290b57cec5SDimitry Andric 
25300b57cec5SDimitry Andric   // Extract the sign bit.
25310b57cec5SDimitry Andric   const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
25320b57cec5SDimitry Andric   auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
25330b57cec5SDimitry Andric 
25340b57cec5SDimitry Andric   const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
25350b57cec5SDimitry Andric 
25360b57cec5SDimitry Andric   const auto Zero32 = B.buildConstant(S32, 0);
25370b57cec5SDimitry Andric 
25380b57cec5SDimitry Andric   // Extend back to 64-bits.
2539bdd1243dSDimitry Andric   auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
25400b57cec5SDimitry Andric 
25410b57cec5SDimitry Andric   auto Shr = B.buildAShr(S64, FractMask, Exp);
25420b57cec5SDimitry Andric   auto Not = B.buildNot(S64, Shr);
25430b57cec5SDimitry Andric   auto Tmp0 = B.buildAnd(S64, Src, Not);
25440b57cec5SDimitry Andric   auto FiftyOne = B.buildConstant(S32, FractBits - 1);
25450b57cec5SDimitry Andric 
25460b57cec5SDimitry Andric   auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
25470b57cec5SDimitry Andric   auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
25480b57cec5SDimitry Andric 
25490b57cec5SDimitry Andric   auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
25500b57cec5SDimitry Andric   B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2551e8d8bef9SDimitry Andric   MI.eraseFromParent();
25520b57cec5SDimitry Andric   return true;
25530b57cec5SDimitry Andric }
25540b57cec5SDimitry Andric 
legalizeITOFP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const25550b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP(
25560b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
25570b57cec5SDimitry Andric   MachineIRBuilder &B, bool Signed) const {
25580b57cec5SDimitry Andric 
25590b57cec5SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
25600b57cec5SDimitry Andric   Register Src = MI.getOperand(1).getReg();
25610b57cec5SDimitry Andric 
25620b57cec5SDimitry Andric   const LLT S64 = LLT::scalar(64);
25630b57cec5SDimitry Andric   const LLT S32 = LLT::scalar(32);
25640b57cec5SDimitry Andric 
2565349cc55cSDimitry Andric   assert(MRI.getType(Src) == S64);
25660b57cec5SDimitry Andric 
25670b57cec5SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2568349cc55cSDimitry Andric   auto ThirtyTwo = B.buildConstant(S32, 32);
25690b57cec5SDimitry Andric 
2570349cc55cSDimitry Andric   if (MRI.getType(Dst) == S64) {
2571349cc55cSDimitry Andric     auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2572349cc55cSDimitry Andric                         : B.buildUITOFP(S64, Unmerge.getReg(1));
25730b57cec5SDimitry Andric 
25740b57cec5SDimitry Andric     auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
257506c3fb27SDimitry Andric     auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
25760b57cec5SDimitry Andric 
25770b57cec5SDimitry Andric     // TODO: Should this propagate fast-math-flags?
25780b57cec5SDimitry Andric     B.buildFAdd(Dst, LdExp, CvtLo);
25790b57cec5SDimitry Andric     MI.eraseFromParent();
25800b57cec5SDimitry Andric     return true;
25810b57cec5SDimitry Andric   }
25820b57cec5SDimitry Andric 
2583349cc55cSDimitry Andric   assert(MRI.getType(Dst) == S32);
2584349cc55cSDimitry Andric 
2585349cc55cSDimitry Andric   auto One = B.buildConstant(S32, 1);
2586349cc55cSDimitry Andric 
2587349cc55cSDimitry Andric   MachineInstrBuilder ShAmt;
2588349cc55cSDimitry Andric   if (Signed) {
2589349cc55cSDimitry Andric     auto ThirtyOne = B.buildConstant(S32, 31);
2590349cc55cSDimitry Andric     auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2591349cc55cSDimitry Andric     auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2592349cc55cSDimitry Andric     auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
25935f757f3fSDimitry Andric     auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2594349cc55cSDimitry Andric                   .addUse(Unmerge.getReg(1));
2595349cc55cSDimitry Andric     auto LS2 = B.buildSub(S32, LS, One);
2596349cc55cSDimitry Andric     ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2597349cc55cSDimitry Andric   } else
2598349cc55cSDimitry Andric     ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2599349cc55cSDimitry Andric   auto Norm = B.buildShl(S64, Src, ShAmt);
2600349cc55cSDimitry Andric   auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2601349cc55cSDimitry Andric   auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2602349cc55cSDimitry Andric   auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2603349cc55cSDimitry Andric   auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2604349cc55cSDimitry Andric   auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
260506c3fb27SDimitry Andric   B.buildFLdexp(Dst, FVal, Scale);
2606349cc55cSDimitry Andric   MI.eraseFromParent();
2607349cc55cSDimitry Andric   return true;
2608349cc55cSDimitry Andric }
2609349cc55cSDimitry Andric 
26105ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this
26115ffd83dbSDimitry Andric // actually works.
legalizeFPTOI(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool Signed) const2612fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
2613fe6060f1SDimitry Andric                                         MachineRegisterInfo &MRI,
2614fe6060f1SDimitry Andric                                         MachineIRBuilder &B,
2615fe6060f1SDimitry Andric                                         bool Signed) const {
26165ffd83dbSDimitry Andric 
26175ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
26185ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
26195ffd83dbSDimitry Andric 
26205ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
26215ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
26225ffd83dbSDimitry Andric 
2623fe6060f1SDimitry Andric   const LLT SrcLT = MRI.getType(Src);
2624fe6060f1SDimitry Andric   assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
26255ffd83dbSDimitry Andric 
26265ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
26275ffd83dbSDimitry Andric 
2628fe6060f1SDimitry Andric   // The basic idea of converting a floating point number into a pair of 32-bit
2629fe6060f1SDimitry Andric   // integers is illustrated as follows:
2630fe6060f1SDimitry Andric   //
2631fe6060f1SDimitry Andric   //     tf := trunc(val);
2632fe6060f1SDimitry Andric   //    hif := floor(tf * 2^-32);
2633fe6060f1SDimitry Andric   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2634fe6060f1SDimitry Andric   //     hi := fptoi(hif);
2635fe6060f1SDimitry Andric   //     lo := fptoi(lof);
2636fe6060f1SDimitry Andric   //
2637fe6060f1SDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2638fe6060f1SDimitry Andric   MachineInstrBuilder Sign;
2639fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2640fe6060f1SDimitry Andric     // However, a 32-bit floating point number has only 23 bits mantissa and
2641fe6060f1SDimitry Andric     // it's not enough to hold all the significant bits of `lof` if val is
2642fe6060f1SDimitry Andric     // negative. To avoid the loss of precision, We need to take the absolute
2643fe6060f1SDimitry Andric     // value after truncating and flip the result back based on the original
2644fe6060f1SDimitry Andric     // signedness.
2645fe6060f1SDimitry Andric     Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2646fe6060f1SDimitry Andric     Trunc = B.buildFAbs(S32, Trunc, Flags);
2647fe6060f1SDimitry Andric   }
2648fe6060f1SDimitry Andric   MachineInstrBuilder K0, K1;
2649fe6060f1SDimitry Andric   if (SrcLT == S64) {
265006c3fb27SDimitry Andric     K0 = B.buildFConstant(
265106c3fb27SDimitry Andric         S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
265206c3fb27SDimitry Andric     K1 = B.buildFConstant(
265306c3fb27SDimitry Andric         S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2654fe6060f1SDimitry Andric   } else {
265506c3fb27SDimitry Andric     K0 = B.buildFConstant(
265606c3fb27SDimitry Andric         S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
265706c3fb27SDimitry Andric     K1 = B.buildFConstant(
265806c3fb27SDimitry Andric         S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2659fe6060f1SDimitry Andric   }
26605ffd83dbSDimitry Andric 
2661fe6060f1SDimitry Andric   auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2662fe6060f1SDimitry Andric   auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2663fe6060f1SDimitry Andric   auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
26645ffd83dbSDimitry Andric 
2665fe6060f1SDimitry Andric   auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2666fe6060f1SDimitry Andric                                      : B.buildFPTOUI(S32, FloorMul);
26675ffd83dbSDimitry Andric   auto Lo = B.buildFPTOUI(S32, Fma);
26685ffd83dbSDimitry Andric 
2669fe6060f1SDimitry Andric   if (Signed && SrcLT == S32) {
2670fe6060f1SDimitry Andric     // Flip the result based on the signedness, which is either all 0s or 1s.
2671bdd1243dSDimitry Andric     Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2672fe6060f1SDimitry Andric     // r := xor({lo, hi}, sign) - sign;
2673bdd1243dSDimitry Andric     B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2674bdd1243dSDimitry Andric                Sign);
2675fe6060f1SDimitry Andric   } else
2676bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, {Lo, Hi});
26775ffd83dbSDimitry Andric   MI.eraseFromParent();
26785ffd83dbSDimitry Andric 
26795ffd83dbSDimitry Andric   return true;
26805ffd83dbSDimitry Andric }
26815ffd83dbSDimitry Andric 
legalizeMinNumMaxNum(LegalizerHelper & Helper,MachineInstr & MI) const26825ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
26835ffd83dbSDimitry Andric                                                MachineInstr &MI) const {
26845ffd83dbSDimitry Andric   MachineFunction &MF = Helper.MIRBuilder.getMF();
26850b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
26860b57cec5SDimitry Andric 
26870b57cec5SDimitry Andric   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
26880b57cec5SDimitry Andric                         MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
26890b57cec5SDimitry Andric 
26900b57cec5SDimitry Andric   // With ieee_mode disabled, the instructions have the correct behavior
26910b57cec5SDimitry Andric   // already for G_FMINNUM/G_FMAXNUM
26920b57cec5SDimitry Andric   if (!MFI->getMode().IEEE)
26930b57cec5SDimitry Andric     return !IsIEEEOp;
26940b57cec5SDimitry Andric 
26950b57cec5SDimitry Andric   if (IsIEEEOp)
26960b57cec5SDimitry Andric     return true;
26970b57cec5SDimitry Andric 
26980b57cec5SDimitry Andric   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
26990b57cec5SDimitry Andric }
27000b57cec5SDimitry Andric 
legalizeExtractVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const27010b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
27020b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
27030b57cec5SDimitry Andric   MachineIRBuilder &B) const {
27040b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
27050b57cec5SDimitry Andric 
27060b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
27075ffd83dbSDimitry Andric 
270806c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
270906c3fb27SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
271006c3fb27SDimitry Andric 
271106c3fb27SDimitry Andric   LLT VecTy = MRI.getType(Vec);
271206c3fb27SDimitry Andric   LLT EltTy = VecTy.getElementType();
271306c3fb27SDimitry Andric   assert(EltTy == MRI.getType(Dst));
271406c3fb27SDimitry Andric 
271506c3fb27SDimitry Andric   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
271606c3fb27SDimitry Andric   // but we can't go directly to that logic becasue you can't bitcast a vector
271706c3fb27SDimitry Andric   // of pointers to a vector of integers. Therefore, introduce an intermediate
271806c3fb27SDimitry Andric   // vector of integers using ptrtoint (and inttoptr on the output) in order to
271906c3fb27SDimitry Andric   // drive the legalization forward.
272006c3fb27SDimitry Andric   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
272106c3fb27SDimitry Andric     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
272206c3fb27SDimitry Andric     LLT IntVecTy = VecTy.changeElementType(IntTy);
272306c3fb27SDimitry Andric 
272406c3fb27SDimitry Andric     auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
272506c3fb27SDimitry Andric     auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
272606c3fb27SDimitry Andric     B.buildIntToPtr(Dst, IntElt);
272706c3fb27SDimitry Andric 
272806c3fb27SDimitry Andric     MI.eraseFromParent();
272906c3fb27SDimitry Andric     return true;
273006c3fb27SDimitry Andric   }
273106c3fb27SDimitry Andric 
27325ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
27335ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2734349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2735bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2736349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
2737e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
27380b57cec5SDimitry Andric     return true;
2739bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
27400b57cec5SDimitry Andric 
274104eeddc0SDimitry Andric   if (IdxVal < VecTy.getNumElements()) {
274204eeddc0SDimitry Andric     auto Unmerge = B.buildUnmerge(EltTy, Vec);
274304eeddc0SDimitry Andric     B.buildCopy(Dst, Unmerge.getReg(IdxVal));
274404eeddc0SDimitry Andric   } else {
27450b57cec5SDimitry Andric     B.buildUndef(Dst);
274604eeddc0SDimitry Andric   }
27470b57cec5SDimitry Andric 
27480b57cec5SDimitry Andric   MI.eraseFromParent();
27490b57cec5SDimitry Andric   return true;
27500b57cec5SDimitry Andric }
27510b57cec5SDimitry Andric 
legalizeInsertVectorElt(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const27520b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
27530b57cec5SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
27540b57cec5SDimitry Andric   MachineIRBuilder &B) const {
27550b57cec5SDimitry Andric   // TODO: Should move some of this into LegalizerHelper.
27560b57cec5SDimitry Andric 
27570b57cec5SDimitry Andric   // TODO: Promote dynamic indexing of s16 to s32
27585ffd83dbSDimitry Andric 
275906c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
276006c3fb27SDimitry Andric   Register Vec = MI.getOperand(1).getReg();
276106c3fb27SDimitry Andric   Register Ins = MI.getOperand(2).getReg();
276206c3fb27SDimitry Andric 
276306c3fb27SDimitry Andric   LLT VecTy = MRI.getType(Vec);
276406c3fb27SDimitry Andric   LLT EltTy = VecTy.getElementType();
276506c3fb27SDimitry Andric   assert(EltTy == MRI.getType(Ins));
276606c3fb27SDimitry Andric 
276706c3fb27SDimitry Andric   // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
276806c3fb27SDimitry Andric   // but we can't go directly to that logic becasue you can't bitcast a vector
276906c3fb27SDimitry Andric   // of pointers to a vector of integers. Therefore, make the pointer vector
277006c3fb27SDimitry Andric   // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
277106c3fb27SDimitry Andric   // new value, and then inttoptr the result vector back. This will then allow
277206c3fb27SDimitry Andric   // the rest of legalization to take over.
277306c3fb27SDimitry Andric   if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
277406c3fb27SDimitry Andric     LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
277506c3fb27SDimitry Andric     LLT IntVecTy = VecTy.changeElementType(IntTy);
277606c3fb27SDimitry Andric 
277706c3fb27SDimitry Andric     auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
277806c3fb27SDimitry Andric     auto IntIns = B.buildPtrToInt(IntTy, Ins);
277906c3fb27SDimitry Andric     auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
278006c3fb27SDimitry Andric                                                  MI.getOperand(3));
278106c3fb27SDimitry Andric     B.buildIntToPtr(Dst, IntVecDest);
278206c3fb27SDimitry Andric     MI.eraseFromParent();
278306c3fb27SDimitry Andric     return true;
278406c3fb27SDimitry Andric   }
278506c3fb27SDimitry Andric 
27865ffd83dbSDimitry Andric   // FIXME: Artifact combiner probably should have replaced the truncated
27875ffd83dbSDimitry Andric   // constant before this, so we shouldn't need
2788349cc55cSDimitry Andric   // getIConstantVRegValWithLookThrough.
2789bdd1243dSDimitry Andric   std::optional<ValueAndVReg> MaybeIdxVal =
2790349cc55cSDimitry Andric       getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
2791e8d8bef9SDimitry Andric   if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
27920b57cec5SDimitry Andric     return true;
27930b57cec5SDimitry Andric 
2794bdd1243dSDimitry Andric   const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
27950b57cec5SDimitry Andric 
279604eeddc0SDimitry Andric   unsigned NumElts = VecTy.getNumElements();
279704eeddc0SDimitry Andric   if (IdxVal < NumElts) {
279804eeddc0SDimitry Andric     SmallVector<Register, 8> SrcRegs;
279904eeddc0SDimitry Andric     for (unsigned i = 0; i < NumElts; ++i)
280004eeddc0SDimitry Andric       SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
280104eeddc0SDimitry Andric     B.buildUnmerge(SrcRegs, Vec);
280204eeddc0SDimitry Andric 
280304eeddc0SDimitry Andric     SrcRegs[IdxVal] = MI.getOperand(2).getReg();
2804bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, SrcRegs);
280504eeddc0SDimitry Andric   } else {
28060b57cec5SDimitry Andric     B.buildUndef(Dst);
280704eeddc0SDimitry Andric   }
28080b57cec5SDimitry Andric 
28090b57cec5SDimitry Andric   MI.eraseFromParent();
28100b57cec5SDimitry Andric   return true;
28110b57cec5SDimitry Andric }
28120b57cec5SDimitry Andric 
legalizeSinCos(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const28138bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos(
28148bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
28158bcb0991SDimitry Andric   MachineIRBuilder &B) const {
28168bcb0991SDimitry Andric 
28178bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
28188bcb0991SDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
28198bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
28208bcb0991SDimitry Andric   unsigned Flags = MI.getFlags();
28218bcb0991SDimitry Andric 
28228bcb0991SDimitry Andric   Register TrigVal;
28235ffd83dbSDimitry Andric   auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
28248bcb0991SDimitry Andric   if (ST.hasTrigReducedRange()) {
28258bcb0991SDimitry Andric     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
28265f757f3fSDimitry Andric     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
28278bcb0991SDimitry Andric                   .addUse(MulVal.getReg(0))
28285f757f3fSDimitry Andric                   .setMIFlags(Flags)
28295f757f3fSDimitry Andric                   .getReg(0);
28308bcb0991SDimitry Andric   } else
28318bcb0991SDimitry Andric     TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
28328bcb0991SDimitry Andric 
28338bcb0991SDimitry Andric   Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
28348bcb0991SDimitry Andric     Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
28355f757f3fSDimitry Andric   B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
28368bcb0991SDimitry Andric       .addUse(TrigVal)
28378bcb0991SDimitry Andric       .setMIFlags(Flags);
28388bcb0991SDimitry Andric   MI.eraseFromParent();
28398bcb0991SDimitry Andric   return true;
28408bcb0991SDimitry Andric }
28418bcb0991SDimitry Andric 
buildPCRelGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,int64_t Offset,unsigned GAFlags) const28425ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
28435ffd83dbSDimitry Andric                                                   MachineIRBuilder &B,
28445ffd83dbSDimitry Andric                                                   const GlobalValue *GV,
28455ffd83dbSDimitry Andric                                                   int64_t Offset,
28465ffd83dbSDimitry Andric                                                   unsigned GAFlags) const {
28475ffd83dbSDimitry Andric   assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
28488bcb0991SDimitry Andric   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
28498bcb0991SDimitry Andric   // to the following code sequence:
28508bcb0991SDimitry Andric   //
28518bcb0991SDimitry Andric   // For constant address space:
28528bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
28538bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol
28548bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, 0
28558bcb0991SDimitry Andric   //
28568bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
28578bcb0991SDimitry Andric   //   a fixup or relocation is emitted to replace $symbol with a literal
28588bcb0991SDimitry Andric   //   constant, which is a pc-relative offset from the encoding of the $symbol
28598bcb0991SDimitry Andric   //   operand to the global variable.
28608bcb0991SDimitry Andric   //
28618bcb0991SDimitry Andric   // For global address space:
28628bcb0991SDimitry Andric   //   s_getpc_b64 s[0:1]
28638bcb0991SDimitry Andric   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
28648bcb0991SDimitry Andric   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
28658bcb0991SDimitry Andric   //
28668bcb0991SDimitry Andric   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
28678bcb0991SDimitry Andric   //   fixups or relocations are emitted to replace $symbol@*@lo and
28688bcb0991SDimitry Andric   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
28698bcb0991SDimitry Andric   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
28708bcb0991SDimitry Andric   //   operand to the global variable.
28718bcb0991SDimitry Andric 
28728bcb0991SDimitry Andric   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
28738bcb0991SDimitry Andric 
28748bcb0991SDimitry Andric   Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
28758bcb0991SDimitry Andric     B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
28768bcb0991SDimitry Andric 
28778bcb0991SDimitry Andric   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
28788bcb0991SDimitry Andric     .addDef(PCReg);
28798bcb0991SDimitry Andric 
28805f757f3fSDimitry Andric   MIB.addGlobalAddress(GV, Offset, GAFlags);
28818bcb0991SDimitry Andric   if (GAFlags == SIInstrInfo::MO_NONE)
28828bcb0991SDimitry Andric     MIB.addImm(0);
28838bcb0991SDimitry Andric   else
28845f757f3fSDimitry Andric     MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
28858bcb0991SDimitry Andric 
288606c3fb27SDimitry Andric   if (!B.getMRI()->getRegClassOrNull(PCReg))
28878bcb0991SDimitry Andric     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
28888bcb0991SDimitry Andric 
28898bcb0991SDimitry Andric   if (PtrTy.getSizeInBits() == 32)
28908bcb0991SDimitry Andric     B.buildExtract(DstReg, PCReg, 0);
28918bcb0991SDimitry Andric   return true;
28928bcb0991SDimitry Andric }
28938bcb0991SDimitry Andric 
28945f757f3fSDimitry Andric // Emit a ABS32_LO / ABS32_HI relocation stub.
buildAbsGlobalAddress(Register DstReg,LLT PtrTy,MachineIRBuilder & B,const GlobalValue * GV,MachineRegisterInfo & MRI) const28955f757f3fSDimitry Andric void AMDGPULegalizerInfo::buildAbsGlobalAddress(
28965f757f3fSDimitry Andric     Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
28975f757f3fSDimitry Andric     MachineRegisterInfo &MRI) const {
28985f757f3fSDimitry Andric   bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
28995f757f3fSDimitry Andric 
29005f757f3fSDimitry Andric   LLT S32 = LLT::scalar(32);
29015f757f3fSDimitry Andric 
29025f757f3fSDimitry Andric   // Use the destination directly, if and only if we store the lower address
29035f757f3fSDimitry Andric   // part only and we don't have a register class being set.
29045f757f3fSDimitry Andric   Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
29055f757f3fSDimitry Andric                         ? DstReg
29065f757f3fSDimitry Andric                         : MRI.createGenericVirtualRegister(S32);
29075f757f3fSDimitry Andric 
29085f757f3fSDimitry Andric   if (!MRI.getRegClassOrNull(AddrLo))
29095f757f3fSDimitry Andric     MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
29105f757f3fSDimitry Andric 
29115f757f3fSDimitry Andric   // Write the lower half.
29125f757f3fSDimitry Andric   B.buildInstr(AMDGPU::S_MOV_B32)
29135f757f3fSDimitry Andric       .addDef(AddrLo)
29145f757f3fSDimitry Andric       .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
29155f757f3fSDimitry Andric 
29165f757f3fSDimitry Andric   // If required, write the upper half as well.
29175f757f3fSDimitry Andric   if (RequiresHighHalf) {
29185f757f3fSDimitry Andric     assert(PtrTy.getSizeInBits() == 64 &&
29195f757f3fSDimitry Andric            "Must provide a 64-bit pointer type!");
29205f757f3fSDimitry Andric 
29215f757f3fSDimitry Andric     Register AddrHi = MRI.createGenericVirtualRegister(S32);
29225f757f3fSDimitry Andric     MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
29235f757f3fSDimitry Andric 
29245f757f3fSDimitry Andric     B.buildInstr(AMDGPU::S_MOV_B32)
29255f757f3fSDimitry Andric         .addDef(AddrHi)
29265f757f3fSDimitry Andric         .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
29275f757f3fSDimitry Andric 
29285f757f3fSDimitry Andric     // Use the destination directly, if and only if we don't have a register
29295f757f3fSDimitry Andric     // class being set.
29305f757f3fSDimitry Andric     Register AddrDst = !MRI.getRegClassOrNull(DstReg)
29315f757f3fSDimitry Andric                            ? DstReg
29325f757f3fSDimitry Andric                            : MRI.createGenericVirtualRegister(LLT::scalar(64));
29335f757f3fSDimitry Andric 
29345f757f3fSDimitry Andric     if (!MRI.getRegClassOrNull(AddrDst))
29355f757f3fSDimitry Andric       MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
29365f757f3fSDimitry Andric 
29375f757f3fSDimitry Andric     B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
29385f757f3fSDimitry Andric 
29395f757f3fSDimitry Andric     // If we created a new register for the destination, cast the result into
29405f757f3fSDimitry Andric     // the final output.
29415f757f3fSDimitry Andric     if (AddrDst != DstReg)
29425f757f3fSDimitry Andric       B.buildCast(DstReg, AddrDst);
29435f757f3fSDimitry Andric   } else if (AddrLo != DstReg) {
29445f757f3fSDimitry Andric     // If we created a new register for the destination, cast the result into
29455f757f3fSDimitry Andric     // the final output.
29465f757f3fSDimitry Andric     B.buildCast(DstReg, AddrLo);
29475f757f3fSDimitry Andric   }
29485f757f3fSDimitry Andric }
29495f757f3fSDimitry Andric 
legalizeGlobalValue(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const29508bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue(
29518bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
29528bcb0991SDimitry Andric   MachineIRBuilder &B) const {
29538bcb0991SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
29548bcb0991SDimitry Andric   LLT Ty = MRI.getType(DstReg);
29558bcb0991SDimitry Andric   unsigned AS = Ty.getAddressSpace();
29568bcb0991SDimitry Andric 
29578bcb0991SDimitry Andric   const GlobalValue *GV = MI.getOperand(1).getGlobal();
29588bcb0991SDimitry Andric   MachineFunction &MF = B.getMF();
29598bcb0991SDimitry Andric   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
29608bcb0991SDimitry Andric 
29618bcb0991SDimitry Andric   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
2962fe6060f1SDimitry Andric     if (!MFI->isModuleEntryFunction() &&
2963*0fca6ea1SDimitry Andric         GV->getName() != "llvm.amdgcn.module.lds") {
29648bcb0991SDimitry Andric       const Function &Fn = MF.getFunction();
29658bcb0991SDimitry Andric       DiagnosticInfoUnsupported BadLDSDecl(
29665ffd83dbSDimitry Andric         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
29675ffd83dbSDimitry Andric         DS_Warning);
29688bcb0991SDimitry Andric       Fn.getContext().diagnose(BadLDSDecl);
29695ffd83dbSDimitry Andric 
29705ffd83dbSDimitry Andric       // We currently don't have a way to correctly allocate LDS objects that
29715ffd83dbSDimitry Andric       // aren't directly associated with a kernel. We do force inlining of
29725ffd83dbSDimitry Andric       // functions that use local objects. However, if these dead functions are
29735ffd83dbSDimitry Andric       // not eliminated, we don't want a compile time error. Just emit a warning
29745ffd83dbSDimitry Andric       // and a trap, since there should be no callable path here.
2975*0fca6ea1SDimitry Andric       B.buildTrap();
29765ffd83dbSDimitry Andric       B.buildUndef(DstReg);
29775ffd83dbSDimitry Andric       MI.eraseFromParent();
29785ffd83dbSDimitry Andric       return true;
29798bcb0991SDimitry Andric     }
29808bcb0991SDimitry Andric 
29818bcb0991SDimitry Andric     // TODO: We could emit code to handle the initialization somewhere.
2982349cc55cSDimitry Andric     // We ignore the initializer for now and legalize it to allow selection.
2983349cc55cSDimitry Andric     // The initializer will anyway get errored out during assembly emission.
29845ffd83dbSDimitry Andric     const SITargetLowering *TLI = ST.getTargetLowering();
29855ffd83dbSDimitry Andric     if (!TLI->shouldUseLDSConstAddress(GV)) {
29865ffd83dbSDimitry Andric       MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
29875ffd83dbSDimitry Andric       return true; // Leave in place;
29885ffd83dbSDimitry Andric     }
29895ffd83dbSDimitry Andric 
2990e8d8bef9SDimitry Andric     if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
2991e8d8bef9SDimitry Andric       Type *Ty = GV->getValueType();
2992e8d8bef9SDimitry Andric       // HIP uses an unsized array `extern __shared__ T s[]` or similar
2993e8d8bef9SDimitry Andric       // zero-sized type in other languages to declare the dynamic shared
2994e8d8bef9SDimitry Andric       // memory which size is not known at the compile time. They will be
2995e8d8bef9SDimitry Andric       // allocated by the runtime and placed directly after the static
2996e8d8bef9SDimitry Andric       // allocated ones. They all share the same offset.
2997e8d8bef9SDimitry Andric       if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2998e8d8bef9SDimitry Andric         // Adjust alignment for that dynamic shared memory array.
299906c3fb27SDimitry Andric         MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
3000e8d8bef9SDimitry Andric         LLT S32 = LLT::scalar(32);
30015f757f3fSDimitry Andric         auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3002e8d8bef9SDimitry Andric         B.buildIntToPtr(DstReg, Sz);
3003e8d8bef9SDimitry Andric         MI.eraseFromParent();
3004e8d8bef9SDimitry Andric         return true;
3005e8d8bef9SDimitry Andric       }
3006e8d8bef9SDimitry Andric     }
3007e8d8bef9SDimitry Andric 
3008349cc55cSDimitry Andric     B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(),
3009349cc55cSDimitry Andric                                                    *cast<GlobalVariable>(GV)));
30108bcb0991SDimitry Andric     MI.eraseFromParent();
30118bcb0991SDimitry Andric     return true;
30128bcb0991SDimitry Andric   }
30138bcb0991SDimitry Andric 
30145f757f3fSDimitry Andric   if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
30155f757f3fSDimitry Andric     buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
30165f757f3fSDimitry Andric     MI.eraseFromParent();
30175f757f3fSDimitry Andric     return true;
30185f757f3fSDimitry Andric   }
30195f757f3fSDimitry Andric 
30208bcb0991SDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
30218bcb0991SDimitry Andric 
30228bcb0991SDimitry Andric   if (TLI->shouldEmitFixup(GV)) {
30238bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
30248bcb0991SDimitry Andric     MI.eraseFromParent();
30258bcb0991SDimitry Andric     return true;
30268bcb0991SDimitry Andric   }
30278bcb0991SDimitry Andric 
30288bcb0991SDimitry Andric   if (TLI->shouldEmitPCReloc(GV)) {
30298bcb0991SDimitry Andric     buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
30308bcb0991SDimitry Andric     MI.eraseFromParent();
30318bcb0991SDimitry Andric     return true;
30328bcb0991SDimitry Andric   }
30338bcb0991SDimitry Andric 
30348bcb0991SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
30358bcb0991SDimitry Andric   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
30368bcb0991SDimitry Andric 
3037fe6060f1SDimitry Andric   LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
30388bcb0991SDimitry Andric   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
30398bcb0991SDimitry Andric       MachinePointerInfo::getGOT(MF),
30408bcb0991SDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
30418bcb0991SDimitry Andric           MachineMemOperand::MOInvariant,
3042fe6060f1SDimitry Andric       LoadTy, Align(8));
30438bcb0991SDimitry Andric 
30448bcb0991SDimitry Andric   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
30458bcb0991SDimitry Andric 
30468bcb0991SDimitry Andric   if (Ty.getSizeInBits() == 32) {
3047349cc55cSDimitry Andric     // Truncate if this is a 32-bit constant address.
30488bcb0991SDimitry Andric     auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
30498bcb0991SDimitry Andric     B.buildExtract(DstReg, Load, 0);
30508bcb0991SDimitry Andric   } else
30518bcb0991SDimitry Andric     B.buildLoad(DstReg, GOTAddr, *GOTMMO);
30528bcb0991SDimitry Andric 
30538bcb0991SDimitry Andric   MI.eraseFromParent();
30548bcb0991SDimitry Andric   return true;
30558bcb0991SDimitry Andric }
30568bcb0991SDimitry Andric 
widenToNextPowerOf2(LLT Ty)3057e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) {
3058e8d8bef9SDimitry Andric   if (Ty.isVector())
3059fe6060f1SDimitry Andric     return Ty.changeElementCount(
3060fe6060f1SDimitry Andric         ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3061e8d8bef9SDimitry Andric   return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3062e8d8bef9SDimitry Andric }
3063e8d8bef9SDimitry Andric 
legalizeLoad(LegalizerHelper & Helper,MachineInstr & MI) const3064e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
3065e8d8bef9SDimitry Andric                                        MachineInstr &MI) const {
3066e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
3067e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
3068e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
3069e8d8bef9SDimitry Andric 
3070e8d8bef9SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
3071e8d8bef9SDimitry Andric   LLT PtrTy = MRI.getType(PtrReg);
3072e8d8bef9SDimitry Andric   unsigned AddrSpace = PtrTy.getAddressSpace();
3073e8d8bef9SDimitry Andric 
3074e8d8bef9SDimitry Andric   if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
30758bcb0991SDimitry Andric     LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3076e8d8bef9SDimitry Andric     auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
30778bcb0991SDimitry Andric     Observer.changingInstr(MI);
30788bcb0991SDimitry Andric     MI.getOperand(1).setReg(Cast.getReg(0));
30798bcb0991SDimitry Andric     Observer.changedInstr(MI);
30808bcb0991SDimitry Andric     return true;
30818bcb0991SDimitry Andric   }
30828bcb0991SDimitry Andric 
3083fe6060f1SDimitry Andric   if (MI.getOpcode() != AMDGPU::G_LOAD)
3084fe6060f1SDimitry Andric     return false;
3085fe6060f1SDimitry Andric 
3086e8d8bef9SDimitry Andric   Register ValReg = MI.getOperand(0).getReg();
3087e8d8bef9SDimitry Andric   LLT ValTy = MRI.getType(ValReg);
3088e8d8bef9SDimitry Andric 
308906c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(ValTy)) {
309006c3fb27SDimitry Andric     Observer.changingInstr(MI);
309106c3fb27SDimitry Andric     castBufferRsrcFromV4I32(MI, B, MRI, 0);
309206c3fb27SDimitry Andric     Observer.changedInstr(MI);
309306c3fb27SDimitry Andric     return true;
309406c3fb27SDimitry Andric   }
309506c3fb27SDimitry Andric 
3096e8d8bef9SDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
3097e8d8bef9SDimitry Andric   const unsigned ValSize = ValTy.getSizeInBits();
3098fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
3099e8d8bef9SDimitry Andric   const Align MemAlign = MMO->getAlign();
3100fe6060f1SDimitry Andric   const unsigned MemSize = MemTy.getSizeInBits();
310104eeddc0SDimitry Andric   const uint64_t AlignInBits = 8 * MemAlign.value();
3102e8d8bef9SDimitry Andric 
3103e8d8bef9SDimitry Andric   // Widen non-power-of-2 loads to the alignment if needed
3104fe6060f1SDimitry Andric   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3105e8d8bef9SDimitry Andric     const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3106e8d8bef9SDimitry Andric 
3107e8d8bef9SDimitry Andric     // This was already the correct extending load result type, so just adjust
3108e8d8bef9SDimitry Andric     // the memory type.
3109e8d8bef9SDimitry Andric     if (WideMemSize == ValSize) {
3110e8d8bef9SDimitry Andric       MachineFunction &MF = B.getMF();
3111e8d8bef9SDimitry Andric 
3112e8d8bef9SDimitry Andric       MachineMemOperand *WideMMO =
3113e8d8bef9SDimitry Andric           MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3114e8d8bef9SDimitry Andric       Observer.changingInstr(MI);
3115e8d8bef9SDimitry Andric       MI.setMemRefs(MF, {WideMMO});
3116e8d8bef9SDimitry Andric       Observer.changedInstr(MI);
3117e8d8bef9SDimitry Andric       return true;
3118e8d8bef9SDimitry Andric     }
3119e8d8bef9SDimitry Andric 
3120e8d8bef9SDimitry Andric     // Don't bother handling edge case that should probably never be produced.
3121e8d8bef9SDimitry Andric     if (ValSize > WideMemSize)
3122e8d8bef9SDimitry Andric       return false;
3123e8d8bef9SDimitry Andric 
3124e8d8bef9SDimitry Andric     LLT WideTy = widenToNextPowerOf2(ValTy);
3125e8d8bef9SDimitry Andric 
3126e8d8bef9SDimitry Andric     Register WideLoad;
3127e8d8bef9SDimitry Andric     if (!WideTy.isVector()) {
3128e8d8bef9SDimitry Andric       WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3129e8d8bef9SDimitry Andric       B.buildTrunc(ValReg, WideLoad).getReg(0);
3130e8d8bef9SDimitry Andric     } else {
3131e8d8bef9SDimitry Andric       // Extract the subvector.
3132e8d8bef9SDimitry Andric 
3133e8d8bef9SDimitry Andric       if (isRegisterType(ValTy)) {
3134e8d8bef9SDimitry Andric         // If this a case where G_EXTRACT is legal, use it.
3135e8d8bef9SDimitry Andric         // (e.g. <3 x s32> -> <4 x s32>)
3136e8d8bef9SDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3137e8d8bef9SDimitry Andric         B.buildExtract(ValReg, WideLoad, 0);
3138e8d8bef9SDimitry Andric       } else {
3139e8d8bef9SDimitry Andric         // For cases where the widened type isn't a nice register value, unmerge
3140e8d8bef9SDimitry Andric         // from a widened register (e.g. <3 x s16> -> <4 x s16>)
31410eae32dcSDimitry Andric         WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
31420eae32dcSDimitry Andric         B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3143e8d8bef9SDimitry Andric       }
3144e8d8bef9SDimitry Andric     }
3145e8d8bef9SDimitry Andric 
3146e8d8bef9SDimitry Andric     MI.eraseFromParent();
3147e8d8bef9SDimitry Andric     return true;
3148e8d8bef9SDimitry Andric   }
3149e8d8bef9SDimitry Andric 
3150e8d8bef9SDimitry Andric   return false;
3151e8d8bef9SDimitry Andric }
3152e8d8bef9SDimitry Andric 
legalizeStore(LegalizerHelper & Helper,MachineInstr & MI) const315306c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
315406c3fb27SDimitry Andric                                         MachineInstr &MI) const {
315506c3fb27SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
315606c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
315706c3fb27SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
315806c3fb27SDimitry Andric 
315906c3fb27SDimitry Andric   Register DataReg = MI.getOperand(0).getReg();
316006c3fb27SDimitry Andric   LLT DataTy = MRI.getType(DataReg);
316106c3fb27SDimitry Andric 
316206c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(DataTy)) {
316306c3fb27SDimitry Andric     Observer.changingInstr(MI);
316406c3fb27SDimitry Andric     castBufferRsrcArgToV4I32(MI, B, 0);
316506c3fb27SDimitry Andric     Observer.changedInstr(MI);
316606c3fb27SDimitry Andric     return true;
316706c3fb27SDimitry Andric   }
316806c3fb27SDimitry Andric   return false;
316906c3fb27SDimitry Andric }
317006c3fb27SDimitry Andric 
legalizeFMad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const31718bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad(
31728bcb0991SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI,
31738bcb0991SDimitry Andric   MachineIRBuilder &B) const {
31748bcb0991SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
31758bcb0991SDimitry Andric   assert(Ty.isScalar());
31768bcb0991SDimitry Andric 
3177480093f4SDimitry Andric   MachineFunction &MF = B.getMF();
3178480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3179480093f4SDimitry Andric 
31808bcb0991SDimitry Andric   // TODO: Always legal with future ftz flag.
31815ffd83dbSDimitry Andric   // FIXME: Do we need just output?
31825f757f3fSDimitry Andric   if (Ty == LLT::float32() &&
318306c3fb27SDimitry Andric       MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
31848bcb0991SDimitry Andric     return true;
31855f757f3fSDimitry Andric   if (Ty == LLT::float16() &&
318606c3fb27SDimitry Andric       MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
31878bcb0991SDimitry Andric     return true;
31888bcb0991SDimitry Andric 
31898bcb0991SDimitry Andric   MachineIRBuilder HelperBuilder(MI);
31908bcb0991SDimitry Andric   GISelObserverWrapper DummyObserver;
31918bcb0991SDimitry Andric   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
31928bcb0991SDimitry Andric   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
31938bcb0991SDimitry Andric }
31948bcb0991SDimitry Andric 
legalizeAtomicCmpXChg(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const3195480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
3196480093f4SDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
3197480093f4SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
3198480093f4SDimitry Andric   Register PtrReg = MI.getOperand(1).getReg();
3199480093f4SDimitry Andric   Register CmpVal = MI.getOperand(2).getReg();
3200480093f4SDimitry Andric   Register NewVal = MI.getOperand(3).getReg();
3201480093f4SDimitry Andric 
3202e8d8bef9SDimitry Andric   assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
3203480093f4SDimitry Andric          "this should not have been custom lowered");
3204480093f4SDimitry Andric 
3205480093f4SDimitry Andric   LLT ValTy = MRI.getType(CmpVal);
3206fe6060f1SDimitry Andric   LLT VecTy = LLT::fixed_vector(2, ValTy);
3207480093f4SDimitry Andric 
3208480093f4SDimitry Andric   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3209480093f4SDimitry Andric 
3210480093f4SDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3211480093f4SDimitry Andric     .addDef(DstReg)
3212480093f4SDimitry Andric     .addUse(PtrReg)
3213480093f4SDimitry Andric     .addUse(PackedVal)
3214480093f4SDimitry Andric     .setMemRefs(MI.memoperands());
3215480093f4SDimitry Andric 
3216480093f4SDimitry Andric   MI.eraseFromParent();
3217480093f4SDimitry Andric   return true;
3218480093f4SDimitry Andric }
3219480093f4SDimitry Andric 
322006c3fb27SDimitry Andric /// Return true if it's known that \p Src can never be an f32 denormal value.
valueIsKnownNeverF32Denorm(const MachineRegisterInfo & MRI,Register Src)322106c3fb27SDimitry Andric static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
322206c3fb27SDimitry Andric                                        Register Src) {
32235f757f3fSDimitry Andric   const MachineInstr *DefMI = MRI.getVRegDef(Src);
32245f757f3fSDimitry Andric   switch (DefMI->getOpcode()) {
32255f757f3fSDimitry Andric   case TargetOpcode::G_INTRINSIC: {
32265f757f3fSDimitry Andric     switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
32275f757f3fSDimitry Andric     case Intrinsic::amdgcn_frexp_mant:
32285f757f3fSDimitry Andric       return true;
32295f757f3fSDimitry Andric     default:
32305f757f3fSDimitry Andric       break;
32315f757f3fSDimitry Andric     }
32325f757f3fSDimitry Andric 
32335f757f3fSDimitry Andric     break;
32345f757f3fSDimitry Andric   }
32355f757f3fSDimitry Andric   case TargetOpcode::G_FFREXP: {
32365f757f3fSDimitry Andric     if (DefMI->getOperand(0).getReg() == Src)
32375f757f3fSDimitry Andric       return true;
32385f757f3fSDimitry Andric     break;
32395f757f3fSDimitry Andric   }
32405f757f3fSDimitry Andric   case TargetOpcode::G_FPEXT: {
32415f757f3fSDimitry Andric     return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
32425f757f3fSDimitry Andric   }
32435f757f3fSDimitry Andric   default:
32445f757f3fSDimitry Andric     return false;
32455f757f3fSDimitry Andric   }
32465f757f3fSDimitry Andric 
324706c3fb27SDimitry Andric   return false;
324806c3fb27SDimitry Andric }
324906c3fb27SDimitry Andric 
allowApproxFunc(const MachineFunction & MF,unsigned Flags)325006c3fb27SDimitry Andric static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
325106c3fb27SDimitry Andric   if (Flags & MachineInstr::FmAfn)
325206c3fb27SDimitry Andric     return true;
325306c3fb27SDimitry Andric   const auto &Options = MF.getTarget().Options;
325406c3fb27SDimitry Andric   return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
325506c3fb27SDimitry Andric }
325606c3fb27SDimitry Andric 
needsDenormHandlingF32(const MachineFunction & MF,Register Src,unsigned Flags)325706c3fb27SDimitry Andric static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
325806c3fb27SDimitry Andric                                    unsigned Flags) {
325906c3fb27SDimitry Andric   return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
326006c3fb27SDimitry Andric          MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
326106c3fb27SDimitry Andric              DenormalMode::PreserveSign;
326206c3fb27SDimitry Andric }
326306c3fb27SDimitry Andric 
326406c3fb27SDimitry Andric std::pair<Register, Register>
getScaledLogInput(MachineIRBuilder & B,Register Src,unsigned Flags) const326506c3fb27SDimitry Andric AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
326606c3fb27SDimitry Andric                                        unsigned Flags) const {
32678a4dda33SDimitry Andric   if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
326806c3fb27SDimitry Andric     return {};
326906c3fb27SDimitry Andric 
327006c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
327106c3fb27SDimitry Andric   auto SmallestNormal = B.buildFConstant(
327206c3fb27SDimitry Andric       F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
327306c3fb27SDimitry Andric   auto IsLtSmallestNormal =
327406c3fb27SDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
327506c3fb27SDimitry Andric 
327606c3fb27SDimitry Andric   auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
327706c3fb27SDimitry Andric   auto One = B.buildFConstant(F32, 1.0);
327806c3fb27SDimitry Andric   auto ScaleFactor =
327906c3fb27SDimitry Andric       B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
328006c3fb27SDimitry Andric   auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
328106c3fb27SDimitry Andric 
328206c3fb27SDimitry Andric   return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
328306c3fb27SDimitry Andric }
328406c3fb27SDimitry Andric 
legalizeFlog2(MachineInstr & MI,MachineIRBuilder & B) const328506c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
328606c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
328706c3fb27SDimitry Andric   // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
328806c3fb27SDimitry Andric   // If we have to handle denormals, scale up the input and adjust the result.
328906c3fb27SDimitry Andric 
329006c3fb27SDimitry Andric   // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
329106c3fb27SDimitry Andric   // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
329206c3fb27SDimitry Andric 
32935ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
32945ffd83dbSDimitry Andric   Register Src = MI.getOperand(1).getReg();
32955ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
32965ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
32975ffd83dbSDimitry Andric 
329806c3fb27SDimitry Andric   if (Ty == LLT::scalar(16)) {
329906c3fb27SDimitry Andric     const LLT F32 = LLT::scalar(32);
330006c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
330106c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, Src, Flags);
33025f757f3fSDimitry Andric     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
330306c3fb27SDimitry Andric                     .addUse(Ext.getReg(0))
330406c3fb27SDimitry Andric                     .setMIFlags(Flags);
330506c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Log2, Flags);
33065ffd83dbSDimitry Andric     MI.eraseFromParent();
33075ffd83dbSDimitry Andric     return true;
33085ffd83dbSDimitry Andric   }
33095ffd83dbSDimitry Andric 
331006c3fb27SDimitry Andric   assert(Ty == LLT::scalar(32));
331106c3fb27SDimitry Andric 
331206c3fb27SDimitry Andric   auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
331306c3fb27SDimitry Andric   if (!ScaledInput) {
33145f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
331506c3fb27SDimitry Andric         .addUse(Src)
331606c3fb27SDimitry Andric         .setMIFlags(Flags);
331706c3fb27SDimitry Andric     MI.eraseFromParent();
331806c3fb27SDimitry Andric     return true;
331906c3fb27SDimitry Andric   }
332006c3fb27SDimitry Andric 
33215f757f3fSDimitry Andric   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
332206c3fb27SDimitry Andric                   .addUse(ScaledInput)
332306c3fb27SDimitry Andric                   .setMIFlags(Flags);
332406c3fb27SDimitry Andric 
332506c3fb27SDimitry Andric   auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
332606c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
332706c3fb27SDimitry Andric   auto ResultOffset =
332806c3fb27SDimitry Andric       B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
332906c3fb27SDimitry Andric   B.buildFSub(Dst, Log2, ResultOffset, Flags);
333006c3fb27SDimitry Andric 
333106c3fb27SDimitry Andric   MI.eraseFromParent();
333206c3fb27SDimitry Andric   return true;
333306c3fb27SDimitry Andric }
333406c3fb27SDimitry Andric 
getMad(MachineIRBuilder & B,LLT Ty,Register X,Register Y,Register Z,unsigned Flags)333506c3fb27SDimitry Andric static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
333606c3fb27SDimitry Andric                        Register Z, unsigned Flags) {
333706c3fb27SDimitry Andric   auto FMul = B.buildFMul(Ty, X, Y, Flags);
333806c3fb27SDimitry Andric   return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
333906c3fb27SDimitry Andric }
334006c3fb27SDimitry Andric 
legalizeFlogCommon(MachineInstr & MI,MachineIRBuilder & B) const334106c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
334206c3fb27SDimitry Andric                                              MachineIRBuilder &B) const {
334306c3fb27SDimitry Andric   const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
334406c3fb27SDimitry Andric   assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
334506c3fb27SDimitry Andric 
334606c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
334706c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
334806c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
334906c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
335006c3fb27SDimitry Andric   const LLT Ty = MRI.getType(X);
335106c3fb27SDimitry Andric   MachineFunction &MF = B.getMF();
335206c3fb27SDimitry Andric 
335306c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
335406c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
335506c3fb27SDimitry Andric 
335606c3fb27SDimitry Andric   const AMDGPUTargetMachine &TM =
335706c3fb27SDimitry Andric       static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
335806c3fb27SDimitry Andric 
335906c3fb27SDimitry Andric   if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
336006c3fb27SDimitry Andric       TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
336106c3fb27SDimitry Andric     if (Ty == F16 && !ST.has16BitInsts()) {
336206c3fb27SDimitry Andric       Register LogVal = MRI.createGenericVirtualRegister(F32);
336306c3fb27SDimitry Andric       auto PromoteSrc = B.buildFPExt(F32, X);
33648a4dda33SDimitry Andric       legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
336506c3fb27SDimitry Andric       B.buildFPTrunc(Dst, LogVal);
336606c3fb27SDimitry Andric     } else {
33678a4dda33SDimitry Andric       legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
336806c3fb27SDimitry Andric     }
336906c3fb27SDimitry Andric 
337006c3fb27SDimitry Andric     MI.eraseFromParent();
337106c3fb27SDimitry Andric     return true;
337206c3fb27SDimitry Andric   }
337306c3fb27SDimitry Andric 
337406c3fb27SDimitry Andric   auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
337506c3fb27SDimitry Andric   if (ScaledInput)
337606c3fb27SDimitry Andric     X = ScaledInput;
337706c3fb27SDimitry Andric 
33785f757f3fSDimitry Andric   auto Y =
33795f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
338006c3fb27SDimitry Andric 
338106c3fb27SDimitry Andric   Register R;
338206c3fb27SDimitry Andric   if (ST.hasFastFMAF32()) {
338306c3fb27SDimitry Andric     // c+cc are ln(2)/ln(10) to more than 49 bits
338406c3fb27SDimitry Andric     const float c_log10 = 0x1.344134p-2f;
338506c3fb27SDimitry Andric     const float cc_log10 = 0x1.09f79ep-26f;
338606c3fb27SDimitry Andric 
338706c3fb27SDimitry Andric     // c + cc is ln(2) to more than 49 bits
338806c3fb27SDimitry Andric     const float c_log = 0x1.62e42ep-1f;
338906c3fb27SDimitry Andric     const float cc_log = 0x1.efa39ep-25f;
339006c3fb27SDimitry Andric 
339106c3fb27SDimitry Andric     auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
339206c3fb27SDimitry Andric     auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
339306c3fb27SDimitry Andric 
339406c3fb27SDimitry Andric     R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
339506c3fb27SDimitry Andric     auto NegR = B.buildFNeg(Ty, R, Flags);
339606c3fb27SDimitry Andric     auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
339706c3fb27SDimitry Andric     auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
339806c3fb27SDimitry Andric     R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
339906c3fb27SDimitry Andric   } else {
340006c3fb27SDimitry Andric     // ch+ct is ln(2)/ln(10) to more than 36 bits
340106c3fb27SDimitry Andric     const float ch_log10 = 0x1.344000p-2f;
340206c3fb27SDimitry Andric     const float ct_log10 = 0x1.3509f6p-18f;
340306c3fb27SDimitry Andric 
340406c3fb27SDimitry Andric     // ch + ct is ln(2) to more than 36 bits
340506c3fb27SDimitry Andric     const float ch_log = 0x1.62e000p-1f;
340606c3fb27SDimitry Andric     const float ct_log = 0x1.0bfbe8p-15f;
340706c3fb27SDimitry Andric 
340806c3fb27SDimitry Andric     auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
340906c3fb27SDimitry Andric     auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
341006c3fb27SDimitry Andric 
341106c3fb27SDimitry Andric     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
341206c3fb27SDimitry Andric     auto YH = B.buildAnd(Ty, Y, MaskConst);
341306c3fb27SDimitry Andric     auto YT = B.buildFSub(Ty, Y, YH, Flags);
341406c3fb27SDimitry Andric     auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
341506c3fb27SDimitry Andric 
341606c3fb27SDimitry Andric     Register Mad0 =
341706c3fb27SDimitry Andric         getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
341806c3fb27SDimitry Andric     Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
341906c3fb27SDimitry Andric     R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
342006c3fb27SDimitry Andric   }
342106c3fb27SDimitry Andric 
342206c3fb27SDimitry Andric   const bool IsFiniteOnly =
342306c3fb27SDimitry Andric       (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
342406c3fb27SDimitry Andric       (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
342506c3fb27SDimitry Andric 
342606c3fb27SDimitry Andric   if (!IsFiniteOnly) {
342706c3fb27SDimitry Andric     // Expand isfinite(x) => fabs(x) < inf
342806c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
342906c3fb27SDimitry Andric     auto Fabs = B.buildFAbs(Ty, Y);
343006c3fb27SDimitry Andric     auto IsFinite =
343106c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
343206c3fb27SDimitry Andric     R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
343306c3fb27SDimitry Andric   }
343406c3fb27SDimitry Andric 
343506c3fb27SDimitry Andric   if (ScaledInput) {
343606c3fb27SDimitry Andric     auto Zero = B.buildFConstant(Ty, 0.0);
343706c3fb27SDimitry Andric     auto ShiftK =
343806c3fb27SDimitry Andric         B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
343906c3fb27SDimitry Andric     auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
344006c3fb27SDimitry Andric     B.buildFSub(Dst, R, Shift, Flags);
344106c3fb27SDimitry Andric   } else {
344206c3fb27SDimitry Andric     B.buildCopy(Dst, R);
344306c3fb27SDimitry Andric   }
344406c3fb27SDimitry Andric 
344506c3fb27SDimitry Andric   MI.eraseFromParent();
344606c3fb27SDimitry Andric   return true;
344706c3fb27SDimitry Andric }
344806c3fb27SDimitry Andric 
legalizeFlogUnsafe(MachineIRBuilder & B,Register Dst,Register Src,bool IsLog10,unsigned Flags) const344906c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
34508a4dda33SDimitry Andric                                              Register Src, bool IsLog10,
345106c3fb27SDimitry Andric                                              unsigned Flags) const {
34528a4dda33SDimitry Andric   const double Log2BaseInverted =
34538a4dda33SDimitry Andric       IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
34548a4dda33SDimitry Andric 
345506c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
34568a4dda33SDimitry Andric 
34578a4dda33SDimitry Andric   if (Ty == LLT::scalar(32)) {
34588a4dda33SDimitry Andric     auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
34598a4dda33SDimitry Andric     if (ScaledInput) {
34605f757f3fSDimitry Andric       auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
34618a4dda33SDimitry Andric                         .addUse(Src)
34628a4dda33SDimitry Andric                         .setMIFlags(Flags);
34638a4dda33SDimitry Andric       auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
34648a4dda33SDimitry Andric       auto Zero = B.buildFConstant(Ty, 0.0);
34658a4dda33SDimitry Andric       auto ResultOffset =
34668a4dda33SDimitry Andric           B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
34678a4dda33SDimitry Andric       auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
34688a4dda33SDimitry Andric 
34698a4dda33SDimitry Andric       if (ST.hasFastFMAF32())
34708a4dda33SDimitry Andric         B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
34718a4dda33SDimitry Andric       else {
34728a4dda33SDimitry Andric         auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
34738a4dda33SDimitry Andric         B.buildFAdd(Dst, Mul, ResultOffset, Flags);
34748a4dda33SDimitry Andric       }
34758a4dda33SDimitry Andric 
34768a4dda33SDimitry Andric       return true;
34778a4dda33SDimitry Andric     }
34788a4dda33SDimitry Andric   }
34798a4dda33SDimitry Andric 
348006c3fb27SDimitry Andric   auto Log2Operand = Ty == LLT::scalar(16)
348106c3fb27SDimitry Andric                          ? B.buildFLog2(Ty, Src, Flags)
34825f757f3fSDimitry Andric                          : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
348306c3fb27SDimitry Andric                                .addUse(Src)
348406c3fb27SDimitry Andric                                .setMIFlags(Flags);
348506c3fb27SDimitry Andric   auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
348606c3fb27SDimitry Andric   B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
348706c3fb27SDimitry Andric   return true;
348806c3fb27SDimitry Andric }
348906c3fb27SDimitry Andric 
legalizeFExp2(MachineInstr & MI,MachineIRBuilder & B) const349006c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
349106c3fb27SDimitry Andric                                         MachineIRBuilder &B) const {
349206c3fb27SDimitry Andric   // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
349306c3fb27SDimitry Andric   // If we have to handle denormals, scale up the input and adjust the result.
349406c3fb27SDimitry Andric 
349506c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
349606c3fb27SDimitry Andric   Register Src = MI.getOperand(1).getReg();
349706c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
349806c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
349906c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
350006c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
350106c3fb27SDimitry Andric 
350206c3fb27SDimitry Andric   if (Ty == F16) {
350306c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
350406c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, Src, Flags);
35055f757f3fSDimitry Andric     auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
350606c3fb27SDimitry Andric                     .addUse(Ext.getReg(0))
350706c3fb27SDimitry Andric                     .setMIFlags(Flags);
350806c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Log2, Flags);
350906c3fb27SDimitry Andric     MI.eraseFromParent();
351006c3fb27SDimitry Andric     return true;
351106c3fb27SDimitry Andric   }
351206c3fb27SDimitry Andric 
351306c3fb27SDimitry Andric   assert(Ty == F32);
351406c3fb27SDimitry Andric 
35158a4dda33SDimitry Andric   if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
35165f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
351706c3fb27SDimitry Andric         .addUse(Src)
351806c3fb27SDimitry Andric         .setMIFlags(Flags);
351906c3fb27SDimitry Andric     MI.eraseFromParent();
352006c3fb27SDimitry Andric     return true;
352106c3fb27SDimitry Andric   }
352206c3fb27SDimitry Andric 
352306c3fb27SDimitry Andric   // bool needs_scaling = x < -0x1.f80000p+6f;
352406c3fb27SDimitry Andric   // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
352506c3fb27SDimitry Andric 
352606c3fb27SDimitry Andric   // -nextafter(128.0, -1)
352706c3fb27SDimitry Andric   auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
352806c3fb27SDimitry Andric   auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
352906c3fb27SDimitry Andric                                   RangeCheckConst, Flags);
353006c3fb27SDimitry Andric 
353106c3fb27SDimitry Andric   auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
353206c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
353306c3fb27SDimitry Andric   auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
353406c3fb27SDimitry Andric   auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
353506c3fb27SDimitry Andric 
35365f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
353706c3fb27SDimitry Andric                   .addUse(AddInput.getReg(0))
353806c3fb27SDimitry Andric                   .setMIFlags(Flags);
353906c3fb27SDimitry Andric 
354006c3fb27SDimitry Andric   auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
354106c3fb27SDimitry Andric   auto One = B.buildFConstant(Ty, 1.0);
354206c3fb27SDimitry Andric   auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
354306c3fb27SDimitry Andric   B.buildFMul(Dst, Exp2, ResultScale, Flags);
354406c3fb27SDimitry Andric   MI.eraseFromParent();
354506c3fb27SDimitry Andric   return true;
354606c3fb27SDimitry Andric }
354706c3fb27SDimitry Andric 
legalizeFExpUnsafe(MachineIRBuilder & B,Register Dst,Register X,unsigned Flags) const354806c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
35495f757f3fSDimitry Andric                                              Register X, unsigned Flags) const {
355006c3fb27SDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
35515f757f3fSDimitry Andric   LLT F32 = LLT::scalar(32);
355206c3fb27SDimitry Andric 
35535f757f3fSDimitry Andric   if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
35545f757f3fSDimitry Andric     auto Log2E = B.buildFConstant(Ty, numbers::log2e);
35555f757f3fSDimitry Andric     auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
35565f757f3fSDimitry Andric 
35575f757f3fSDimitry Andric     if (Ty == F32) {
35585f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
355906c3fb27SDimitry Andric         .addUse(Mul.getReg(0))
356006c3fb27SDimitry Andric         .setMIFlags(Flags);
356106c3fb27SDimitry Andric     } else {
356206c3fb27SDimitry Andric       B.buildFExp2(Dst, Mul.getReg(0), Flags);
356306c3fb27SDimitry Andric     }
356406c3fb27SDimitry Andric 
356506c3fb27SDimitry Andric     return true;
356606c3fb27SDimitry Andric   }
356706c3fb27SDimitry Andric 
35685f757f3fSDimitry Andric   auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
35695f757f3fSDimitry Andric   auto NeedsScaling =
35705f757f3fSDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
35715f757f3fSDimitry Andric   auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
35725f757f3fSDimitry Andric   auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
35735f757f3fSDimitry Andric   auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
35745f757f3fSDimitry Andric 
35755f757f3fSDimitry Andric   auto Log2E = B.buildFConstant(Ty, numbers::log2e);
35765f757f3fSDimitry Andric   auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
35775f757f3fSDimitry Andric 
35785f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
35795f757f3fSDimitry Andric     .addUse(ExpInput.getReg(0))
35805f757f3fSDimitry Andric     .setMIFlags(Flags);
35815f757f3fSDimitry Andric 
35825f757f3fSDimitry Andric   auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
35835f757f3fSDimitry Andric   auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
35845f757f3fSDimitry Andric   B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
35855f757f3fSDimitry Andric   return true;
35865f757f3fSDimitry Andric }
35875f757f3fSDimitry Andric 
legalizeFExp(MachineInstr & MI,MachineIRBuilder & B) const35885ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
35895ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
35905ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
359106c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
359206c3fb27SDimitry Andric   const unsigned Flags = MI.getFlags();
359306c3fb27SDimitry Andric   MachineFunction &MF = B.getMF();
359406c3fb27SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
359506c3fb27SDimitry Andric   LLT Ty = MRI.getType(Dst);
359606c3fb27SDimitry Andric   const LLT F16 = LLT::scalar(16);
359706c3fb27SDimitry Andric   const LLT F32 = LLT::scalar(32);
35985f757f3fSDimitry Andric   const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
35995ffd83dbSDimitry Andric 
360006c3fb27SDimitry Andric   if (Ty == F16) {
360106c3fb27SDimitry Andric     // v_exp_f16 (fmul x, log2e)
360206c3fb27SDimitry Andric     if (allowApproxFunc(MF, Flags)) {
360306c3fb27SDimitry Andric       // TODO: Does this really require fast?
360406c3fb27SDimitry Andric       legalizeFExpUnsafe(B, Dst, X, Flags);
360506c3fb27SDimitry Andric       MI.eraseFromParent();
360606c3fb27SDimitry Andric       return true;
360706c3fb27SDimitry Andric     }
360806c3fb27SDimitry Andric 
360906c3fb27SDimitry Andric     // exp(f16 x) ->
361006c3fb27SDimitry Andric     //   fptrunc (v_exp_f32 (fmul (fpext x), log2e))
361106c3fb27SDimitry Andric 
361206c3fb27SDimitry Andric     // Nothing in half is a denormal when promoted to f32.
361306c3fb27SDimitry Andric     auto Ext = B.buildFPExt(F32, X, Flags);
361406c3fb27SDimitry Andric     Register Lowered = MRI.createGenericVirtualRegister(F32);
361506c3fb27SDimitry Andric     legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
361606c3fb27SDimitry Andric     B.buildFPTrunc(Dst, Lowered, Flags);
361706c3fb27SDimitry Andric     MI.eraseFromParent();
361806c3fb27SDimitry Andric     return true;
361906c3fb27SDimitry Andric   }
362006c3fb27SDimitry Andric 
362106c3fb27SDimitry Andric   assert(Ty == F32);
362206c3fb27SDimitry Andric 
362306c3fb27SDimitry Andric   // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
362406c3fb27SDimitry Andric   // library behavior. Also, is known-not-daz source sufficient?
36255f757f3fSDimitry Andric   if (allowApproxFunc(MF, Flags)) {
362606c3fb27SDimitry Andric     legalizeFExpUnsafe(B, Dst, X, Flags);
362706c3fb27SDimitry Andric     MI.eraseFromParent();
362806c3fb27SDimitry Andric     return true;
362906c3fb27SDimitry Andric   }
363006c3fb27SDimitry Andric 
363106c3fb27SDimitry Andric   //    Algorithm:
363206c3fb27SDimitry Andric   //
363306c3fb27SDimitry Andric   //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
363406c3fb27SDimitry Andric   //
363506c3fb27SDimitry Andric   //    x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
363606c3fb27SDimitry Andric   //    n = 64*m + j,   0 <= j < 64
363706c3fb27SDimitry Andric   //
363806c3fb27SDimitry Andric   //    e^x = 2^((64*m + j + f)/64)
363906c3fb27SDimitry Andric   //        = (2^m) * (2^(j/64)) * 2^(f/64)
364006c3fb27SDimitry Andric   //        = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
364106c3fb27SDimitry Andric   //
364206c3fb27SDimitry Andric   //    f = x*(64/ln(2)) - n
364306c3fb27SDimitry Andric   //    r = f*(ln(2)/64) = x - n*(ln(2)/64)
364406c3fb27SDimitry Andric   //
364506c3fb27SDimitry Andric   //    e^x = (2^m) * (2^(j/64)) * e^r
364606c3fb27SDimitry Andric   //
364706c3fb27SDimitry Andric   //    (2^(j/64)) is precomputed
364806c3fb27SDimitry Andric   //
364906c3fb27SDimitry Andric   //    e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
365006c3fb27SDimitry Andric   //    e^r = 1 + q
365106c3fb27SDimitry Andric   //
365206c3fb27SDimitry Andric   //    q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
365306c3fb27SDimitry Andric   //
365406c3fb27SDimitry Andric   //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
365506c3fb27SDimitry Andric   const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
365606c3fb27SDimitry Andric   Register PH, PL;
365706c3fb27SDimitry Andric 
365806c3fb27SDimitry Andric   if (ST.hasFastFMAF32()) {
365906c3fb27SDimitry Andric     const float c_exp = numbers::log2ef;
366006c3fb27SDimitry Andric     const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
366106c3fb27SDimitry Andric     const float c_exp10 = 0x1.a934f0p+1f;
366206c3fb27SDimitry Andric     const float cc_exp10 = 0x1.2f346ep-24f;
366306c3fb27SDimitry Andric 
366406c3fb27SDimitry Andric     auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
366506c3fb27SDimitry Andric     PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
366606c3fb27SDimitry Andric     auto NegPH = B.buildFNeg(Ty, PH, Flags);
366706c3fb27SDimitry Andric     auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
366806c3fb27SDimitry Andric 
366906c3fb27SDimitry Andric     auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
367006c3fb27SDimitry Andric     PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
367106c3fb27SDimitry Andric   } else {
367206c3fb27SDimitry Andric     const float ch_exp = 0x1.714000p+0f;
367306c3fb27SDimitry Andric     const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
367406c3fb27SDimitry Andric 
367506c3fb27SDimitry Andric     const float ch_exp10 = 0x1.a92000p+1f;
367606c3fb27SDimitry Andric     const float cl_exp10 = 0x1.4f0978p-11f;
367706c3fb27SDimitry Andric 
367806c3fb27SDimitry Andric     auto MaskConst = B.buildConstant(Ty, 0xfffff000);
367906c3fb27SDimitry Andric     auto XH = B.buildAnd(Ty, X, MaskConst);
368006c3fb27SDimitry Andric     auto XL = B.buildFSub(Ty, X, XH, Flags);
368106c3fb27SDimitry Andric 
368206c3fb27SDimitry Andric     auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
368306c3fb27SDimitry Andric     PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
368406c3fb27SDimitry Andric 
368506c3fb27SDimitry Andric     auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
368606c3fb27SDimitry Andric     auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
368706c3fb27SDimitry Andric 
368806c3fb27SDimitry Andric     Register Mad0 =
368906c3fb27SDimitry Andric         getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
369006c3fb27SDimitry Andric     PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
369106c3fb27SDimitry Andric   }
369206c3fb27SDimitry Andric 
36935f757f3fSDimitry Andric   auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
369406c3fb27SDimitry Andric 
369506c3fb27SDimitry Andric   // It is unsafe to contract this fsub into the PH multiply.
369606c3fb27SDimitry Andric   auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
369706c3fb27SDimitry Andric   auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
369806c3fb27SDimitry Andric   auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
369906c3fb27SDimitry Andric 
37005f757f3fSDimitry Andric   auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
370106c3fb27SDimitry Andric                   .addUse(A.getReg(0))
370206c3fb27SDimitry Andric                   .setMIFlags(Flags);
370306c3fb27SDimitry Andric   auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
370406c3fb27SDimitry Andric 
370506c3fb27SDimitry Andric   auto UnderflowCheckConst =
370606c3fb27SDimitry Andric       B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
370706c3fb27SDimitry Andric   auto Zero = B.buildFConstant(Ty, 0.0);
370806c3fb27SDimitry Andric   auto Underflow =
370906c3fb27SDimitry Andric       B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
371006c3fb27SDimitry Andric 
371106c3fb27SDimitry Andric   R = B.buildSelect(Ty, Underflow, Zero, R);
371206c3fb27SDimitry Andric 
371306c3fb27SDimitry Andric   const auto &Options = MF.getTarget().Options;
371406c3fb27SDimitry Andric 
371506c3fb27SDimitry Andric   if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
371606c3fb27SDimitry Andric     auto OverflowCheckConst =
371706c3fb27SDimitry Andric         B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
371806c3fb27SDimitry Andric 
371906c3fb27SDimitry Andric     auto Overflow =
372006c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
372106c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
372206c3fb27SDimitry Andric     R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
372306c3fb27SDimitry Andric   }
372406c3fb27SDimitry Andric 
372506c3fb27SDimitry Andric   B.buildCopy(Dst, R);
37265ffd83dbSDimitry Andric   MI.eraseFromParent();
37275ffd83dbSDimitry Andric   return true;
37285ffd83dbSDimitry Andric }
37295ffd83dbSDimitry Andric 
legalizeFPow(MachineInstr & MI,MachineIRBuilder & B) const37305ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
37315ffd83dbSDimitry Andric                                        MachineIRBuilder &B) const {
37325ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
37335ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
37345ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
37355ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
37365ffd83dbSDimitry Andric   LLT Ty = B.getMRI()->getType(Dst);
37375f757f3fSDimitry Andric   const LLT F16 = LLT::float16();
37385f757f3fSDimitry Andric   const LLT F32 = LLT::float32();
37395ffd83dbSDimitry Andric 
37405f757f3fSDimitry Andric   if (Ty == F32) {
37415f757f3fSDimitry Andric     auto Log = B.buildFLog2(F32, Src0, Flags);
37425f757f3fSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
37435ffd83dbSDimitry Andric                    .addUse(Log.getReg(0))
37445ffd83dbSDimitry Andric                    .addUse(Src1)
37455ffd83dbSDimitry Andric                    .setMIFlags(Flags);
37465ffd83dbSDimitry Andric     B.buildFExp2(Dst, Mul, Flags);
37475f757f3fSDimitry Andric   } else if (Ty == F16) {
37485ffd83dbSDimitry Andric     // There's no f16 fmul_legacy, so we need to convert for it.
37495f757f3fSDimitry Andric     auto Log = B.buildFLog2(F16, Src0, Flags);
37505f757f3fSDimitry Andric     auto Ext0 = B.buildFPExt(F32, Log, Flags);
37515f757f3fSDimitry Andric     auto Ext1 = B.buildFPExt(F32, Src1, Flags);
37525f757f3fSDimitry Andric     auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
37535ffd83dbSDimitry Andric                    .addUse(Ext0.getReg(0))
37545ffd83dbSDimitry Andric                    .addUse(Ext1.getReg(0))
37555ffd83dbSDimitry Andric                    .setMIFlags(Flags);
37565f757f3fSDimitry Andric     B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
37575ffd83dbSDimitry Andric   } else
37585ffd83dbSDimitry Andric     return false;
37595ffd83dbSDimitry Andric 
37605ffd83dbSDimitry Andric   MI.eraseFromParent();
37615ffd83dbSDimitry Andric   return true;
37625ffd83dbSDimitry Andric }
37635ffd83dbSDimitry Andric 
37645ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers.
stripAnySourceMods(Register OrigSrc,MachineRegisterInfo & MRI)37655ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
37665ffd83dbSDimitry Andric   Register ModSrc = OrigSrc;
37675ffd83dbSDimitry Andric   if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
37685ffd83dbSDimitry Andric     ModSrc = SrcFNeg->getOperand(1).getReg();
37695ffd83dbSDimitry Andric     if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
37705ffd83dbSDimitry Andric       ModSrc = SrcFAbs->getOperand(1).getReg();
37715ffd83dbSDimitry Andric   } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
37725ffd83dbSDimitry Andric     ModSrc = SrcFAbs->getOperand(1).getReg();
37735ffd83dbSDimitry Andric   return ModSrc;
37745ffd83dbSDimitry Andric }
37755ffd83dbSDimitry Andric 
legalizeFFloor(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const37765ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
37775ffd83dbSDimitry Andric                                          MachineRegisterInfo &MRI,
37785ffd83dbSDimitry Andric                                          MachineIRBuilder &B) const {
37795ffd83dbSDimitry Andric 
37805ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
37815f757f3fSDimitry Andric   const LLT F64 = LLT::float64();
37825ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
37835ffd83dbSDimitry Andric   Register OrigSrc = MI.getOperand(1).getReg();
37845ffd83dbSDimitry Andric   unsigned Flags = MI.getFlags();
37855f757f3fSDimitry Andric   assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
37865ffd83dbSDimitry Andric          "this should not have been custom lowered");
37875ffd83dbSDimitry Andric 
37885ffd83dbSDimitry Andric   // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
37895ffd83dbSDimitry Andric   // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
37905ffd83dbSDimitry Andric   // efficient way to implement it is using V_FRACT_F64. The workaround for the
37915ffd83dbSDimitry Andric   // V_FRACT bug is:
37925ffd83dbSDimitry Andric   //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
37935ffd83dbSDimitry Andric   //
37945ffd83dbSDimitry Andric   // Convert floor(x) to (x - fract(x))
37955ffd83dbSDimitry Andric 
37965f757f3fSDimitry Andric   auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
37975ffd83dbSDimitry Andric                    .addUse(OrigSrc)
37985ffd83dbSDimitry Andric                    .setMIFlags(Flags);
37995ffd83dbSDimitry Andric 
38005ffd83dbSDimitry Andric   // Give source modifier matching some assistance before obscuring a foldable
38015ffd83dbSDimitry Andric   // pattern.
38025ffd83dbSDimitry Andric 
38035ffd83dbSDimitry Andric   // TODO: We can avoid the neg on the fract? The input sign to fract
38045ffd83dbSDimitry Andric   // shouldn't matter?
38055ffd83dbSDimitry Andric   Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
38065ffd83dbSDimitry Andric 
380706c3fb27SDimitry Andric   auto Const =
38085f757f3fSDimitry Andric       B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
38095ffd83dbSDimitry Andric 
38105f757f3fSDimitry Andric   Register Min = MRI.createGenericVirtualRegister(F64);
38115ffd83dbSDimitry Andric 
38125ffd83dbSDimitry Andric   // We don't need to concern ourselves with the snan handling difference, so
38135ffd83dbSDimitry Andric   // use the one which will directly select.
38145ffd83dbSDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
38155ffd83dbSDimitry Andric   if (MFI->getMode().IEEE)
38165ffd83dbSDimitry Andric     B.buildFMinNumIEEE(Min, Fract, Const, Flags);
38175ffd83dbSDimitry Andric   else
38185ffd83dbSDimitry Andric     B.buildFMinNum(Min, Fract, Const, Flags);
38195ffd83dbSDimitry Andric 
38205ffd83dbSDimitry Andric   Register CorrectedFract = Min;
38215ffd83dbSDimitry Andric   if (!MI.getFlag(MachineInstr::FmNoNans)) {
38225ffd83dbSDimitry Andric     auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
38235f757f3fSDimitry Andric     CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
38245ffd83dbSDimitry Andric   }
38255ffd83dbSDimitry Andric 
38265f757f3fSDimitry Andric   auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
38275ffd83dbSDimitry Andric   B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
38285ffd83dbSDimitry Andric 
38295ffd83dbSDimitry Andric   MI.eraseFromParent();
38305ffd83dbSDimitry Andric   return true;
38315ffd83dbSDimitry Andric }
38325ffd83dbSDimitry Andric 
38335ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations.
38345ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper.
legalizeBuildVector(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const38355ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector(
38365ffd83dbSDimitry Andric   MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
38375ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
38385ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3839bdd1243dSDimitry Andric   const LLT S16 = LLT::scalar(16);
3840fe6060f1SDimitry Andric   assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
38415ffd83dbSDimitry Andric 
38425ffd83dbSDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
38435ffd83dbSDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
38445ffd83dbSDimitry Andric 
3845bdd1243dSDimitry Andric   if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3846bdd1243dSDimitry Andric     assert(MRI.getType(Src0) == S32);
3847bdd1243dSDimitry Andric     Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
3848bdd1243dSDimitry Andric     Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
3849bdd1243dSDimitry Andric   }
3850bdd1243dSDimitry Andric 
3851bdd1243dSDimitry Andric   auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
38525ffd83dbSDimitry Andric   B.buildBitcast(Dst, Merge);
38535ffd83dbSDimitry Andric 
38545ffd83dbSDimitry Andric   MI.eraseFromParent();
38555ffd83dbSDimitry Andric   return true;
38565ffd83dbSDimitry Andric }
38575ffd83dbSDimitry Andric 
385881ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions.
385981ad6265SDimitry Andric //
386081ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits.
386181ad6265SDimitry Andric //
386281ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence
386381ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of
386481ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go
386581ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction
386681ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions.
buildMultiply(LegalizerHelper & Helper,MutableArrayRef<Register> Accum,ArrayRef<Register> Src0,ArrayRef<Register> Src1,bool UsePartialMad64_32,bool SeparateOddAlignedProducts) const386706c3fb27SDimitry Andric void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
386806c3fb27SDimitry Andric                                         MutableArrayRef<Register> Accum,
386906c3fb27SDimitry Andric                                         ArrayRef<Register> Src0,
387006c3fb27SDimitry Andric                                         ArrayRef<Register> Src1,
387106c3fb27SDimitry Andric                                         bool UsePartialMad64_32,
387206c3fb27SDimitry Andric                                         bool SeparateOddAlignedProducts) const {
387381ad6265SDimitry Andric   // Use (possibly empty) vectors of S1 registers to represent the set of
387481ad6265SDimitry Andric   // carries from one pair of positions to the next.
387581ad6265SDimitry Andric   using Carry = SmallVector<Register, 2>;
387681ad6265SDimitry Andric 
387781ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
387806c3fb27SDimitry Andric   GISelKnownBits &KB = *Helper.getKnownBits();
387981ad6265SDimitry Andric 
388081ad6265SDimitry Andric   const LLT S1 = LLT::scalar(1);
388181ad6265SDimitry Andric   const LLT S32 = LLT::scalar(32);
388281ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
388381ad6265SDimitry Andric 
388481ad6265SDimitry Andric   Register Zero32;
388581ad6265SDimitry Andric   Register Zero64;
388681ad6265SDimitry Andric 
388781ad6265SDimitry Andric   auto getZero32 = [&]() -> Register {
388881ad6265SDimitry Andric     if (!Zero32)
388981ad6265SDimitry Andric       Zero32 = B.buildConstant(S32, 0).getReg(0);
389081ad6265SDimitry Andric     return Zero32;
389181ad6265SDimitry Andric   };
389281ad6265SDimitry Andric   auto getZero64 = [&]() -> Register {
389381ad6265SDimitry Andric     if (!Zero64)
389481ad6265SDimitry Andric       Zero64 = B.buildConstant(S64, 0).getReg(0);
389581ad6265SDimitry Andric     return Zero64;
389681ad6265SDimitry Andric   };
389781ad6265SDimitry Andric 
389806c3fb27SDimitry Andric   SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
389906c3fb27SDimitry Andric   for (unsigned i = 0; i < Src0.size(); ++i) {
390006c3fb27SDimitry Andric     Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
390106c3fb27SDimitry Andric     Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
390206c3fb27SDimitry Andric   }
390306c3fb27SDimitry Andric 
390481ad6265SDimitry Andric   // Merge the given carries into the 32-bit LocalAccum, which is modified
390581ad6265SDimitry Andric   // in-place.
390681ad6265SDimitry Andric   //
390781ad6265SDimitry Andric   // Returns the carry-out, which is a single S1 register or null.
390881ad6265SDimitry Andric   auto mergeCarry =
390981ad6265SDimitry Andric       [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
391081ad6265SDimitry Andric         if (CarryIn.empty())
391181ad6265SDimitry Andric           return Register();
391281ad6265SDimitry Andric 
391381ad6265SDimitry Andric         bool HaveCarryOut = true;
391481ad6265SDimitry Andric         Register CarryAccum;
391581ad6265SDimitry Andric         if (CarryIn.size() == 1) {
391681ad6265SDimitry Andric           if (!LocalAccum) {
391781ad6265SDimitry Andric             LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
391881ad6265SDimitry Andric             return Register();
391981ad6265SDimitry Andric           }
392081ad6265SDimitry Andric 
392181ad6265SDimitry Andric           CarryAccum = getZero32();
392281ad6265SDimitry Andric         } else {
392381ad6265SDimitry Andric           CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
392481ad6265SDimitry Andric           for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
392581ad6265SDimitry Andric             CarryAccum =
392681ad6265SDimitry Andric                 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
392781ad6265SDimitry Andric                     .getReg(0);
392881ad6265SDimitry Andric           }
392981ad6265SDimitry Andric 
393081ad6265SDimitry Andric           if (!LocalAccum) {
393181ad6265SDimitry Andric             LocalAccum = getZero32();
393281ad6265SDimitry Andric             HaveCarryOut = false;
393381ad6265SDimitry Andric           }
393481ad6265SDimitry Andric         }
393581ad6265SDimitry Andric 
393681ad6265SDimitry Andric         auto Add =
393781ad6265SDimitry Andric             B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
393881ad6265SDimitry Andric         LocalAccum = Add.getReg(0);
393981ad6265SDimitry Andric         return HaveCarryOut ? Add.getReg(1) : Register();
394081ad6265SDimitry Andric       };
394181ad6265SDimitry Andric 
394281ad6265SDimitry Andric   // Build a multiply-add chain to compute
394381ad6265SDimitry Andric   //
394481ad6265SDimitry Andric   //   LocalAccum + (partial products at DstIndex)
394581ad6265SDimitry Andric   //       + (opportunistic subset of CarryIn)
394681ad6265SDimitry Andric   //
394781ad6265SDimitry Andric   // LocalAccum is an array of one or two 32-bit registers that are updated
394881ad6265SDimitry Andric   // in-place. The incoming registers may be null.
394981ad6265SDimitry Andric   //
395081ad6265SDimitry Andric   // In some edge cases, carry-ins can be consumed "for free". In that case,
395181ad6265SDimitry Andric   // the consumed carry bits are removed from CarryIn in-place.
395281ad6265SDimitry Andric   auto buildMadChain =
395381ad6265SDimitry Andric       [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
395481ad6265SDimitry Andric           -> Carry {
395581ad6265SDimitry Andric         assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
395681ad6265SDimitry Andric                (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
395781ad6265SDimitry Andric 
395881ad6265SDimitry Andric         Carry CarryOut;
395981ad6265SDimitry Andric         unsigned j0 = 0;
396081ad6265SDimitry Andric 
396181ad6265SDimitry Andric         // Use plain 32-bit multiplication for the most significant part of the
396281ad6265SDimitry Andric         // result by default.
396381ad6265SDimitry Andric         if (LocalAccum.size() == 1 &&
396481ad6265SDimitry Andric             (!UsePartialMad64_32 || !CarryIn.empty())) {
396581ad6265SDimitry Andric           do {
396606c3fb27SDimitry Andric             // Skip multiplication if one of the operands is 0
396781ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
396806c3fb27SDimitry Andric             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
396906c3fb27SDimitry Andric               ++j0;
397006c3fb27SDimitry Andric               continue;
397106c3fb27SDimitry Andric             }
397281ad6265SDimitry Andric             auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
397306c3fb27SDimitry Andric             if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
397481ad6265SDimitry Andric               LocalAccum[0] = Mul.getReg(0);
397581ad6265SDimitry Andric             } else {
397681ad6265SDimitry Andric               if (CarryIn.empty()) {
397781ad6265SDimitry Andric                 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
397881ad6265SDimitry Andric               } else {
397981ad6265SDimitry Andric                 LocalAccum[0] =
398081ad6265SDimitry Andric                     B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
398181ad6265SDimitry Andric                         .getReg(0);
398281ad6265SDimitry Andric                 CarryIn.pop_back();
398381ad6265SDimitry Andric               }
398481ad6265SDimitry Andric             }
398581ad6265SDimitry Andric             ++j0;
398681ad6265SDimitry Andric           } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
398781ad6265SDimitry Andric         }
398881ad6265SDimitry Andric 
398981ad6265SDimitry Andric         // Build full 64-bit multiplies.
399081ad6265SDimitry Andric         if (j0 <= DstIndex) {
399181ad6265SDimitry Andric           bool HaveSmallAccum = false;
399281ad6265SDimitry Andric           Register Tmp;
399381ad6265SDimitry Andric 
399481ad6265SDimitry Andric           if (LocalAccum[0]) {
399581ad6265SDimitry Andric             if (LocalAccum.size() == 1) {
399681ad6265SDimitry Andric               Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
399781ad6265SDimitry Andric               HaveSmallAccum = true;
399881ad6265SDimitry Andric             } else if (LocalAccum[1]) {
3999bdd1243dSDimitry Andric               Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
400081ad6265SDimitry Andric               HaveSmallAccum = false;
400181ad6265SDimitry Andric             } else {
400281ad6265SDimitry Andric               Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
400381ad6265SDimitry Andric               HaveSmallAccum = true;
400481ad6265SDimitry Andric             }
400581ad6265SDimitry Andric           } else {
400681ad6265SDimitry Andric             assert(LocalAccum.size() == 1 || !LocalAccum[1]);
400781ad6265SDimitry Andric             Tmp = getZero64();
400881ad6265SDimitry Andric             HaveSmallAccum = true;
400981ad6265SDimitry Andric           }
401081ad6265SDimitry Andric 
401181ad6265SDimitry Andric           do {
401281ad6265SDimitry Andric             unsigned j1 = DstIndex - j0;
401306c3fb27SDimitry Andric             if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
401406c3fb27SDimitry Andric               ++j0;
401506c3fb27SDimitry Andric               continue;
401606c3fb27SDimitry Andric             }
401781ad6265SDimitry Andric             auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
401881ad6265SDimitry Andric                                     {Src0[j0], Src1[j1], Tmp});
401981ad6265SDimitry Andric             Tmp = Mad.getReg(0);
402081ad6265SDimitry Andric             if (!HaveSmallAccum)
402181ad6265SDimitry Andric               CarryOut.push_back(Mad.getReg(1));
402281ad6265SDimitry Andric             HaveSmallAccum = false;
402306c3fb27SDimitry Andric 
402481ad6265SDimitry Andric             ++j0;
402581ad6265SDimitry Andric           } while (j0 <= DstIndex);
402681ad6265SDimitry Andric 
402781ad6265SDimitry Andric           auto Unmerge = B.buildUnmerge(S32, Tmp);
402881ad6265SDimitry Andric           LocalAccum[0] = Unmerge.getReg(0);
402981ad6265SDimitry Andric           if (LocalAccum.size() > 1)
403081ad6265SDimitry Andric             LocalAccum[1] = Unmerge.getReg(1);
403181ad6265SDimitry Andric         }
403281ad6265SDimitry Andric 
403381ad6265SDimitry Andric         return CarryOut;
403481ad6265SDimitry Andric       };
403581ad6265SDimitry Andric 
403681ad6265SDimitry Andric   // Outer multiply loop, iterating over destination parts from least
403781ad6265SDimitry Andric   // significant to most significant parts.
403881ad6265SDimitry Andric   //
403981ad6265SDimitry Andric   // The columns of the following diagram correspond to the destination parts
404081ad6265SDimitry Andric   // affected by one iteration of the outer loop (ignoring boundary
404181ad6265SDimitry Andric   // conditions).
404281ad6265SDimitry Andric   //
404381ad6265SDimitry Andric   //   Dest index relative to 2 * i:      1 0 -1
404481ad6265SDimitry Andric   //                                      ------
404581ad6265SDimitry Andric   //   Carries from previous iteration:     e o
404681ad6265SDimitry Andric   //   Even-aligned partial product sum:  E E .
404781ad6265SDimitry Andric   //   Odd-aligned partial product sum:     O O
404881ad6265SDimitry Andric   //
404981ad6265SDimitry Andric   // 'o' is OddCarry, 'e' is EvenCarry.
405081ad6265SDimitry Andric   // EE and OO are computed from partial products via buildMadChain and use
405181ad6265SDimitry Andric   // accumulation where possible and appropriate.
405281ad6265SDimitry Andric   //
405381ad6265SDimitry Andric   Register SeparateOddCarry;
405481ad6265SDimitry Andric   Carry EvenCarry;
405581ad6265SDimitry Andric   Carry OddCarry;
405681ad6265SDimitry Andric 
405781ad6265SDimitry Andric   for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
405881ad6265SDimitry Andric     Carry OddCarryIn = std::move(OddCarry);
405981ad6265SDimitry Andric     Carry EvenCarryIn = std::move(EvenCarry);
406081ad6265SDimitry Andric     OddCarry.clear();
406181ad6265SDimitry Andric     EvenCarry.clear();
406281ad6265SDimitry Andric 
406381ad6265SDimitry Andric     // Partial products at offset 2 * i.
406481ad6265SDimitry Andric     if (2 * i < Accum.size()) {
406581ad6265SDimitry Andric       auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
406681ad6265SDimitry Andric       EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
406781ad6265SDimitry Andric     }
406881ad6265SDimitry Andric 
406981ad6265SDimitry Andric     // Partial products at offset 2 * i - 1.
407081ad6265SDimitry Andric     if (i > 0) {
407181ad6265SDimitry Andric       if (!SeparateOddAlignedProducts) {
407281ad6265SDimitry Andric         auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
407381ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
407481ad6265SDimitry Andric       } else {
407581ad6265SDimitry Andric         bool IsHighest = 2 * i >= Accum.size();
407681ad6265SDimitry Andric         Register SeparateOddOut[2];
4077bdd1243dSDimitry Andric         auto LocalAccum = MutableArrayRef(SeparateOddOut)
407881ad6265SDimitry Andric                               .take_front(IsHighest ? 1 : 2);
407981ad6265SDimitry Andric         OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
408081ad6265SDimitry Andric 
408181ad6265SDimitry Andric         MachineInstr *Lo;
408281ad6265SDimitry Andric 
408381ad6265SDimitry Andric         if (i == 1) {
408481ad6265SDimitry Andric           if (!IsHighest)
408581ad6265SDimitry Andric             Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
408681ad6265SDimitry Andric           else
408781ad6265SDimitry Andric             Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
408881ad6265SDimitry Andric         } else {
408981ad6265SDimitry Andric           Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
409081ad6265SDimitry Andric                             SeparateOddCarry);
409181ad6265SDimitry Andric         }
409281ad6265SDimitry Andric         Accum[2 * i - 1] = Lo->getOperand(0).getReg();
409381ad6265SDimitry Andric 
409481ad6265SDimitry Andric         if (!IsHighest) {
409581ad6265SDimitry Andric           auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
409681ad6265SDimitry Andric                                 Lo->getOperand(1).getReg());
409781ad6265SDimitry Andric           Accum[2 * i] = Hi.getReg(0);
409881ad6265SDimitry Andric           SeparateOddCarry = Hi.getReg(1);
409981ad6265SDimitry Andric         }
410081ad6265SDimitry Andric       }
410181ad6265SDimitry Andric     }
410281ad6265SDimitry Andric 
410381ad6265SDimitry Andric     // Add in the carries from the previous iteration
410481ad6265SDimitry Andric     if (i > 0) {
410581ad6265SDimitry Andric       if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
410681ad6265SDimitry Andric         EvenCarryIn.push_back(CarryOut);
410781ad6265SDimitry Andric 
410881ad6265SDimitry Andric       if (2 * i < Accum.size()) {
410981ad6265SDimitry Andric         if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
411081ad6265SDimitry Andric           OddCarry.push_back(CarryOut);
411181ad6265SDimitry Andric       }
411281ad6265SDimitry Andric     }
411381ad6265SDimitry Andric   }
411481ad6265SDimitry Andric }
411581ad6265SDimitry Andric 
411681ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions.
411781ad6265SDimitry Andric //
411881ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to
411981ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
legalizeMul(LegalizerHelper & Helper,MachineInstr & MI) const412081ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
412181ad6265SDimitry Andric                                       MachineInstr &MI) const {
412281ad6265SDimitry Andric   assert(ST.hasMad64_32());
412381ad6265SDimitry Andric   assert(MI.getOpcode() == TargetOpcode::G_MUL);
412481ad6265SDimitry Andric 
412581ad6265SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
412681ad6265SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
412781ad6265SDimitry Andric 
412881ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
412981ad6265SDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
413081ad6265SDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
413181ad6265SDimitry Andric 
413281ad6265SDimitry Andric   LLT Ty = MRI.getType(DstReg);
413381ad6265SDimitry Andric   assert(Ty.isScalar());
413481ad6265SDimitry Andric 
413581ad6265SDimitry Andric   unsigned Size = Ty.getSizeInBits();
413681ad6265SDimitry Andric   unsigned NumParts = Size / 32;
413781ad6265SDimitry Andric   assert((Size % 32) == 0);
413881ad6265SDimitry Andric   assert(NumParts >= 2);
413981ad6265SDimitry Andric 
414081ad6265SDimitry Andric   // Whether to use MAD_64_32 for partial products whose high half is
414181ad6265SDimitry Andric   // discarded. This avoids some ADD instructions but risks false dependency
414281ad6265SDimitry Andric   // stalls on some subtargets in some cases.
414381ad6265SDimitry Andric   const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
414481ad6265SDimitry Andric 
414581ad6265SDimitry Andric   // Whether to compute odd-aligned partial products separately. This is
414681ad6265SDimitry Andric   // advisable on subtargets where the accumulator of MAD_64_32 must be placed
414781ad6265SDimitry Andric   // in an even-aligned VGPR.
414881ad6265SDimitry Andric   const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
414981ad6265SDimitry Andric 
415081ad6265SDimitry Andric   LLT S32 = LLT::scalar(32);
415181ad6265SDimitry Andric   SmallVector<Register, 2> Src0Parts, Src1Parts;
415281ad6265SDimitry Andric   for (unsigned i = 0; i < NumParts; ++i) {
415381ad6265SDimitry Andric     Src0Parts.push_back(MRI.createGenericVirtualRegister(S32));
415481ad6265SDimitry Andric     Src1Parts.push_back(MRI.createGenericVirtualRegister(S32));
415581ad6265SDimitry Andric   }
415681ad6265SDimitry Andric   B.buildUnmerge(Src0Parts, Src0);
415781ad6265SDimitry Andric   B.buildUnmerge(Src1Parts, Src1);
415881ad6265SDimitry Andric 
415981ad6265SDimitry Andric   SmallVector<Register, 2> AccumRegs(NumParts);
416081ad6265SDimitry Andric   buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
416181ad6265SDimitry Andric                 SeparateOddAlignedProducts);
416281ad6265SDimitry Andric 
4163bdd1243dSDimitry Andric   B.buildMergeLikeInstr(DstReg, AccumRegs);
416481ad6265SDimitry Andric   MI.eraseFromParent();
416581ad6265SDimitry Andric   return true;
416681ad6265SDimitry Andric }
416781ad6265SDimitry Andric 
4168349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4169349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4170349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select.
legalizeCTLZ_CTTZ(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4171349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
4172349cc55cSDimitry Andric                                             MachineRegisterInfo &MRI,
4173349cc55cSDimitry Andric                                             MachineIRBuilder &B) const {
4174349cc55cSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4175349cc55cSDimitry Andric   Register Src = MI.getOperand(1).getReg();
4176349cc55cSDimitry Andric   LLT DstTy = MRI.getType(Dst);
4177349cc55cSDimitry Andric   LLT SrcTy = MRI.getType(Src);
4178349cc55cSDimitry Andric 
4179349cc55cSDimitry Andric   unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4180349cc55cSDimitry Andric                         ? AMDGPU::G_AMDGPU_FFBH_U32
4181349cc55cSDimitry Andric                         : AMDGPU::G_AMDGPU_FFBL_B32;
4182349cc55cSDimitry Andric   auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4183349cc55cSDimitry Andric   B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4184349cc55cSDimitry Andric 
4185349cc55cSDimitry Andric   MI.eraseFromParent();
4186349cc55cSDimitry Andric   return true;
4187349cc55cSDimitry Andric }
4188349cc55cSDimitry Andric 
legalizeCTLZ_ZERO_UNDEF(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4189*0fca6ea1SDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
4190*0fca6ea1SDimitry Andric                                                   MachineRegisterInfo &MRI,
4191*0fca6ea1SDimitry Andric                                                   MachineIRBuilder &B) const {
4192*0fca6ea1SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4193*0fca6ea1SDimitry Andric   Register Src = MI.getOperand(1).getReg();
4194*0fca6ea1SDimitry Andric   LLT SrcTy = MRI.getType(Src);
4195*0fca6ea1SDimitry Andric   TypeSize NumBits = SrcTy.getSizeInBits();
4196*0fca6ea1SDimitry Andric 
4197*0fca6ea1SDimitry Andric   assert(NumBits < 32u);
4198*0fca6ea1SDimitry Andric 
4199*0fca6ea1SDimitry Andric   auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4200*0fca6ea1SDimitry Andric   auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4201*0fca6ea1SDimitry Andric   auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4202*0fca6ea1SDimitry Andric   auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4203*0fca6ea1SDimitry Andric   B.buildTrunc(Dst, Ctlz);
4204*0fca6ea1SDimitry Andric   MI.eraseFromParent();
4205*0fca6ea1SDimitry Andric   return true;
4206*0fca6ea1SDimitry Andric }
4207*0fca6ea1SDimitry Andric 
4208e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1
isNot(const MachineRegisterInfo & MRI,const MachineInstr & MI)4209e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4210e8d8bef9SDimitry Andric   if (MI.getOpcode() != TargetOpcode::G_XOR)
4211e8d8bef9SDimitry Andric     return false;
4212349cc55cSDimitry Andric   auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4213e8d8bef9SDimitry Andric   return ConstVal && *ConstVal == -1;
4214e8d8bef9SDimitry Andric }
4215e8d8bef9SDimitry Andric 
42160b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid.
4217e8d8bef9SDimitry Andric static MachineInstr *
verifyCFIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineInstr * & Br,MachineBasicBlock * & UncondBrTarget,bool & Negated)4218e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
4219e8d8bef9SDimitry Andric                   MachineBasicBlock *&UncondBrTarget, bool &Negated) {
42200b57cec5SDimitry Andric   Register CondDef = MI.getOperand(0).getReg();
42210b57cec5SDimitry Andric   if (!MRI.hasOneNonDBGUse(CondDef))
42220b57cec5SDimitry Andric     return nullptr;
42230b57cec5SDimitry Andric 
42245ffd83dbSDimitry Andric   MachineBasicBlock *Parent = MI.getParent();
4225e8d8bef9SDimitry Andric   MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4226e8d8bef9SDimitry Andric 
4227e8d8bef9SDimitry Andric   if (isNot(MRI, *UseMI)) {
4228e8d8bef9SDimitry Andric     Register NegatedCond = UseMI->getOperand(0).getReg();
4229e8d8bef9SDimitry Andric     if (!MRI.hasOneNonDBGUse(NegatedCond))
4230e8d8bef9SDimitry Andric       return nullptr;
4231e8d8bef9SDimitry Andric 
4232e8d8bef9SDimitry Andric     // We're deleting the def of this value, so we need to remove it.
4233349cc55cSDimitry Andric     eraseInstr(*UseMI, MRI);
4234e8d8bef9SDimitry Andric 
4235e8d8bef9SDimitry Andric     UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4236e8d8bef9SDimitry Andric     Negated = true;
4237e8d8bef9SDimitry Andric   }
4238e8d8bef9SDimitry Andric 
4239e8d8bef9SDimitry Andric   if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4240480093f4SDimitry Andric     return nullptr;
4241480093f4SDimitry Andric 
42425ffd83dbSDimitry Andric   // Make sure the cond br is followed by a G_BR, or is the last instruction.
4243e8d8bef9SDimitry Andric   MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
42445ffd83dbSDimitry Andric   if (Next == Parent->end()) {
42455ffd83dbSDimitry Andric     MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
42465ffd83dbSDimitry Andric     if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
42475ffd83dbSDimitry Andric       return nullptr;
42485ffd83dbSDimitry Andric     UncondBrTarget = &*NextMBB;
42495ffd83dbSDimitry Andric   } else {
4250480093f4SDimitry Andric     if (Next->getOpcode() != AMDGPU::G_BR)
4251480093f4SDimitry Andric       return nullptr;
4252480093f4SDimitry Andric     Br = &*Next;
42535ffd83dbSDimitry Andric     UncondBrTarget = Br->getOperand(0).getMBB();
4254480093f4SDimitry Andric   }
4255480093f4SDimitry Andric 
4256e8d8bef9SDimitry Andric   return UseMI;
42570b57cec5SDimitry Andric }
42580b57cec5SDimitry Andric 
loadInputValue(Register DstReg,MachineIRBuilder & B,const ArgDescriptor * Arg,const TargetRegisterClass * ArgRC,LLT ArgTy) const42590b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
4260e8d8bef9SDimitry Andric                                          const ArgDescriptor *Arg,
4261e8d8bef9SDimitry Andric                                          const TargetRegisterClass *ArgRC,
4262e8d8bef9SDimitry Andric                                          LLT ArgTy) const {
4263e8d8bef9SDimitry Andric   MCRegister SrcReg = Arg->getRegister();
4264e8d8bef9SDimitry Andric   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
42655ffd83dbSDimitry Andric   assert(DstReg.isVirtual() && "Virtual register expected");
42660b57cec5SDimitry Andric 
426704eeddc0SDimitry Andric   Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
426804eeddc0SDimitry Andric                                              *ArgRC, B.getDebugLoc(), ArgTy);
42690b57cec5SDimitry Andric   if (Arg->isMasked()) {
42700b57cec5SDimitry Andric     // TODO: Should we try to emit this once in the entry block?
42710b57cec5SDimitry Andric     const LLT S32 = LLT::scalar(32);
42720b57cec5SDimitry Andric     const unsigned Mask = Arg->getMask();
427306c3fb27SDimitry Andric     const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
42740b57cec5SDimitry Andric 
42758bcb0991SDimitry Andric     Register AndMaskSrc = LiveIn;
42768bcb0991SDimitry Andric 
427704eeddc0SDimitry Andric     // TODO: Avoid clearing the high bits if we know workitem id y/z are always
427804eeddc0SDimitry Andric     // 0.
42798bcb0991SDimitry Andric     if (Shift != 0) {
42800b57cec5SDimitry Andric       auto ShiftAmt = B.buildConstant(S32, Shift);
42818bcb0991SDimitry Andric       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
42828bcb0991SDimitry Andric     }
42838bcb0991SDimitry Andric 
42848bcb0991SDimitry Andric     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
42855ffd83dbSDimitry Andric   } else {
42860b57cec5SDimitry Andric     B.buildCopy(DstReg, LiveIn);
42870b57cec5SDimitry Andric   }
42880b57cec5SDimitry Andric 
42890b57cec5SDimitry Andric   return true;
42900b57cec5SDimitry Andric }
42910b57cec5SDimitry Andric 
loadInputValue(Register DstReg,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const4292e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(
4293e8d8bef9SDimitry Andric     Register DstReg, MachineIRBuilder &B,
4294e8d8bef9SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4295e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4296b3edf446SDimitry Andric   const ArgDescriptor *Arg = nullptr;
4297e8d8bef9SDimitry Andric   const TargetRegisterClass *ArgRC;
4298e8d8bef9SDimitry Andric   LLT ArgTy;
4299b3edf446SDimitry Andric 
4300b3edf446SDimitry Andric   CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4301b3edf446SDimitry Andric   const ArgDescriptor WorkGroupIDX =
4302b3edf446SDimitry Andric       ArgDescriptor::createRegister(AMDGPU::TTMP9);
4303b3edf446SDimitry Andric   // If GridZ is not programmed in an entry function then the hardware will set
4304b3edf446SDimitry Andric   // it to all zeros, so there is no need to mask the GridY value in the low
4305b3edf446SDimitry Andric   // order bits.
4306b3edf446SDimitry Andric   const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4307b3edf446SDimitry Andric       AMDGPU::TTMP7,
4308b3edf446SDimitry Andric       AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4309b3edf446SDimitry Andric   const ArgDescriptor WorkGroupIDZ =
4310b3edf446SDimitry Andric       ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4311*0fca6ea1SDimitry Andric   if (ST.hasArchitectedSGPRs() &&
4312*0fca6ea1SDimitry Andric       (AMDGPU::isCompute(CC) || CC == CallingConv::AMDGPU_Gfx)) {
4313b3edf446SDimitry Andric     switch (ArgType) {
4314b3edf446SDimitry Andric     case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4315b3edf446SDimitry Andric       Arg = &WorkGroupIDX;
4316b3edf446SDimitry Andric       ArgRC = &AMDGPU::SReg_32RegClass;
4317b3edf446SDimitry Andric       ArgTy = LLT::scalar(32);
4318b3edf446SDimitry Andric       break;
4319b3edf446SDimitry Andric     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4320b3edf446SDimitry Andric       Arg = &WorkGroupIDY;
4321b3edf446SDimitry Andric       ArgRC = &AMDGPU::SReg_32RegClass;
4322b3edf446SDimitry Andric       ArgTy = LLT::scalar(32);
4323b3edf446SDimitry Andric       break;
4324b3edf446SDimitry Andric     case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4325b3edf446SDimitry Andric       Arg = &WorkGroupIDZ;
4326b3edf446SDimitry Andric       ArgRC = &AMDGPU::SReg_32RegClass;
4327b3edf446SDimitry Andric       ArgTy = LLT::scalar(32);
4328b3edf446SDimitry Andric       break;
4329b3edf446SDimitry Andric     default:
4330b3edf446SDimitry Andric       break;
4331b3edf446SDimitry Andric     }
4332b3edf446SDimitry Andric   }
4333b3edf446SDimitry Andric 
4334b3edf446SDimitry Andric   if (!Arg)
4335e8d8bef9SDimitry Andric     std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4336e8d8bef9SDimitry Andric 
4337349cc55cSDimitry Andric   if (!Arg) {
4338349cc55cSDimitry Andric     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
4339349cc55cSDimitry Andric       // The intrinsic may appear when we have a 0 sized kernarg segment, in which
4340349cc55cSDimitry Andric       // case the pointer argument may be missing and we use null.
4341349cc55cSDimitry Andric       B.buildConstant(DstReg, 0);
4342349cc55cSDimitry Andric       return true;
4343349cc55cSDimitry Andric     }
4344349cc55cSDimitry Andric 
4345349cc55cSDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
4346349cc55cSDimitry Andric     // attributes uses the corresponding intrinsic.
4347349cc55cSDimitry Andric     B.buildUndef(DstReg);
4348349cc55cSDimitry Andric     return true;
4349349cc55cSDimitry Andric   }
4350349cc55cSDimitry Andric 
4351e8d8bef9SDimitry Andric   if (!Arg->isRegister() || !Arg->getRegister().isValid())
4352e8d8bef9SDimitry Andric     return false; // TODO: Handle these
4353e8d8bef9SDimitry Andric   return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4354e8d8bef9SDimitry Andric }
4355e8d8bef9SDimitry Andric 
legalizePreloadedArgIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const43560b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
43575ffd83dbSDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
43580b57cec5SDimitry Andric     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
4359e8d8bef9SDimitry Andric   if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
43605ffd83dbSDimitry Andric     return false;
43615ffd83dbSDimitry Andric 
43620b57cec5SDimitry Andric   MI.eraseFromParent();
43630b57cec5SDimitry Andric   return true;
43640b57cec5SDimitry Andric }
43650b57cec5SDimitry Andric 
replaceWithConstant(MachineIRBuilder & B,MachineInstr & MI,int64_t C)436681ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI,
436781ad6265SDimitry Andric                                 int64_t C) {
436881ad6265SDimitry Andric   B.buildConstant(MI.getOperand(0).getReg(), C);
436981ad6265SDimitry Andric   MI.eraseFromParent();
437081ad6265SDimitry Andric   return true;
437181ad6265SDimitry Andric }
437281ad6265SDimitry Andric 
legalizeWorkitemIDIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned Dim,AMDGPUFunctionArgInfo::PreloadedValue ArgType) const437381ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic(
437481ad6265SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
437581ad6265SDimitry Andric     unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
437681ad6265SDimitry Andric   unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
437781ad6265SDimitry Andric   if (MaxID == 0)
437881ad6265SDimitry Andric     return replaceWithConstant(B, MI, 0);
437981ad6265SDimitry Andric 
438081ad6265SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
438181ad6265SDimitry Andric   const ArgDescriptor *Arg;
438281ad6265SDimitry Andric   const TargetRegisterClass *ArgRC;
438381ad6265SDimitry Andric   LLT ArgTy;
438481ad6265SDimitry Andric   std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
438581ad6265SDimitry Andric 
438681ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
438781ad6265SDimitry Andric   if (!Arg) {
438881ad6265SDimitry Andric     // It's undefined behavior if a function marked with the amdgpu-no-*
438981ad6265SDimitry Andric     // attributes uses the corresponding intrinsic.
439081ad6265SDimitry Andric     B.buildUndef(DstReg);
439181ad6265SDimitry Andric     MI.eraseFromParent();
439281ad6265SDimitry Andric     return true;
439381ad6265SDimitry Andric   }
439481ad6265SDimitry Andric 
439581ad6265SDimitry Andric   if (Arg->isMasked()) {
439681ad6265SDimitry Andric     // Don't bother inserting AssertZext for packed IDs since we're emitting the
439781ad6265SDimitry Andric     // masking operations anyway.
439881ad6265SDimitry Andric     //
439981ad6265SDimitry Andric     // TODO: We could assert the top bit is 0 for the source copy.
440081ad6265SDimitry Andric     if (!loadInputValue(DstReg, B, ArgType))
440181ad6265SDimitry Andric       return false;
440281ad6265SDimitry Andric   } else {
440381ad6265SDimitry Andric     Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
440481ad6265SDimitry Andric     if (!loadInputValue(TmpReg, B, ArgType))
440581ad6265SDimitry Andric       return false;
4406bdd1243dSDimitry Andric     B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
440781ad6265SDimitry Andric   }
440881ad6265SDimitry Andric 
440981ad6265SDimitry Andric   MI.eraseFromParent();
441081ad6265SDimitry Andric   return true;
441181ad6265SDimitry Andric }
441281ad6265SDimitry Andric 
getKernargParameterPtr(MachineIRBuilder & B,int64_t Offset) const441381ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B,
441481ad6265SDimitry Andric                                                      int64_t Offset) const {
441581ad6265SDimitry Andric   LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
441681ad6265SDimitry Andric   Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
441781ad6265SDimitry Andric 
441881ad6265SDimitry Andric   // TODO: If we passed in the base kernel offset we could have a better
441981ad6265SDimitry Andric   // alignment than 4, but we don't really need it.
442081ad6265SDimitry Andric   if (!loadInputValue(KernArgReg, B,
442181ad6265SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
442281ad6265SDimitry Andric     llvm_unreachable("failed to find kernarg segment ptr");
442381ad6265SDimitry Andric 
442481ad6265SDimitry Andric   auto COffset = B.buildConstant(LLT::scalar(64), Offset);
442581ad6265SDimitry Andric   // TODO: Should get nuw
442681ad6265SDimitry Andric   return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
442781ad6265SDimitry Andric }
442881ad6265SDimitry Andric 
442981ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by
443081ad6265SDimitry Andric /// legacy intrinsics.
legalizeKernargMemParameter(MachineInstr & MI,MachineIRBuilder & B,uint64_t Offset,Align Alignment) const443181ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI,
443281ad6265SDimitry Andric                                                       MachineIRBuilder &B,
443381ad6265SDimitry Andric                                                       uint64_t Offset,
443481ad6265SDimitry Andric                                                       Align Alignment) const {
443581ad6265SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
443681ad6265SDimitry Andric 
443781ad6265SDimitry Andric   assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
443881ad6265SDimitry Andric          "unexpected kernarg parameter type");
443981ad6265SDimitry Andric 
444081ad6265SDimitry Andric   Register Ptr = getKernargParameterPtr(B, Offset);
444181ad6265SDimitry Andric   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
444281ad6265SDimitry Andric   B.buildLoad(DstReg, Ptr, PtrInfo, Align(4),
444381ad6265SDimitry Andric               MachineMemOperand::MODereferenceable |
444481ad6265SDimitry Andric                   MachineMemOperand::MOInvariant);
444581ad6265SDimitry Andric   MI.eraseFromParent();
444681ad6265SDimitry Andric   return true;
444781ad6265SDimitry Andric }
444881ad6265SDimitry Andric 
legalizeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const44498bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
44508bcb0991SDimitry Andric                                        MachineRegisterInfo &MRI,
44518bcb0991SDimitry Andric                                        MachineIRBuilder &B) const {
4452480093f4SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
4453480093f4SDimitry Andric   LLT DstTy = MRI.getType(Dst);
4454480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
4455480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4456480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
44578bcb0991SDimitry Andric 
4458480093f4SDimitry Andric   if (DstTy == S16)
4459480093f4SDimitry Andric     return legalizeFDIV16(MI, MRI, B);
4460480093f4SDimitry Andric   if (DstTy == S32)
4461480093f4SDimitry Andric     return legalizeFDIV32(MI, MRI, B);
4462480093f4SDimitry Andric   if (DstTy == S64)
4463480093f4SDimitry Andric     return legalizeFDIV64(MI, MRI, B);
4464480093f4SDimitry Andric 
44658bcb0991SDimitry Andric   return false;
44668bcb0991SDimitry Andric }
44678bcb0991SDimitry Andric 
legalizeUnsignedDIV_REM32Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register X,Register Y) const4468fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
4469fe6060f1SDimitry Andric                                                         Register DstDivReg,
4470fe6060f1SDimitry Andric                                                         Register DstRemReg,
44715ffd83dbSDimitry Andric                                                         Register X,
4472fe6060f1SDimitry Andric                                                         Register Y) const {
44735ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
44745ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
44755ffd83dbSDimitry Andric 
44765ffd83dbSDimitry Andric   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
44775ffd83dbSDimitry Andric   // algorithm used here.
44785ffd83dbSDimitry Andric 
44795ffd83dbSDimitry Andric   // Initial estimate of inv(y).
44805ffd83dbSDimitry Andric   auto FloatY = B.buildUITOFP(S32, Y);
44815ffd83dbSDimitry Andric   auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
448206c3fb27SDimitry Andric   auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
44835ffd83dbSDimitry Andric   auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
44845ffd83dbSDimitry Andric   auto Z = B.buildFPTOUI(S32, ScaledY);
44855ffd83dbSDimitry Andric 
44865ffd83dbSDimitry Andric   // One round of UNR.
44875ffd83dbSDimitry Andric   auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
44885ffd83dbSDimitry Andric   auto NegYZ = B.buildMul(S32, NegY, Z);
44895ffd83dbSDimitry Andric   Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
44905ffd83dbSDimitry Andric 
44915ffd83dbSDimitry Andric   // Quotient/remainder estimate.
44925ffd83dbSDimitry Andric   auto Q = B.buildUMulH(S32, X, Z);
44935ffd83dbSDimitry Andric   auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
44945ffd83dbSDimitry Andric 
44955ffd83dbSDimitry Andric   // First quotient/remainder refinement.
44965ffd83dbSDimitry Andric   auto One = B.buildConstant(S32, 1);
44975ffd83dbSDimitry Andric   auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4498fe6060f1SDimitry Andric   if (DstDivReg)
44995ffd83dbSDimitry Andric     Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
45005ffd83dbSDimitry Andric   R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
45015ffd83dbSDimitry Andric 
45025ffd83dbSDimitry Andric   // Second quotient/remainder refinement.
45035ffd83dbSDimitry Andric   Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
4504fe6060f1SDimitry Andric   if (DstDivReg)
4505fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
45065ffd83dbSDimitry Andric 
4507fe6060f1SDimitry Andric   if (DstRemReg)
4508fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
45095ffd83dbSDimitry Andric }
45105ffd83dbSDimitry Andric 
4511349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32
45125ffd83dbSDimitry Andric //
45135ffd83dbSDimitry Andric // Return lo, hi of result
45145ffd83dbSDimitry Andric //
45155ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo
45165ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi
45175ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
45185ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad
45195ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc
45205ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32)
45215ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2
45225ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1
45235ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc}
emitReciprocalU64(MachineIRBuilder & B,Register Val)45245ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
45255ffd83dbSDimitry Andric                                                        Register Val) {
45265ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
45275ffd83dbSDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Val);
45285ffd83dbSDimitry Andric 
45295ffd83dbSDimitry Andric   auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
45305ffd83dbSDimitry Andric   auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
45315ffd83dbSDimitry Andric 
453206c3fb27SDimitry Andric   auto Mad = B.buildFMAD(
453306c3fb27SDimitry Andric       S32, CvtHi, // 2**32
453406c3fb27SDimitry Andric       B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
45355ffd83dbSDimitry Andric 
45365ffd83dbSDimitry Andric   auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
453706c3fb27SDimitry Andric   auto Mul1 = B.buildFMul(
453806c3fb27SDimitry Andric       S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
45395ffd83dbSDimitry Andric 
45405ffd83dbSDimitry Andric   // 2**(-32)
454106c3fb27SDimitry Andric   auto Mul2 = B.buildFMul(
454206c3fb27SDimitry Andric       S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
45435ffd83dbSDimitry Andric   auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
45445ffd83dbSDimitry Andric 
45455ffd83dbSDimitry Andric   // -(2**32)
454606c3fb27SDimitry Andric   auto Mad2 = B.buildFMAD(
454706c3fb27SDimitry Andric       S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
454806c3fb27SDimitry Andric       Mul1);
45495ffd83dbSDimitry Andric 
45505ffd83dbSDimitry Andric   auto ResultLo = B.buildFPTOUI(S32, Mad2);
45515ffd83dbSDimitry Andric   auto ResultHi = B.buildFPTOUI(S32, Trunc);
45525ffd83dbSDimitry Andric 
45535ffd83dbSDimitry Andric   return {ResultLo.getReg(0), ResultHi.getReg(0)};
45545ffd83dbSDimitry Andric }
45555ffd83dbSDimitry Andric 
legalizeUnsignedDIV_REM64Impl(MachineIRBuilder & B,Register DstDivReg,Register DstRemReg,Register Numer,Register Denom) const4556fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
4557fe6060f1SDimitry Andric                                                         Register DstDivReg,
4558fe6060f1SDimitry Andric                                                         Register DstRemReg,
45595ffd83dbSDimitry Andric                                                         Register Numer,
4560fe6060f1SDimitry Andric                                                         Register Denom) const {
45615ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
45625ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
45635ffd83dbSDimitry Andric   const LLT S1 = LLT::scalar(1);
45645ffd83dbSDimitry Andric   Register RcpLo, RcpHi;
45655ffd83dbSDimitry Andric 
45665ffd83dbSDimitry Andric   std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
45675ffd83dbSDimitry Andric 
4568bdd1243dSDimitry Andric   auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
45695ffd83dbSDimitry Andric 
45705ffd83dbSDimitry Andric   auto Zero64 = B.buildConstant(S64, 0);
45715ffd83dbSDimitry Andric   auto NegDenom = B.buildSub(S64, Zero64, Denom);
45725ffd83dbSDimitry Andric 
45735ffd83dbSDimitry Andric   auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
45745ffd83dbSDimitry Andric   auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
45755ffd83dbSDimitry Andric 
45765ffd83dbSDimitry Andric   auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
45775ffd83dbSDimitry Andric   Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
45785ffd83dbSDimitry Andric   Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
45795ffd83dbSDimitry Andric 
45805ffd83dbSDimitry Andric   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
45815ffd83dbSDimitry Andric   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4582bdd1243dSDimitry Andric   auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
45835ffd83dbSDimitry Andric 
45845ffd83dbSDimitry Andric   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
45855ffd83dbSDimitry Andric   auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
45865ffd83dbSDimitry Andric   auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
45875ffd83dbSDimitry Andric   Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
45885ffd83dbSDimitry Andric   Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
45895ffd83dbSDimitry Andric 
45905ffd83dbSDimitry Andric   auto Zero32 = B.buildConstant(S32, 0);
45915ffd83dbSDimitry Andric   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
4592349cc55cSDimitry Andric   auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4593bdd1243dSDimitry Andric   auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
45945ffd83dbSDimitry Andric 
45955ffd83dbSDimitry Andric   auto UnmergeNumer = B.buildUnmerge(S32, Numer);
45965ffd83dbSDimitry Andric   Register NumerLo = UnmergeNumer.getReg(0);
45975ffd83dbSDimitry Andric   Register NumerHi = UnmergeNumer.getReg(1);
45985ffd83dbSDimitry Andric 
45995ffd83dbSDimitry Andric   auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
46005ffd83dbSDimitry Andric   auto Mul3 = B.buildMul(S64, Denom, MulHi3);
46015ffd83dbSDimitry Andric   auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
46025ffd83dbSDimitry Andric   Register Mul3_Lo = UnmergeMul3.getReg(0);
46035ffd83dbSDimitry Andric   Register Mul3_Hi = UnmergeMul3.getReg(1);
46045ffd83dbSDimitry Andric   auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
46055ffd83dbSDimitry Andric   auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
46065ffd83dbSDimitry Andric   auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
4607bdd1243dSDimitry Andric   auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
46085ffd83dbSDimitry Andric 
46095ffd83dbSDimitry Andric   auto UnmergeDenom = B.buildUnmerge(S32, Denom);
46105ffd83dbSDimitry Andric   Register DenomLo = UnmergeDenom.getReg(0);
46115ffd83dbSDimitry Andric   Register DenomHi = UnmergeDenom.getReg(1);
46125ffd83dbSDimitry Andric 
46135ffd83dbSDimitry Andric   auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
46145ffd83dbSDimitry Andric   auto C1 = B.buildSExt(S32, CmpHi);
46155ffd83dbSDimitry Andric 
46165ffd83dbSDimitry Andric   auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
46175ffd83dbSDimitry Andric   auto C2 = B.buildSExt(S32, CmpLo);
46185ffd83dbSDimitry Andric 
46195ffd83dbSDimitry Andric   auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
46205ffd83dbSDimitry Andric   auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
46215ffd83dbSDimitry Andric 
46225ffd83dbSDimitry Andric   // TODO: Here and below portions of the code can be enclosed into if/endif.
46235ffd83dbSDimitry Andric   // Currently control flow is unconditional and we have 4 selects after
46245ffd83dbSDimitry Andric   // potential endif to substitute PHIs.
46255ffd83dbSDimitry Andric 
46265ffd83dbSDimitry Andric   // if C3 != 0 ...
46275ffd83dbSDimitry Andric   auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
46285ffd83dbSDimitry Andric   auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
46295ffd83dbSDimitry Andric   auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4630bdd1243dSDimitry Andric   auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
46315ffd83dbSDimitry Andric 
46325ffd83dbSDimitry Andric   auto One64 = B.buildConstant(S64, 1);
46335ffd83dbSDimitry Andric   auto Add3 = B.buildAdd(S64, MulHi3, One64);
46345ffd83dbSDimitry Andric 
46355ffd83dbSDimitry Andric   auto C4 =
46365ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
46375ffd83dbSDimitry Andric   auto C5 =
46385ffd83dbSDimitry Andric       B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
46395ffd83dbSDimitry Andric   auto C6 = B.buildSelect(
46405ffd83dbSDimitry Andric       S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
46415ffd83dbSDimitry Andric 
46425ffd83dbSDimitry Andric   // if (C6 != 0)
46435ffd83dbSDimitry Andric   auto Add4 = B.buildAdd(S64, Add3, One64);
46445ffd83dbSDimitry Andric   auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
46455ffd83dbSDimitry Andric 
46465ffd83dbSDimitry Andric   auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
46475ffd83dbSDimitry Andric   auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4648bdd1243dSDimitry Andric   auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
46495ffd83dbSDimitry Andric 
46505ffd83dbSDimitry Andric   // endif C6
46515ffd83dbSDimitry Andric   // endif C3
46525ffd83dbSDimitry Andric 
4653fe6060f1SDimitry Andric   if (DstDivReg) {
46545ffd83dbSDimitry Andric     auto Sel1 = B.buildSelect(
46555ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
4656fe6060f1SDimitry Andric     B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4657fe6060f1SDimitry Andric                   Sel1, MulHi3);
4658fe6060f1SDimitry Andric   }
4659fe6060f1SDimitry Andric 
4660fe6060f1SDimitry Andric   if (DstRemReg) {
46615ffd83dbSDimitry Andric     auto Sel2 = B.buildSelect(
46625ffd83dbSDimitry Andric         S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
4663fe6060f1SDimitry Andric     B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
4664fe6060f1SDimitry Andric                   Sel2, Sub1);
46655ffd83dbSDimitry Andric   }
46665ffd83dbSDimitry Andric }
46675ffd83dbSDimitry Andric 
legalizeUnsignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4668fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI,
46695ffd83dbSDimitry Andric                                                   MachineRegisterInfo &MRI,
46705ffd83dbSDimitry Andric                                                   MachineIRBuilder &B) const {
4671fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg;
4672fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
4673fe6060f1SDimitry Andric   default:
4674fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
4675fe6060f1SDimitry Andric   case AMDGPU::G_UDIV: {
4676fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4677fe6060f1SDimitry Andric     break;
4678fe6060f1SDimitry Andric   }
4679fe6060f1SDimitry Andric   case AMDGPU::G_UREM: {
4680fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
4681fe6060f1SDimitry Andric     break;
4682fe6060f1SDimitry Andric   }
4683fe6060f1SDimitry Andric   case AMDGPU::G_UDIVREM: {
4684fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4685fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
4686fe6060f1SDimitry Andric     break;
4687fe6060f1SDimitry Andric   }
4688fe6060f1SDimitry Andric   }
4689fe6060f1SDimitry Andric 
46905ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
46915ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
4692fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4693fe6060f1SDimitry Andric   Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
4694fe6060f1SDimitry Andric   Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
4695fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
46965ffd83dbSDimitry Andric 
46975ffd83dbSDimitry Andric   if (Ty == S32)
4698fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
46995ffd83dbSDimitry Andric   else if (Ty == S64)
4700fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
47015ffd83dbSDimitry Andric   else
47025ffd83dbSDimitry Andric     return false;
47035ffd83dbSDimitry Andric 
47045ffd83dbSDimitry Andric   MI.eraseFromParent();
47055ffd83dbSDimitry Andric   return true;
47065ffd83dbSDimitry Andric }
47075ffd83dbSDimitry Andric 
legalizeSignedDIV_REM(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4708fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI,
47095ffd83dbSDimitry Andric                                                 MachineRegisterInfo &MRI,
47105ffd83dbSDimitry Andric                                                 MachineIRBuilder &B) const {
47115ffd83dbSDimitry Andric   const LLT S64 = LLT::scalar(64);
47125ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
47135ffd83dbSDimitry Andric 
4714fe6060f1SDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
47155ffd83dbSDimitry Andric   if (Ty != S32 && Ty != S64)
47165ffd83dbSDimitry Andric     return false;
47175ffd83dbSDimitry Andric 
4718fe6060f1SDimitry Andric   const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
4719fe6060f1SDimitry Andric   Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
4720fe6060f1SDimitry Andric   Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
47215ffd83dbSDimitry Andric 
47225ffd83dbSDimitry Andric   auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
47235ffd83dbSDimitry Andric   auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
47245ffd83dbSDimitry Andric   auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
47255ffd83dbSDimitry Andric 
47265ffd83dbSDimitry Andric   LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
47275ffd83dbSDimitry Andric   RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
47285ffd83dbSDimitry Andric 
47295ffd83dbSDimitry Andric   LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
47305ffd83dbSDimitry Andric   RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
47315ffd83dbSDimitry Andric 
4732fe6060f1SDimitry Andric   Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4733fe6060f1SDimitry Andric   switch (MI.getOpcode()) {
4734fe6060f1SDimitry Andric   default:
4735fe6060f1SDimitry Andric     llvm_unreachable("Unexpected opcode!");
4736fe6060f1SDimitry Andric   case AMDGPU::G_SDIV: {
4737fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4738fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4739fe6060f1SDimitry Andric     break;
4740fe6060f1SDimitry Andric   }
4741fe6060f1SDimitry Andric   case AMDGPU::G_SREM: {
4742fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(0).getReg();
4743fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4744fe6060f1SDimitry Andric     break;
4745fe6060f1SDimitry Andric   }
4746fe6060f1SDimitry Andric   case AMDGPU::G_SDIVREM: {
4747fe6060f1SDimitry Andric     DstDivReg = MI.getOperand(0).getReg();
4748fe6060f1SDimitry Andric     DstRemReg = MI.getOperand(1).getReg();
4749fe6060f1SDimitry Andric     TmpDivReg = MRI.createGenericVirtualRegister(Ty);
4750fe6060f1SDimitry Andric     TmpRemReg = MRI.createGenericVirtualRegister(Ty);
4751fe6060f1SDimitry Andric     break;
4752fe6060f1SDimitry Andric   }
4753fe6060f1SDimitry Andric   }
4754fe6060f1SDimitry Andric 
47555ffd83dbSDimitry Andric   if (Ty == S32)
4756fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
47575ffd83dbSDimitry Andric   else
4758fe6060f1SDimitry Andric     legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
47595ffd83dbSDimitry Andric 
4760fe6060f1SDimitry Andric   if (DstDivReg) {
4761fe6060f1SDimitry Andric     auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
4762fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4763fe6060f1SDimitry Andric     B.buildSub(DstDivReg, SignXor, Sign);
4764fe6060f1SDimitry Andric   }
47655ffd83dbSDimitry Andric 
4766fe6060f1SDimitry Andric   if (DstRemReg) {
4767fe6060f1SDimitry Andric     auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
4768fe6060f1SDimitry Andric     auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4769fe6060f1SDimitry Andric     B.buildSub(DstRemReg, SignXor, Sign);
4770fe6060f1SDimitry Andric   }
47715ffd83dbSDimitry Andric 
47725ffd83dbSDimitry Andric   MI.eraseFromParent();
47735ffd83dbSDimitry Andric   return true;
47745ffd83dbSDimitry Andric }
47755ffd83dbSDimitry Andric 
legalizeFastUnsafeFDIV(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const47768bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
47778bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
47788bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
47798bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
47808bcb0991SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
47818bcb0991SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
47828bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
47838bcb0991SDimitry Andric   LLT ResTy = MRI.getType(Res);
47848bcb0991SDimitry Andric 
47858bcb0991SDimitry Andric   const MachineFunction &MF = B.getMF();
478606c3fb27SDimitry Andric   bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
478706c3fb27SDimitry Andric                             MF.getTarget().Options.UnsafeFPMath;
47888bcb0991SDimitry Andric 
47898bcb0991SDimitry Andric   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
479006c3fb27SDimitry Andric     if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
479106c3fb27SDimitry Andric       return false;
479206c3fb27SDimitry Andric 
479306c3fb27SDimitry Andric     // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
479406c3fb27SDimitry Andric     // the CI documentation has a worst case error of 1 ulp.
479506c3fb27SDimitry Andric     // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
479606c3fb27SDimitry Andric     // use it as long as we aren't trying to use denormals.
479706c3fb27SDimitry Andric     //
479806c3fb27SDimitry Andric     // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
479906c3fb27SDimitry Andric 
48008bcb0991SDimitry Andric     // 1 / x -> RCP(x)
48018bcb0991SDimitry Andric     if (CLHS->isExactlyValue(1.0)) {
48025f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
48038bcb0991SDimitry Andric           .addUse(RHS)
48048bcb0991SDimitry Andric           .setMIFlags(Flags);
48058bcb0991SDimitry Andric 
48068bcb0991SDimitry Andric       MI.eraseFromParent();
48078bcb0991SDimitry Andric       return true;
48088bcb0991SDimitry Andric     }
48098bcb0991SDimitry Andric 
48108bcb0991SDimitry Andric     // -1 / x -> RCP( FNEG(x) )
48118bcb0991SDimitry Andric     if (CLHS->isExactlyValue(-1.0)) {
48128bcb0991SDimitry Andric       auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
48135f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
48148bcb0991SDimitry Andric           .addUse(FNeg.getReg(0))
48158bcb0991SDimitry Andric           .setMIFlags(Flags);
48168bcb0991SDimitry Andric 
48178bcb0991SDimitry Andric       MI.eraseFromParent();
48188bcb0991SDimitry Andric       return true;
48198bcb0991SDimitry Andric     }
48208bcb0991SDimitry Andric   }
48218bcb0991SDimitry Andric 
48225f757f3fSDimitry Andric   // For f16 require afn or arcp.
48235f757f3fSDimitry Andric   // For f32 require afn.
482406c3fb27SDimitry Andric   if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
482506c3fb27SDimitry Andric                               !MI.getFlag(MachineInstr::FmArcp)))
482606c3fb27SDimitry Andric     return false;
482706c3fb27SDimitry Andric 
48288bcb0991SDimitry Andric   // x / y -> x * (1.0 / y)
48295f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
48308bcb0991SDimitry Andric                  .addUse(RHS)
48318bcb0991SDimitry Andric                  .setMIFlags(Flags);
48328bcb0991SDimitry Andric   B.buildFMul(Res, LHS, RCP, Flags);
48338bcb0991SDimitry Andric 
48348bcb0991SDimitry Andric   MI.eraseFromParent();
48358bcb0991SDimitry Andric   return true;
48368bcb0991SDimitry Andric }
48378bcb0991SDimitry Andric 
legalizeFastUnsafeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4838e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
4839e8d8bef9SDimitry Andric                                                    MachineRegisterInfo &MRI,
4840e8d8bef9SDimitry Andric                                                    MachineIRBuilder &B) const {
4841e8d8bef9SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4842e8d8bef9SDimitry Andric   Register X = MI.getOperand(1).getReg();
4843e8d8bef9SDimitry Andric   Register Y = MI.getOperand(2).getReg();
4844e8d8bef9SDimitry Andric   uint16_t Flags = MI.getFlags();
4845e8d8bef9SDimitry Andric   LLT ResTy = MRI.getType(Res);
4846e8d8bef9SDimitry Andric 
4847e8d8bef9SDimitry Andric   const MachineFunction &MF = B.getMF();
4848e8d8bef9SDimitry Andric   bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
4849e8d8bef9SDimitry Andric                             MI.getFlag(MachineInstr::FmAfn);
4850e8d8bef9SDimitry Andric 
4851e8d8bef9SDimitry Andric   if (!AllowInaccurateRcp)
48528bcb0991SDimitry Andric     return false;
4853e8d8bef9SDimitry Andric 
4854e8d8bef9SDimitry Andric   auto NegY = B.buildFNeg(ResTy, Y);
4855e8d8bef9SDimitry Andric   auto One = B.buildFConstant(ResTy, 1.0);
4856e8d8bef9SDimitry Andric 
48575f757f3fSDimitry Andric   auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4858e8d8bef9SDimitry Andric                .addUse(Y)
4859e8d8bef9SDimitry Andric                .setMIFlags(Flags);
4860e8d8bef9SDimitry Andric 
4861e8d8bef9SDimitry Andric   auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
4862e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp0, R, R);
4863e8d8bef9SDimitry Andric 
4864e8d8bef9SDimitry Andric   auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
4865e8d8bef9SDimitry Andric   R = B.buildFMA(ResTy, Tmp1, R, R);
4866e8d8bef9SDimitry Andric 
4867e8d8bef9SDimitry Andric   auto Ret = B.buildFMul(ResTy, X, R);
4868e8d8bef9SDimitry Andric   auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
4869e8d8bef9SDimitry Andric 
4870e8d8bef9SDimitry Andric   B.buildFMA(Res, Tmp2, R, Ret);
4871e8d8bef9SDimitry Andric   MI.eraseFromParent();
4872e8d8bef9SDimitry Andric   return true;
48738bcb0991SDimitry Andric }
48748bcb0991SDimitry Andric 
legalizeFDIV16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4875480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
4876480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4877480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4878e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4879e8d8bef9SDimitry Andric     return true;
4880e8d8bef9SDimitry Andric 
4881480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4882480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4883480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4884480093f4SDimitry Andric 
4885480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4886480093f4SDimitry Andric 
4887480093f4SDimitry Andric   LLT S16 = LLT::scalar(16);
4888480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4889480093f4SDimitry Andric 
4890480093f4SDimitry Andric   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
4891480093f4SDimitry Andric   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
4892480093f4SDimitry Andric 
48935f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4894480093f4SDimitry Andric                  .addUse(RHSExt.getReg(0))
4895480093f4SDimitry Andric                  .setMIFlags(Flags);
4896480093f4SDimitry Andric 
4897480093f4SDimitry Andric   auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
4898480093f4SDimitry Andric   auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
4899480093f4SDimitry Andric 
49005f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4901480093f4SDimitry Andric       .addUse(RDst.getReg(0))
4902480093f4SDimitry Andric       .addUse(RHS)
4903480093f4SDimitry Andric       .addUse(LHS)
4904480093f4SDimitry Andric       .setMIFlags(Flags);
4905480093f4SDimitry Andric 
4906480093f4SDimitry Andric   MI.eraseFromParent();
4907480093f4SDimitry Andric   return true;
4908480093f4SDimitry Andric }
4909480093f4SDimitry Andric 
4910*0fca6ea1SDimitry Andric static constexpr unsigned SPDenormModeBitField =
4911*0fca6ea1SDimitry Andric     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 4, 2);
49125f757f3fSDimitry Andric 
4913480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
4914480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode.
toggleSPDenormMode(bool Enable,MachineIRBuilder & B,const GCNSubtarget & ST,SIModeRegisterDefaults Mode)491506c3fb27SDimitry Andric static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
4916480093f4SDimitry Andric                                const GCNSubtarget &ST,
491706c3fb27SDimitry Andric                                SIModeRegisterDefaults Mode) {
4918480093f4SDimitry Andric   // Set SP denorm mode to this value.
4919480093f4SDimitry Andric   unsigned SPDenormMode =
49205ffd83dbSDimitry Andric     Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
4921480093f4SDimitry Andric 
4922480093f4SDimitry Andric   if (ST.hasDenormModeInst()) {
4923480093f4SDimitry Andric     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
49245ffd83dbSDimitry Andric     uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4925480093f4SDimitry Andric 
49265ffd83dbSDimitry Andric     uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4927480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_DENORM_MODE)
4928480093f4SDimitry Andric       .addImm(NewDenormModeValue);
4929480093f4SDimitry Andric 
4930480093f4SDimitry Andric   } else {
4931480093f4SDimitry Andric     B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4932480093f4SDimitry Andric       .addImm(SPDenormMode)
4933480093f4SDimitry Andric       .addImm(SPDenormModeBitField);
4934480093f4SDimitry Andric   }
4935480093f4SDimitry Andric }
4936480093f4SDimitry Andric 
legalizeFDIV32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const4937480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
4938480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
4939480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
4940e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV(MI, MRI, B))
4941e8d8bef9SDimitry Andric     return true;
4942e8d8bef9SDimitry Andric 
4943480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
4944480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
4945480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
4946480093f4SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
494706c3fb27SDimitry Andric   SIModeRegisterDefaults Mode = MFI->getMode();
4948480093f4SDimitry Andric 
4949480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
4950480093f4SDimitry Andric 
4951480093f4SDimitry Andric   LLT S32 = LLT::scalar(32);
4952480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
4953480093f4SDimitry Andric 
4954480093f4SDimitry Andric   auto One = B.buildFConstant(S32, 1.0f);
4955480093f4SDimitry Andric 
4956480093f4SDimitry Andric   auto DenominatorScaled =
49575f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4958480093f4SDimitry Andric           .addUse(LHS)
49595ffd83dbSDimitry Andric           .addUse(RHS)
49605ffd83dbSDimitry Andric           .addImm(0)
4961480093f4SDimitry Andric           .setMIFlags(Flags);
4962480093f4SDimitry Andric   auto NumeratorScaled =
49635f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
4964480093f4SDimitry Andric           .addUse(LHS)
4965480093f4SDimitry Andric           .addUse(RHS)
49665ffd83dbSDimitry Andric           .addImm(1)
4967480093f4SDimitry Andric           .setMIFlags(Flags);
4968480093f4SDimitry Andric 
49695f757f3fSDimitry Andric   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
4970480093f4SDimitry Andric                        .addUse(DenominatorScaled.getReg(0))
4971480093f4SDimitry Andric                        .setMIFlags(Flags);
4972480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
4973480093f4SDimitry Andric 
49745f757f3fSDimitry Andric   const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
49755f757f3fSDimitry Andric   const bool HasDynamicDenormals =
49765f757f3fSDimitry Andric       (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
49775f757f3fSDimitry Andric       (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
49785f757f3fSDimitry Andric 
49795f757f3fSDimitry Andric   Register SavedSPDenormMode;
49805f757f3fSDimitry Andric   if (!PreservesDenormals) {
49815f757f3fSDimitry Andric     if (HasDynamicDenormals) {
49825f757f3fSDimitry Andric       SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
49835f757f3fSDimitry Andric       B.buildInstr(AMDGPU::S_GETREG_B32)
49845f757f3fSDimitry Andric           .addDef(SavedSPDenormMode)
49855f757f3fSDimitry Andric           .addImm(SPDenormModeBitField);
49865f757f3fSDimitry Andric     }
4987480093f4SDimitry Andric     toggleSPDenormMode(true, B, ST, Mode);
49885f757f3fSDimitry Andric   }
4989480093f4SDimitry Andric 
4990480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
4991480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4992480093f4SDimitry Andric   auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
4993480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
4994480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
4995480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4996480093f4SDimitry Andric 
49975f757f3fSDimitry Andric   if (!PreservesDenormals) {
49985f757f3fSDimitry Andric     if (HasDynamicDenormals) {
49995f757f3fSDimitry Andric       assert(SavedSPDenormMode);
50005f757f3fSDimitry Andric       B.buildInstr(AMDGPU::S_SETREG_B32)
50015f757f3fSDimitry Andric           .addReg(SavedSPDenormMode)
50025f757f3fSDimitry Andric           .addImm(SPDenormModeBitField);
50035f757f3fSDimitry Andric     } else
5004480093f4SDimitry Andric       toggleSPDenormMode(false, B, ST, Mode);
50055f757f3fSDimitry Andric   }
5006480093f4SDimitry Andric 
50075f757f3fSDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5008480093f4SDimitry Andric                   .addUse(Fma4.getReg(0))
5009480093f4SDimitry Andric                   .addUse(Fma1.getReg(0))
5010480093f4SDimitry Andric                   .addUse(Fma3.getReg(0))
5011480093f4SDimitry Andric                   .addUse(NumeratorScaled.getReg(1))
5012480093f4SDimitry Andric                   .setMIFlags(Flags);
5013480093f4SDimitry Andric 
50145f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5015480093f4SDimitry Andric       .addUse(Fmas.getReg(0))
5016480093f4SDimitry Andric       .addUse(RHS)
5017480093f4SDimitry Andric       .addUse(LHS)
5018480093f4SDimitry Andric       .setMIFlags(Flags);
5019480093f4SDimitry Andric 
5020480093f4SDimitry Andric   MI.eraseFromParent();
5021480093f4SDimitry Andric   return true;
5022480093f4SDimitry Andric }
5023480093f4SDimitry Andric 
legalizeFDIV64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5024480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
5025480093f4SDimitry Andric                                          MachineRegisterInfo &MRI,
5026480093f4SDimitry Andric                                          MachineIRBuilder &B) const {
5027e8d8bef9SDimitry Andric   if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5028e8d8bef9SDimitry Andric     return true;
5029e8d8bef9SDimitry Andric 
5030480093f4SDimitry Andric   Register Res = MI.getOperand(0).getReg();
5031480093f4SDimitry Andric   Register LHS = MI.getOperand(1).getReg();
5032480093f4SDimitry Andric   Register RHS = MI.getOperand(2).getReg();
5033480093f4SDimitry Andric 
5034480093f4SDimitry Andric   uint16_t Flags = MI.getFlags();
5035480093f4SDimitry Andric 
5036480093f4SDimitry Andric   LLT S64 = LLT::scalar(64);
5037480093f4SDimitry Andric   LLT S1 = LLT::scalar(1);
5038480093f4SDimitry Andric 
5039480093f4SDimitry Andric   auto One = B.buildFConstant(S64, 1.0);
5040480093f4SDimitry Andric 
50415f757f3fSDimitry Andric   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5042480093f4SDimitry Andric                        .addUse(LHS)
5043480093f4SDimitry Andric                        .addUse(RHS)
50445ffd83dbSDimitry Andric                        .addImm(0)
5045480093f4SDimitry Andric                        .setMIFlags(Flags);
5046480093f4SDimitry Andric 
5047480093f4SDimitry Andric   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5048480093f4SDimitry Andric 
50495f757f3fSDimitry Andric   auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5050480093f4SDimitry Andric                  .addUse(DivScale0.getReg(0))
5051480093f4SDimitry Andric                  .setMIFlags(Flags);
5052480093f4SDimitry Andric 
5053480093f4SDimitry Andric   auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5054480093f4SDimitry Andric   auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5055480093f4SDimitry Andric   auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5056480093f4SDimitry Andric 
50575f757f3fSDimitry Andric   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5058480093f4SDimitry Andric                        .addUse(LHS)
5059480093f4SDimitry Andric                        .addUse(RHS)
50605ffd83dbSDimitry Andric                        .addImm(1)
5061480093f4SDimitry Andric                        .setMIFlags(Flags);
5062480093f4SDimitry Andric 
5063480093f4SDimitry Andric   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
50645ffd83dbSDimitry Andric   auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5065480093f4SDimitry Andric   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5066480093f4SDimitry Andric 
5067480093f4SDimitry Andric   Register Scale;
5068480093f4SDimitry Andric   if (!ST.hasUsableDivScaleConditionOutput()) {
5069480093f4SDimitry Andric     // Workaround a hardware bug on SI where the condition output from div_scale
5070480093f4SDimitry Andric     // is not usable.
5071480093f4SDimitry Andric 
5072480093f4SDimitry Andric     LLT S32 = LLT::scalar(32);
5073480093f4SDimitry Andric 
5074480093f4SDimitry Andric     auto NumUnmerge = B.buildUnmerge(S32, LHS);
5075480093f4SDimitry Andric     auto DenUnmerge = B.buildUnmerge(S32, RHS);
5076480093f4SDimitry Andric     auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5077480093f4SDimitry Andric     auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5078480093f4SDimitry Andric 
5079480093f4SDimitry Andric     auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5080480093f4SDimitry Andric                               Scale1Unmerge.getReg(1));
5081480093f4SDimitry Andric     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5082480093f4SDimitry Andric                               Scale0Unmerge.getReg(1));
50835ffd83dbSDimitry Andric     Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5084480093f4SDimitry Andric   } else {
5085480093f4SDimitry Andric     Scale = DivScale1.getReg(1);
5086480093f4SDimitry Andric   }
5087480093f4SDimitry Andric 
50885f757f3fSDimitry Andric   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5089480093f4SDimitry Andric                   .addUse(Fma4.getReg(0))
5090480093f4SDimitry Andric                   .addUse(Fma3.getReg(0))
5091480093f4SDimitry Andric                   .addUse(Mul.getReg(0))
5092480093f4SDimitry Andric                   .addUse(Scale)
5093480093f4SDimitry Andric                   .setMIFlags(Flags);
5094480093f4SDimitry Andric 
50955f757f3fSDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5096480093f4SDimitry Andric       .addUse(Fmas.getReg(0))
5097480093f4SDimitry Andric       .addUse(RHS)
5098480093f4SDimitry Andric       .addUse(LHS)
5099480093f4SDimitry Andric       .setMIFlags(Flags);
5100480093f4SDimitry Andric 
5101480093f4SDimitry Andric   MI.eraseFromParent();
5102480093f4SDimitry Andric   return true;
5103480093f4SDimitry Andric }
5104480093f4SDimitry Andric 
legalizeFFREXP(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const510506c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
510606c3fb27SDimitry Andric                                          MachineRegisterInfo &MRI,
510706c3fb27SDimitry Andric                                          MachineIRBuilder &B) const {
510806c3fb27SDimitry Andric   Register Res0 = MI.getOperand(0).getReg();
510906c3fb27SDimitry Andric   Register Res1 = MI.getOperand(1).getReg();
511006c3fb27SDimitry Andric   Register Val = MI.getOperand(2).getReg();
511106c3fb27SDimitry Andric   uint16_t Flags = MI.getFlags();
511206c3fb27SDimitry Andric 
511306c3fb27SDimitry Andric   LLT Ty = MRI.getType(Res0);
511406c3fb27SDimitry Andric   LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
511506c3fb27SDimitry Andric 
51165f757f3fSDimitry Andric   auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
511706c3fb27SDimitry Andric                   .addUse(Val)
511806c3fb27SDimitry Andric                   .setMIFlags(Flags);
51195f757f3fSDimitry Andric   auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
512006c3fb27SDimitry Andric                  .addUse(Val)
512106c3fb27SDimitry Andric                  .setMIFlags(Flags);
512206c3fb27SDimitry Andric 
512306c3fb27SDimitry Andric   if (ST.hasFractBug()) {
512406c3fb27SDimitry Andric     auto Fabs = B.buildFAbs(Ty, Val);
512506c3fb27SDimitry Andric     auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
512606c3fb27SDimitry Andric     auto IsFinite =
512706c3fb27SDimitry Andric         B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
512806c3fb27SDimitry Andric     auto Zero = B.buildConstant(InstrExpTy, 0);
512906c3fb27SDimitry Andric     Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
513006c3fb27SDimitry Andric     Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
513106c3fb27SDimitry Andric   }
513206c3fb27SDimitry Andric 
513306c3fb27SDimitry Andric   B.buildCopy(Res0, Mant);
513406c3fb27SDimitry Andric   B.buildSExtOrTrunc(Res1, Exp);
513506c3fb27SDimitry Andric 
513606c3fb27SDimitry Andric   MI.eraseFromParent();
513706c3fb27SDimitry Andric   return true;
513806c3fb27SDimitry Andric }
513906c3fb27SDimitry Andric 
legalizeFDIVFastIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const51408bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
51418bcb0991SDimitry Andric                                                  MachineRegisterInfo &MRI,
51428bcb0991SDimitry Andric                                                  MachineIRBuilder &B) const {
51438bcb0991SDimitry Andric   Register Res = MI.getOperand(0).getReg();
51448bcb0991SDimitry Andric   Register LHS = MI.getOperand(2).getReg();
51458bcb0991SDimitry Andric   Register RHS = MI.getOperand(3).getReg();
51468bcb0991SDimitry Andric   uint16_t Flags = MI.getFlags();
51478bcb0991SDimitry Andric 
51488bcb0991SDimitry Andric   LLT S32 = LLT::scalar(32);
51498bcb0991SDimitry Andric   LLT S1 = LLT::scalar(1);
51508bcb0991SDimitry Andric 
51518bcb0991SDimitry Andric   auto Abs = B.buildFAbs(S32, RHS, Flags);
51528bcb0991SDimitry Andric   const APFloat C0Val(1.0f);
51538bcb0991SDimitry Andric 
515406c3fb27SDimitry Andric   auto C0 = B.buildFConstant(S32, 0x1p+96f);
515506c3fb27SDimitry Andric   auto C1 = B.buildFConstant(S32, 0x1p-32f);
515606c3fb27SDimitry Andric   auto C2 = B.buildFConstant(S32, 1.0f);
51578bcb0991SDimitry Andric 
51588bcb0991SDimitry Andric   auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
51598bcb0991SDimitry Andric   auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
51608bcb0991SDimitry Andric 
51618bcb0991SDimitry Andric   auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
51628bcb0991SDimitry Andric 
51635f757f3fSDimitry Andric   auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
51648bcb0991SDimitry Andric                  .addUse(Mul0.getReg(0))
51658bcb0991SDimitry Andric                  .setMIFlags(Flags);
51668bcb0991SDimitry Andric 
51678bcb0991SDimitry Andric   auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
51688bcb0991SDimitry Andric 
51698bcb0991SDimitry Andric   B.buildFMul(Res, Sel, Mul1, Flags);
51708bcb0991SDimitry Andric 
51718bcb0991SDimitry Andric   MI.eraseFromParent();
51728bcb0991SDimitry Andric   return true;
51738bcb0991SDimitry Andric }
51748bcb0991SDimitry Andric 
legalizeFSQRTF16(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const51755f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
51765f757f3fSDimitry Andric                                            MachineRegisterInfo &MRI,
51775f757f3fSDimitry Andric                                            MachineIRBuilder &B) const {
51785f757f3fSDimitry Andric   // Bypass the correct expansion a standard promotion through G_FSQRT would
51795f757f3fSDimitry Andric   // get. The f32 op is accurate enough for the f16 cas.
51805f757f3fSDimitry Andric   unsigned Flags = MI.getFlags();
51815f757f3fSDimitry Andric   assert(!ST.has16BitInsts());
51825f757f3fSDimitry Andric   const LLT F32 = LLT::scalar(32);
51835f757f3fSDimitry Andric   auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
51845f757f3fSDimitry Andric   auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
51855f757f3fSDimitry Andric     .addUse(Ext.getReg(0))
51865f757f3fSDimitry Andric     .setMIFlags(Flags);
51875f757f3fSDimitry Andric   B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
51885f757f3fSDimitry Andric   MI.eraseFromParent();
51895f757f3fSDimitry Andric   return true;
51905f757f3fSDimitry Andric }
51915f757f3fSDimitry Andric 
legalizeFSQRTF32(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const51925f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
51935f757f3fSDimitry Andric                                            MachineRegisterInfo &MRI,
51945f757f3fSDimitry Andric                                            MachineIRBuilder &B) const {
51955f757f3fSDimitry Andric   MachineFunction &MF = B.getMF();
51965f757f3fSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
51975f757f3fSDimitry Andric   Register X = MI.getOperand(1).getReg();
51985f757f3fSDimitry Andric   const unsigned Flags = MI.getFlags();
51995f757f3fSDimitry Andric   const LLT S1 = LLT::scalar(1);
52005f757f3fSDimitry Andric   const LLT F32 = LLT::scalar(32);
52015f757f3fSDimitry Andric   const LLT I32 = LLT::scalar(32);
52025f757f3fSDimitry Andric 
52035f757f3fSDimitry Andric   if (allowApproxFunc(MF, Flags)) {
52045f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
52055f757f3fSDimitry Andric       .addUse(X)
52065f757f3fSDimitry Andric       .setMIFlags(Flags);
52075f757f3fSDimitry Andric     MI.eraseFromParent();
52085f757f3fSDimitry Andric     return true;
52095f757f3fSDimitry Andric   }
52105f757f3fSDimitry Andric 
52115f757f3fSDimitry Andric   auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
52125f757f3fSDimitry Andric   auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
52135f757f3fSDimitry Andric   auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
52145f757f3fSDimitry Andric   auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
52155f757f3fSDimitry Andric   auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
52165f757f3fSDimitry Andric 
52175f757f3fSDimitry Andric   Register SqrtS = MRI.createGenericVirtualRegister(F32);
52185f757f3fSDimitry Andric   if (needsDenormHandlingF32(MF, X, Flags)) {
52195f757f3fSDimitry Andric     B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
52205f757f3fSDimitry Andric       .addUse(SqrtX.getReg(0))
52215f757f3fSDimitry Andric       .setMIFlags(Flags);
52225f757f3fSDimitry Andric 
52235f757f3fSDimitry Andric     auto NegOne = B.buildConstant(I32, -1);
52245f757f3fSDimitry Andric     auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
52255f757f3fSDimitry Andric 
52265f757f3fSDimitry Andric     auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
52275f757f3fSDimitry Andric     auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
52285f757f3fSDimitry Andric 
52295f757f3fSDimitry Andric     auto PosOne = B.buildConstant(I32, 1);
52305f757f3fSDimitry Andric     auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
52315f757f3fSDimitry Andric 
52325f757f3fSDimitry Andric     auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
52335f757f3fSDimitry Andric     auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
52345f757f3fSDimitry Andric 
52355f757f3fSDimitry Andric     auto Zero = B.buildFConstant(F32, 0.0f);
52365f757f3fSDimitry Andric     auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
52375f757f3fSDimitry Andric 
52385f757f3fSDimitry Andric     SqrtS =
52395f757f3fSDimitry Andric         B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
52405f757f3fSDimitry Andric 
52415f757f3fSDimitry Andric     auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
52425f757f3fSDimitry Andric     SqrtS =
52435f757f3fSDimitry Andric         B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
52445f757f3fSDimitry Andric   } else {
52455f757f3fSDimitry Andric     auto SqrtR =
52465f757f3fSDimitry Andric         B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
52475f757f3fSDimitry Andric     B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
52485f757f3fSDimitry Andric 
52495f757f3fSDimitry Andric     auto Half = B.buildFConstant(F32, 0.5f);
52505f757f3fSDimitry Andric     auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
52515f757f3fSDimitry Andric     auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
52525f757f3fSDimitry Andric     auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
52535f757f3fSDimitry Andric     SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
52545f757f3fSDimitry Andric     SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
52555f757f3fSDimitry Andric     auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
52565f757f3fSDimitry Andric     auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
52575f757f3fSDimitry Andric     SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
52585f757f3fSDimitry Andric   }
52595f757f3fSDimitry Andric 
52605f757f3fSDimitry Andric   auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
52615f757f3fSDimitry Andric 
52625f757f3fSDimitry Andric   auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
52635f757f3fSDimitry Andric 
52645f757f3fSDimitry Andric   SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
52655f757f3fSDimitry Andric 
52665f757f3fSDimitry Andric   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
52675f757f3fSDimitry Andric   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
52685f757f3fSDimitry Andric 
52695f757f3fSDimitry Andric   MI.eraseFromParent();
52705f757f3fSDimitry Andric   return true;
52715f757f3fSDimitry Andric }
52725f757f3fSDimitry Andric 
legalizeFSQRTF64(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const52735f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
527406c3fb27SDimitry Andric                                            MachineRegisterInfo &MRI,
527506c3fb27SDimitry Andric                                            MachineIRBuilder &B) const {
527606c3fb27SDimitry Andric   // For double type, the SQRT and RSQ instructions don't have required
527706c3fb27SDimitry Andric   // precision, we apply Goldschmidt's algorithm to improve the result:
527806c3fb27SDimitry Andric   //
527906c3fb27SDimitry Andric   //   y0 = rsq(x)
528006c3fb27SDimitry Andric   //   g0 = x * y0
528106c3fb27SDimitry Andric   //   h0 = 0.5 * y0
528206c3fb27SDimitry Andric   //
528306c3fb27SDimitry Andric   //   r0 = 0.5 - h0 * g0
528406c3fb27SDimitry Andric   //   g1 = g0 * r0 + g0
528506c3fb27SDimitry Andric   //   h1 = h0 * r0 + h0
528606c3fb27SDimitry Andric   //
528706c3fb27SDimitry Andric   //   r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
528806c3fb27SDimitry Andric   //   g2 = g1 * r1 + g1     g2 = d0 * h1 + g1
528906c3fb27SDimitry Andric   //   h2 = h1 * r1 + h1
529006c3fb27SDimitry Andric   //
529106c3fb27SDimitry Andric   //   r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
529206c3fb27SDimitry Andric   //   g3 = g2 * r2 + g2     g3 = d1 * h1 + g2
529306c3fb27SDimitry Andric   //
529406c3fb27SDimitry Andric   //   sqrt(x) = g3
529506c3fb27SDimitry Andric 
529606c3fb27SDimitry Andric   const LLT S1 = LLT::scalar(1);
529706c3fb27SDimitry Andric   const LLT S32 = LLT::scalar(32);
529806c3fb27SDimitry Andric   const LLT F64 = LLT::scalar(64);
529906c3fb27SDimitry Andric 
530006c3fb27SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
530106c3fb27SDimitry Andric   assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
530206c3fb27SDimitry Andric 
530306c3fb27SDimitry Andric   Register X = MI.getOperand(1).getReg();
530406c3fb27SDimitry Andric   unsigned Flags = MI.getFlags();
530506c3fb27SDimitry Andric 
530606c3fb27SDimitry Andric   auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
530706c3fb27SDimitry Andric 
530806c3fb27SDimitry Andric   auto ZeroInt = B.buildConstant(S32, 0);
530906c3fb27SDimitry Andric   auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
531006c3fb27SDimitry Andric 
531106c3fb27SDimitry Andric   // Scale up input if it is too small.
531206c3fb27SDimitry Andric   auto ScaleUpFactor = B.buildConstant(S32, 256);
531306c3fb27SDimitry Andric   auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
531406c3fb27SDimitry Andric   auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
531506c3fb27SDimitry Andric 
53165f757f3fSDimitry Andric   auto SqrtY =
53175f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
531806c3fb27SDimitry Andric 
531906c3fb27SDimitry Andric   auto Half = B.buildFConstant(F64, 0.5);
532006c3fb27SDimitry Andric   auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
532106c3fb27SDimitry Andric   auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
532206c3fb27SDimitry Andric 
532306c3fb27SDimitry Andric   auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
532406c3fb27SDimitry Andric   auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
532506c3fb27SDimitry Andric 
532606c3fb27SDimitry Andric   auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
532706c3fb27SDimitry Andric   auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
532806c3fb27SDimitry Andric 
532906c3fb27SDimitry Andric   auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
533006c3fb27SDimitry Andric   auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
533106c3fb27SDimitry Andric 
533206c3fb27SDimitry Andric   auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
533306c3fb27SDimitry Andric 
533406c3fb27SDimitry Andric   auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
533506c3fb27SDimitry Andric   auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
533606c3fb27SDimitry Andric 
533706c3fb27SDimitry Andric   auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
533806c3fb27SDimitry Andric 
533906c3fb27SDimitry Andric   // Scale down the result.
534006c3fb27SDimitry Andric   auto ScaleDownFactor = B.buildConstant(S32, -128);
534106c3fb27SDimitry Andric   auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
534206c3fb27SDimitry Andric   SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
534306c3fb27SDimitry Andric 
534406c3fb27SDimitry Andric   // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
534506c3fb27SDimitry Andric   // with finite only or nsz because rsq(+/-0) = +/-inf
534606c3fb27SDimitry Andric 
534706c3fb27SDimitry Andric   // TODO: Check for DAZ and expand to subnormals
534806c3fb27SDimitry Andric   auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
534906c3fb27SDimitry Andric 
535006c3fb27SDimitry Andric   // If x is +INF, +0, or -0, use its original value
535106c3fb27SDimitry Andric   B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
535206c3fb27SDimitry Andric 
535306c3fb27SDimitry Andric   MI.eraseFromParent();
535406c3fb27SDimitry Andric   return true;
535506c3fb27SDimitry Andric }
535606c3fb27SDimitry Andric 
legalizeFSQRT(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const53575f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
53585f757f3fSDimitry Andric                                         MachineRegisterInfo &MRI,
53595f757f3fSDimitry Andric                                         MachineIRBuilder &B) const {
53605f757f3fSDimitry Andric   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
53615f757f3fSDimitry Andric   if (Ty == LLT::scalar(32))
53625f757f3fSDimitry Andric     return legalizeFSQRTF32(MI, MRI, B);
53635f757f3fSDimitry Andric   if (Ty == LLT::scalar(64))
53645f757f3fSDimitry Andric     return legalizeFSQRTF64(MI, MRI, B);
53655f757f3fSDimitry Andric   if (Ty == LLT::scalar(16))
53665f757f3fSDimitry Andric     return legalizeFSQRTF16(MI, MRI, B);
53675f757f3fSDimitry Andric   return false;
53685f757f3fSDimitry Andric }
53695f757f3fSDimitry Andric 
5370e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
5371e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions?
5372e8d8bef9SDimitry Andric //
5373e8d8bef9SDimitry Andric // Reciprocal square root.  The clamp prevents infinite results, clamping
5374e8d8bef9SDimitry Andric // infinities to max_float.  D.f = 1.0 / sqrt(S0.f), result clamped to
5375e8d8bef9SDimitry Andric // +-max_float.
legalizeRsqClampIntrinsic(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5376e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
5377e8d8bef9SDimitry Andric                                                     MachineRegisterInfo &MRI,
5378e8d8bef9SDimitry Andric                                                     MachineIRBuilder &B) const {
5379e8d8bef9SDimitry Andric   if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5380e8d8bef9SDimitry Andric     return true;
5381e8d8bef9SDimitry Andric 
5382e8d8bef9SDimitry Andric   Register Dst = MI.getOperand(0).getReg();
5383e8d8bef9SDimitry Andric   Register Src = MI.getOperand(2).getReg();
5384e8d8bef9SDimitry Andric   auto Flags = MI.getFlags();
5385e8d8bef9SDimitry Andric 
5386e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(Dst);
5387e8d8bef9SDimitry Andric 
5388e8d8bef9SDimitry Andric   const fltSemantics *FltSemantics;
5389e8d8bef9SDimitry Andric   if (Ty == LLT::scalar(32))
5390e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEsingle();
5391e8d8bef9SDimitry Andric   else if (Ty == LLT::scalar(64))
5392e8d8bef9SDimitry Andric     FltSemantics = &APFloat::IEEEdouble();
5393e8d8bef9SDimitry Andric   else
5394e8d8bef9SDimitry Andric     return false;
5395e8d8bef9SDimitry Andric 
53965f757f3fSDimitry Andric   auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5397e8d8bef9SDimitry Andric                  .addUse(Src)
5398e8d8bef9SDimitry Andric                  .setMIFlags(Flags);
5399e8d8bef9SDimitry Andric 
5400e8d8bef9SDimitry Andric   // We don't need to concern ourselves with the snan handling difference, since
5401e8d8bef9SDimitry Andric   // the rsq quieted (or not) so use the one which will directly select.
5402e8d8bef9SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5403e8d8bef9SDimitry Andric   const bool UseIEEE = MFI->getMode().IEEE;
5404e8d8bef9SDimitry Andric 
5405e8d8bef9SDimitry Andric   auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
5406e8d8bef9SDimitry Andric   auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5407e8d8bef9SDimitry Andric                             B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5408e8d8bef9SDimitry Andric 
5409e8d8bef9SDimitry Andric   auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
5410e8d8bef9SDimitry Andric 
5411e8d8bef9SDimitry Andric   if (UseIEEE)
5412e8d8bef9SDimitry Andric     B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5413e8d8bef9SDimitry Andric   else
5414e8d8bef9SDimitry Andric     B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5415e8d8bef9SDimitry Andric   MI.eraseFromParent();
5416e8d8bef9SDimitry Andric   return true;
5417e8d8bef9SDimitry Andric }
5418e8d8bef9SDimitry Andric 
5419*0fca6ea1SDimitry Andric // TODO: Fix pointer type handling
legalizeLaneOp(LegalizerHelper & Helper,MachineInstr & MI,Intrinsic::ID IID) const5420*0fca6ea1SDimitry Andric bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5421e8d8bef9SDimitry Andric                                          MachineInstr &MI,
5422e8d8bef9SDimitry Andric                                          Intrinsic::ID IID) const {
5423e8d8bef9SDimitry Andric 
5424*0fca6ea1SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
5425*0fca6ea1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
5426e8d8bef9SDimitry Andric 
5427*0fca6ea1SDimitry Andric   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5428*0fca6ea1SDimitry Andric                       IID == Intrinsic::amdgcn_permlanex16;
5429e8d8bef9SDimitry Andric 
5430*0fca6ea1SDimitry Andric   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5431*0fca6ea1SDimitry Andric                                       Register Src2, LLT VT) -> Register {
5432*0fca6ea1SDimitry Andric     auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5433*0fca6ea1SDimitry Andric     switch (IID) {
5434*0fca6ea1SDimitry Andric     case Intrinsic::amdgcn_readfirstlane:
5435*0fca6ea1SDimitry Andric     case Intrinsic::amdgcn_permlane64:
5436*0fca6ea1SDimitry Andric       return LaneOp.getReg(0);
5437*0fca6ea1SDimitry Andric     case Intrinsic::amdgcn_readlane:
5438*0fca6ea1SDimitry Andric       return LaneOp.addUse(Src1).getReg(0);
5439*0fca6ea1SDimitry Andric     case Intrinsic::amdgcn_writelane:
5440*0fca6ea1SDimitry Andric       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5441*0fca6ea1SDimitry Andric     case Intrinsic::amdgcn_permlane16:
5442*0fca6ea1SDimitry Andric     case Intrinsic::amdgcn_permlanex16: {
5443*0fca6ea1SDimitry Andric       Register Src3 = MI.getOperand(5).getReg();
5444*0fca6ea1SDimitry Andric       Register Src4 = MI.getOperand(6).getImm();
5445*0fca6ea1SDimitry Andric       Register Src5 = MI.getOperand(7).getImm();
5446*0fca6ea1SDimitry Andric       return LaneOp.addUse(Src1)
5447*0fca6ea1SDimitry Andric           .addUse(Src2)
5448*0fca6ea1SDimitry Andric           .addUse(Src3)
5449*0fca6ea1SDimitry Andric           .addImm(Src4)
5450*0fca6ea1SDimitry Andric           .addImm(Src5)
5451*0fca6ea1SDimitry Andric           .getReg(0);
5452*0fca6ea1SDimitry Andric     }
5453*0fca6ea1SDimitry Andric     default:
5454*0fca6ea1SDimitry Andric       llvm_unreachable("unhandled lane op");
5455*0fca6ea1SDimitry Andric     }
5456*0fca6ea1SDimitry Andric   };
5457*0fca6ea1SDimitry Andric 
5458*0fca6ea1SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5459*0fca6ea1SDimitry Andric   Register Src0 = MI.getOperand(2).getReg();
5460*0fca6ea1SDimitry Andric   Register Src1, Src2;
5461*0fca6ea1SDimitry Andric   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5462*0fca6ea1SDimitry Andric       IsPermLane16) {
5463*0fca6ea1SDimitry Andric     Src1 = MI.getOperand(3).getReg();
5464*0fca6ea1SDimitry Andric     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5465*0fca6ea1SDimitry Andric       Src2 = MI.getOperand(4).getReg();
5466*0fca6ea1SDimitry Andric     }
5467*0fca6ea1SDimitry Andric   }
5468*0fca6ea1SDimitry Andric 
5469*0fca6ea1SDimitry Andric   LLT Ty = MRI.getType(DstReg);
5470*0fca6ea1SDimitry Andric   unsigned Size = Ty.getSizeInBits();
5471*0fca6ea1SDimitry Andric 
5472*0fca6ea1SDimitry Andric   if (Size == 32) {
5473*0fca6ea1SDimitry Andric     // Already legal
5474*0fca6ea1SDimitry Andric     return true;
5475*0fca6ea1SDimitry Andric   }
5476*0fca6ea1SDimitry Andric 
5477*0fca6ea1SDimitry Andric   if (Size < 32) {
5478*0fca6ea1SDimitry Andric     Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5479*0fca6ea1SDimitry Andric 
5480*0fca6ea1SDimitry Andric     if (IsPermLane16)
5481*0fca6ea1SDimitry Andric       Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5482*0fca6ea1SDimitry Andric 
5483*0fca6ea1SDimitry Andric     if (IID == Intrinsic::amdgcn_writelane)
5484*0fca6ea1SDimitry Andric       Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5485*0fca6ea1SDimitry Andric 
5486*0fca6ea1SDimitry Andric     Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5487*0fca6ea1SDimitry Andric     B.buildTrunc(DstReg, LaneOpDst);
5488*0fca6ea1SDimitry Andric     MI.eraseFromParent();
5489*0fca6ea1SDimitry Andric     return true;
5490*0fca6ea1SDimitry Andric   }
5491*0fca6ea1SDimitry Andric 
5492*0fca6ea1SDimitry Andric   if (Size % 32 != 0)
5493*0fca6ea1SDimitry Andric     return false;
5494*0fca6ea1SDimitry Andric 
5495*0fca6ea1SDimitry Andric   LLT PartialResTy = S32;
5496*0fca6ea1SDimitry Andric   if (Ty.isVector()) {
5497*0fca6ea1SDimitry Andric     LLT EltTy = Ty.getElementType();
5498*0fca6ea1SDimitry Andric     switch (EltTy.getSizeInBits()) {
5499*0fca6ea1SDimitry Andric     case 16:
5500*0fca6ea1SDimitry Andric       PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5501*0fca6ea1SDimitry Andric       break;
5502*0fca6ea1SDimitry Andric     case 32:
5503*0fca6ea1SDimitry Andric       PartialResTy = EltTy;
5504*0fca6ea1SDimitry Andric       break;
5505*0fca6ea1SDimitry Andric     default:
5506*0fca6ea1SDimitry Andric       // Handle all other cases via S32 pieces;
5507*0fca6ea1SDimitry Andric       break;
5508*0fca6ea1SDimitry Andric     }
5509*0fca6ea1SDimitry Andric   }
5510*0fca6ea1SDimitry Andric 
5511*0fca6ea1SDimitry Andric   SmallVector<Register, 2> PartialRes;
5512*0fca6ea1SDimitry Andric   unsigned NumParts = Size / 32;
5513*0fca6ea1SDimitry Andric   MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5514*0fca6ea1SDimitry Andric   MachineInstrBuilder Src1Parts, Src2Parts;
5515*0fca6ea1SDimitry Andric 
5516*0fca6ea1SDimitry Andric   if (IsPermLane16)
5517*0fca6ea1SDimitry Andric     Src1Parts = B.buildUnmerge(PartialResTy, Src1);
5518*0fca6ea1SDimitry Andric 
5519*0fca6ea1SDimitry Andric   if (IID == Intrinsic::amdgcn_writelane)
5520*0fca6ea1SDimitry Andric     Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5521*0fca6ea1SDimitry Andric 
5522*0fca6ea1SDimitry Andric   for (unsigned i = 0; i < NumParts; ++i) {
5523*0fca6ea1SDimitry Andric     Src0 = Src0Parts.getReg(i);
5524*0fca6ea1SDimitry Andric 
5525*0fca6ea1SDimitry Andric     if (IsPermLane16)
5526*0fca6ea1SDimitry Andric       Src1 = Src1Parts.getReg(i);
5527*0fca6ea1SDimitry Andric 
5528*0fca6ea1SDimitry Andric     if (IID == Intrinsic::amdgcn_writelane)
5529*0fca6ea1SDimitry Andric       Src2 = Src2Parts.getReg(i);
5530*0fca6ea1SDimitry Andric 
5531*0fca6ea1SDimitry Andric     PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5532*0fca6ea1SDimitry Andric   }
5533*0fca6ea1SDimitry Andric 
5534*0fca6ea1SDimitry Andric   B.buildMergeLikeInstr(DstReg, PartialRes);
5535*0fca6ea1SDimitry Andric   MI.eraseFromParent();
5536e8d8bef9SDimitry Andric   return true;
5537e8d8bef9SDimitry Andric }
5538e8d8bef9SDimitry Andric 
getImplicitArgPtr(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5539e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
5540e8d8bef9SDimitry Andric                                             MachineRegisterInfo &MRI,
5541e8d8bef9SDimitry Andric                                             MachineIRBuilder &B) const {
5542e8d8bef9SDimitry Andric   uint64_t Offset =
5543e8d8bef9SDimitry Andric     ST.getTargetLowering()->getImplicitParameterOffset(
5544e8d8bef9SDimitry Andric       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
5545e8d8bef9SDimitry Andric   LLT DstTy = MRI.getType(DstReg);
5546e8d8bef9SDimitry Andric   LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
5547e8d8bef9SDimitry Andric 
5548e8d8bef9SDimitry Andric   Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
5549e8d8bef9SDimitry Andric   if (!loadInputValue(KernargPtrReg, B,
5550e8d8bef9SDimitry Andric                       AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
5551e8d8bef9SDimitry Andric     return false;
5552e8d8bef9SDimitry Andric 
5553e8d8bef9SDimitry Andric   // FIXME: This should be nuw
5554e8d8bef9SDimitry Andric   B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
5555e8d8bef9SDimitry Andric   return true;
5556e8d8bef9SDimitry Andric }
5557e8d8bef9SDimitry Andric 
555806c3fb27SDimitry Andric /// To create a buffer resource from a 64-bit pointer, mask off the upper 32
555906c3fb27SDimitry Andric /// bits of the pointer and replace them with the stride argument, then
556006c3fb27SDimitry Andric /// merge_values everything together. In the common case of a raw buffer (the
556106c3fb27SDimitry Andric /// stride component is 0), we can just AND off the upper half.
legalizePointerAsRsrcIntrin(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const556206c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
556306c3fb27SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
556406c3fb27SDimitry Andric   Register Result = MI.getOperand(0).getReg();
556506c3fb27SDimitry Andric   Register Pointer = MI.getOperand(2).getReg();
556606c3fb27SDimitry Andric   Register Stride = MI.getOperand(3).getReg();
556706c3fb27SDimitry Andric   Register NumRecords = MI.getOperand(4).getReg();
556806c3fb27SDimitry Andric   Register Flags = MI.getOperand(5).getReg();
556906c3fb27SDimitry Andric 
557006c3fb27SDimitry Andric   LLT S32 = LLT::scalar(32);
557106c3fb27SDimitry Andric 
557206c3fb27SDimitry Andric   B.setInsertPt(B.getMBB(), ++B.getInsertPt());
557306c3fb27SDimitry Andric   auto Unmerge = B.buildUnmerge(S32, Pointer);
557406c3fb27SDimitry Andric   Register LowHalf = Unmerge.getReg(0);
557506c3fb27SDimitry Andric   Register HighHalf = Unmerge.getReg(1);
557606c3fb27SDimitry Andric 
557706c3fb27SDimitry Andric   auto AndMask = B.buildConstant(S32, 0x0000ffff);
557806c3fb27SDimitry Andric   auto Masked = B.buildAnd(S32, HighHalf, AndMask);
557906c3fb27SDimitry Andric 
558006c3fb27SDimitry Andric   MachineInstrBuilder NewHighHalf = Masked;
558106c3fb27SDimitry Andric   std::optional<ValueAndVReg> StrideConst =
558206c3fb27SDimitry Andric       getIConstantVRegValWithLookThrough(Stride, MRI);
558306c3fb27SDimitry Andric   if (!StrideConst || !StrideConst->Value.isZero()) {
558406c3fb27SDimitry Andric     MachineInstrBuilder ShiftedStride;
558506c3fb27SDimitry Andric     if (StrideConst) {
558606c3fb27SDimitry Andric       uint32_t StrideVal = StrideConst->Value.getZExtValue();
558706c3fb27SDimitry Andric       uint32_t ShiftedStrideVal = StrideVal << 16;
558806c3fb27SDimitry Andric       ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
558906c3fb27SDimitry Andric     } else {
559006c3fb27SDimitry Andric       auto ExtStride = B.buildAnyExt(S32, Stride);
559106c3fb27SDimitry Andric       auto ShiftConst = B.buildConstant(S32, 16);
559206c3fb27SDimitry Andric       ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
559306c3fb27SDimitry Andric     }
559406c3fb27SDimitry Andric     NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
559506c3fb27SDimitry Andric   }
559606c3fb27SDimitry Andric   Register NewHighHalfReg = NewHighHalf.getReg(0);
559706c3fb27SDimitry Andric   B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
559806c3fb27SDimitry Andric   MI.eraseFromParent();
559906c3fb27SDimitry Andric   return true;
560006c3fb27SDimitry Andric }
560106c3fb27SDimitry Andric 
legalizeImplicitArgPtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const56020b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
56030b57cec5SDimitry Andric                                                  MachineRegisterInfo &MRI,
56040b57cec5SDimitry Andric                                                  MachineIRBuilder &B) const {
56050b57cec5SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
56060b57cec5SDimitry Andric   if (!MFI->isEntryFunction()) {
56070b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
56080b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
56090b57cec5SDimitry Andric   }
56100b57cec5SDimitry Andric 
56110b57cec5SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5612e8d8bef9SDimitry Andric   if (!getImplicitArgPtr(DstReg, MRI, B))
56130b57cec5SDimitry Andric     return false;
56140b57cec5SDimitry Andric 
56150b57cec5SDimitry Andric   MI.eraseFromParent();
56160b57cec5SDimitry Andric   return true;
56170b57cec5SDimitry Andric }
56180b57cec5SDimitry Andric 
getLDSKernelId(Register DstReg,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5619fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
5620fcaf7f86SDimitry Andric                                          MachineRegisterInfo &MRI,
5621fcaf7f86SDimitry Andric                                          MachineIRBuilder &B) const {
5622fcaf7f86SDimitry Andric   Function &F = B.getMF().getFunction();
5623bdd1243dSDimitry Andric   std::optional<uint32_t> KnownSize =
5624fcaf7f86SDimitry Andric       AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
5625fcaf7f86SDimitry Andric   if (KnownSize.has_value())
5626bdd1243dSDimitry Andric     B.buildConstant(DstReg, *KnownSize);
5627fcaf7f86SDimitry Andric   return false;
5628fcaf7f86SDimitry Andric }
5629fcaf7f86SDimitry Andric 
legalizeLDSKernelId(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const5630fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
5631fcaf7f86SDimitry Andric                                               MachineRegisterInfo &MRI,
5632fcaf7f86SDimitry Andric                                               MachineIRBuilder &B) const {
5633fcaf7f86SDimitry Andric 
5634fcaf7f86SDimitry Andric   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5635fcaf7f86SDimitry Andric   if (!MFI->isEntryFunction()) {
5636fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
5637fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
5638fcaf7f86SDimitry Andric   }
5639fcaf7f86SDimitry Andric 
5640fcaf7f86SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
5641fcaf7f86SDimitry Andric   if (!getLDSKernelId(DstReg, MRI, B))
5642fcaf7f86SDimitry Andric     return false;
5643fcaf7f86SDimitry Andric 
5644fcaf7f86SDimitry Andric   MI.eraseFromParent();
5645fcaf7f86SDimitry Andric   return true;
5646fcaf7f86SDimitry Andric }
5647fcaf7f86SDimitry Andric 
legalizeIsAddrSpace(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,unsigned AddrSpace) const56488bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
56498bcb0991SDimitry Andric                                               MachineRegisterInfo &MRI,
56508bcb0991SDimitry Andric                                               MachineIRBuilder &B,
56518bcb0991SDimitry Andric                                               unsigned AddrSpace) const {
56528bcb0991SDimitry Andric   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
5653e8d8bef9SDimitry Andric   auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
5654e8d8bef9SDimitry Andric   Register Hi32 = Unmerge.getReg(1);
5655e8d8bef9SDimitry Andric 
56568bcb0991SDimitry Andric   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
56578bcb0991SDimitry Andric   MI.eraseFromParent();
56588bcb0991SDimitry Andric   return true;
56598bcb0991SDimitry Andric }
56608bcb0991SDimitry Andric 
56615ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
56625ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be
56635ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset
56645ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in
56655ffd83dbSDimitry Andric // the instruction's soffset field).  This function takes the first kind of
56665ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset.
5667fe6060f1SDimitry Andric std::pair<Register, unsigned>
splitBufferOffsets(MachineIRBuilder & B,Register OrigOffset) const56685ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
56695ffd83dbSDimitry Andric                                         Register OrigOffset) const {
56705f757f3fSDimitry Andric   const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
56715ffd83dbSDimitry Andric   Register BaseReg;
5672fe6060f1SDimitry Andric   unsigned ImmOffset;
56735ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
5674fe6060f1SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
56755ffd83dbSDimitry Andric 
5676fe6060f1SDimitry Andric   std::tie(BaseReg, ImmOffset) =
5677fe6060f1SDimitry Andric       AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset);
56785ffd83dbSDimitry Andric 
5679fe6060f1SDimitry Andric   // If BaseReg is a pointer, convert it to int.
5680fe6060f1SDimitry Andric   if (MRI.getType(BaseReg).isPointer())
5681fe6060f1SDimitry Andric     BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
56825ffd83dbSDimitry Andric 
568306c3fb27SDimitry Andric   // If the immediate value is too big for the immoffset field, put only bits
568406c3fb27SDimitry Andric   // that would normally fit in the immoffset field. The remaining value that
568506c3fb27SDimitry Andric   // is copied/added for the voffset field is a large power of 2, and it
568606c3fb27SDimitry Andric   // stands more chance of being CSEd with the copy/add for another similar
568706c3fb27SDimitry Andric   // load/store.
568806c3fb27SDimitry Andric   // However, do not do that rounding down if that is a negative
568906c3fb27SDimitry Andric   // number, as it appears to be illegal to have a negative offset in the
569006c3fb27SDimitry Andric   // vgpr, even if adding the immediate offset makes it positive.
56915ffd83dbSDimitry Andric   unsigned Overflow = ImmOffset & ~MaxImm;
56925ffd83dbSDimitry Andric   ImmOffset -= Overflow;
56935ffd83dbSDimitry Andric   if ((int32_t)Overflow < 0) {
56945ffd83dbSDimitry Andric     Overflow += ImmOffset;
56955ffd83dbSDimitry Andric     ImmOffset = 0;
56965ffd83dbSDimitry Andric   }
56975ffd83dbSDimitry Andric 
56985ffd83dbSDimitry Andric   if (Overflow != 0) {
56995ffd83dbSDimitry Andric     if (!BaseReg) {
57005ffd83dbSDimitry Andric       BaseReg = B.buildConstant(S32, Overflow).getReg(0);
57015ffd83dbSDimitry Andric     } else {
57025ffd83dbSDimitry Andric       auto OverflowVal = B.buildConstant(S32, Overflow);
57035ffd83dbSDimitry Andric       BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
57045ffd83dbSDimitry Andric     }
57055ffd83dbSDimitry Andric   }
57065ffd83dbSDimitry Andric 
57075ffd83dbSDimitry Andric   if (!BaseReg)
57085ffd83dbSDimitry Andric     BaseReg = B.buildConstant(S32, 0).getReg(0);
57095ffd83dbSDimitry Andric 
5710bdd1243dSDimitry Andric   return std::pair(BaseReg, ImmOffset);
5711fe6060f1SDimitry Andric }
5712fe6060f1SDimitry Andric 
57138bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets.
handleD16VData(MachineIRBuilder & B,MachineRegisterInfo & MRI,Register Reg,bool ImageStore) const57148bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
57158bcb0991SDimitry Andric                                              MachineRegisterInfo &MRI,
5716e8d8bef9SDimitry Andric                                              Register Reg,
5717e8d8bef9SDimitry Andric                                              bool ImageStore) const {
57188bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
57198bcb0991SDimitry Andric   const LLT S32 = LLT::scalar(32);
57208bcb0991SDimitry Andric   LLT StoreVT = MRI.getType(Reg);
57218bcb0991SDimitry Andric   assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
57228bcb0991SDimitry Andric 
5723e8d8bef9SDimitry Andric   if (ST.hasUnpackedD16VMem()) {
57248bcb0991SDimitry Andric     auto Unmerge = B.buildUnmerge(S16, Reg);
57258bcb0991SDimitry Andric 
57268bcb0991SDimitry Andric     SmallVector<Register, 4> WideRegs;
57278bcb0991SDimitry Andric     for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
57288bcb0991SDimitry Andric       WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
57298bcb0991SDimitry Andric 
57308bcb0991SDimitry Andric     int NumElts = StoreVT.getNumElements();
57318bcb0991SDimitry Andric 
5732fe6060f1SDimitry Andric     return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
5733fe6060f1SDimitry Andric         .getReg(0);
57348bcb0991SDimitry Andric   }
57358bcb0991SDimitry Andric 
5736e8d8bef9SDimitry Andric   if (ImageStore && ST.hasImageStoreD16Bug()) {
5737e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 2) {
5738e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5739e8d8bef9SDimitry Andric       Reg = B.buildBitcast(S32, Reg).getReg(0);
5740e8d8bef9SDimitry Andric       PackedRegs.push_back(Reg);
5741e8d8bef9SDimitry Andric       PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
5742fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
5743fe6060f1SDimitry Andric           .getReg(0);
5744e8d8bef9SDimitry Andric     }
5745e8d8bef9SDimitry Andric 
5746e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 3) {
5747e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5748e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S16, Reg);
5749e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5750e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
5751e8d8bef9SDimitry Andric       PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
5752fe6060f1SDimitry Andric       Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
5753fe6060f1SDimitry Andric       return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
5754e8d8bef9SDimitry Andric     }
5755e8d8bef9SDimitry Andric 
5756e8d8bef9SDimitry Andric     if (StoreVT.getNumElements() == 4) {
5757e8d8bef9SDimitry Andric       SmallVector<Register, 4> PackedRegs;
5758fe6060f1SDimitry Andric       Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
5759e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge(S32, Reg);
5760e8d8bef9SDimitry Andric       for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
5761e8d8bef9SDimitry Andric         PackedRegs.push_back(Unmerge.getReg(I));
5762e8d8bef9SDimitry Andric       PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
5763fe6060f1SDimitry Andric       return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
5764fe6060f1SDimitry Andric           .getReg(0);
5765e8d8bef9SDimitry Andric     }
5766e8d8bef9SDimitry Andric 
5767e8d8bef9SDimitry Andric     llvm_unreachable("invalid data type");
5768e8d8bef9SDimitry Andric   }
5769e8d8bef9SDimitry Andric 
57700eae32dcSDimitry Andric   if (StoreVT == LLT::fixed_vector(3, S16)) {
57710eae32dcSDimitry Andric     Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
57720eae32dcSDimitry Andric               .getReg(0);
57730eae32dcSDimitry Andric   }
5774e8d8bef9SDimitry Andric   return Reg;
5775e8d8bef9SDimitry Andric }
5776e8d8bef9SDimitry Andric 
fixStoreSourceType(MachineIRBuilder & B,Register VData,bool IsFormat) const57775ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType(
57785ffd83dbSDimitry Andric   MachineIRBuilder &B, Register VData, bool IsFormat) const {
57795ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
57805ffd83dbSDimitry Andric   LLT Ty = MRI->getType(VData);
57818bcb0991SDimitry Andric 
57828bcb0991SDimitry Andric   const LLT S16 = LLT::scalar(16);
57838bcb0991SDimitry Andric 
578406c3fb27SDimitry Andric   // Fixup buffer resources themselves needing to be v4i128.
578506c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty))
578606c3fb27SDimitry Andric     return castBufferRsrcToV4I32(VData, B);
578706c3fb27SDimitry Andric 
57888bcb0991SDimitry Andric   // Fixup illegal register types for i8 stores.
57898bcb0991SDimitry Andric   if (Ty == LLT::scalar(8) || Ty == S16) {
57908bcb0991SDimitry Andric     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
57915ffd83dbSDimitry Andric     return AnyExt;
57928bcb0991SDimitry Andric   }
57938bcb0991SDimitry Andric 
57948bcb0991SDimitry Andric   if (Ty.isVector()) {
57958bcb0991SDimitry Andric     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
57968bcb0991SDimitry Andric       if (IsFormat)
57975ffd83dbSDimitry Andric         return handleD16VData(B, *MRI, VData);
57985ffd83dbSDimitry Andric     }
57995ffd83dbSDimitry Andric   }
58005ffd83dbSDimitry Andric 
58015ffd83dbSDimitry Andric   return VData;
58025ffd83dbSDimitry Andric }
58035ffd83dbSDimitry Andric 
legalizeBufferStore(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsTyped,bool IsFormat) const58045ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
58055ffd83dbSDimitry Andric                                               MachineRegisterInfo &MRI,
58065ffd83dbSDimitry Andric                                               MachineIRBuilder &B,
58075ffd83dbSDimitry Andric                                               bool IsTyped,
58085ffd83dbSDimitry Andric                                               bool IsFormat) const {
58095ffd83dbSDimitry Andric   Register VData = MI.getOperand(1).getReg();
58105ffd83dbSDimitry Andric   LLT Ty = MRI.getType(VData);
58115ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
58125ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
58135ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
58145ffd83dbSDimitry Andric 
58155ffd83dbSDimitry Andric   VData = fixStoreSourceType(B, VData, IsFormat);
581606c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 2);
58175ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(2).getReg();
58185ffd83dbSDimitry Andric 
58195ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
5820*0fca6ea1SDimitry Andric   const int MemSize = MMO->getSize().getValue();
58215ffd83dbSDimitry Andric 
58225ffd83dbSDimitry Andric   unsigned ImmOffset;
58235ffd83dbSDimitry Andric 
58245ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
58255ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
58265ffd83dbSDimitry Andric 
58275ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
58285ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
58295ffd83dbSDimitry Andric   Register VIndex;
58305ffd83dbSDimitry Andric   int OpOffset = 0;
58315ffd83dbSDimitry Andric   if (HasVIndex) {
58325ffd83dbSDimitry Andric     VIndex = MI.getOperand(3).getReg();
58335ffd83dbSDimitry Andric     OpOffset = 1;
5834fe6060f1SDimitry Andric   } else {
5835fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
58365ffd83dbSDimitry Andric   }
58375ffd83dbSDimitry Andric 
58385ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
58395ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
58405ffd83dbSDimitry Andric 
58415ffd83dbSDimitry Andric   unsigned Format = 0;
58425ffd83dbSDimitry Andric   if (IsTyped) {
58435ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
58445ffd83dbSDimitry Andric     ++OpOffset;
58455ffd83dbSDimitry Andric   }
58465ffd83dbSDimitry Andric 
58475ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
58485ffd83dbSDimitry Andric 
5849fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
58505ffd83dbSDimitry Andric 
58515ffd83dbSDimitry Andric   unsigned Opc;
58525ffd83dbSDimitry Andric   if (IsTyped) {
58535ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
58545ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
58555ffd83dbSDimitry Andric   } else if (IsFormat) {
58565ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
58575ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
58585ffd83dbSDimitry Andric   } else {
58595ffd83dbSDimitry Andric     switch (MemSize) {
58605ffd83dbSDimitry Andric     case 1:
58615ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
58625ffd83dbSDimitry Andric       break;
58635ffd83dbSDimitry Andric     case 2:
58645ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
58655ffd83dbSDimitry Andric       break;
58665ffd83dbSDimitry Andric     default:
58675ffd83dbSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
58685ffd83dbSDimitry Andric       break;
58695ffd83dbSDimitry Andric     }
58705ffd83dbSDimitry Andric   }
58715ffd83dbSDimitry Andric 
58725ffd83dbSDimitry Andric   auto MIB = B.buildInstr(Opc)
58735ffd83dbSDimitry Andric     .addUse(VData)              // vdata
58745ffd83dbSDimitry Andric     .addUse(RSrc)               // rsrc
58755ffd83dbSDimitry Andric     .addUse(VIndex)             // vindex
58765ffd83dbSDimitry Andric     .addUse(VOffset)            // voffset
58775ffd83dbSDimitry Andric     .addUse(SOffset)            // soffset
58785ffd83dbSDimitry Andric     .addImm(ImmOffset);         // offset(imm)
58795ffd83dbSDimitry Andric 
58805ffd83dbSDimitry Andric   if (IsTyped)
58815ffd83dbSDimitry Andric     MIB.addImm(Format);
58825ffd83dbSDimitry Andric 
58835ffd83dbSDimitry Andric   MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
58845ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
58855ffd83dbSDimitry Andric      .addMemOperand(MMO);
58865ffd83dbSDimitry Andric 
58875ffd83dbSDimitry Andric   MI.eraseFromParent();
58888bcb0991SDimitry Andric   return true;
58898bcb0991SDimitry Andric }
58908bcb0991SDimitry Andric 
buildBufferLoad(unsigned Opc,Register LoadDstReg,Register RSrc,Register VIndex,Register VOffset,Register SOffset,unsigned ImmOffset,unsigned Format,unsigned AuxiliaryData,MachineMemOperand * MMO,bool IsTyped,bool HasVIndex,MachineIRBuilder & B)5891bdd1243dSDimitry Andric static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
5892bdd1243dSDimitry Andric                             Register VIndex, Register VOffset, Register SOffset,
5893bdd1243dSDimitry Andric                             unsigned ImmOffset, unsigned Format,
5894bdd1243dSDimitry Andric                             unsigned AuxiliaryData, MachineMemOperand *MMO,
5895bdd1243dSDimitry Andric                             bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
5896bdd1243dSDimitry Andric   auto MIB = B.buildInstr(Opc)
5897bdd1243dSDimitry Andric                  .addDef(LoadDstReg) // vdata
5898bdd1243dSDimitry Andric                  .addUse(RSrc)       // rsrc
5899bdd1243dSDimitry Andric                  .addUse(VIndex)     // vindex
5900bdd1243dSDimitry Andric                  .addUse(VOffset)    // voffset
5901bdd1243dSDimitry Andric                  .addUse(SOffset)    // soffset
5902bdd1243dSDimitry Andric                  .addImm(ImmOffset); // offset(imm)
5903bdd1243dSDimitry Andric 
5904bdd1243dSDimitry Andric   if (IsTyped)
5905bdd1243dSDimitry Andric     MIB.addImm(Format);
5906bdd1243dSDimitry Andric 
5907bdd1243dSDimitry Andric   MIB.addImm(AuxiliaryData)       // cachepolicy, swizzled buffer(imm)
5908bdd1243dSDimitry Andric       .addImm(HasVIndex ? -1 : 0) // idxen(imm)
5909bdd1243dSDimitry Andric       .addMemOperand(MMO);
5910bdd1243dSDimitry Andric }
5911bdd1243dSDimitry Andric 
legalizeBufferLoad(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B,bool IsFormat,bool IsTyped) const59125ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
59135ffd83dbSDimitry Andric                                              MachineRegisterInfo &MRI,
59145ffd83dbSDimitry Andric                                              MachineIRBuilder &B,
59155ffd83dbSDimitry Andric                                              bool IsFormat,
59165ffd83dbSDimitry Andric                                              bool IsTyped) const {
59175ffd83dbSDimitry Andric   // FIXME: Verifier should enforce 1 MMO for these intrinsics.
59185ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
5919fe6060f1SDimitry Andric   const LLT MemTy = MMO->getMemoryType();
59205ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
59215ffd83dbSDimitry Andric 
59225ffd83dbSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
5923bdd1243dSDimitry Andric 
5924bdd1243dSDimitry Andric   Register StatusDst;
5925bdd1243dSDimitry Andric   int OpOffset = 0;
5926bdd1243dSDimitry Andric   assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
5927bdd1243dSDimitry Andric   bool IsTFE = MI.getNumExplicitDefs() == 2;
5928bdd1243dSDimitry Andric   if (IsTFE) {
5929bdd1243dSDimitry Andric     StatusDst = MI.getOperand(1).getReg();
5930bdd1243dSDimitry Andric     ++OpOffset;
5931bdd1243dSDimitry Andric   }
5932bdd1243dSDimitry Andric 
593306c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
5934bdd1243dSDimitry Andric   Register RSrc = MI.getOperand(2 + OpOffset).getReg();
59355ffd83dbSDimitry Andric 
59365ffd83dbSDimitry Andric   // The typed intrinsics add an immediate after the registers.
59375ffd83dbSDimitry Andric   const unsigned NumVIndexOps = IsTyped ? 8 : 7;
59385ffd83dbSDimitry Andric 
59395ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
5940bdd1243dSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
59415ffd83dbSDimitry Andric   Register VIndex;
59425ffd83dbSDimitry Andric   if (HasVIndex) {
5943bdd1243dSDimitry Andric     VIndex = MI.getOperand(3 + OpOffset).getReg();
5944bdd1243dSDimitry Andric     ++OpOffset;
5945fe6060f1SDimitry Andric   } else {
5946fe6060f1SDimitry Andric     VIndex = B.buildConstant(S32, 0).getReg(0);
59478bcb0991SDimitry Andric   }
59488bcb0991SDimitry Andric 
59495ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(3 + OpOffset).getReg();
59505ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(4 + OpOffset).getReg();
59515ffd83dbSDimitry Andric 
59525ffd83dbSDimitry Andric   unsigned Format = 0;
59535ffd83dbSDimitry Andric   if (IsTyped) {
59545ffd83dbSDimitry Andric     Format = MI.getOperand(5 + OpOffset).getImm();
59555ffd83dbSDimitry Andric     ++OpOffset;
59568bcb0991SDimitry Andric   }
59578bcb0991SDimitry Andric 
59585ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
59595ffd83dbSDimitry Andric   unsigned ImmOffset;
59605ffd83dbSDimitry Andric 
59615ffd83dbSDimitry Andric   LLT Ty = MRI.getType(Dst);
596206c3fb27SDimitry Andric   // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
596306c3fb27SDimitry Andric   // logic doesn't have to handle that case.
596406c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty)) {
596506c3fb27SDimitry Andric     Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
596606c3fb27SDimitry Andric     Dst = MI.getOperand(0).getReg();
596706c3fb27SDimitry Andric   }
59685ffd83dbSDimitry Andric   LLT EltTy = Ty.getScalarType();
59695ffd83dbSDimitry Andric   const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
59705ffd83dbSDimitry Andric   const bool Unpacked = ST.hasUnpackedD16VMem();
59715ffd83dbSDimitry Andric 
5972fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
59735ffd83dbSDimitry Andric 
59745ffd83dbSDimitry Andric   unsigned Opc;
59755ffd83dbSDimitry Andric 
5976bdd1243dSDimitry Andric   // TODO: Support TFE for typed and narrow loads.
59775ffd83dbSDimitry Andric   if (IsTyped) {
5978bdd1243dSDimitry Andric     if (IsTFE)
5979bdd1243dSDimitry Andric       return false;
59805ffd83dbSDimitry Andric     Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
59815ffd83dbSDimitry Andric                   AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
59825ffd83dbSDimitry Andric   } else if (IsFormat) {
5983bdd1243dSDimitry Andric     if (IsD16) {
5984bdd1243dSDimitry Andric       if (IsTFE)
5985bdd1243dSDimitry Andric         return false;
5986bdd1243dSDimitry Andric       Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
59875ffd83dbSDimitry Andric     } else {
5988bdd1243dSDimitry Andric       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5989bdd1243dSDimitry Andric                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5990bdd1243dSDimitry Andric     }
5991bdd1243dSDimitry Andric   } else {
5992fe6060f1SDimitry Andric     switch (MemTy.getSizeInBits()) {
5993fe6060f1SDimitry Andric     case 8:
5994*0fca6ea1SDimitry Andric       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
5995*0fca6ea1SDimitry Andric                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
59965ffd83dbSDimitry Andric       break;
5997fe6060f1SDimitry Andric     case 16:
5998*0fca6ea1SDimitry Andric       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
5999*0fca6ea1SDimitry Andric                   : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
60005ffd83dbSDimitry Andric       break;
60015ffd83dbSDimitry Andric     default:
6002*0fca6ea1SDimitry Andric       Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6003*0fca6ea1SDimitry Andric                   : AMDGPU::G_AMDGPU_BUFFER_LOAD;
60045ffd83dbSDimitry Andric       break;
60055ffd83dbSDimitry Andric     }
60065ffd83dbSDimitry Andric   }
60075ffd83dbSDimitry Andric 
6008bdd1243dSDimitry Andric   if (IsTFE) {
6009bdd1243dSDimitry Andric     unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6010bdd1243dSDimitry Andric     unsigned NumLoadDWords = NumValueDWords + 1;
6011bdd1243dSDimitry Andric     LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6012bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6013bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6014bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6015*0fca6ea1SDimitry Andric     if (MemTy.getSizeInBits() < 32) {
6016*0fca6ea1SDimitry Andric       Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6017*0fca6ea1SDimitry Andric       B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6018*0fca6ea1SDimitry Andric       B.buildTrunc(Dst, ExtDst);
6019*0fca6ea1SDimitry Andric     } else if (NumValueDWords == 1) {
6020bdd1243dSDimitry Andric       B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6021bdd1243dSDimitry Andric     } else {
6022bdd1243dSDimitry Andric       SmallVector<Register, 5> LoadElts;
6023bdd1243dSDimitry Andric       for (unsigned I = 0; I != NumValueDWords; ++I)
6024bdd1243dSDimitry Andric         LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6025bdd1243dSDimitry Andric       LoadElts.push_back(StatusDst);
6026bdd1243dSDimitry Andric       B.buildUnmerge(LoadElts, LoadDstReg);
6027bdd1243dSDimitry Andric       LoadElts.truncate(NumValueDWords);
6028bdd1243dSDimitry Andric       B.buildMergeLikeInstr(Dst, LoadElts);
6029bdd1243dSDimitry Andric     }
6030bdd1243dSDimitry Andric   } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6031bdd1243dSDimitry Andric              (IsD16 && !Ty.isVector())) {
6032bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6033bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6034bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
60355ffd83dbSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
60365ffd83dbSDimitry Andric     B.buildTrunc(Dst, LoadDstReg);
6037bdd1243dSDimitry Andric   } else if (Unpacked && IsD16 && Ty.isVector()) {
6038bdd1243dSDimitry Andric     LLT UnpackedTy = Ty.changeElementSize(32);
6039bdd1243dSDimitry Andric     Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6040bdd1243dSDimitry Andric     buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6041bdd1243dSDimitry Andric                     Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6042bdd1243dSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
60435ffd83dbSDimitry Andric     // FIXME: G_TRUNC should work, but legalization currently fails
60445ffd83dbSDimitry Andric     auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
60455ffd83dbSDimitry Andric     SmallVector<Register, 4> Repack;
60465ffd83dbSDimitry Andric     for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
60475ffd83dbSDimitry Andric       Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6048bdd1243dSDimitry Andric     B.buildMergeLikeInstr(Dst, Repack);
6049bdd1243dSDimitry Andric   } else {
6050bdd1243dSDimitry Andric     buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6051bdd1243dSDimitry Andric                     AuxiliaryData, MMO, IsTyped, HasVIndex, B);
60525ffd83dbSDimitry Andric   }
60535ffd83dbSDimitry Andric 
60545ffd83dbSDimitry Andric   MI.eraseFromParent();
60555ffd83dbSDimitry Andric   return true;
60565ffd83dbSDimitry Andric }
60575ffd83dbSDimitry Andric 
getBufferAtomicPseudo(Intrinsic::ID IntrID)60585ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
60595ffd83dbSDimitry Andric   switch (IntrID) {
60605ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
606106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
60625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
606306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
60645ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
60655ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
606606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
60675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
606806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
60695ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
60705ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
607106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
60725ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
607306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
60745ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
60755ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
607606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
60775ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
607806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
60795ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
60805ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
608106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
60825ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
608306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
60845ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
60855ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
608606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
60875ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
608806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
60895ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
60905ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
609106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
60925ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
609306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
60945ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
60955ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
609606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
60975ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
609806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
60995ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
61005ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
610106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
61025ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
610306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
61045ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
61055ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
610606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
61075ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
610806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
61095ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
61105ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
611106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
61125ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
611306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
61145ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
61155ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
611606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
61175ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
611806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
61195ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
61205ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
612106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
61225ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
612306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
61245ffd83dbSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6125e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
612606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6127e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
612806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6129e8d8bef9SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6130fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
613106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6132fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
613306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6134fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6135fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
613606c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6137fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
613806c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6139fe6060f1SDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
61407a6dacacSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
61417a6dacacSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
61427a6dacacSDimitry Andric     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
61435ffd83dbSDimitry Andric   default:
61445ffd83dbSDimitry Andric     llvm_unreachable("unhandled atomic opcode");
61455ffd83dbSDimitry Andric   }
61465ffd83dbSDimitry Andric }
61475ffd83dbSDimitry Andric 
legalizeBufferAtomic(MachineInstr & MI,MachineIRBuilder & B,Intrinsic::ID IID) const61485ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
61495ffd83dbSDimitry Andric                                                MachineIRBuilder &B,
61505ffd83dbSDimitry Andric                                                Intrinsic::ID IID) const {
615106c3fb27SDimitry Andric   const bool IsCmpSwap =
615206c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
615306c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
615406c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
615506c3fb27SDimitry Andric       IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
61565ffd83dbSDimitry Andric 
61575f757f3fSDimitry Andric   Register Dst = MI.getOperand(0).getReg();
615806c3fb27SDimitry Andric   // Since we don't have 128-bit atomics, we don't need to handle the case of
615906c3fb27SDimitry Andric   // p8 argmunents to the atomic itself
61605f757f3fSDimitry Andric   Register VData = MI.getOperand(2).getReg();
61615f757f3fSDimitry Andric 
6162e8d8bef9SDimitry Andric   Register CmpVal;
61635f757f3fSDimitry Andric   int OpOffset = 0;
61645ffd83dbSDimitry Andric 
61655ffd83dbSDimitry Andric   if (IsCmpSwap) {
61665f757f3fSDimitry Andric     CmpVal = MI.getOperand(3).getReg();
61675ffd83dbSDimitry Andric     ++OpOffset;
61685ffd83dbSDimitry Andric   }
61695ffd83dbSDimitry Andric 
617006c3fb27SDimitry Andric   castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
61715ffd83dbSDimitry Andric   Register RSrc = MI.getOperand(3 + OpOffset).getReg();
61725f757f3fSDimitry Andric   const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
61735ffd83dbSDimitry Andric 
61745ffd83dbSDimitry Andric   // The struct intrinsic variants add one additional operand over raw.
61755ffd83dbSDimitry Andric   const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
61765ffd83dbSDimitry Andric   Register VIndex;
61775ffd83dbSDimitry Andric   if (HasVIndex) {
61785ffd83dbSDimitry Andric     VIndex = MI.getOperand(4 + OpOffset).getReg();
61795ffd83dbSDimitry Andric     ++OpOffset;
6180fe6060f1SDimitry Andric   } else {
6181fe6060f1SDimitry Andric     VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
61825ffd83dbSDimitry Andric   }
61835ffd83dbSDimitry Andric 
61845ffd83dbSDimitry Andric   Register VOffset = MI.getOperand(4 + OpOffset).getReg();
61855ffd83dbSDimitry Andric   Register SOffset = MI.getOperand(5 + OpOffset).getReg();
61865ffd83dbSDimitry Andric   unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
61875ffd83dbSDimitry Andric 
61885ffd83dbSDimitry Andric   MachineMemOperand *MMO = *MI.memoperands_begin();
61895ffd83dbSDimitry Andric 
61905ffd83dbSDimitry Andric   unsigned ImmOffset;
6191fe6060f1SDimitry Andric   std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
61925ffd83dbSDimitry Andric 
61935f757f3fSDimitry Andric   auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
61945f757f3fSDimitry Andric       .addDef(Dst)
61955f757f3fSDimitry Andric       .addUse(VData); // vdata
61965ffd83dbSDimitry Andric 
61975ffd83dbSDimitry Andric   if (IsCmpSwap)
61985ffd83dbSDimitry Andric     MIB.addReg(CmpVal);
61995ffd83dbSDimitry Andric 
62005ffd83dbSDimitry Andric   MIB.addUse(RSrc)               // rsrc
62015ffd83dbSDimitry Andric      .addUse(VIndex)             // vindex
62025ffd83dbSDimitry Andric      .addUse(VOffset)            // voffset
62035ffd83dbSDimitry Andric      .addUse(SOffset)            // soffset
62045ffd83dbSDimitry Andric      .addImm(ImmOffset)          // offset(imm)
62055ffd83dbSDimitry Andric      .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
62065ffd83dbSDimitry Andric      .addImm(HasVIndex ? -1 : 0) // idxen(imm)
62075ffd83dbSDimitry Andric      .addMemOperand(MMO);
62085ffd83dbSDimitry Andric 
62095ffd83dbSDimitry Andric   MI.eraseFromParent();
62105ffd83dbSDimitry Andric   return true;
62115ffd83dbSDimitry Andric }
62125ffd83dbSDimitry Andric 
6213fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
62145ffd83dbSDimitry Andric /// vector with s16 typed elements.
packImage16bitOpsToDwords(MachineIRBuilder & B,MachineInstr & MI,SmallVectorImpl<Register> & PackedAddrs,unsigned ArgOffset,const AMDGPU::ImageDimIntrinsicInfo * Intr,bool IsA16,bool IsG16)6215fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
6216fe6060f1SDimitry Andric                                       SmallVectorImpl<Register> &PackedAddrs,
6217fe6060f1SDimitry Andric                                       unsigned ArgOffset,
6218fe6060f1SDimitry Andric                                       const AMDGPU::ImageDimIntrinsicInfo *Intr,
6219fe6060f1SDimitry Andric                                       bool IsA16, bool IsG16) {
62205ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
6221fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
6222fe6060f1SDimitry Andric   auto EndIdx = Intr->VAddrEnd;
62235ffd83dbSDimitry Andric 
6224e8d8bef9SDimitry Andric   for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6225e8d8bef9SDimitry Andric     MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
62265ffd83dbSDimitry Andric     if (!SrcOp.isReg())
62275ffd83dbSDimitry Andric       continue; // _L to _LZ may have eliminated this.
62285ffd83dbSDimitry Andric 
62295ffd83dbSDimitry Andric     Register AddrReg = SrcOp.getReg();
62305ffd83dbSDimitry Andric 
6231fe6060f1SDimitry Andric     if ((I < Intr->GradientStart) ||
6232fe6060f1SDimitry Andric         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6233fe6060f1SDimitry Andric         (I >= Intr->CoordStart && !IsA16)) {
62340eae32dcSDimitry Andric       if ((I < Intr->GradientStart) && IsA16 &&
62350eae32dcSDimitry Andric           (B.getMRI()->getType(AddrReg) == S16)) {
623604eeddc0SDimitry Andric         assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
62370eae32dcSDimitry Andric         // Special handling of bias when A16 is on. Bias is of type half but
62380eae32dcSDimitry Andric         // occupies full 32-bit.
62390eae32dcSDimitry Andric         PackedAddrs.push_back(
62400eae32dcSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
62410eae32dcSDimitry Andric                 .getReg(0));
62420eae32dcSDimitry Andric       } else {
624304eeddc0SDimitry Andric         assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
624404eeddc0SDimitry Andric                "Bias needs to be converted to 16 bit in A16 mode");
624504eeddc0SDimitry Andric         // Handle any gradient or coordinate operands that should not be packed
62465ffd83dbSDimitry Andric         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
62475ffd83dbSDimitry Andric         PackedAddrs.push_back(AddrReg);
62480eae32dcSDimitry Andric       }
62495ffd83dbSDimitry Andric     } else {
62505ffd83dbSDimitry Andric       // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
62515ffd83dbSDimitry Andric       // derivatives dx/dh and dx/dv are packed with undef.
62525ffd83dbSDimitry Andric       if (((I + 1) >= EndIdx) ||
6253e8d8bef9SDimitry Andric           ((Intr->NumGradients / 2) % 2 == 1 &&
6254e8d8bef9SDimitry Andric            (I == static_cast<unsigned>(Intr->GradientStart +
6255e8d8bef9SDimitry Andric                                        (Intr->NumGradients / 2) - 1) ||
6256e8d8bef9SDimitry Andric             I == static_cast<unsigned>(Intr->GradientStart +
6257e8d8bef9SDimitry Andric                                        Intr->NumGradients - 1))) ||
62585ffd83dbSDimitry Andric           // Check for _L to _LZ optimization
6259e8d8bef9SDimitry Andric           !MI.getOperand(ArgOffset + I + 1).isReg()) {
62605ffd83dbSDimitry Andric         PackedAddrs.push_back(
62615ffd83dbSDimitry Andric             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
62625ffd83dbSDimitry Andric                 .getReg(0));
62635ffd83dbSDimitry Andric       } else {
62645ffd83dbSDimitry Andric         PackedAddrs.push_back(
6265e8d8bef9SDimitry Andric             B.buildBuildVector(
6266e8d8bef9SDimitry Andric                  V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
62675ffd83dbSDimitry Andric                 .getReg(0));
62685ffd83dbSDimitry Andric         ++I;
62695ffd83dbSDimitry Andric       }
62705ffd83dbSDimitry Andric     }
62715ffd83dbSDimitry Andric   }
62725ffd83dbSDimitry Andric }
62735ffd83dbSDimitry Andric 
62745ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register,
62755ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg.
convertImageAddrToPacked(MachineIRBuilder & B,MachineInstr & MI,int DimIdx,int NumVAddrs)62765ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
62775ffd83dbSDimitry Andric                                      int DimIdx, int NumVAddrs) {
62785ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
6279bdd1243dSDimitry Andric   (void)S32;
62805ffd83dbSDimitry Andric   SmallVector<Register, 8> AddrRegs;
62815ffd83dbSDimitry Andric   for (int I = 0; I != NumVAddrs; ++I) {
62825ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
62835ffd83dbSDimitry Andric     if (SrcOp.isReg()) {
62845ffd83dbSDimitry Andric       AddrRegs.push_back(SrcOp.getReg());
62855ffd83dbSDimitry Andric       assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
62865ffd83dbSDimitry Andric     }
62875ffd83dbSDimitry Andric   }
62885ffd83dbSDimitry Andric 
62895ffd83dbSDimitry Andric   int NumAddrRegs = AddrRegs.size();
62905ffd83dbSDimitry Andric   if (NumAddrRegs != 1) {
6291fe6060f1SDimitry Andric     auto VAddr =
6292fe6060f1SDimitry Andric         B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
62935ffd83dbSDimitry Andric     MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
62945ffd83dbSDimitry Andric   }
62955ffd83dbSDimitry Andric 
62965ffd83dbSDimitry Andric   for (int I = 1; I != NumVAddrs; ++I) {
62975ffd83dbSDimitry Andric     MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
62985ffd83dbSDimitry Andric     if (SrcOp.isReg())
62995ffd83dbSDimitry Andric       MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
63005ffd83dbSDimitry Andric   }
63015ffd83dbSDimitry Andric }
63025ffd83dbSDimitry Andric 
63035ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget.
63045ffd83dbSDimitry Andric ///
63055ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be
63065ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed
63075ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit
63085ffd83dbSDimitry Andric /// registers.
63095ffd83dbSDimitry Andric ///
63105ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want
63115ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't
631281ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid
63135ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on
6314349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires
6315349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg.
legalizeImageIntrinsic(MachineInstr & MI,MachineIRBuilder & B,GISelChangeObserver & Observer,const AMDGPU::ImageDimIntrinsicInfo * Intr) const63165ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
6317e8d8bef9SDimitry Andric     MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
6318e8d8bef9SDimitry Andric     const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
63195ffd83dbSDimitry Andric 
6320bdd1243dSDimitry Andric   const MachineFunction &MF = *MI.getMF();
6321e8d8bef9SDimitry Andric   const unsigned NumDefs = MI.getNumExplicitDefs();
6322e8d8bef9SDimitry Andric   const unsigned ArgOffset = NumDefs + 1;
63235ffd83dbSDimitry Andric   bool IsTFE = NumDefs == 2;
63245ffd83dbSDimitry Andric   // We are only processing the operands of d16 image operations on subtargets
63255ffd83dbSDimitry Andric   // that use the unpacked register layout, or need to repack the TFE result.
63265ffd83dbSDimitry Andric 
63275ffd83dbSDimitry Andric   // TODO: Do we need to guard against already legalized intrinsics?
63285ffd83dbSDimitry Andric   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6329e8d8bef9SDimitry Andric       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
63305ffd83dbSDimitry Andric 
63315ffd83dbSDimitry Andric   MachineRegisterInfo *MRI = B.getMRI();
63325ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
63335ffd83dbSDimitry Andric   const LLT S16 = LLT::scalar(16);
6334fe6060f1SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
63355ffd83dbSDimitry Andric 
63365ffd83dbSDimitry Andric   unsigned DMask = 0;
6337*0fca6ea1SDimitry Andric   Register VData;
6338*0fca6ea1SDimitry Andric   LLT Ty;
6339*0fca6ea1SDimitry Andric 
6340*0fca6ea1SDimitry Andric   if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
6341*0fca6ea1SDimitry Andric     VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6342*0fca6ea1SDimitry Andric     Ty = MRI->getType(VData);
6343*0fca6ea1SDimitry Andric   }
63445ffd83dbSDimitry Andric 
63457a6dacacSDimitry Andric   const bool IsAtomicPacked16Bit =
63467a6dacacSDimitry Andric       (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
63477a6dacacSDimitry Andric        BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
63487a6dacacSDimitry Andric 
63495ffd83dbSDimitry Andric   // Check for 16 bit addresses and pack if true.
6350e8d8bef9SDimitry Andric   LLT GradTy =
6351e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
6352e8d8bef9SDimitry Andric   LLT AddrTy =
6353e8d8bef9SDimitry Andric       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
635406c3fb27SDimitry Andric   const bool IsG16 =
635506c3fb27SDimitry Andric       ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
63565ffd83dbSDimitry Andric   const bool IsA16 = AddrTy == S16;
63577a6dacacSDimitry Andric   const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
63585ffd83dbSDimitry Andric 
63595ffd83dbSDimitry Andric   int DMaskLanes = 0;
63605ffd83dbSDimitry Andric   if (!BaseOpcode->Atomic) {
6361e8d8bef9SDimitry Andric     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
63625ffd83dbSDimitry Andric     if (BaseOpcode->Gather4) {
63635ffd83dbSDimitry Andric       DMaskLanes = 4;
63645ffd83dbSDimitry Andric     } else if (DMask != 0) {
6365bdd1243dSDimitry Andric       DMaskLanes = llvm::popcount(DMask);
63665ffd83dbSDimitry Andric     } else if (!IsTFE && !BaseOpcode->Store) {
63675ffd83dbSDimitry Andric       // If dmask is 0, this is a no-op load. This can be eliminated.
63685ffd83dbSDimitry Andric       B.buildUndef(MI.getOperand(0));
63695ffd83dbSDimitry Andric       MI.eraseFromParent();
63705ffd83dbSDimitry Andric       return true;
63715ffd83dbSDimitry Andric     }
63725ffd83dbSDimitry Andric   }
63735ffd83dbSDimitry Andric 
63745ffd83dbSDimitry Andric   Observer.changingInstr(MI);
63755ffd83dbSDimitry Andric   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
63765ffd83dbSDimitry Andric 
637704eeddc0SDimitry Andric   const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
637804eeddc0SDimitry Andric                                      : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
637904eeddc0SDimitry Andric   const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
638004eeddc0SDimitry Andric                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6381*0fca6ea1SDimitry Andric   unsigned NewOpcode = LoadOpcode;
6382*0fca6ea1SDimitry Andric   if (BaseOpcode->Store)
6383*0fca6ea1SDimitry Andric     NewOpcode = StoreOpcode;
6384*0fca6ea1SDimitry Andric   else if (BaseOpcode->NoReturn)
6385*0fca6ea1SDimitry Andric     NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
63865ffd83dbSDimitry Andric 
63875ffd83dbSDimitry Andric   // Track that we legalized this
63885ffd83dbSDimitry Andric   MI.setDesc(B.getTII().get(NewOpcode));
63895ffd83dbSDimitry Andric 
63905ffd83dbSDimitry Andric   // Expecting to get an error flag since TFC is on - and dmask is 0 Force
63915ffd83dbSDimitry Andric   // dmask to be at least 1 otherwise the instruction will fail
63925ffd83dbSDimitry Andric   if (IsTFE && DMask == 0) {
63935ffd83dbSDimitry Andric     DMask = 0x1;
63945ffd83dbSDimitry Andric     DMaskLanes = 1;
6395e8d8bef9SDimitry Andric     MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
63965ffd83dbSDimitry Andric   }
63975ffd83dbSDimitry Andric 
63985ffd83dbSDimitry Andric   if (BaseOpcode->Atomic) {
63995ffd83dbSDimitry Andric     Register VData0 = MI.getOperand(2).getReg();
64005ffd83dbSDimitry Andric     LLT Ty = MRI->getType(VData0);
64015ffd83dbSDimitry Andric 
64025ffd83dbSDimitry Andric     // TODO: Allow atomic swap and bit ops for v2s16/v4s16
64037a6dacacSDimitry Andric     if (Ty.isVector() && !IsAtomicPacked16Bit)
64045ffd83dbSDimitry Andric       return false;
64055ffd83dbSDimitry Andric 
64065ffd83dbSDimitry Andric     if (BaseOpcode->AtomicX2) {
64075ffd83dbSDimitry Andric       Register VData1 = MI.getOperand(3).getReg();
64085ffd83dbSDimitry Andric       // The two values are packed in one register.
6409fe6060f1SDimitry Andric       LLT PackedTy = LLT::fixed_vector(2, Ty);
64105ffd83dbSDimitry Andric       auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
64115ffd83dbSDimitry Andric       MI.getOperand(2).setReg(Concat.getReg(0));
64125ffd83dbSDimitry Andric       MI.getOperand(3).setReg(AMDGPU::NoRegister);
64135ffd83dbSDimitry Andric     }
64145ffd83dbSDimitry Andric   }
64155ffd83dbSDimitry Andric 
6416e8d8bef9SDimitry Andric   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
64175ffd83dbSDimitry Andric 
64185ffd83dbSDimitry Andric   // Rewrite the addressing register layout before doing anything else.
6419fe6060f1SDimitry Andric   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6420fe6060f1SDimitry Andric     // 16 bit gradients are supported, but are tied to the A16 control
6421fe6060f1SDimitry Andric     // so both gradients and addresses must be 16 bit
64225ffd83dbSDimitry Andric     return false;
6423fe6060f1SDimitry Andric   }
64245ffd83dbSDimitry Andric 
6425fe6060f1SDimitry Andric   if (IsA16 && !ST.hasA16()) {
6426fe6060f1SDimitry Andric     // A16 not supported
6427fe6060f1SDimitry Andric     return false;
6428fe6060f1SDimitry Andric   }
6429fe6060f1SDimitry Andric 
64305f757f3fSDimitry Andric   const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
643106c3fb27SDimitry Andric   const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
643206c3fb27SDimitry Andric 
6433fe6060f1SDimitry Andric   if (IsA16 || IsG16) {
64345f757f3fSDimitry Andric     // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
64355f757f3fSDimitry Andric     // instructions expect VGPR_32
64365ffd83dbSDimitry Andric     SmallVector<Register, 4> PackedRegs;
64375ffd83dbSDimitry Andric 
64385f757f3fSDimitry Andric     packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
64395ffd83dbSDimitry Andric 
64405ffd83dbSDimitry Andric     // See also below in the non-a16 branch
6441bdd1243dSDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() &&
6442bdd1243dSDimitry Andric                         PackedRegs.size() >= ST.getNSAThreshold(MF) &&
644306c3fb27SDimitry Andric                         (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
644406c3fb27SDimitry Andric     const bool UsePartialNSA =
644506c3fb27SDimitry Andric         UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
64465ffd83dbSDimitry Andric 
644706c3fb27SDimitry Andric     if (UsePartialNSA) {
644806c3fb27SDimitry Andric       // Pack registers that would go over NSAMaxSize into last VAddr register
644906c3fb27SDimitry Andric       LLT PackedAddrTy =
645006c3fb27SDimitry Andric           LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
645106c3fb27SDimitry Andric       auto Concat = B.buildConcatVectors(
645206c3fb27SDimitry Andric           PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
645306c3fb27SDimitry Andric       PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
645406c3fb27SDimitry Andric       PackedRegs.resize(NSAMaxSize);
645506c3fb27SDimitry Andric     } else if (!UseNSA && PackedRegs.size() > 1) {
6456fe6060f1SDimitry Andric       LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
64575ffd83dbSDimitry Andric       auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
64585ffd83dbSDimitry Andric       PackedRegs[0] = Concat.getReg(0);
64595ffd83dbSDimitry Andric       PackedRegs.resize(1);
64605ffd83dbSDimitry Andric     }
64615ffd83dbSDimitry Andric 
6462e8d8bef9SDimitry Andric     const unsigned NumPacked = PackedRegs.size();
6463e8d8bef9SDimitry Andric     for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
6464e8d8bef9SDimitry Andric       MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
64655ffd83dbSDimitry Andric       if (!SrcOp.isReg()) {
64665ffd83dbSDimitry Andric         assert(SrcOp.isImm() && SrcOp.getImm() == 0);
64675ffd83dbSDimitry Andric         continue;
64685ffd83dbSDimitry Andric       }
64695ffd83dbSDimitry Andric 
64705ffd83dbSDimitry Andric       assert(SrcOp.getReg() != AMDGPU::NoRegister);
64715ffd83dbSDimitry Andric 
6472e8d8bef9SDimitry Andric       if (I - Intr->VAddrStart < NumPacked)
6473e8d8bef9SDimitry Andric         SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
64745ffd83dbSDimitry Andric       else
64755ffd83dbSDimitry Andric         SrcOp.setReg(AMDGPU::NoRegister);
64765ffd83dbSDimitry Andric     }
64775ffd83dbSDimitry Andric   } else {
64785ffd83dbSDimitry Andric     // If the register allocator cannot place the address registers contiguously
64795ffd83dbSDimitry Andric     // without introducing moves, then using the non-sequential address encoding
64805ffd83dbSDimitry Andric     // is always preferable, since it saves VALU instructions and is usually a
64815ffd83dbSDimitry Andric     // wash in terms of code size or even better.
64825ffd83dbSDimitry Andric     //
64835ffd83dbSDimitry Andric     // However, we currently have no way of hinting to the register allocator
64845ffd83dbSDimitry Andric     // that MIMG addresses should be placed contiguously when it is possible to
64855ffd83dbSDimitry Andric     // do so, so force non-NSA for the common 2-address case as a heuristic.
64865ffd83dbSDimitry Andric     //
64875ffd83dbSDimitry Andric     // SIShrinkInstructions will convert NSA encodings to non-NSA after register
64885ffd83dbSDimitry Andric     // allocation when possible.
648981ad6265SDimitry Andric     //
64905f757f3fSDimitry Andric     // Partial NSA is allowed on GFX11+ where the final register is a contiguous
649106c3fb27SDimitry Andric     // set of the remaining addresses.
6492bdd1243dSDimitry Andric     const bool UseNSA = ST.hasNSAEncoding() &&
6493bdd1243dSDimitry Andric                         CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
649406c3fb27SDimitry Andric                         (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
649506c3fb27SDimitry Andric     const bool UsePartialNSA =
649606c3fb27SDimitry Andric         UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
64975ffd83dbSDimitry Andric 
649806c3fb27SDimitry Andric     if (UsePartialNSA) {
649906c3fb27SDimitry Andric       convertImageAddrToPacked(B, MI,
650006c3fb27SDimitry Andric                                ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
650106c3fb27SDimitry Andric                                Intr->NumVAddrs - NSAMaxSize + 1);
650206c3fb27SDimitry Andric     } else if (!UseNSA && Intr->NumVAddrs > 1) {
6503e8d8bef9SDimitry Andric       convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
6504e8d8bef9SDimitry Andric                                Intr->NumVAddrs);
65055ffd83dbSDimitry Andric     }
650606c3fb27SDimitry Andric   }
65075ffd83dbSDimitry Andric 
65085ffd83dbSDimitry Andric   int Flags = 0;
65095ffd83dbSDimitry Andric   if (IsA16)
65105ffd83dbSDimitry Andric     Flags |= 1;
65115ffd83dbSDimitry Andric   if (IsG16)
65125ffd83dbSDimitry Andric     Flags |= 2;
65135ffd83dbSDimitry Andric   MI.addOperand(MachineOperand::CreateImm(Flags));
65145ffd83dbSDimitry Andric 
6515*0fca6ea1SDimitry Andric   if (BaseOpcode->NoReturn) { // No TFE for stores?
65165ffd83dbSDimitry Andric     // TODO: Handle dmask trim
651704eeddc0SDimitry Andric     if (!Ty.isVector() || !IsD16)
65185ffd83dbSDimitry Andric       return true;
65195ffd83dbSDimitry Andric 
6520e8d8bef9SDimitry Andric     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
65215ffd83dbSDimitry Andric     if (RepackedReg != VData) {
65225ffd83dbSDimitry Andric       MI.getOperand(1).setReg(RepackedReg);
65235ffd83dbSDimitry Andric     }
65245ffd83dbSDimitry Andric 
65255ffd83dbSDimitry Andric     return true;
65265ffd83dbSDimitry Andric   }
65275ffd83dbSDimitry Andric 
65285ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
65295ffd83dbSDimitry Andric   const LLT EltTy = Ty.getScalarType();
65305ffd83dbSDimitry Andric   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
65315ffd83dbSDimitry Andric 
65325ffd83dbSDimitry Andric   // Confirm that the return type is large enough for the dmask specified
65335ffd83dbSDimitry Andric   if (NumElts < DMaskLanes)
65345ffd83dbSDimitry Andric     return false;
65355ffd83dbSDimitry Andric 
65365ffd83dbSDimitry Andric   if (NumElts > 4 || DMaskLanes > 4)
65375ffd83dbSDimitry Andric     return false;
65385ffd83dbSDimitry Andric 
65397a6dacacSDimitry Andric   // Image atomic instructions are using DMask to specify how many bits
65407a6dacacSDimitry Andric   // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
65417a6dacacSDimitry Andric   // DMaskLanes for image atomic has default value '0'.
65427a6dacacSDimitry Andric   // We must be sure that atomic variants (especially packed) will not be
65437a6dacacSDimitry Andric   // truncated from v2s16 or v4s16 to s16 type.
65447a6dacacSDimitry Andric   //
65457a6dacacSDimitry Andric   // ChangeElementCount will be needed for image load where Ty is always scalar.
65465ffd83dbSDimitry Andric   const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6547fe6060f1SDimitry Andric   const LLT AdjustedTy =
65487a6dacacSDimitry Andric       DMaskLanes == 0
65497a6dacacSDimitry Andric           ? Ty
65507a6dacacSDimitry Andric           : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
65515ffd83dbSDimitry Andric 
65525ffd83dbSDimitry Andric   // The raw dword aligned data component of the load. The only legal cases
65535ffd83dbSDimitry Andric   // where this matters should be when using the packed D16 format, for
65545ffd83dbSDimitry Andric   // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
65555ffd83dbSDimitry Andric   LLT RoundedTy;
65565ffd83dbSDimitry Andric 
6557bdd1243dSDimitry Andric   // S32 vector to cover all data, plus TFE result element.
65585ffd83dbSDimitry Andric   LLT TFETy;
65595ffd83dbSDimitry Andric 
65605ffd83dbSDimitry Andric   // Register type to use for each loaded component. Will be S32 or V2S16.
65615ffd83dbSDimitry Andric   LLT RegTy;
65625ffd83dbSDimitry Andric 
65635ffd83dbSDimitry Andric   if (IsD16 && ST.hasUnpackedD16VMem()) {
6564fe6060f1SDimitry Andric     RoundedTy =
6565fe6060f1SDimitry Andric         LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
6566fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
65675ffd83dbSDimitry Andric     RegTy = S32;
65685ffd83dbSDimitry Andric   } else {
65695ffd83dbSDimitry Andric     unsigned EltSize = EltTy.getSizeInBits();
65705ffd83dbSDimitry Andric     unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
65715ffd83dbSDimitry Andric     unsigned RoundedSize = 32 * RoundedElts;
6572fe6060f1SDimitry Andric     RoundedTy = LLT::scalarOrVector(
6573fe6060f1SDimitry Andric         ElementCount::getFixed(RoundedSize / EltSize), EltSize);
6574fe6060f1SDimitry Andric     TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
65755ffd83dbSDimitry Andric     RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
65765ffd83dbSDimitry Andric   }
65775ffd83dbSDimitry Andric 
65785ffd83dbSDimitry Andric   // The return type does not need adjustment.
65795ffd83dbSDimitry Andric   // TODO: Should we change s16 case to s32 or <2 x s16>?
65805ffd83dbSDimitry Andric   if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
65815ffd83dbSDimitry Andric     return true;
65825ffd83dbSDimitry Andric 
65835ffd83dbSDimitry Andric   Register Dst1Reg;
65845ffd83dbSDimitry Andric 
65855ffd83dbSDimitry Andric   // Insert after the instruction.
65865ffd83dbSDimitry Andric   B.setInsertPt(*MI.getParent(), ++MI.getIterator());
65875ffd83dbSDimitry Andric 
65885ffd83dbSDimitry Andric   // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
65895ffd83dbSDimitry Andric   // s16> instead of s32, we would only need 1 bitcast instead of multiple.
65905ffd83dbSDimitry Andric   const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
65915ffd83dbSDimitry Andric   const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
65925ffd83dbSDimitry Andric 
65935ffd83dbSDimitry Andric   Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
65945ffd83dbSDimitry Andric 
65955ffd83dbSDimitry Andric   MI.getOperand(0).setReg(NewResultReg);
65965ffd83dbSDimitry Andric 
65975ffd83dbSDimitry Andric   // In the IR, TFE is supposed to be used with a 2 element struct return
6598349cc55cSDimitry Andric   // type. The instruction really returns these two values in one contiguous
65995ffd83dbSDimitry Andric   // register, with one additional dword beyond the loaded data. Rewrite the
66005ffd83dbSDimitry Andric   // return type to use a single register result.
66015ffd83dbSDimitry Andric 
66025ffd83dbSDimitry Andric   if (IsTFE) {
66035ffd83dbSDimitry Andric     Dst1Reg = MI.getOperand(1).getReg();
66045ffd83dbSDimitry Andric     if (MRI->getType(Dst1Reg) != S32)
66055ffd83dbSDimitry Andric       return false;
66065ffd83dbSDimitry Andric 
66075ffd83dbSDimitry Andric     // TODO: Make sure the TFE operand bit is set.
660881ad6265SDimitry Andric     MI.removeOperand(1);
66095ffd83dbSDimitry Andric 
66105ffd83dbSDimitry Andric     // Handle the easy case that requires no repack instructions.
66115ffd83dbSDimitry Andric     if (Ty == S32) {
66125ffd83dbSDimitry Andric       B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
66135ffd83dbSDimitry Andric       return true;
66145ffd83dbSDimitry Andric     }
66155ffd83dbSDimitry Andric   }
66165ffd83dbSDimitry Andric 
66175ffd83dbSDimitry Andric   // Now figure out how to copy the new result register back into the old
66185ffd83dbSDimitry Andric   // result.
66195ffd83dbSDimitry Andric   SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
66205ffd83dbSDimitry Andric 
66215ffd83dbSDimitry Andric   const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
66225ffd83dbSDimitry Andric 
66235ffd83dbSDimitry Andric   if (ResultNumRegs == 1) {
66245ffd83dbSDimitry Andric     assert(!IsTFE);
66255ffd83dbSDimitry Andric     ResultRegs[0] = NewResultReg;
66265ffd83dbSDimitry Andric   } else {
66275ffd83dbSDimitry Andric     // We have to repack into a new vector of some kind.
66285ffd83dbSDimitry Andric     for (int I = 0; I != NumDataRegs; ++I)
66295ffd83dbSDimitry Andric       ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
66305ffd83dbSDimitry Andric     B.buildUnmerge(ResultRegs, NewResultReg);
66315ffd83dbSDimitry Andric 
66325ffd83dbSDimitry Andric     // Drop the final TFE element to get the data part. The TFE result is
66335ffd83dbSDimitry Andric     // directly written to the right place already.
66345ffd83dbSDimitry Andric     if (IsTFE)
66355ffd83dbSDimitry Andric       ResultRegs.resize(NumDataRegs);
66365ffd83dbSDimitry Andric   }
66375ffd83dbSDimitry Andric 
66385ffd83dbSDimitry Andric   // For an s16 scalar result, we form an s32 result with a truncate regardless
66395ffd83dbSDimitry Andric   // of packed vs. unpacked.
66405ffd83dbSDimitry Andric   if (IsD16 && !Ty.isVector()) {
66415ffd83dbSDimitry Andric     B.buildTrunc(DstReg, ResultRegs[0]);
66425ffd83dbSDimitry Andric     return true;
66435ffd83dbSDimitry Andric   }
66445ffd83dbSDimitry Andric 
66455ffd83dbSDimitry Andric   // Avoid a build/concat_vector of 1 entry.
66465ffd83dbSDimitry Andric   if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
66475ffd83dbSDimitry Andric     B.buildBitcast(DstReg, ResultRegs[0]);
66485ffd83dbSDimitry Andric     return true;
66495ffd83dbSDimitry Andric   }
66505ffd83dbSDimitry Andric 
66515ffd83dbSDimitry Andric   assert(Ty.isVector());
66525ffd83dbSDimitry Andric 
66535ffd83dbSDimitry Andric   if (IsD16) {
66545ffd83dbSDimitry Andric     // For packed D16 results with TFE enabled, all the data components are
66555ffd83dbSDimitry Andric     // S32. Cast back to the expected type.
66565ffd83dbSDimitry Andric     //
66575ffd83dbSDimitry Andric     // TODO: We don't really need to use load s32 elements. We would only need one
66585ffd83dbSDimitry Andric     // cast for the TFE result if a multiple of v2s16 was used.
66595ffd83dbSDimitry Andric     if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
66605ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
66615ffd83dbSDimitry Andric         Reg = B.buildBitcast(V2S16, Reg).getReg(0);
66625ffd83dbSDimitry Andric     } else if (ST.hasUnpackedD16VMem()) {
66635ffd83dbSDimitry Andric       for (Register &Reg : ResultRegs)
66645ffd83dbSDimitry Andric         Reg = B.buildTrunc(S16, Reg).getReg(0);
66655ffd83dbSDimitry Andric     }
66665ffd83dbSDimitry Andric   }
66675ffd83dbSDimitry Andric 
66685ffd83dbSDimitry Andric   auto padWithUndef = [&](LLT Ty, int NumElts) {
66695ffd83dbSDimitry Andric     if (NumElts == 0)
66705ffd83dbSDimitry Andric       return;
66715ffd83dbSDimitry Andric     Register Undef = B.buildUndef(Ty).getReg(0);
66725ffd83dbSDimitry Andric     for (int I = 0; I != NumElts; ++I)
66735ffd83dbSDimitry Andric       ResultRegs.push_back(Undef);
66745ffd83dbSDimitry Andric   };
66755ffd83dbSDimitry Andric 
66765ffd83dbSDimitry Andric   // Pad out any elements eliminated due to the dmask.
66775ffd83dbSDimitry Andric   LLT ResTy = MRI->getType(ResultRegs[0]);
66785ffd83dbSDimitry Andric   if (!ResTy.isVector()) {
66795ffd83dbSDimitry Andric     padWithUndef(ResTy, NumElts - ResultRegs.size());
66805ffd83dbSDimitry Andric     B.buildBuildVector(DstReg, ResultRegs);
66815ffd83dbSDimitry Andric     return true;
66825ffd83dbSDimitry Andric   }
66835ffd83dbSDimitry Andric 
66845ffd83dbSDimitry Andric   assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
66855ffd83dbSDimitry Andric   const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
66865ffd83dbSDimitry Andric 
66875ffd83dbSDimitry Andric   // Deal with the one annoying legal case.
6688fe6060f1SDimitry Andric   const LLT V3S16 = LLT::fixed_vector(3, 16);
66895ffd83dbSDimitry Andric   if (Ty == V3S16) {
66900eae32dcSDimitry Andric     if (IsTFE) {
66910eae32dcSDimitry Andric       if (ResultRegs.size() == 1) {
66920eae32dcSDimitry Andric         NewResultReg = ResultRegs[0];
66930eae32dcSDimitry Andric       } else if (ResultRegs.size() == 2) {
66940eae32dcSDimitry Andric         LLT V4S16 = LLT::fixed_vector(4, 16);
66950eae32dcSDimitry Andric         NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
66960eae32dcSDimitry Andric       } else {
66970eae32dcSDimitry Andric         return false;
66980eae32dcSDimitry Andric       }
66990eae32dcSDimitry Andric     }
67000eae32dcSDimitry Andric 
67010eae32dcSDimitry Andric     if (MRI->getType(DstReg).getNumElements() <
67020eae32dcSDimitry Andric         MRI->getType(NewResultReg).getNumElements()) {
67030eae32dcSDimitry Andric       B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
67040eae32dcSDimitry Andric     } else {
67050eae32dcSDimitry Andric       B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
67060eae32dcSDimitry Andric     }
67075ffd83dbSDimitry Andric     return true;
67085ffd83dbSDimitry Andric   }
67095ffd83dbSDimitry Andric 
67105ffd83dbSDimitry Andric   padWithUndef(ResTy, RegsToCover - ResultRegs.size());
67115ffd83dbSDimitry Andric   B.buildConcatVectors(DstReg, ResultRegs);
67125ffd83dbSDimitry Andric   return true;
67135ffd83dbSDimitry Andric }
67145ffd83dbSDimitry Andric 
legalizeSBufferLoad(LegalizerHelper & Helper,MachineInstr & MI) const67157a6dacacSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
67167a6dacacSDimitry Andric                                               MachineInstr &MI) const {
6717e8d8bef9SDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
6718e8d8bef9SDimitry Andric   GISelChangeObserver &Observer = Helper.Observer;
6719e8d8bef9SDimitry Andric 
67207a6dacacSDimitry Andric   Register OrigDst = MI.getOperand(0).getReg();
67217a6dacacSDimitry Andric   Register Dst;
67227a6dacacSDimitry Andric   LLT Ty = B.getMRI()->getType(OrigDst);
67235ffd83dbSDimitry Andric   unsigned Size = Ty.getSizeInBits();
67245ffd83dbSDimitry Andric   MachineFunction &MF = B.getMF();
67257a6dacacSDimitry Andric   unsigned Opc = 0;
67267a6dacacSDimitry Andric   if (Size < 32 && ST.hasScalarSubwordLoads()) {
67277a6dacacSDimitry Andric     assert(Size == 8 || Size == 16);
67287a6dacacSDimitry Andric     Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
67297a6dacacSDimitry Andric                     : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
67307a6dacacSDimitry Andric     // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
67317a6dacacSDimitry Andric     // destination register.
67327a6dacacSDimitry Andric     Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
67337a6dacacSDimitry Andric   } else {
67347a6dacacSDimitry Andric     Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
67357a6dacacSDimitry Andric     Dst = OrigDst;
67367a6dacacSDimitry Andric   }
67375ffd83dbSDimitry Andric 
67385ffd83dbSDimitry Andric   Observer.changingInstr(MI);
67395ffd83dbSDimitry Andric 
674006c3fb27SDimitry Andric   // Handle needing to s.buffer.load() a p8 value.
674106c3fb27SDimitry Andric   if (hasBufferRsrcWorkaround(Ty)) {
674206c3fb27SDimitry Andric     Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
674306c3fb27SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
674406c3fb27SDimitry Andric   }
6745fe6060f1SDimitry Andric   if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
6746e8d8bef9SDimitry Andric     Ty = getBitcastRegisterType(Ty);
6747e8d8bef9SDimitry Andric     Helper.bitcastDst(MI, Ty, 0);
6748e8d8bef9SDimitry Andric     B.setInsertPt(B.getMBB(), MI);
6749e8d8bef9SDimitry Andric   }
6750e8d8bef9SDimitry Andric 
67515ffd83dbSDimitry Andric   // FIXME: We don't really need this intermediate instruction. The intrinsic
67525ffd83dbSDimitry Andric   // should be fixed to have a memory operand. Since it's readnone, we're not
67535ffd83dbSDimitry Andric   // allowed to add one.
67547a6dacacSDimitry Andric   MI.setDesc(B.getTII().get(Opc));
675581ad6265SDimitry Andric   MI.removeOperand(1); // Remove intrinsic ID
67565ffd83dbSDimitry Andric 
67575ffd83dbSDimitry Andric   // FIXME: When intrinsic definition is fixed, this should have an MMO already.
67585ffd83dbSDimitry Andric   const unsigned MemSize = (Size + 7) / 8;
6759*0fca6ea1SDimitry Andric   const Align MemAlign = B.getDataLayout().getABITypeAlign(
6760*0fca6ea1SDimitry Andric       getTypeForLLT(Ty, MF.getFunction().getContext()));
67615ffd83dbSDimitry Andric   MachineMemOperand *MMO = MF.getMachineMemOperand(
67625ffd83dbSDimitry Andric       MachinePointerInfo(),
67635ffd83dbSDimitry Andric       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
67645ffd83dbSDimitry Andric           MachineMemOperand::MOInvariant,
67655ffd83dbSDimitry Andric       MemSize, MemAlign);
67665ffd83dbSDimitry Andric   MI.addMemOperand(MF, MMO);
67677a6dacacSDimitry Andric   if (Dst != OrigDst) {
67687a6dacacSDimitry Andric     MI.getOperand(0).setReg(Dst);
67697a6dacacSDimitry Andric     B.setInsertPt(B.getMBB(), ++B.getInsertPt());
67707a6dacacSDimitry Andric     B.buildTrunc(OrigDst, Dst);
67717a6dacacSDimitry Andric   }
67725ffd83dbSDimitry Andric 
67735f757f3fSDimitry Andric   // If we don't have 96-bit result scalar loads, widening to 128-bit should
67745ffd83dbSDimitry Andric   // always be legal. We may need to restore this to a 96-bit result if it turns
67755ffd83dbSDimitry Andric   // out this needs to be converted to a vector load during RegBankSelect.
67765f757f3fSDimitry Andric   if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
67775ffd83dbSDimitry Andric     if (Ty.isVector())
67785ffd83dbSDimitry Andric       Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
67795ffd83dbSDimitry Andric     else
67805ffd83dbSDimitry Andric       Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
67815ffd83dbSDimitry Andric   }
67825ffd83dbSDimitry Andric 
67835ffd83dbSDimitry Andric   Observer.changedInstr(MI);
67845ffd83dbSDimitry Andric   return true;
67855ffd83dbSDimitry Andric }
67865ffd83dbSDimitry Andric 
6787e8d8bef9SDimitry Andric // TODO: Move to selection
legalizeTrap(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6788*0fca6ea1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrap(MachineInstr &MI,
67890b57cec5SDimitry Andric                                        MachineRegisterInfo &MRI,
67900b57cec5SDimitry Andric                                        MachineIRBuilder &B) const {
6791fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
6792fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
6793fe6060f1SDimitry Andric     return legalizeTrapEndpgm(MI, MRI, B);
6794fe6060f1SDimitry Andric 
679506c3fb27SDimitry Andric   return ST.supportsGetDoorbellID() ?
679606c3fb27SDimitry Andric          legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
6797fe6060f1SDimitry Andric }
6798fe6060f1SDimitry Andric 
legalizeTrapEndpgm(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6799fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
6800fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
680106c3fb27SDimitry Andric   const DebugLoc &DL = MI.getDebugLoc();
680206c3fb27SDimitry Andric   MachineBasicBlock &BB = B.getMBB();
680306c3fb27SDimitry Andric   MachineFunction *MF = BB.getParent();
680406c3fb27SDimitry Andric 
680506c3fb27SDimitry Andric   if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
680606c3fb27SDimitry Andric     BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
680706c3fb27SDimitry Andric       .addImm(0);
680806c3fb27SDimitry Andric     MI.eraseFromParent();
680906c3fb27SDimitry Andric     return true;
681006c3fb27SDimitry Andric   }
681106c3fb27SDimitry Andric 
681206c3fb27SDimitry Andric   // We need a block split to make the real endpgm a terminator. We also don't
681306c3fb27SDimitry Andric   // want to break phis in successor blocks, so we can't just delete to the
681406c3fb27SDimitry Andric   // end of the block.
681506c3fb27SDimitry Andric   BB.splitAt(MI, false /*UpdateLiveIns*/);
681606c3fb27SDimitry Andric   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
681706c3fb27SDimitry Andric   MF->push_back(TrapBB);
681806c3fb27SDimitry Andric   BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
681906c3fb27SDimitry Andric     .addImm(0);
682006c3fb27SDimitry Andric   BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
682106c3fb27SDimitry Andric     .addMBB(TrapBB);
682206c3fb27SDimitry Andric 
682306c3fb27SDimitry Andric   BB.addSuccessor(TrapBB);
6824fe6060f1SDimitry Andric   MI.eraseFromParent();
6825fe6060f1SDimitry Andric   return true;
6826fe6060f1SDimitry Andric }
6827fe6060f1SDimitry Andric 
legalizeTrapHsaQueuePtr(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6828fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
6829fe6060f1SDimitry Andric     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
683081ad6265SDimitry Andric   MachineFunction &MF = B.getMF();
683181ad6265SDimitry Andric   const LLT S64 = LLT::scalar(64);
683281ad6265SDimitry Andric 
683381ad6265SDimitry Andric   Register SGPR01(AMDGPU::SGPR0_SGPR1);
683481ad6265SDimitry Andric   // For code object version 5, queue_ptr is passed through implicit kernarg.
68357a6dacacSDimitry Andric   if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
683606c3fb27SDimitry Andric       AMDGPU::AMDHSA_COV5) {
683781ad6265SDimitry Andric     AMDGPUTargetLowering::ImplicitParameter Param =
683881ad6265SDimitry Andric         AMDGPUTargetLowering::QUEUE_PTR;
683981ad6265SDimitry Andric     uint64_t Offset =
684081ad6265SDimitry Andric         ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
684181ad6265SDimitry Andric 
684281ad6265SDimitry Andric     Register KernargPtrReg = MRI.createGenericVirtualRegister(
684381ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
684481ad6265SDimitry Andric 
684581ad6265SDimitry Andric     if (!loadInputValue(KernargPtrReg, B,
684681ad6265SDimitry Andric                         AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
684781ad6265SDimitry Andric       return false;
684881ad6265SDimitry Andric 
684981ad6265SDimitry Andric     // TODO: can we be smarter about machine pointer info?
685081ad6265SDimitry Andric     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
685181ad6265SDimitry Andric     MachineMemOperand *MMO = MF.getMachineMemOperand(
685281ad6265SDimitry Andric         PtrInfo,
685381ad6265SDimitry Andric         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
685481ad6265SDimitry Andric             MachineMemOperand::MOInvariant,
685581ad6265SDimitry Andric         LLT::scalar(64), commonAlignment(Align(64), Offset));
685681ad6265SDimitry Andric 
685781ad6265SDimitry Andric     // Pointer address
685881ad6265SDimitry Andric     Register LoadAddr = MRI.createGenericVirtualRegister(
685981ad6265SDimitry Andric         LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
686081ad6265SDimitry Andric     B.buildPtrAdd(LoadAddr, KernargPtrReg,
686181ad6265SDimitry Andric                   B.buildConstant(LLT::scalar(64), Offset).getReg(0));
686281ad6265SDimitry Andric     // Load address
686381ad6265SDimitry Andric     Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
686481ad6265SDimitry Andric     B.buildCopy(SGPR01, Temp);
686581ad6265SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
686681ad6265SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
686781ad6265SDimitry Andric         .addReg(SGPR01, RegState::Implicit);
686881ad6265SDimitry Andric     MI.eraseFromParent();
686981ad6265SDimitry Andric     return true;
687081ad6265SDimitry Andric   }
687181ad6265SDimitry Andric 
68725ffd83dbSDimitry Andric   // Pass queue pointer to trap handler as input, and insert trap instruction
68735ffd83dbSDimitry Andric   // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
6874e8d8bef9SDimitry Andric   Register LiveIn =
6875e8d8bef9SDimitry Andric     MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
6876e8d8bef9SDimitry Andric   if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
68775ffd83dbSDimitry Andric     return false;
6878e8d8bef9SDimitry Andric 
68795ffd83dbSDimitry Andric   B.buildCopy(SGPR01, LiveIn);
68805ffd83dbSDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
6881fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
68825ffd83dbSDimitry Andric       .addReg(SGPR01, RegState::Implicit);
6883fe6060f1SDimitry Andric 
6884fe6060f1SDimitry Andric   MI.eraseFromParent();
6885fe6060f1SDimitry Andric   return true;
68865ffd83dbSDimitry Andric }
68875ffd83dbSDimitry Andric 
legalizeTrapHsa(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6888*0fca6ea1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa(MachineInstr &MI,
6889*0fca6ea1SDimitry Andric                                           MachineRegisterInfo &MRI,
6890*0fca6ea1SDimitry Andric                                           MachineIRBuilder &B) const {
6891*0fca6ea1SDimitry Andric   // We need to simulate the 's_trap 2' instruction on targets that run in
6892*0fca6ea1SDimitry Andric   // PRIV=1 (where it is treated as a nop).
6893*0fca6ea1SDimitry Andric   if (ST.hasPrivEnabledTrap2NopBug()) {
6894*0fca6ea1SDimitry Andric     ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
6895*0fca6ea1SDimitry Andric                                            MI.getDebugLoc());
6896*0fca6ea1SDimitry Andric     MI.eraseFromParent();
6897*0fca6ea1SDimitry Andric     return true;
6898*0fca6ea1SDimitry Andric   }
6899*0fca6ea1SDimitry Andric 
6900fe6060f1SDimitry Andric   B.buildInstr(AMDGPU::S_TRAP)
6901fe6060f1SDimitry Andric       .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
69025ffd83dbSDimitry Andric   MI.eraseFromParent();
69035ffd83dbSDimitry Andric   return true;
69045ffd83dbSDimitry Andric }
69055ffd83dbSDimitry Andric 
legalizeDebugTrap(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const6906*0fca6ea1SDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrap(MachineInstr &MI,
6907*0fca6ea1SDimitry Andric                                             MachineRegisterInfo &MRI,
6908*0fca6ea1SDimitry Andric                                             MachineIRBuilder &B) const {
6909349cc55cSDimitry Andric   // Is non-HSA path or trap-handler disabled? Then, report a warning
69105ffd83dbSDimitry Andric   // accordingly
6911fe6060f1SDimitry Andric   if (!ST.isTrapHandlerEnabled() ||
6912fe6060f1SDimitry Andric       ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
69135ffd83dbSDimitry Andric     DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
69145ffd83dbSDimitry Andric                                      "debugtrap handler not supported",
69155ffd83dbSDimitry Andric                                      MI.getDebugLoc(), DS_Warning);
69165ffd83dbSDimitry Andric     LLVMContext &Ctx = B.getMF().getFunction().getContext();
69175ffd83dbSDimitry Andric     Ctx.diagnose(NoTrap);
69185ffd83dbSDimitry Andric   } else {
69195ffd83dbSDimitry Andric     // Insert debug-trap instruction
6920fe6060f1SDimitry Andric     B.buildInstr(AMDGPU::S_TRAP)
6921fe6060f1SDimitry Andric         .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
69225ffd83dbSDimitry Andric   }
69235ffd83dbSDimitry Andric 
69245ffd83dbSDimitry Andric   MI.eraseFromParent();
69255ffd83dbSDimitry Andric   return true;
69265ffd83dbSDimitry Andric }
69275ffd83dbSDimitry Andric 
legalizeBVHIntrinsic(MachineInstr & MI,MachineIRBuilder & B) const6928e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
6929e8d8bef9SDimitry Andric                                                MachineIRBuilder &B) const {
6930e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
6931e8d8bef9SDimitry Andric   const LLT S16 = LLT::scalar(16);
6932e8d8bef9SDimitry Andric   const LLT S32 = LLT::scalar(32);
693381ad6265SDimitry Andric   const LLT V2S16 = LLT::fixed_vector(2, 16);
693481ad6265SDimitry Andric   const LLT V3S32 = LLT::fixed_vector(3, 32);
6935e8d8bef9SDimitry Andric 
6936e8d8bef9SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
6937e8d8bef9SDimitry Andric   Register NodePtr = MI.getOperand(2).getReg();
6938e8d8bef9SDimitry Andric   Register RayExtent = MI.getOperand(3).getReg();
6939e8d8bef9SDimitry Andric   Register RayOrigin = MI.getOperand(4).getReg();
6940e8d8bef9SDimitry Andric   Register RayDir = MI.getOperand(5).getReg();
6941e8d8bef9SDimitry Andric   Register RayInvDir = MI.getOperand(6).getReg();
6942e8d8bef9SDimitry Andric   Register TDescr = MI.getOperand(7).getReg();
6943e8d8bef9SDimitry Andric 
6944fe6060f1SDimitry Andric   if (!ST.hasGFX10_AEncoding()) {
6945fe6060f1SDimitry Andric     DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(),
6946fe6060f1SDimitry Andric                                         "intrinsic not supported on subtarget",
6947fe6060f1SDimitry Andric                                         MI.getDebugLoc());
6948fe6060f1SDimitry Andric     B.getMF().getFunction().getContext().diagnose(BadIntrin);
6949fe6060f1SDimitry Andric     return false;
6950fe6060f1SDimitry Andric   }
6951fe6060f1SDimitry Andric 
69525f757f3fSDimitry Andric   const bool IsGFX11 = AMDGPU::isGFX11(ST);
695381ad6265SDimitry Andric   const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
69545f757f3fSDimitry Andric   const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
6955349cc55cSDimitry Andric   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6956349cc55cSDimitry Andric   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
6957349cc55cSDimitry Andric   const unsigned NumVDataDwords = 4;
6958349cc55cSDimitry Andric   const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
695981ad6265SDimitry Andric   const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
69605f757f3fSDimitry Andric   const bool UseNSA =
69615f757f3fSDimitry Andric       IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
69625f757f3fSDimitry Andric 
6963349cc55cSDimitry Andric   const unsigned BaseOpcodes[2][2] = {
6964349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6965349cc55cSDimitry Andric       {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6966349cc55cSDimitry Andric        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6967349cc55cSDimitry Andric   int Opcode;
6968349cc55cSDimitry Andric   if (UseNSA) {
696981ad6265SDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
69705f757f3fSDimitry Andric                                    IsGFX12Plus ? AMDGPU::MIMGEncGfx12
69715f757f3fSDimitry Andric                                    : IsGFX11   ? AMDGPU::MIMGEncGfx11NSA
697281ad6265SDimitry Andric                                                : AMDGPU::MIMGEncGfx10NSA,
6973349cc55cSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
6974349cc55cSDimitry Andric   } else {
69755f757f3fSDimitry Andric     assert(!IsGFX12Plus);
69765f757f3fSDimitry Andric     Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
69775f757f3fSDimitry Andric                                    IsGFX11 ? AMDGPU::MIMGEncGfx11Default
69785f757f3fSDimitry Andric                                            : AMDGPU::MIMGEncGfx10Default,
6979bdd1243dSDimitry Andric                                    NumVDataDwords, NumVAddrDwords);
6980349cc55cSDimitry Andric   }
6981349cc55cSDimitry Andric   assert(Opcode != -1);
6982e8d8bef9SDimitry Andric 
6983e8d8bef9SDimitry Andric   SmallVector<Register, 12> Ops;
698481ad6265SDimitry Andric   if (UseNSA && IsGFX11Plus) {
698581ad6265SDimitry Andric     auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
698681ad6265SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
6987bdd1243dSDimitry Andric       auto Merged = B.buildMergeLikeInstr(
698881ad6265SDimitry Andric           V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
698981ad6265SDimitry Andric       Ops.push_back(Merged.getReg(0));
699081ad6265SDimitry Andric     };
699181ad6265SDimitry Andric 
699281ad6265SDimitry Andric     Ops.push_back(NodePtr);
699381ad6265SDimitry Andric     Ops.push_back(RayExtent);
699481ad6265SDimitry Andric     packLanes(RayOrigin);
699581ad6265SDimitry Andric 
699681ad6265SDimitry Andric     if (IsA16) {
699781ad6265SDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
699881ad6265SDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
6999bdd1243dSDimitry Andric       auto MergedDir = B.buildMergeLikeInstr(
700081ad6265SDimitry Andric           V3S32,
7001bdd1243dSDimitry Andric           {B.buildBitcast(
7002bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
700381ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(0)}))
700481ad6265SDimitry Andric                .getReg(0),
7005bdd1243dSDimitry Andric            B.buildBitcast(
7006bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
700781ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(1)}))
700881ad6265SDimitry Andric                .getReg(0),
7009bdd1243dSDimitry Andric            B.buildBitcast(
7010bdd1243dSDimitry Andric                 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
701181ad6265SDimitry Andric                                                    UnmergeRayDir.getReg(2)}))
701281ad6265SDimitry Andric                .getReg(0)});
701381ad6265SDimitry Andric       Ops.push_back(MergedDir.getReg(0));
701481ad6265SDimitry Andric     } else {
701581ad6265SDimitry Andric       packLanes(RayDir);
701681ad6265SDimitry Andric       packLanes(RayInvDir);
701781ad6265SDimitry Andric     }
701881ad6265SDimitry Andric   } else {
7019e8d8bef9SDimitry Andric     if (Is64) {
7020e8d8bef9SDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7021e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
7022e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
7023e8d8bef9SDimitry Andric     } else {
7024e8d8bef9SDimitry Andric       Ops.push_back(NodePtr);
7025e8d8bef9SDimitry Andric     }
7026e8d8bef9SDimitry Andric     Ops.push_back(RayExtent);
7027e8d8bef9SDimitry Andric 
7028e8d8bef9SDimitry Andric     auto packLanes = [&Ops, &S32, &B](Register Src) {
70290eae32dcSDimitry Andric       auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7030e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(0));
7031e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(1));
7032e8d8bef9SDimitry Andric       Ops.push_back(Unmerge.getReg(2));
7033e8d8bef9SDimitry Andric     };
7034e8d8bef9SDimitry Andric 
7035e8d8bef9SDimitry Andric     packLanes(RayOrigin);
7036e8d8bef9SDimitry Andric     if (IsA16) {
70370eae32dcSDimitry Andric       auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
70380eae32dcSDimitry Andric       auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7039e8d8bef9SDimitry Andric       Register R1 = MRI.createGenericVirtualRegister(S32);
7040e8d8bef9SDimitry Andric       Register R2 = MRI.createGenericVirtualRegister(S32);
7041e8d8bef9SDimitry Andric       Register R3 = MRI.createGenericVirtualRegister(S32);
7042bdd1243dSDimitry Andric       B.buildMergeLikeInstr(R1,
7043bdd1243dSDimitry Andric                             {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7044bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
7045bdd1243dSDimitry Andric           R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7046bdd1243dSDimitry Andric       B.buildMergeLikeInstr(
7047bdd1243dSDimitry Andric           R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7048e8d8bef9SDimitry Andric       Ops.push_back(R1);
7049e8d8bef9SDimitry Andric       Ops.push_back(R2);
7050e8d8bef9SDimitry Andric       Ops.push_back(R3);
7051e8d8bef9SDimitry Andric     } else {
7052e8d8bef9SDimitry Andric       packLanes(RayDir);
7053e8d8bef9SDimitry Andric       packLanes(RayInvDir);
7054e8d8bef9SDimitry Andric     }
705581ad6265SDimitry Andric   }
7056e8d8bef9SDimitry Andric 
7057349cc55cSDimitry Andric   if (!UseNSA) {
7058349cc55cSDimitry Andric     // Build a single vector containing all the operands so far prepared.
7059349cc55cSDimitry Andric     LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7060bdd1243dSDimitry Andric     Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7061349cc55cSDimitry Andric     Ops.clear();
7062349cc55cSDimitry Andric     Ops.push_back(MergedOps);
7063349cc55cSDimitry Andric   }
7064349cc55cSDimitry Andric 
7065e8d8bef9SDimitry Andric   auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
7066e8d8bef9SDimitry Andric     .addDef(DstReg)
7067e8d8bef9SDimitry Andric     .addImm(Opcode);
7068e8d8bef9SDimitry Andric 
7069e8d8bef9SDimitry Andric   for (Register R : Ops) {
7070e8d8bef9SDimitry Andric     MIB.addUse(R);
7071e8d8bef9SDimitry Andric   }
7072e8d8bef9SDimitry Andric 
7073e8d8bef9SDimitry Andric   MIB.addUse(TDescr)
7074e8d8bef9SDimitry Andric      .addImm(IsA16 ? 1 : 0)
7075e8d8bef9SDimitry Andric      .cloneMemRefs(MI);
7076e8d8bef9SDimitry Andric 
7077e8d8bef9SDimitry Andric   MI.eraseFromParent();
7078e8d8bef9SDimitry Andric   return true;
7079e8d8bef9SDimitry Andric }
7080e8d8bef9SDimitry Andric 
legalizeFPTruncRound(MachineInstr & MI,MachineIRBuilder & B) const708181ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
708281ad6265SDimitry Andric                                                MachineIRBuilder &B) const {
708381ad6265SDimitry Andric   unsigned Opc;
708481ad6265SDimitry Andric   int RoundMode = MI.getOperand(2).getImm();
708581ad6265SDimitry Andric 
708681ad6265SDimitry Andric   if (RoundMode == (int)RoundingMode::TowardPositive)
708781ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
708881ad6265SDimitry Andric   else if (RoundMode == (int)RoundingMode::TowardNegative)
708981ad6265SDimitry Andric     Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
709081ad6265SDimitry Andric   else
709181ad6265SDimitry Andric     return false;
709281ad6265SDimitry Andric 
709381ad6265SDimitry Andric   B.buildInstr(Opc)
709481ad6265SDimitry Andric       .addDef(MI.getOperand(0).getReg())
709581ad6265SDimitry Andric       .addUse(MI.getOperand(1).getReg());
709681ad6265SDimitry Andric 
709704eeddc0SDimitry Andric   MI.eraseFromParent();
709881ad6265SDimitry Andric 
709904eeddc0SDimitry Andric   return true;
710004eeddc0SDimitry Andric }
710104eeddc0SDimitry Andric 
legalizeStackSave(MachineInstr & MI,MachineIRBuilder & B) const71025f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
71035f757f3fSDimitry Andric                                             MachineIRBuilder &B) const {
71045f757f3fSDimitry Andric   const SITargetLowering *TLI = ST.getTargetLowering();
71055f757f3fSDimitry Andric   Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
71065f757f3fSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
71075f757f3fSDimitry Andric   B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
71085f757f3fSDimitry Andric   MI.eraseFromParent();
71095f757f3fSDimitry Andric   return true;
71105f757f3fSDimitry Andric }
71115f757f3fSDimitry Andric 
legalizeWaveID(MachineInstr & MI,MachineIRBuilder & B) const7112b3edf446SDimitry Andric bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI,
7113b3edf446SDimitry Andric                                          MachineIRBuilder &B) const {
7114b3edf446SDimitry Andric   // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7115b3edf446SDimitry Andric   if (!ST.hasArchitectedSGPRs())
7116b3edf446SDimitry Andric     return false;
7117b3edf446SDimitry Andric   LLT S32 = LLT::scalar(32);
7118b3edf446SDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
7119b3edf446SDimitry Andric   auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7120b3edf446SDimitry Andric   auto LSB = B.buildConstant(S32, 25);
7121b3edf446SDimitry Andric   auto Width = B.buildConstant(S32, 5);
7122b3edf446SDimitry Andric   B.buildUbfx(DstReg, TTMP8, LSB, Width);
7123b3edf446SDimitry Andric   MI.eraseFromParent();
7124b3edf446SDimitry Andric   return true;
7125b3edf446SDimitry Andric }
7126b3edf446SDimitry Andric 
7127*0fca6ea1SDimitry Andric static constexpr unsigned FPEnvModeBitField =
7128*0fca6ea1SDimitry Andric     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 0, 23);
7129*0fca6ea1SDimitry Andric 
7130*0fca6ea1SDimitry Andric static constexpr unsigned FPEnvTrapBitField =
7131*0fca6ea1SDimitry Andric     AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_TRAPSTS, 0, 5);
7132*0fca6ea1SDimitry Andric 
legalizeGetFPEnv(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const7133*0fca6ea1SDimitry Andric bool AMDGPULegalizerInfo::legalizeGetFPEnv(MachineInstr &MI,
7134*0fca6ea1SDimitry Andric                                            MachineRegisterInfo &MRI,
7135*0fca6ea1SDimitry Andric                                            MachineIRBuilder &B) const {
7136*0fca6ea1SDimitry Andric   Register Src = MI.getOperand(0).getReg();
7137*0fca6ea1SDimitry Andric   if (MRI.getType(Src) != S64)
7138*0fca6ea1SDimitry Andric     return false;
7139*0fca6ea1SDimitry Andric 
7140*0fca6ea1SDimitry Andric   auto ModeReg =
7141*0fca6ea1SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7142*0fca6ea1SDimitry Andric                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7143*0fca6ea1SDimitry Andric           .addImm(FPEnvModeBitField);
7144*0fca6ea1SDimitry Andric   auto TrapReg =
7145*0fca6ea1SDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7146*0fca6ea1SDimitry Andric                        /*HasSideEffects=*/true, /*isConvergent=*/false)
7147*0fca6ea1SDimitry Andric           .addImm(FPEnvTrapBitField);
7148*0fca6ea1SDimitry Andric   B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7149*0fca6ea1SDimitry Andric   MI.eraseFromParent();
7150*0fca6ea1SDimitry Andric   return true;
7151*0fca6ea1SDimitry Andric }
7152*0fca6ea1SDimitry Andric 
legalizeSetFPEnv(MachineInstr & MI,MachineRegisterInfo & MRI,MachineIRBuilder & B) const7153*0fca6ea1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSetFPEnv(MachineInstr &MI,
7154*0fca6ea1SDimitry Andric                                            MachineRegisterInfo &MRI,
7155*0fca6ea1SDimitry Andric                                            MachineIRBuilder &B) const {
7156*0fca6ea1SDimitry Andric   Register Src = MI.getOperand(0).getReg();
7157*0fca6ea1SDimitry Andric   if (MRI.getType(Src) != S64)
7158*0fca6ea1SDimitry Andric     return false;
7159*0fca6ea1SDimitry Andric 
7160*0fca6ea1SDimitry Andric   auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7161*0fca6ea1SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7162*0fca6ea1SDimitry Andric                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7163*0fca6ea1SDimitry Andric       .addImm(static_cast<int16_t>(FPEnvModeBitField))
7164*0fca6ea1SDimitry Andric       .addReg(Unmerge.getReg(0));
7165*0fca6ea1SDimitry Andric   B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7166*0fca6ea1SDimitry Andric                    /*HasSideEffects=*/true, /*isConvergent=*/false)
7167*0fca6ea1SDimitry Andric       .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7168*0fca6ea1SDimitry Andric       .addReg(Unmerge.getReg(1));
7169*0fca6ea1SDimitry Andric   MI.eraseFromParent();
7170*0fca6ea1SDimitry Andric   return true;
7171*0fca6ea1SDimitry Andric }
7172*0fca6ea1SDimitry Andric 
legalizeIntrinsic(LegalizerHelper & Helper,MachineInstr & MI) const71735ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
71745ffd83dbSDimitry Andric                                             MachineInstr &MI) const {
71755ffd83dbSDimitry Andric   MachineIRBuilder &B = Helper.MIRBuilder;
71765ffd83dbSDimitry Andric   MachineRegisterInfo &MRI = *B.getMRI();
71775ffd83dbSDimitry Andric 
71780b57cec5SDimitry Andric   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
71795f757f3fSDimitry Andric   auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
7180480093f4SDimitry Andric   switch (IntrID) {
7181480093f4SDimitry Andric   case Intrinsic::amdgcn_if:
7182480093f4SDimitry Andric   case Intrinsic::amdgcn_else: {
7183480093f4SDimitry Andric     MachineInstr *Br = nullptr;
71845ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
7185e8d8bef9SDimitry Andric     bool Negated = false;
7186e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
7187e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
71880b57cec5SDimitry Andric       const SIRegisterInfo *TRI
71890b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
71900b57cec5SDimitry Andric 
71910b57cec5SDimitry Andric       Register Def = MI.getOperand(1).getReg();
71920b57cec5SDimitry Andric       Register Use = MI.getOperand(3).getReg();
7193480093f4SDimitry Andric 
71945ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
7195e8d8bef9SDimitry Andric 
7196e8d8bef9SDimitry Andric       if (Negated)
7197e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
7198e8d8bef9SDimitry Andric 
71995ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
7200480093f4SDimitry Andric       if (IntrID == Intrinsic::amdgcn_if) {
72010b57cec5SDimitry Andric         B.buildInstr(AMDGPU::SI_IF)
72020b57cec5SDimitry Andric           .addDef(Def)
72030b57cec5SDimitry Andric           .addUse(Use)
72045ffd83dbSDimitry Andric           .addMBB(UncondBrTarget);
7205480093f4SDimitry Andric       } else {
7206480093f4SDimitry Andric         B.buildInstr(AMDGPU::SI_ELSE)
7207480093f4SDimitry Andric             .addDef(Def)
7208480093f4SDimitry Andric             .addUse(Use)
7209e8d8bef9SDimitry Andric             .addMBB(UncondBrTarget);
7210480093f4SDimitry Andric       }
7211480093f4SDimitry Andric 
72125ffd83dbSDimitry Andric       if (Br) {
72135ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
72145ffd83dbSDimitry Andric       } else {
72155ffd83dbSDimitry Andric         // The IRTranslator skips inserting the G_BR for fallthrough cases, but
72165ffd83dbSDimitry Andric         // since we're swapping branch targets it needs to be reinserted.
72175ffd83dbSDimitry Andric         // FIXME: IRTranslator should probably not do this
72185ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
72195ffd83dbSDimitry Andric       }
72200b57cec5SDimitry Andric 
72210b57cec5SDimitry Andric       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
72220b57cec5SDimitry Andric       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
72230b57cec5SDimitry Andric       MI.eraseFromParent();
72240b57cec5SDimitry Andric       BrCond->eraseFromParent();
72250b57cec5SDimitry Andric       return true;
72260b57cec5SDimitry Andric     }
72270b57cec5SDimitry Andric 
72280b57cec5SDimitry Andric     return false;
72290b57cec5SDimitry Andric   }
72300b57cec5SDimitry Andric   case Intrinsic::amdgcn_loop: {
7231480093f4SDimitry Andric     MachineInstr *Br = nullptr;
72325ffd83dbSDimitry Andric     MachineBasicBlock *UncondBrTarget = nullptr;
7233e8d8bef9SDimitry Andric     bool Negated = false;
7234e8d8bef9SDimitry Andric     if (MachineInstr *BrCond =
7235e8d8bef9SDimitry Andric             verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
72360b57cec5SDimitry Andric       const SIRegisterInfo *TRI
72370b57cec5SDimitry Andric         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
72380b57cec5SDimitry Andric 
72395ffd83dbSDimitry Andric       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
72400b57cec5SDimitry Andric       Register Reg = MI.getOperand(2).getReg();
72415ffd83dbSDimitry Andric 
7242e8d8bef9SDimitry Andric       if (Negated)
7243e8d8bef9SDimitry Andric         std::swap(CondBrTarget, UncondBrTarget);
7244e8d8bef9SDimitry Andric 
72455ffd83dbSDimitry Andric       B.setInsertPt(B.getMBB(), BrCond->getIterator());
72460b57cec5SDimitry Andric       B.buildInstr(AMDGPU::SI_LOOP)
72470b57cec5SDimitry Andric         .addUse(Reg)
72485ffd83dbSDimitry Andric         .addMBB(UncondBrTarget);
72495ffd83dbSDimitry Andric 
72505ffd83dbSDimitry Andric       if (Br)
72515ffd83dbSDimitry Andric         Br->getOperand(0).setMBB(CondBrTarget);
72525ffd83dbSDimitry Andric       else
72535ffd83dbSDimitry Andric         B.buildBr(*CondBrTarget);
72545ffd83dbSDimitry Andric 
72550b57cec5SDimitry Andric       MI.eraseFromParent();
72560b57cec5SDimitry Andric       BrCond->eraseFromParent();
72570b57cec5SDimitry Andric       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
72580b57cec5SDimitry Andric       return true;
72590b57cec5SDimitry Andric     }
72600b57cec5SDimitry Andric 
72610b57cec5SDimitry Andric     return false;
72620b57cec5SDimitry Andric   }
7263*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_addrspacecast_nonnull:
7264*0fca6ea1SDimitry Andric     return legalizeAddrSpaceCast(MI, MRI, B);
726506c3fb27SDimitry Andric   case Intrinsic::amdgcn_make_buffer_rsrc:
726606c3fb27SDimitry Andric     return legalizePointerAsRsrcIntrin(MI, MRI, B);
72670b57cec5SDimitry Andric   case Intrinsic::amdgcn_kernarg_segment_ptr:
72685ffd83dbSDimitry Andric     if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
72695ffd83dbSDimitry Andric       // This only makes sense to call in a kernel, so just lower to null.
72705ffd83dbSDimitry Andric       B.buildConstant(MI.getOperand(0).getReg(), 0);
72715ffd83dbSDimitry Andric       MI.eraseFromParent();
72725ffd83dbSDimitry Andric       return true;
72735ffd83dbSDimitry Andric     }
72745ffd83dbSDimitry Andric 
72750b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
72760b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
72770b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicitarg_ptr:
72780b57cec5SDimitry Andric     return legalizeImplicitArgPtr(MI, MRI, B);
72790b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_x:
728081ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
72810b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_X);
72820b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_y:
728381ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
72840b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
72850b57cec5SDimitry Andric   case Intrinsic::amdgcn_workitem_id_z:
728681ad6265SDimitry Andric     return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
72870b57cec5SDimitry Andric                                        AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
72880b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_x:
72890b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
72900b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
72910b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_y:
72920b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
72930b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
72940b57cec5SDimitry Andric   case Intrinsic::amdgcn_workgroup_id_z:
72950b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
72960b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
7297b3edf446SDimitry Andric   case Intrinsic::amdgcn_wave_id:
7298b3edf446SDimitry Andric     return legalizeWaveID(MI, B);
7299fcaf7f86SDimitry Andric   case Intrinsic::amdgcn_lds_kernel_id:
7300fcaf7f86SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
7301fcaf7f86SDimitry Andric                                       AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
73020b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_ptr:
73030b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
73040b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
73050b57cec5SDimitry Andric   case Intrinsic::amdgcn_queue_ptr:
73060b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
73070b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::QUEUE_PTR);
73080b57cec5SDimitry Andric   case Intrinsic::amdgcn_implicit_buffer_ptr:
73090b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(
73100b57cec5SDimitry Andric       MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
73110b57cec5SDimitry Andric   case Intrinsic::amdgcn_dispatch_id:
73120b57cec5SDimitry Andric     return legalizePreloadedArgIntrin(MI, MRI, B,
73130b57cec5SDimitry Andric                                       AMDGPUFunctionArgInfo::DISPATCH_ID);
731481ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_x:
731581ad6265SDimitry Andric     // TODO: Emit error for hsa
731681ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
731781ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_X);
731881ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_y:
731981ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
732081ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Y);
732181ad6265SDimitry Andric   case Intrinsic::r600_read_ngroups_z:
732281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,
732381ad6265SDimitry Andric                                        SI::KernelInputOffsets::NGROUPS_Z);
732481ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_x:
732581ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
732681ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X);
732781ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_y:
732881ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
732981ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B,  SI::KernelInputOffsets::LOCAL_SIZE_Y);
733081ad6265SDimitry Andric     // TODO: Could insert G_ASSERT_ZEXT from s16
733181ad6265SDimitry Andric   case Intrinsic::r600_read_local_size_z:
733281ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z);
733381ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_x:
733481ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X);
733581ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_y:
733681ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y);
733781ad6265SDimitry Andric   case Intrinsic::r600_read_global_size_z:
733881ad6265SDimitry Andric     return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z);
73398bcb0991SDimitry Andric   case Intrinsic::amdgcn_fdiv_fast:
73408bcb0991SDimitry Andric     return legalizeFDIVFastIntrin(MI, MRI, B);
73418bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_shared:
73428bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
73438bcb0991SDimitry Andric   case Intrinsic::amdgcn_is_private:
73448bcb0991SDimitry Andric     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
73458bcb0991SDimitry Andric   case Intrinsic::amdgcn_wavefrontsize: {
73468bcb0991SDimitry Andric     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
73478bcb0991SDimitry Andric     MI.eraseFromParent();
73488bcb0991SDimitry Andric     return true;
73498bcb0991SDimitry Andric   }
73505ffd83dbSDimitry Andric   case Intrinsic::amdgcn_s_buffer_load:
7351e8d8bef9SDimitry Andric     return legalizeSBufferLoad(Helper, MI);
73528bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store:
735306c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_store:
73545ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store:
735506c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_store:
73565ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, false);
73578bcb0991SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_store_format:
735806c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
73595ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_store_format:
736006c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
73615ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, false, true);
73625ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_store:
736306c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
73645ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_store:
736506c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
73665ffd83dbSDimitry Andric     return legalizeBufferStore(MI, MRI, B, true, true);
73675ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load:
736806c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_load:
7369*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_raw_atomic_buffer_load:
7370*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
73715ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load:
737206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_load:
73735ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, false, false);
73745ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_load_format:
737506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
73765ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_load_format:
737706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
73785ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, false);
73795ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_tbuffer_load:
738006c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
73815ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_tbuffer_load:
738206c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
73835ffd83dbSDimitry Andric     return legalizeBufferLoad(MI, MRI, B, true, true);
73845ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
738506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
73865ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
738706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
73885ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_add:
738906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
73905ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_add:
739106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
73925ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
739306c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
73945ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
739506c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
73965ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
739706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
73985ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
739906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
74005ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
740106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
74025ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
740306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
74045ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
740506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
74065ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
740706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
74085ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
740906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
74105ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
741106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
74125ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_and:
741306c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
74145ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_and:
741506c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
74165ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_or:
741706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
74185ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_or:
741906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
74205ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_xor:
742106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
74225ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_xor:
742306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
74245ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_inc:
742506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
74265ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
742706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
74285ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
742906c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
74305ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
743106c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
74325ffd83dbSDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
743306c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
74345ffd83dbSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
743506c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7436fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
743706c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7438fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
743906c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7440fe6060f1SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
744106c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7442fe6060f1SDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
744306c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
744404eeddc0SDimitry Andric   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
744506c3fb27SDimitry Andric   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7446bdd1243dSDimitry Andric   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
744706c3fb27SDimitry Andric   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
744804eeddc0SDimitry Andric     return legalizeBufferAtomic(MI, B, IntrID);
7449e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_rsq_clamp:
7450e8d8bef9SDimitry Andric     return legalizeRsqClampIntrinsic(MI, MRI, B);
7451e8d8bef9SDimitry Andric   case Intrinsic::amdgcn_image_bvh_intersect_ray:
7452e8d8bef9SDimitry Andric     return legalizeBVHIntrinsic(MI, B);
7453b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7454b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7455b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7456b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7457b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7458b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7459b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7460b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7461b3edf446SDimitry Andric     Register Index = MI.getOperand(5).getReg();
7462b3edf446SDimitry Andric     LLT S32 = LLT::scalar(32);
7463b3edf446SDimitry Andric     if (MRI.getType(Index) != S32)
7464b3edf446SDimitry Andric       MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
7465b3edf446SDimitry Andric     return true;
7466b3edf446SDimitry Andric   }
7467b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7468b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7469b3edf446SDimitry Andric   case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7470b3edf446SDimitry Andric     Register Index = MI.getOperand(7).getReg();
7471b3edf446SDimitry Andric     LLT S32 = LLT::scalar(32);
7472b3edf446SDimitry Andric     if (MRI.getType(Index) != S32)
7473b3edf446SDimitry Andric       MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
7474b3edf446SDimitry Andric     return true;
7475b3edf446SDimitry Andric   }
747606c3fb27SDimitry Andric   case Intrinsic::amdgcn_fmed3: {
747706c3fb27SDimitry Andric     GISelChangeObserver &Observer = Helper.Observer;
747806c3fb27SDimitry Andric 
747906c3fb27SDimitry Andric     // FIXME: This is to workaround the inability of tablegen match combiners to
748006c3fb27SDimitry Andric     // match intrinsics in patterns.
748106c3fb27SDimitry Andric     Observer.changingInstr(MI);
748206c3fb27SDimitry Andric     MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
748306c3fb27SDimitry Andric     MI.removeOperand(1);
748406c3fb27SDimitry Andric     Observer.changedInstr(MI);
748506c3fb27SDimitry Andric     return true;
748606c3fb27SDimitry Andric   }
7487*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_readlane:
7488*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_writelane:
7489*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_readfirstlane:
7490*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_permlane16:
7491*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_permlanex16:
7492*0fca6ea1SDimitry Andric   case Intrinsic::amdgcn_permlane64:
7493*0fca6ea1SDimitry Andric     return legalizeLaneOp(Helper, MI, IntrID);
74945ffd83dbSDimitry Andric   default: {
74955ffd83dbSDimitry Andric     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
74965ffd83dbSDimitry Andric             AMDGPU::getImageDimIntrinsicInfo(IntrID))
74975ffd83dbSDimitry Andric       return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
74980b57cec5SDimitry Andric     return true;
74990b57cec5SDimitry Andric   }
75005ffd83dbSDimitry Andric   }
75010b57cec5SDimitry Andric 
75020b57cec5SDimitry Andric   return true;
75030b57cec5SDimitry Andric }
7504