10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for 100b57cec5SDimitry Andric /// AMDGPU. 110b57cec5SDimitry Andric /// \todo This should be generated by TableGen. 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 158bcb0991SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h" 18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h" 190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 21fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h" 23fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h" 240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 2706c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h" 288bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 29e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 3081ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h" 310b57cec5SDimitry Andric 320b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo" 330b57cec5SDimitry Andric 340b57cec5SDimitry Andric using namespace llvm; 350b57cec5SDimitry Andric using namespace LegalizeActions; 360b57cec5SDimitry Andric using namespace LegalizeMutations; 370b57cec5SDimitry Andric using namespace LegalityPredicates; 385ffd83dbSDimitry Andric using namespace MIPatternMatch; 390b57cec5SDimitry Andric 405ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types. 415ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality( 425ffd83dbSDimitry Andric "amdgpu-global-isel-new-legality", 435ffd83dbSDimitry Andric cl::desc("Use GlobalISel desired legality, rather than try to use" 445ffd83dbSDimitry Andric "rules compatible with selection patterns"), 455ffd83dbSDimitry Andric cl::init(false), 465ffd83dbSDimitry Andric cl::ReallyHidden); 470b57cec5SDimitry Andric 485ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024; 495ffd83dbSDimitry Andric 505ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements 515ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) { 525ffd83dbSDimitry Andric unsigned NElts = Ty.getNumElements(); 535ffd83dbSDimitry Andric unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 54fe6060f1SDimitry Andric return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 550b57cec5SDimitry Andric } 560b57cec5SDimitry Andric 575ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits 585ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) { 595ffd83dbSDimitry Andric unsigned Bits = Ty.getSizeInBits(); 605ffd83dbSDimitry Andric unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 615ffd83dbSDimitry Andric return LLT::scalar(Pow2Bits); 628bcb0991SDimitry Andric } 638bcb0991SDimitry Andric 64349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an 65e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 66e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized. 670b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 680b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 690b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 70e8d8bef9SDimitry Andric if (!Ty.isVector()) 71e8d8bef9SDimitry Andric return false; 72e8d8bef9SDimitry Andric 73e8d8bef9SDimitry Andric const LLT EltTy = Ty.getElementType(); 74e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 75e8d8bef9SDimitry Andric return Ty.getNumElements() % 2 != 0 && 76e8d8bef9SDimitry Andric EltSize > 1 && EltSize < 32 && 778bcb0991SDimitry Andric Ty.getSizeInBits() % 32 != 0; 788bcb0991SDimitry Andric }; 798bcb0991SDimitry Andric } 808bcb0991SDimitry Andric 81e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 82e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 83e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 84e8d8bef9SDimitry Andric return Ty.getSizeInBits() % 32 == 0; 85e8d8bef9SDimitry Andric }; 86e8d8bef9SDimitry Andric } 87e8d8bef9SDimitry Andric 888bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) { 898bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 908bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 918bcb0991SDimitry Andric const LLT EltTy = Ty.getScalarType(); 928bcb0991SDimitry Andric return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 930b57cec5SDimitry Andric }; 940b57cec5SDimitry Andric } 950b57cec5SDimitry Andric 960b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 970b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 980b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 990b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 100bdd1243dSDimitry Andric return std::pair(TypeIdx, 101fe6060f1SDimitry Andric LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 1020b57cec5SDimitry Andric }; 1030b57cec5SDimitry Andric } 1040b57cec5SDimitry Andric 1050b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 1060b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1070b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1080b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 1090b57cec5SDimitry Andric unsigned Size = Ty.getSizeInBits(); 1100b57cec5SDimitry Andric unsigned Pieces = (Size + 63) / 64; 1110b57cec5SDimitry Andric unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 112bdd1243dSDimitry Andric return std::pair(TypeIdx, LLT::scalarOrVector( 113bdd1243dSDimitry Andric ElementCount::getFixed(NewNumElts), EltTy)); 1140b57cec5SDimitry Andric }; 1150b57cec5SDimitry Andric } 1160b57cec5SDimitry Andric 1178bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit 1188bcb0991SDimitry Andric // type. 1198bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 1208bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1218bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1228bcb0991SDimitry Andric 1238bcb0991SDimitry Andric const LLT EltTy = Ty.getElementType(); 1248bcb0991SDimitry Andric const int Size = Ty.getSizeInBits(); 1258bcb0991SDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1268bcb0991SDimitry Andric const int NextMul32 = (Size + 31) / 32; 1278bcb0991SDimitry Andric 1288bcb0991SDimitry Andric assert(EltSize < 32); 1298bcb0991SDimitry Andric 1308bcb0991SDimitry Andric const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131bdd1243dSDimitry Andric return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 1328bcb0991SDimitry Andric }; 1338bcb0991SDimitry Andric } 1348bcb0991SDimitry Andric 13506c3fb27SDimitry Andric // Increase the number of vector elements to reach the next legal RegClass. 13606c3fb27SDimitry Andric static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 13706c3fb27SDimitry Andric return [=](const LegalityQuery &Query) { 13806c3fb27SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 13906c3fb27SDimitry Andric const unsigned NumElts = Ty.getNumElements(); 14006c3fb27SDimitry Andric const unsigned EltSize = Ty.getElementType().getSizeInBits(); 14106c3fb27SDimitry Andric const unsigned MaxNumElts = MaxRegisterSize / EltSize; 14206c3fb27SDimitry Andric 14306c3fb27SDimitry Andric assert(EltSize == 32 || EltSize == 64); 14406c3fb27SDimitry Andric assert(Ty.getSizeInBits() < MaxRegisterSize); 14506c3fb27SDimitry Andric 14606c3fb27SDimitry Andric unsigned NewNumElts; 14706c3fb27SDimitry Andric // Find the nearest legal RegClass that is larger than the current type. 14806c3fb27SDimitry Andric for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 14906c3fb27SDimitry Andric if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 15006c3fb27SDimitry Andric break; 15106c3fb27SDimitry Andric } 15206c3fb27SDimitry Andric 15306c3fb27SDimitry Andric return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); 15406c3fb27SDimitry Andric }; 15506c3fb27SDimitry Andric } 15606c3fb27SDimitry Andric 15706c3fb27SDimitry Andric static LLT getBufferRsrcScalarType(const LLT Ty) { 15806c3fb27SDimitry Andric if (!Ty.isVector()) 15906c3fb27SDimitry Andric return LLT::scalar(128); 16006c3fb27SDimitry Andric const ElementCount NumElems = Ty.getElementCount(); 16106c3fb27SDimitry Andric return LLT::vector(NumElems, LLT::scalar(128)); 16206c3fb27SDimitry Andric } 16306c3fb27SDimitry Andric 16406c3fb27SDimitry Andric static LLT getBufferRsrcRegisterType(const LLT Ty) { 16506c3fb27SDimitry Andric if (!Ty.isVector()) 16606c3fb27SDimitry Andric return LLT::fixed_vector(4, LLT::scalar(32)); 16706c3fb27SDimitry Andric const unsigned NumElems = Ty.getElementCount().getFixedValue(); 16806c3fb27SDimitry Andric return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 16906c3fb27SDimitry Andric } 17006c3fb27SDimitry Andric 171e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) { 172e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 1735ffd83dbSDimitry Andric 1745ffd83dbSDimitry Andric if (Size <= 32) { 1755ffd83dbSDimitry Andric // <2 x s8> -> s16 1765ffd83dbSDimitry Andric // <4 x s8> -> s32 177e8d8bef9SDimitry Andric return LLT::scalar(Size); 178e8d8bef9SDimitry Andric } 1795ffd83dbSDimitry Andric 180fe6060f1SDimitry Andric return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 181e8d8bef9SDimitry Andric } 182e8d8bef9SDimitry Andric 183e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 184e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 185e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 186bdd1243dSDimitry Andric return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 187e8d8bef9SDimitry Andric }; 188e8d8bef9SDimitry Andric } 189e8d8bef9SDimitry Andric 190e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 191e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 192e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 193e8d8bef9SDimitry Andric unsigned Size = Ty.getSizeInBits(); 194e8d8bef9SDimitry Andric assert(Size % 32 == 0); 195bdd1243dSDimitry Andric return std::pair( 196fe6060f1SDimitry Andric TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 1975ffd83dbSDimitry Andric }; 1985ffd83dbSDimitry Andric } 1995ffd83dbSDimitry Andric 2008bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 2018bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2028bcb0991SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2038bcb0991SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 2048bcb0991SDimitry Andric }; 2058bcb0991SDimitry Andric } 2068bcb0991SDimitry Andric 2070b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 2080b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 2090b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2100b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 2110b57cec5SDimitry Andric }; 2120b57cec5SDimitry Andric } 2130b57cec5SDimitry Andric 2140b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 2150b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 2160b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2170b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 2180b57cec5SDimitry Andric }; 2190b57cec5SDimitry Andric } 2200b57cec5SDimitry Andric 2215ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) { 2225ffd83dbSDimitry Andric return Size % 32 == 0 && Size <= MaxRegisterSize; 2235ffd83dbSDimitry Andric } 2245ffd83dbSDimitry Andric 2255ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) { 2265ffd83dbSDimitry Andric const int EltSize = EltTy.getSizeInBits(); 2275ffd83dbSDimitry Andric return EltSize == 16 || EltSize % 32 == 0; 2285ffd83dbSDimitry Andric } 2295ffd83dbSDimitry Andric 2305ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) { 2310b57cec5SDimitry Andric const int EltSize = Ty.getElementType().getSizeInBits(); 2320b57cec5SDimitry Andric return EltSize == 32 || EltSize == 64 || 2330b57cec5SDimitry Andric (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 2340b57cec5SDimitry Andric EltSize == 128 || EltSize == 256; 2350b57cec5SDimitry Andric } 2360b57cec5SDimitry Andric 2375ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) { 2385ffd83dbSDimitry Andric if (!isRegisterSize(Ty.getSizeInBits())) 2395ffd83dbSDimitry Andric return false; 2405ffd83dbSDimitry Andric 2415ffd83dbSDimitry Andric if (Ty.isVector()) 2425ffd83dbSDimitry Andric return isRegisterVectorType(Ty); 2435ffd83dbSDimitry Andric 2445ffd83dbSDimitry Andric return true; 2455ffd83dbSDimitry Andric } 2465ffd83dbSDimitry Andric 2475ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and 2485ffd83dbSDimitry Andric // multiples of v2s16. 2495ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) { 2505ffd83dbSDimitry Andric return [=](const LegalityQuery &Query) { 2515ffd83dbSDimitry Andric return isRegisterType(Query.Types[TypeIdx]); 2528bcb0991SDimitry Andric }; 2538bcb0991SDimitry Andric } 2548bcb0991SDimitry Andric 25506c3fb27SDimitry Andric // RegisterType that doesn't have a corresponding RegClass. 25606c3fb27SDimitry Andric static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { 25706c3fb27SDimitry Andric return [=](const LegalityQuery &Query) { 25806c3fb27SDimitry Andric LLT Ty = Query.Types[TypeIdx]; 25906c3fb27SDimitry Andric return isRegisterType(Ty) && 26006c3fb27SDimitry Andric !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 26106c3fb27SDimitry Andric }; 26206c3fb27SDimitry Andric } 26306c3fb27SDimitry Andric 2645ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 2658bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2665ffd83dbSDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2675ffd83dbSDimitry Andric if (!QueryTy.isVector()) 2685ffd83dbSDimitry Andric return false; 2695ffd83dbSDimitry Andric const LLT EltTy = QueryTy.getElementType(); 2705ffd83dbSDimitry Andric return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 2718bcb0991SDimitry Andric }; 2728bcb0991SDimitry Andric } 2738bcb0991SDimitry Andric 274fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger 275fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type. 276fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 2778bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2788bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 2798bcb0991SDimitry Andric return !Ty.isVector() && Ty.getSizeInBits() > 32 && 280fe6060f1SDimitry Andric Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 2810b57cec5SDimitry Andric }; 2820b57cec5SDimitry Andric } 2830b57cec5SDimitry Andric 2845ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 2855ffd83dbSDimitry Andric // handle some operations by just promoting the register during 2865ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits. 2875ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 28806c3fb27SDimitry Andric bool IsLoad, bool IsAtomic) { 2895ffd83dbSDimitry Andric switch (AS) { 2905ffd83dbSDimitry Andric case AMDGPUAS::PRIVATE_ADDRESS: 2915ffd83dbSDimitry Andric // FIXME: Private element size. 292e8d8bef9SDimitry Andric return ST.enableFlatScratch() ? 128 : 32; 2935ffd83dbSDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 2945ffd83dbSDimitry Andric return ST.useDS128() ? 128 : 64; 2955ffd83dbSDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 2965ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS: 2975ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 29806c3fb27SDimitry Andric case AMDGPUAS::BUFFER_RESOURCE: 2995ffd83dbSDimitry Andric // Treat constant and global as identical. SMRD loads are sometimes usable for 3005ffd83dbSDimitry Andric // global loads (ideally constant address space should be eliminated) 3015ffd83dbSDimitry Andric // depending on the context. Legality cannot be context dependent, but 3025ffd83dbSDimitry Andric // RegBankSelect can split the load as necessary depending on the pointer 3035ffd83dbSDimitry Andric // register bank/uniformity and if the memory is invariant or not written in a 3045ffd83dbSDimitry Andric // kernel. 3055ffd83dbSDimitry Andric return IsLoad ? 512 : 128; 3065ffd83dbSDimitry Andric default: 30706c3fb27SDimitry Andric // FIXME: Flat addresses may contextually need to be split to 32-bit parts 30806c3fb27SDimitry Andric // if they may alias scratch depending on the subtarget. This needs to be 30906c3fb27SDimitry Andric // moved to custom handling to use addressMayBeAccessedAsPrivate 31006c3fb27SDimitry Andric return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 3115ffd83dbSDimitry Andric } 3125ffd83dbSDimitry Andric } 3135ffd83dbSDimitry Andric 3145ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 315fe6060f1SDimitry Andric const LegalityQuery &Query) { 3165ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 3175ffd83dbSDimitry Andric 3185ffd83dbSDimitry Andric // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 319fe6060f1SDimitry Andric const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 3205ffd83dbSDimitry Andric 3215ffd83dbSDimitry Andric unsigned RegSize = Ty.getSizeInBits(); 32204eeddc0SDimitry Andric uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 32304eeddc0SDimitry Andric uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 3245ffd83dbSDimitry Andric unsigned AS = Query.Types[1].getAddressSpace(); 3255ffd83dbSDimitry Andric 3265ffd83dbSDimitry Andric // All of these need to be custom lowered to cast the pointer operand. 3275ffd83dbSDimitry Andric if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 3285ffd83dbSDimitry Andric return false; 3295ffd83dbSDimitry Andric 330fe6060f1SDimitry Andric // Do not handle extending vector loads. 331fe6060f1SDimitry Andric if (Ty.isVector() && MemSize != RegSize) 332fe6060f1SDimitry Andric return false; 333fe6060f1SDimitry Andric 3345ffd83dbSDimitry Andric // TODO: We should be able to widen loads if the alignment is high enough, but 3355ffd83dbSDimitry Andric // we also need to modify the memory access size. 3365ffd83dbSDimitry Andric #if 0 3375ffd83dbSDimitry Andric // Accept widening loads based on alignment. 3385ffd83dbSDimitry Andric if (IsLoad && MemSize < Size) 3395ffd83dbSDimitry Andric MemSize = std::max(MemSize, Align); 3405ffd83dbSDimitry Andric #endif 3415ffd83dbSDimitry Andric 3425ffd83dbSDimitry Andric // Only 1-byte and 2-byte to 32-bit extloads are valid. 3435ffd83dbSDimitry Andric if (MemSize != RegSize && RegSize != 32) 3445ffd83dbSDimitry Andric return false; 3455ffd83dbSDimitry Andric 34606c3fb27SDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 34706c3fb27SDimitry Andric Query.MMODescrs[0].Ordering != 34806c3fb27SDimitry Andric AtomicOrdering::NotAtomic)) 3495ffd83dbSDimitry Andric return false; 3505ffd83dbSDimitry Andric 3515ffd83dbSDimitry Andric switch (MemSize) { 3525ffd83dbSDimitry Andric case 8: 3535ffd83dbSDimitry Andric case 16: 3545ffd83dbSDimitry Andric case 32: 3555ffd83dbSDimitry Andric case 64: 3565ffd83dbSDimitry Andric case 128: 3575ffd83dbSDimitry Andric break; 3585ffd83dbSDimitry Andric case 96: 3595ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 3605ffd83dbSDimitry Andric return false; 3615ffd83dbSDimitry Andric break; 3625ffd83dbSDimitry Andric case 256: 3635ffd83dbSDimitry Andric case 512: 3645ffd83dbSDimitry Andric // These may contextually need to be broken down. 3655ffd83dbSDimitry Andric break; 3665ffd83dbSDimitry Andric default: 3675ffd83dbSDimitry Andric return false; 3685ffd83dbSDimitry Andric } 3695ffd83dbSDimitry Andric 3705ffd83dbSDimitry Andric assert(RegSize >= MemSize); 3715ffd83dbSDimitry Andric 372e8d8bef9SDimitry Andric if (AlignBits < MemSize) { 3735ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 374e8d8bef9SDimitry Andric if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 375e8d8bef9SDimitry Andric Align(AlignBits / 8))) 3765ffd83dbSDimitry Andric return false; 3775ffd83dbSDimitry Andric } 3785ffd83dbSDimitry Andric 3795ffd83dbSDimitry Andric return true; 3805ffd83dbSDimitry Andric } 3815ffd83dbSDimitry Andric 38206c3fb27SDimitry Andric // The newer buffer intrinsic forms take their resource arguments as 38306c3fb27SDimitry Andric // pointers in address space 8, aka s128 values. However, in order to not break 38406c3fb27SDimitry Andric // SelectionDAG, the underlying operations have to continue to take v4i32 38506c3fb27SDimitry Andric // arguments. Therefore, we convert resource pointers - or vectors of them 38606c3fb27SDimitry Andric // to integer values here. 38706c3fb27SDimitry Andric static bool hasBufferRsrcWorkaround(const LLT Ty) { 38806c3fb27SDimitry Andric if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 38906c3fb27SDimitry Andric return true; 39006c3fb27SDimitry Andric if (Ty.isVector()) { 39106c3fb27SDimitry Andric const LLT ElemTy = Ty.getElementType(); 39206c3fb27SDimitry Andric return hasBufferRsrcWorkaround(ElemTy); 39306c3fb27SDimitry Andric } 39406c3fb27SDimitry Andric return false; 39506c3fb27SDimitry Andric } 39606c3fb27SDimitry Andric 3975ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 3985ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care 3995ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by 4005ffd83dbSDimitry Andric // bitcasting. 4015ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) { 4025ffd83dbSDimitry Andric if (EnableNewLegality) 4035ffd83dbSDimitry Andric return false; 4045ffd83dbSDimitry Andric 4055ffd83dbSDimitry Andric const unsigned Size = Ty.getSizeInBits(); 4065ffd83dbSDimitry Andric if (Size <= 64) 4075ffd83dbSDimitry Andric return false; 40806c3fb27SDimitry Andric // Address space 8 pointers get their own workaround. 40906c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(Ty)) 41006c3fb27SDimitry Andric return false; 4115ffd83dbSDimitry Andric if (!Ty.isVector()) 4125ffd83dbSDimitry Andric return true; 413e8d8bef9SDimitry Andric 414e8d8bef9SDimitry Andric LLT EltTy = Ty.getElementType(); 415e8d8bef9SDimitry Andric if (EltTy.isPointer()) 416e8d8bef9SDimitry Andric return true; 417e8d8bef9SDimitry Andric 418e8d8bef9SDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 4195ffd83dbSDimitry Andric return EltSize != 32 && EltSize != 64; 4205ffd83dbSDimitry Andric } 4215ffd83dbSDimitry Andric 422fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 4235ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 424fe6060f1SDimitry Andric return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 42506c3fb27SDimitry Andric !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 4265ffd83dbSDimitry Andric } 4275ffd83dbSDimitry Andric 428e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast 429e8d8bef9SDimitry Andric /// to a different type. 430e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 431fe6060f1SDimitry Andric const LLT MemTy) { 432fe6060f1SDimitry Andric const unsigned MemSizeInBits = MemTy.getSizeInBits(); 433e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 434e8d8bef9SDimitry Andric if (Size != MemSizeInBits) 435e8d8bef9SDimitry Andric return Size <= 32 && Ty.isVector(); 436e8d8bef9SDimitry Andric 437e8d8bef9SDimitry Andric if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 438e8d8bef9SDimitry Andric return true; 439fe6060f1SDimitry Andric 440fe6060f1SDimitry Andric // Don't try to handle bitcasting vector ext loads for now. 441fe6060f1SDimitry Andric return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 442fe6060f1SDimitry Andric (Size <= 32 || isRegisterSize(Size)) && 443e8d8bef9SDimitry Andric !isRegisterVectorElementType(Ty.getElementType()); 444e8d8bef9SDimitry Andric } 445e8d8bef9SDimitry Andric 446e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory 447e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself 448e8d8bef9SDimitry Andric /// changes, not the size of the result register. 449fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 45004eeddc0SDimitry Andric uint64_t AlignInBits, unsigned AddrSpace, 451e8d8bef9SDimitry Andric unsigned Opcode) { 452fe6060f1SDimitry Andric unsigned SizeInBits = MemoryTy.getSizeInBits(); 453e8d8bef9SDimitry Andric // We don't want to widen cases that are naturally legal. 454e8d8bef9SDimitry Andric if (isPowerOf2_32(SizeInBits)) 455e8d8bef9SDimitry Andric return false; 456e8d8bef9SDimitry Andric 457e8d8bef9SDimitry Andric // If we have 96-bit memory operations, we shouldn't touch them. Note we may 458e8d8bef9SDimitry Andric // end up widening these for a scalar load during RegBankSelect, since there 459e8d8bef9SDimitry Andric // aren't 96-bit scalar loads. 460e8d8bef9SDimitry Andric if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 461e8d8bef9SDimitry Andric return false; 462e8d8bef9SDimitry Andric 46306c3fb27SDimitry Andric if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 464e8d8bef9SDimitry Andric return false; 465e8d8bef9SDimitry Andric 466e8d8bef9SDimitry Andric // A load is known dereferenceable up to the alignment, so it's legal to widen 467e8d8bef9SDimitry Andric // to it. 468e8d8bef9SDimitry Andric // 469e8d8bef9SDimitry Andric // TODO: Could check dereferenceable for less aligned cases. 470e8d8bef9SDimitry Andric unsigned RoundedSize = NextPowerOf2(SizeInBits); 471e8d8bef9SDimitry Andric if (AlignInBits < RoundedSize) 472e8d8bef9SDimitry Andric return false; 473e8d8bef9SDimitry Andric 474e8d8bef9SDimitry Andric // Do not widen if it would introduce a slow unaligned load. 475e8d8bef9SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 476bdd1243dSDimitry Andric unsigned Fast = 0; 477e8d8bef9SDimitry Andric return TLI->allowsMisalignedMemoryAccessesImpl( 478e8d8bef9SDimitry Andric RoundedSize, AddrSpace, Align(AlignInBits / 8), 479e8d8bef9SDimitry Andric MachineMemOperand::MOLoad, &Fast) && 480e8d8bef9SDimitry Andric Fast; 481e8d8bef9SDimitry Andric } 482e8d8bef9SDimitry Andric 483e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 484e8d8bef9SDimitry Andric unsigned Opcode) { 485e8d8bef9SDimitry Andric if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 486e8d8bef9SDimitry Andric return false; 487e8d8bef9SDimitry Andric 488fe6060f1SDimitry Andric return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 489e8d8bef9SDimitry Andric Query.MMODescrs[0].AlignInBits, 490e8d8bef9SDimitry Andric Query.Types[1].getAddressSpace(), Opcode); 491e8d8bef9SDimitry Andric } 492e8d8bef9SDimitry Andric 49306c3fb27SDimitry Andric /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 49406c3fb27SDimitry Andric /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 49506c3fb27SDimitry Andric /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 49606c3fb27SDimitry Andric static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 49706c3fb27SDimitry Andric MachineRegisterInfo &MRI, unsigned Idx) { 49806c3fb27SDimitry Andric MachineOperand &MO = MI.getOperand(Idx); 49906c3fb27SDimitry Andric 50006c3fb27SDimitry Andric const LLT PointerTy = MRI.getType(MO.getReg()); 50106c3fb27SDimitry Andric 50206c3fb27SDimitry Andric // Paranoidly prevent us from doing this multiple times. 50306c3fb27SDimitry Andric if (!hasBufferRsrcWorkaround(PointerTy)) 50406c3fb27SDimitry Andric return PointerTy; 50506c3fb27SDimitry Andric 50606c3fb27SDimitry Andric const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 50706c3fb27SDimitry Andric const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 50806c3fb27SDimitry Andric if (!PointerTy.isVector()) { 50906c3fb27SDimitry Andric // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 51006c3fb27SDimitry Andric const unsigned NumParts = PointerTy.getSizeInBits() / 32; 51106c3fb27SDimitry Andric const LLT S32 = LLT::scalar(32); 51206c3fb27SDimitry Andric 51306c3fb27SDimitry Andric Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 51406c3fb27SDimitry Andric std::array<Register, 4> VectorElems; 51506c3fb27SDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 51606c3fb27SDimitry Andric for (unsigned I = 0; I < NumParts; ++I) 51706c3fb27SDimitry Andric VectorElems[I] = 51806c3fb27SDimitry Andric B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 51906c3fb27SDimitry Andric B.buildMergeValues(MO, VectorElems); 52006c3fb27SDimitry Andric MO.setReg(VectorReg); 52106c3fb27SDimitry Andric return VectorTy; 52206c3fb27SDimitry Andric } 52306c3fb27SDimitry Andric Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 52406c3fb27SDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 52506c3fb27SDimitry Andric auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 52606c3fb27SDimitry Andric B.buildIntToPtr(MO, Scalar); 52706c3fb27SDimitry Andric MO.setReg(BitcastReg); 52806c3fb27SDimitry Andric 52906c3fb27SDimitry Andric return VectorTy; 53006c3fb27SDimitry Andric } 53106c3fb27SDimitry Andric 53206c3fb27SDimitry Andric /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 53306c3fb27SDimitry Andric /// the form in which the value must be in order to be passed to the low-level 53406c3fb27SDimitry Andric /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 53506c3fb27SDimitry Andric /// needed in order to account for the fact that we can't define a register 53606c3fb27SDimitry Andric /// class for s128 without breaking SelectionDAG. 53706c3fb27SDimitry Andric static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 53806c3fb27SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 53906c3fb27SDimitry Andric const LLT PointerTy = MRI.getType(Pointer); 54006c3fb27SDimitry Andric const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 54106c3fb27SDimitry Andric const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 54206c3fb27SDimitry Andric 54306c3fb27SDimitry Andric if (!PointerTy.isVector()) { 54406c3fb27SDimitry Andric // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 54506c3fb27SDimitry Andric SmallVector<Register, 4> PointerParts; 54606c3fb27SDimitry Andric const unsigned NumParts = PointerTy.getSizeInBits() / 32; 54706c3fb27SDimitry Andric auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 54806c3fb27SDimitry Andric for (unsigned I = 0; I < NumParts; ++I) 54906c3fb27SDimitry Andric PointerParts.push_back(Unmerged.getReg(I)); 55006c3fb27SDimitry Andric return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 55106c3fb27SDimitry Andric } 55206c3fb27SDimitry Andric Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 55306c3fb27SDimitry Andric return B.buildBitcast(VectorTy, Scalar).getReg(0); 55406c3fb27SDimitry Andric } 55506c3fb27SDimitry Andric 55606c3fb27SDimitry Andric static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 55706c3fb27SDimitry Andric unsigned Idx) { 55806c3fb27SDimitry Andric MachineOperand &MO = MI.getOperand(Idx); 55906c3fb27SDimitry Andric 56006c3fb27SDimitry Andric const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 56106c3fb27SDimitry Andric // Paranoidly prevent us from doing this multiple times. 56206c3fb27SDimitry Andric if (!hasBufferRsrcWorkaround(PointerTy)) 56306c3fb27SDimitry Andric return; 56406c3fb27SDimitry Andric MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 56506c3fb27SDimitry Andric } 56606c3fb27SDimitry Andric 5670b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 5680b57cec5SDimitry Andric const GCNTargetMachine &TM) 5690b57cec5SDimitry Andric : ST(ST_) { 5700b57cec5SDimitry Andric using namespace TargetOpcode; 5710b57cec5SDimitry Andric 5720b57cec5SDimitry Andric auto GetAddrSpacePtr = [&TM](unsigned AS) { 5730b57cec5SDimitry Andric return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 5740b57cec5SDimitry Andric }; 5750b57cec5SDimitry Andric 5760b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 577e8d8bef9SDimitry Andric const LLT S8 = LLT::scalar(8); 5780b57cec5SDimitry Andric const LLT S16 = LLT::scalar(16); 5790b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 5800b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 5810b57cec5SDimitry Andric const LLT S128 = LLT::scalar(128); 5820b57cec5SDimitry Andric const LLT S256 = LLT::scalar(256); 5835ffd83dbSDimitry Andric const LLT S512 = LLT::scalar(512); 5845ffd83dbSDimitry Andric const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 5850b57cec5SDimitry Andric 586fe6060f1SDimitry Andric const LLT V2S8 = LLT::fixed_vector(2, 8); 587fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 588fe6060f1SDimitry Andric const LLT V4S16 = LLT::fixed_vector(4, 16); 5890b57cec5SDimitry Andric 590fe6060f1SDimitry Andric const LLT V2S32 = LLT::fixed_vector(2, 32); 591fe6060f1SDimitry Andric const LLT V3S32 = LLT::fixed_vector(3, 32); 592fe6060f1SDimitry Andric const LLT V4S32 = LLT::fixed_vector(4, 32); 593fe6060f1SDimitry Andric const LLT V5S32 = LLT::fixed_vector(5, 32); 594fe6060f1SDimitry Andric const LLT V6S32 = LLT::fixed_vector(6, 32); 595fe6060f1SDimitry Andric const LLT V7S32 = LLT::fixed_vector(7, 32); 596fe6060f1SDimitry Andric const LLT V8S32 = LLT::fixed_vector(8, 32); 597fe6060f1SDimitry Andric const LLT V9S32 = LLT::fixed_vector(9, 32); 598fe6060f1SDimitry Andric const LLT V10S32 = LLT::fixed_vector(10, 32); 599fe6060f1SDimitry Andric const LLT V11S32 = LLT::fixed_vector(11, 32); 600fe6060f1SDimitry Andric const LLT V12S32 = LLT::fixed_vector(12, 32); 601fe6060f1SDimitry Andric const LLT V13S32 = LLT::fixed_vector(13, 32); 602fe6060f1SDimitry Andric const LLT V14S32 = LLT::fixed_vector(14, 32); 603fe6060f1SDimitry Andric const LLT V15S32 = LLT::fixed_vector(15, 32); 604fe6060f1SDimitry Andric const LLT V16S32 = LLT::fixed_vector(16, 32); 605fe6060f1SDimitry Andric const LLT V32S32 = LLT::fixed_vector(32, 32); 6060b57cec5SDimitry Andric 607fe6060f1SDimitry Andric const LLT V2S64 = LLT::fixed_vector(2, 64); 608fe6060f1SDimitry Andric const LLT V3S64 = LLT::fixed_vector(3, 64); 609fe6060f1SDimitry Andric const LLT V4S64 = LLT::fixed_vector(4, 64); 610fe6060f1SDimitry Andric const LLT V5S64 = LLT::fixed_vector(5, 64); 611fe6060f1SDimitry Andric const LLT V6S64 = LLT::fixed_vector(6, 64); 612fe6060f1SDimitry Andric const LLT V7S64 = LLT::fixed_vector(7, 64); 613fe6060f1SDimitry Andric const LLT V8S64 = LLT::fixed_vector(8, 64); 614fe6060f1SDimitry Andric const LLT V16S64 = LLT::fixed_vector(16, 64); 6150b57cec5SDimitry Andric 6160b57cec5SDimitry Andric std::initializer_list<LLT> AllS32Vectors = 6170b57cec5SDimitry Andric {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 6188bcb0991SDimitry Andric V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 6190b57cec5SDimitry Andric std::initializer_list<LLT> AllS64Vectors = 6208bcb0991SDimitry Andric {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 6210b57cec5SDimitry Andric 6220b57cec5SDimitry Andric const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 6230b57cec5SDimitry Andric const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 6248bcb0991SDimitry Andric const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 6250b57cec5SDimitry Andric const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 6268bcb0991SDimitry Andric const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 6270b57cec5SDimitry Andric const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 6280b57cec5SDimitry Andric const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 62906c3fb27SDimitry Andric const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 63006c3fb27SDimitry Andric const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 6310b57cec5SDimitry Andric 6320b57cec5SDimitry Andric const LLT CodePtr = FlatPtr; 6330b57cec5SDimitry Andric 6340b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces64 = { 6350b57cec5SDimitry Andric GlobalPtr, ConstantPtr, FlatPtr 6360b57cec5SDimitry Andric }; 6370b57cec5SDimitry Andric 6380b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces32 = { 6398bcb0991SDimitry Andric LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 6400b57cec5SDimitry Andric }; 6410b57cec5SDimitry Andric 64206c3fb27SDimitry Andric const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 64306c3fb27SDimitry Andric 6440b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesBase = { 6450b57cec5SDimitry Andric S32, S64 6460b57cec5SDimitry Andric }; 6470b57cec5SDimitry Andric 6480b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypes16 = { 6490b57cec5SDimitry Andric S32, S64, S16 6500b57cec5SDimitry Andric }; 6510b57cec5SDimitry Andric 6520b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesPK16 = { 6530b57cec5SDimitry Andric S32, S64, S16, V2S16 6540b57cec5SDimitry Andric }; 6550b57cec5SDimitry Andric 6565ffd83dbSDimitry Andric const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 6575ffd83dbSDimitry Andric 658fe6060f1SDimitry Andric // s1 for VCC branches, s32 for SCC branches. 659fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 6600b57cec5SDimitry Andric 6610b57cec5SDimitry Andric // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 6620b57cec5SDimitry Andric // elements for v3s16 6630b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PHI) 664e8d8bef9SDimitry Andric .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 6650b57cec5SDimitry Andric .legalFor(AllS32Vectors) 6660b57cec5SDimitry Andric .legalFor(AllS64Vectors) 6670b57cec5SDimitry Andric .legalFor(AddrSpaces64) 6680b57cec5SDimitry Andric .legalFor(AddrSpaces32) 66906c3fb27SDimitry Andric .legalFor(AddrSpaces128) 670e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 671e8d8bef9SDimitry Andric .clampScalar(0, S16, S256) 6720b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 6730b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16) 6740b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 675e8d8bef9SDimitry Andric .scalarize(0); 6760b57cec5SDimitry Andric 677e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 678e8d8bef9SDimitry Andric // Full set of gfx9 features. 67981ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 6805ffd83dbSDimitry Andric .legalFor({S32, S16, V2S16}) 6810eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 68281ad6265SDimitry Andric .scalarize(0) 68381ad6265SDimitry Andric .minScalar(0, S16) 684349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 68581ad6265SDimitry Andric .maxScalar(0, S32); 68681ad6265SDimitry Andric 68781ad6265SDimitry Andric getActionDefinitionsBuilder(G_MUL) 68881ad6265SDimitry Andric .legalFor({S32, S16, V2S16}) 68981ad6265SDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 69081ad6265SDimitry Andric .scalarize(0) 69181ad6265SDimitry Andric .minScalar(0, S16) 69281ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 69381ad6265SDimitry Andric .custom(); 69481ad6265SDimitry Andric assert(ST.hasMad64_32()); 695e8d8bef9SDimitry Andric 696e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 697e8d8bef9SDimitry Andric .legalFor({S32, S16, V2S16}) // Clamp modifier 698e8d8bef9SDimitry Andric .minScalarOrElt(0, S16) 6990eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 700e8d8bef9SDimitry Andric .scalarize(0) 701e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 32) 702e8d8bef9SDimitry Andric .lower(); 7035ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 70481ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 7050b57cec5SDimitry Andric .legalFor({S32, S16}) 706349cc55cSDimitry Andric .minScalar(0, S16) 707349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 708349cc55cSDimitry Andric .maxScalar(0, S32) 709349cc55cSDimitry Andric .scalarize(0); 710e8d8bef9SDimitry Andric 71181ad6265SDimitry Andric getActionDefinitionsBuilder(G_MUL) 71281ad6265SDimitry Andric .legalFor({S32, S16}) 71381ad6265SDimitry Andric .scalarize(0) 71481ad6265SDimitry Andric .minScalar(0, S16) 71581ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 71681ad6265SDimitry Andric .custom(); 71781ad6265SDimitry Andric assert(ST.hasMad64_32()); 71881ad6265SDimitry Andric 719e8d8bef9SDimitry Andric // Technically the saturating operations require clamp bit support, but this 720e8d8bef9SDimitry Andric // was introduced at the same time as 16-bit operations. 721e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 722e8d8bef9SDimitry Andric .legalFor({S32, S16}) // Clamp modifier 723e8d8bef9SDimitry Andric .minScalar(0, S16) 724e8d8bef9SDimitry Andric .scalarize(0) 725e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 16) 726e8d8bef9SDimitry Andric .lower(); 727e8d8bef9SDimitry Andric 728e8d8bef9SDimitry Andric // We're just lowering this, but it helps get a better result to try to 729e8d8bef9SDimitry Andric // coerce to the desired type first. 730e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 731e8d8bef9SDimitry Andric .minScalar(0, S16) 732e8d8bef9SDimitry Andric .scalarize(0) 733e8d8bef9SDimitry Andric .lower(); 7340b57cec5SDimitry Andric } else { 73581ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 7360b57cec5SDimitry Andric .legalFor({S32}) 737349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 7380b57cec5SDimitry Andric .clampScalar(0, S32, S32) 7390b57cec5SDimitry Andric .scalarize(0); 740e8d8bef9SDimitry Andric 74181ad6265SDimitry Andric auto &Mul = getActionDefinitionsBuilder(G_MUL) 74281ad6265SDimitry Andric .legalFor({S32}) 74381ad6265SDimitry Andric .scalarize(0) 74481ad6265SDimitry Andric .minScalar(0, S32) 74581ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32); 74681ad6265SDimitry Andric 74781ad6265SDimitry Andric if (ST.hasMad64_32()) 74881ad6265SDimitry Andric Mul.custom(); 74981ad6265SDimitry Andric else 75081ad6265SDimitry Andric Mul.maxScalar(0, S32); 75181ad6265SDimitry Andric 752e8d8bef9SDimitry Andric if (ST.hasIntClamp()) { 753e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 754e8d8bef9SDimitry Andric .legalFor({S32}) // Clamp modifier. 755e8d8bef9SDimitry Andric .scalarize(0) 756e8d8bef9SDimitry Andric .minScalarOrElt(0, S32) 757e8d8bef9SDimitry Andric .lower(); 758e8d8bef9SDimitry Andric } else { 759e8d8bef9SDimitry Andric // Clamp bit support was added in VI, along with 16-bit operations. 760e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 761e8d8bef9SDimitry Andric .minScalar(0, S32) 762e8d8bef9SDimitry Andric .scalarize(0) 763e8d8bef9SDimitry Andric .lower(); 7640b57cec5SDimitry Andric } 7650b57cec5SDimitry Andric 766e8d8bef9SDimitry Andric // FIXME: DAG expansion gets better results. The widening uses the smaller 767e8d8bef9SDimitry Andric // range values and goes for the min/max lowering directly. 768e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 769e8d8bef9SDimitry Andric .minScalar(0, S32) 770e8d8bef9SDimitry Andric .scalarize(0) 771e8d8bef9SDimitry Andric .lower(); 772e8d8bef9SDimitry Andric } 773e8d8bef9SDimitry Andric 774fe6060f1SDimitry Andric getActionDefinitionsBuilder( 775fe6060f1SDimitry Andric {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 7765ffd83dbSDimitry Andric .customFor({S32, S64}) 777480093f4SDimitry Andric .clampScalar(0, S32, S64) 778480093f4SDimitry Andric .widenScalarToNextPow2(0, 32) 779480093f4SDimitry Andric .scalarize(0); 780480093f4SDimitry Andric 781e8d8bef9SDimitry Andric auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 7820b57cec5SDimitry Andric .legalFor({S32}) 783349cc55cSDimitry Andric .maxScalar(0, S32); 784e8d8bef9SDimitry Andric 785e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts()) { 786e8d8bef9SDimitry Andric Mulh 787e8d8bef9SDimitry Andric .clampMaxNumElements(0, S8, 2) 788e8d8bef9SDimitry Andric .lowerFor({V2S8}); 789e8d8bef9SDimitry Andric } 790e8d8bef9SDimitry Andric 791e8d8bef9SDimitry Andric Mulh 792e8d8bef9SDimitry Andric .scalarize(0) 793e8d8bef9SDimitry Andric .lower(); 7940b57cec5SDimitry Andric 7950b57cec5SDimitry Andric // Report legal for any types we can handle anywhere. For the cases only legal 7960b57cec5SDimitry Andric // on the SALU, RegBankSelect will be able to re-legalize. 7970b57cec5SDimitry Andric getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 7980b57cec5SDimitry Andric .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 7990b57cec5SDimitry Andric .clampScalar(0, S32, S64) 8000b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 8018bcb0991SDimitry Andric .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 8020b57cec5SDimitry Andric .widenScalarToNextPow2(0) 8030b57cec5SDimitry Andric .scalarize(0); 8040b57cec5SDimitry Andric 805bdd1243dSDimitry Andric getActionDefinitionsBuilder( 806bdd1243dSDimitry Andric {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 807480093f4SDimitry Andric .legalFor({{S32, S1}, {S32, S32}}) 808bdd1243dSDimitry Andric .clampScalar(0, S32, S32) 809bdd1243dSDimitry Andric .scalarize(0); 8100b57cec5SDimitry Andric 8110b57cec5SDimitry Andric getActionDefinitionsBuilder(G_BITCAST) 8120b57cec5SDimitry Andric // Don't worry about the size constraint. 8138bcb0991SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 8145ffd83dbSDimitry Andric .lower(); 8150b57cec5SDimitry Andric 8160b57cec5SDimitry Andric 8170b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONSTANT) 8188bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, GlobalPtr, 8190b57cec5SDimitry Andric LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 820e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 8210b57cec5SDimitry Andric .clampScalar(0, S32, S64) 822e8d8bef9SDimitry Andric .widenScalarToNextPow2(0); 8230b57cec5SDimitry Andric 8245ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FCONSTANT) 8255ffd83dbSDimitry Andric .legalFor({S32, S64, S16}) 8265ffd83dbSDimitry Andric .clampScalar(0, S16, S64); 8278bcb0991SDimitry Andric 8285ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 8295ffd83dbSDimitry Andric .legalIf(isRegisterType(0)) 8305ffd83dbSDimitry Andric // s1 and s16 are special cases because they have legal operations on 8315ffd83dbSDimitry Andric // them, but don't really occupy registers in the normal way. 8325ffd83dbSDimitry Andric .legalFor({S1, S16}) 8335ffd83dbSDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 8345ffd83dbSDimitry Andric .clampScalarOrElt(0, S32, MaxScalar) 8355ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 8365ffd83dbSDimitry Andric .clampMaxNumElements(0, S32, 16); 8375ffd83dbSDimitry Andric 838fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 8395ffd83dbSDimitry Andric 8405ffd83dbSDimitry Andric // If the amount is divergent, we have to do a wave reduction to get the 8415ffd83dbSDimitry Andric // maximum value, so this is expanded during RegBankSelect. 8425ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_DYN_STACKALLOC) 8435ffd83dbSDimitry Andric .legalFor({{PrivatePtr, S32}}); 8445ffd83dbSDimitry Andric 8455ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_GLOBAL_VALUE) 846e8d8bef9SDimitry Andric .customIf(typeIsNot(0, PrivatePtr)); 847e8d8bef9SDimitry Andric 848fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 8490b57cec5SDimitry Andric 8500b57cec5SDimitry Andric auto &FPOpActions = getActionDefinitionsBuilder( 851bdd1243dSDimitry Andric { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 852bdd1243dSDimitry Andric G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 8530b57cec5SDimitry Andric .legalFor({S32, S64}); 8548bcb0991SDimitry Andric auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 8558bcb0991SDimitry Andric .customFor({S32, S64}); 8568bcb0991SDimitry Andric auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 8578bcb0991SDimitry Andric .customFor({S32, S64}); 8580b57cec5SDimitry Andric 8590b57cec5SDimitry Andric if (ST.has16BitInsts()) { 8600b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 8610b57cec5SDimitry Andric FPOpActions.legalFor({S16, V2S16}); 8620b57cec5SDimitry Andric else 8630b57cec5SDimitry Andric FPOpActions.legalFor({S16}); 8648bcb0991SDimitry Andric 8658bcb0991SDimitry Andric TrigActions.customFor({S16}); 8668bcb0991SDimitry Andric FDIVActions.customFor({S16}); 8670b57cec5SDimitry Andric } 8680b57cec5SDimitry Andric 8690b57cec5SDimitry Andric auto &MinNumMaxNum = getActionDefinitionsBuilder({ 8700b57cec5SDimitry Andric G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 8710b57cec5SDimitry Andric 8720b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 8730b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesPK16) 874480093f4SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 8750b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 8760b57cec5SDimitry Andric .clampScalar(0, S16, S64) 8770b57cec5SDimitry Andric .scalarize(0); 8780b57cec5SDimitry Andric } else if (ST.has16BitInsts()) { 8790b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypes16) 8800b57cec5SDimitry Andric .clampScalar(0, S16, S64) 8810b57cec5SDimitry Andric .scalarize(0); 8820b57cec5SDimitry Andric } else { 8830b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesBase) 8840b57cec5SDimitry Andric .clampScalar(0, S32, S64) 8850b57cec5SDimitry Andric .scalarize(0); 8860b57cec5SDimitry Andric } 8870b57cec5SDimitry Andric 8880b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 8890eae32dcSDimitry Andric FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 8908bcb0991SDimitry Andric 8910b57cec5SDimitry Andric FPOpActions 8920b57cec5SDimitry Andric .scalarize(0) 8930b57cec5SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 8940b57cec5SDimitry Andric 8958bcb0991SDimitry Andric TrigActions 8968bcb0991SDimitry Andric .scalarize(0) 8978bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 8988bcb0991SDimitry Andric 8998bcb0991SDimitry Andric FDIVActions 9008bcb0991SDimitry Andric .scalarize(0) 9018bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 9028bcb0991SDimitry Andric 9038bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FNEG, G_FABS}) 9048bcb0991SDimitry Andric .legalFor(FPTypesPK16) 9050eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 9068bcb0991SDimitry Andric .scalarize(0) 9078bcb0991SDimitry Andric .clampScalar(0, S16, S64); 9088bcb0991SDimitry Andric 9090b57cec5SDimitry Andric if (ST.has16BitInsts()) { 91006c3fb27SDimitry Andric getActionDefinitionsBuilder(G_FSQRT) 91106c3fb27SDimitry Andric .legalFor({S32, S16}) 91206c3fb27SDimitry Andric .customFor({S64}) 91306c3fb27SDimitry Andric .scalarize(0) 91406c3fb27SDimitry Andric .clampScalar(0, S16, S64); 91506c3fb27SDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 9160b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 9170b57cec5SDimitry Andric .scalarize(0) 9180b57cec5SDimitry Andric .clampScalar(0, S16, S64); 91906c3fb27SDimitry Andric 92006c3fb27SDimitry Andric getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 92106c3fb27SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 92206c3fb27SDimitry Andric .scalarize(0) 92306c3fb27SDimitry Andric .maxScalarIf(typeIs(0, S16), 1, S16) 92406c3fb27SDimitry Andric .clampScalar(1, S32, S32) 92506c3fb27SDimitry Andric .lower(); 92606c3fb27SDimitry Andric 92706c3fb27SDimitry Andric getActionDefinitionsBuilder(G_FFREXP) 92806c3fb27SDimitry Andric .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 92906c3fb27SDimitry Andric .scalarize(0) 93006c3fb27SDimitry Andric .lower(); 9310b57cec5SDimitry Andric } else { 9325ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSQRT) 93306c3fb27SDimitry Andric .legalFor({S32}) 93406c3fb27SDimitry Andric .customFor({S64}) 9355ffd83dbSDimitry Andric .scalarize(0) 9365ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 9375ffd83dbSDimitry Andric 9385ffd83dbSDimitry Andric if (ST.hasFractBug()) { 9395ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 9405ffd83dbSDimitry Andric .customFor({S64}) 9415ffd83dbSDimitry Andric .legalFor({S32, S64}) 9425ffd83dbSDimitry Andric .scalarize(0) 9435ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 9445ffd83dbSDimitry Andric } else { 9455ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 9460b57cec5SDimitry Andric .legalFor({S32, S64}) 9470b57cec5SDimitry Andric .scalarize(0) 9480b57cec5SDimitry Andric .clampScalar(0, S32, S64); 9490b57cec5SDimitry Andric } 95006c3fb27SDimitry Andric 95106c3fb27SDimitry Andric getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 95206c3fb27SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}) 95306c3fb27SDimitry Andric .scalarize(0) 95406c3fb27SDimitry Andric .clampScalar(0, S32, S64) 95506c3fb27SDimitry Andric .clampScalar(1, S32, S32) 95606c3fb27SDimitry Andric .lower(); 95706c3fb27SDimitry Andric 95806c3fb27SDimitry Andric getActionDefinitionsBuilder(G_FFREXP) 95906c3fb27SDimitry Andric .customFor({{S32, S32}, {S64, S32}}) 96006c3fb27SDimitry Andric .scalarize(0) 96106c3fb27SDimitry Andric .minScalar(0, S32) 96206c3fb27SDimitry Andric .clampScalar(1, S32, S32) 96306c3fb27SDimitry Andric .lower(); 9645ffd83dbSDimitry Andric } 9650b57cec5SDimitry Andric 9660b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPTRUNC) 9670b57cec5SDimitry Andric .legalFor({{S32, S64}, {S16, S32}}) 9685ffd83dbSDimitry Andric .scalarize(0) 9695ffd83dbSDimitry Andric .lower(); 9700b57cec5SDimitry Andric 9710b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPEXT) 9720b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}}) 973e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 9740b57cec5SDimitry Andric .scalarize(0); 9750b57cec5SDimitry Andric 976bdd1243dSDimitry Andric auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 97781ad6265SDimitry Andric if (ST.has16BitInsts()) { 97881ad6265SDimitry Andric FSubActions 97981ad6265SDimitry Andric // Use actual fsub instruction 98081ad6265SDimitry Andric .legalFor({S32, S16}) 98181ad6265SDimitry Andric // Must use fadd + fneg 98281ad6265SDimitry Andric .lowerFor({S64, V2S16}); 98381ad6265SDimitry Andric } else { 98481ad6265SDimitry Andric FSubActions 9850b57cec5SDimitry Andric // Use actual fsub instruction 9860b57cec5SDimitry Andric .legalFor({S32}) 9870b57cec5SDimitry Andric // Must use fadd + fneg 98881ad6265SDimitry Andric .lowerFor({S64, S16, V2S16}); 98981ad6265SDimitry Andric } 99081ad6265SDimitry Andric 99181ad6265SDimitry Andric FSubActions 9920b57cec5SDimitry Andric .scalarize(0) 9930b57cec5SDimitry Andric .clampScalar(0, S32, S64); 9940b57cec5SDimitry Andric 9958bcb0991SDimitry Andric // Whether this is legal depends on the floating point mode for the function. 9968bcb0991SDimitry Andric auto &FMad = getActionDefinitionsBuilder(G_FMAD); 9975ffd83dbSDimitry Andric if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 9988bcb0991SDimitry Andric FMad.customFor({S32, S16}); 9995ffd83dbSDimitry Andric else if (ST.hasMadMacF32Insts()) 10008bcb0991SDimitry Andric FMad.customFor({S32}); 10015ffd83dbSDimitry Andric else if (ST.hasMadF16()) 10025ffd83dbSDimitry Andric FMad.customFor({S16}); 10038bcb0991SDimitry Andric FMad.scalarize(0) 10048bcb0991SDimitry Andric .lower(); 10058bcb0991SDimitry Andric 1006e8d8bef9SDimitry Andric auto &FRem = getActionDefinitionsBuilder(G_FREM); 1007e8d8bef9SDimitry Andric if (ST.has16BitInsts()) { 1008e8d8bef9SDimitry Andric FRem.customFor({S16, S32, S64}); 1009e8d8bef9SDimitry Andric } else { 1010e8d8bef9SDimitry Andric FRem.minScalar(0, S32) 1011e8d8bef9SDimitry Andric .customFor({S32, S64}); 1012e8d8bef9SDimitry Andric } 1013e8d8bef9SDimitry Andric FRem.scalarize(0); 1014e8d8bef9SDimitry Andric 10155ffd83dbSDimitry Andric // TODO: Do we need to clamp maximum bitwidth? 10165ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_TRUNC) 10175ffd83dbSDimitry Andric .legalIf(isScalar(0)) 10185ffd83dbSDimitry Andric .legalFor({{V2S16, V2S32}}) 10195ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 10205ffd83dbSDimitry Andric // Avoid scalarizing in cases that should be truly illegal. In unresolvable 10215ffd83dbSDimitry Andric // situations (like an invalid implicit use), we don't want to infinite loop 10225ffd83dbSDimitry Andric // in the legalizer. 10235ffd83dbSDimitry Andric .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 10245ffd83dbSDimitry Andric .alwaysLegal(); 10255ffd83dbSDimitry Andric 10260b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 10270b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 10285ffd83dbSDimitry Andric {S32, S1}, {S64, S1}, {S16, S1}}) 1029480093f4SDimitry Andric .scalarize(0) 10305ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 10315ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 10320b57cec5SDimitry Andric 10338bcb0991SDimitry Andric // TODO: Split s1->s64 during regbankselect for VALU. 10348bcb0991SDimitry Andric auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1035480093f4SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1036480093f4SDimitry Andric .lowerIf(typeIs(1, S1)) 1037349cc55cSDimitry Andric .customFor({{S32, S64}, {S64, S64}}); 10388bcb0991SDimitry Andric if (ST.has16BitInsts()) 10398bcb0991SDimitry Andric IToFP.legalFor({{S16, S16}}); 10408bcb0991SDimitry Andric IToFP.clampScalar(1, S32, S64) 1041e8d8bef9SDimitry Andric .minScalar(0, S32) 10425ffd83dbSDimitry Andric .scalarize(0) 10435ffd83dbSDimitry Andric .widenScalarToNextPow2(1); 10440b57cec5SDimitry Andric 10458bcb0991SDimitry Andric auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 10465ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1047fe6060f1SDimitry Andric .customFor({{S64, S32}, {S64, S64}}) 1048e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 10498bcb0991SDimitry Andric if (ST.has16BitInsts()) 10508bcb0991SDimitry Andric FPToI.legalFor({{S16, S16}}); 10518bcb0991SDimitry Andric else 10528bcb0991SDimitry Andric FPToI.minScalar(1, S32); 10538bcb0991SDimitry Andric 10548bcb0991SDimitry Andric FPToI.minScalar(0, S32) 1055fe6060f1SDimitry Andric .widenScalarToNextPow2(0, 32) 10565ffd83dbSDimitry Andric .scalarize(0) 10575ffd83dbSDimitry Andric .lower(); 10580b57cec5SDimitry Andric 105981ad6265SDimitry Andric getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 106081ad6265SDimitry Andric .customFor({S16, S32}) 106181ad6265SDimitry Andric .scalarize(0) 106281ad6265SDimitry Andric .lower(); 106381ad6265SDimitry Andric 1064e8d8bef9SDimitry Andric // Lower roundeven into G_FRINT 1065e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 1066480093f4SDimitry Andric .scalarize(0) 1067480093f4SDimitry Andric .lower(); 10680b57cec5SDimitry Andric 1069480093f4SDimitry Andric if (ST.has16BitInsts()) { 1070480093f4SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 1071480093f4SDimitry Andric .legalFor({S16, S32, S64}) 1072480093f4SDimitry Andric .clampScalar(0, S16, S64) 1073480093f4SDimitry Andric .scalarize(0); 1074480093f4SDimitry Andric } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 10750b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 10760b57cec5SDimitry Andric .legalFor({S32, S64}) 10770b57cec5SDimitry Andric .clampScalar(0, S32, S64) 10780b57cec5SDimitry Andric .scalarize(0); 10790b57cec5SDimitry Andric } else { 10800b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 10810b57cec5SDimitry Andric .legalFor({S32}) 10820b57cec5SDimitry Andric .customFor({S64}) 10830b57cec5SDimitry Andric .clampScalar(0, S32, S64) 10840b57cec5SDimitry Andric .scalarize(0); 10850b57cec5SDimitry Andric } 10860b57cec5SDimitry Andric 1087480093f4SDimitry Andric getActionDefinitionsBuilder(G_PTR_ADD) 108806c3fb27SDimitry Andric .unsupportedFor({BufferFatPtr, RsrcPtr}) 1089e8d8bef9SDimitry Andric .legalIf(all(isPointer(0), sameSize(0, 1))) 1090e8d8bef9SDimitry Andric .scalarize(0) 1091e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0); 10920b57cec5SDimitry Andric 10935ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_PTRMASK) 1094e8d8bef9SDimitry Andric .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1095e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0) 10965ffd83dbSDimitry Andric .scalarize(0); 10970b57cec5SDimitry Andric 10980b57cec5SDimitry Andric auto &CmpBuilder = 10990b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ICMP) 1100480093f4SDimitry Andric // The compare output type differs based on the register bank of the output, 1101480093f4SDimitry Andric // so make both s1 and s32 legal. 1102480093f4SDimitry Andric // 1103480093f4SDimitry Andric // Scalar compares producing output in scc will be promoted to s32, as that 1104480093f4SDimitry Andric // is the allocatable register type that will be needed for the copy from 1105480093f4SDimitry Andric // scc. This will be promoted during RegBankSelect, and we assume something 1106480093f4SDimitry Andric // before that won't try to use s32 result types. 1107480093f4SDimitry Andric // 1108480093f4SDimitry Andric // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1109480093f4SDimitry Andric // bank. 11100b57cec5SDimitry Andric .legalForCartesianProduct( 11110b57cec5SDimitry Andric {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1112480093f4SDimitry Andric .legalForCartesianProduct( 1113480093f4SDimitry Andric {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 11140b57cec5SDimitry Andric if (ST.has16BitInsts()) { 11150b57cec5SDimitry Andric CmpBuilder.legalFor({{S1, S16}}); 11160b57cec5SDimitry Andric } 11170b57cec5SDimitry Andric 11180b57cec5SDimitry Andric CmpBuilder 11190b57cec5SDimitry Andric .widenScalarToNextPow2(1) 11200b57cec5SDimitry Andric .clampScalar(1, S32, S64) 11210b57cec5SDimitry Andric .scalarize(0) 1122480093f4SDimitry Andric .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 11230b57cec5SDimitry Andric 11240b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCMP) 11250b57cec5SDimitry Andric .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 11260b57cec5SDimitry Andric .widenScalarToNextPow2(1) 11270b57cec5SDimitry Andric .clampScalar(1, S32, S64) 11280b57cec5SDimitry Andric .scalarize(0); 11290b57cec5SDimitry Andric 11305ffd83dbSDimitry Andric // FIXME: fpow has a selection pattern that should move to custom lowering. 113106c3fb27SDimitry Andric auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 11325ffd83dbSDimitry Andric if (ST.has16BitInsts()) 11335ffd83dbSDimitry Andric ExpOps.customFor({{S32}, {S16}}); 11345ffd83dbSDimitry Andric else 11355ffd83dbSDimitry Andric ExpOps.customFor({S32}); 11365ffd83dbSDimitry Andric ExpOps.clampScalar(0, MinScalarFPTy, S32) 11370b57cec5SDimitry Andric .scalarize(0); 11380b57cec5SDimitry Andric 1139e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FPOWI) 1140e8d8bef9SDimitry Andric .clampScalar(0, MinScalarFPTy, S32) 1141e8d8bef9SDimitry Andric .lower(); 1142e8d8bef9SDimitry Andric 114306c3fb27SDimitry Andric auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 114406c3fb27SDimitry Andric Log2Ops.customFor({S32}); 114506c3fb27SDimitry Andric if (ST.has16BitInsts()) 114606c3fb27SDimitry Andric Log2Ops.legalFor({S16}); 114706c3fb27SDimitry Andric else 114806c3fb27SDimitry Andric Log2Ops.customFor({S16}); 114906c3fb27SDimitry Andric Log2Ops.scalarize(0) 115006c3fb27SDimitry Andric .lower(); 115106c3fb27SDimitry Andric 115206c3fb27SDimitry Andric auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP}); 115306c3fb27SDimitry Andric LogOps.customFor({S32, S16}); 115406c3fb27SDimitry Andric LogOps.clampScalar(0, MinScalarFPTy, S32) 115506c3fb27SDimitry Andric .scalarize(0); 115606c3fb27SDimitry Andric 11570b57cec5SDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 11585ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_CTPOP) 11590b57cec5SDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 11600b57cec5SDimitry Andric .clampScalar(0, S32, S32) 116104eeddc0SDimitry Andric .widenScalarToNextPow2(1, 32) 11620b57cec5SDimitry Andric .clampScalar(1, S32, S64) 11630b57cec5SDimitry Andric .scalarize(0) 116404eeddc0SDimitry Andric .widenScalarToNextPow2(0, 32); 116504eeddc0SDimitry Andric 1166bdd1243dSDimitry Andric // If no 16 bit instr is available, lower into different instructions. 1167bdd1243dSDimitry Andric if (ST.has16BitInsts()) 1168bdd1243dSDimitry Andric getActionDefinitionsBuilder(G_IS_FPCLASS) 1169bdd1243dSDimitry Andric .legalForCartesianProduct({S1}, FPTypes16) 1170bdd1243dSDimitry Andric .widenScalarToNextPow2(1) 1171bdd1243dSDimitry Andric .scalarize(0) 1172bdd1243dSDimitry Andric .lower(); 1173bdd1243dSDimitry Andric else 1174bdd1243dSDimitry Andric getActionDefinitionsBuilder(G_IS_FPCLASS) 1175bdd1243dSDimitry Andric .legalForCartesianProduct({S1}, FPTypesBase) 1176bdd1243dSDimitry Andric .lowerFor({S1, S16}) 1177bdd1243dSDimitry Andric .widenScalarToNextPow2(1) 1178bdd1243dSDimitry Andric .scalarize(0) 1179bdd1243dSDimitry Andric .lower(); 11800b57cec5SDimitry Andric 11815ffd83dbSDimitry Andric // The hardware instructions return a different result on 0 than the generic 11825ffd83dbSDimitry Andric // instructions expect. The hardware produces -1, but these produce the 11835ffd83dbSDimitry Andric // bitwidth. 11845ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 11855ffd83dbSDimitry Andric .scalarize(0) 11865ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 11875ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 11885ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 11895ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32) 1190349cc55cSDimitry Andric .custom(); 11915ffd83dbSDimitry Andric 11925ffd83dbSDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 11935ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 11945ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 11955ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 11965ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 11975ffd83dbSDimitry Andric .scalarize(0) 11985ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 11995ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 12005ffd83dbSDimitry Andric 1201fe6060f1SDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1202fe6060f1SDimitry Andric // RegBankSelect. 12035ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BITREVERSE) 1204fe6060f1SDimitry Andric .legalFor({S32, S64}) 1205fe6060f1SDimitry Andric .clampScalar(0, S32, S64) 1206fe6060f1SDimitry Andric .scalarize(0) 1207fe6060f1SDimitry Andric .widenScalarToNextPow2(0); 12080b57cec5SDimitry Andric 12090b57cec5SDimitry Andric if (ST.has16BitInsts()) { 12105ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 12115ffd83dbSDimitry Andric .legalFor({S16, S32, V2S16}) 12120eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 12135ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 12145ffd83dbSDimitry Andric // narrowScalar limitation. 12155ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 12165ffd83dbSDimitry Andric .clampScalar(0, S16, S32) 12175ffd83dbSDimitry Andric .scalarize(0); 12185ffd83dbSDimitry Andric 12190b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 1220fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 12210b57cec5SDimitry Andric .legalFor({S32, S16, V2S16}) 12220b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 12230b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 12245ffd83dbSDimitry Andric .minScalar(0, S16) 12250b57cec5SDimitry Andric .widenScalarToNextPow2(0) 12265ffd83dbSDimitry Andric .scalarize(0) 12275ffd83dbSDimitry Andric .lower(); 12280b57cec5SDimitry Andric } else { 1229fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 12300b57cec5SDimitry Andric .legalFor({S32, S16}) 12310b57cec5SDimitry Andric .widenScalarToNextPow2(0) 12325ffd83dbSDimitry Andric .minScalar(0, S16) 12335ffd83dbSDimitry Andric .scalarize(0) 12345ffd83dbSDimitry Andric .lower(); 12350b57cec5SDimitry Andric } 12360b57cec5SDimitry Andric } else { 12375ffd83dbSDimitry Andric // TODO: Should have same legality without v_perm_b32 12385ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 12395ffd83dbSDimitry Andric .legalFor({S32}) 12405ffd83dbSDimitry Andric .lowerIf(scalarNarrowerThan(0, 32)) 12415ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 12425ffd83dbSDimitry Andric // narrowScalar limitation. 12435ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 12445ffd83dbSDimitry Andric .maxScalar(0, S32) 12455ffd83dbSDimitry Andric .scalarize(0) 12465ffd83dbSDimitry Andric .lower(); 12475ffd83dbSDimitry Andric 1248fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 12490b57cec5SDimitry Andric .legalFor({S32}) 12505ffd83dbSDimitry Andric .minScalar(0, S32) 12510b57cec5SDimitry Andric .widenScalarToNextPow2(0) 12525ffd83dbSDimitry Andric .scalarize(0) 12535ffd83dbSDimitry Andric .lower(); 12540b57cec5SDimitry Andric } 12550b57cec5SDimitry Andric 12560b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTTOPTR) 12570b57cec5SDimitry Andric // List the common cases 12580b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 12590b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 12600b57cec5SDimitry Andric .scalarize(0) 12610b57cec5SDimitry Andric // Accept any address space as long as the size matches 12620b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 12630b57cec5SDimitry Andric .widenScalarIf(smallerThan(1, 0), 12640b57cec5SDimitry Andric [](const LegalityQuery &Query) { 1265bdd1243dSDimitry Andric return std::pair( 1266bdd1243dSDimitry Andric 1, LLT::scalar(Query.Types[0].getSizeInBits())); 12670b57cec5SDimitry Andric }) 1268bdd1243dSDimitry Andric .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1269bdd1243dSDimitry Andric return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 12700b57cec5SDimitry Andric }); 12710b57cec5SDimitry Andric 12720b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PTRTOINT) 12730b57cec5SDimitry Andric // List the common cases 12740b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 12750b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 12760b57cec5SDimitry Andric .scalarize(0) 12770b57cec5SDimitry Andric // Accept any address space as long as the size matches 12780b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 12790b57cec5SDimitry Andric .widenScalarIf(smallerThan(0, 1), 12800b57cec5SDimitry Andric [](const LegalityQuery &Query) { 1281bdd1243dSDimitry Andric return std::pair( 1282bdd1243dSDimitry Andric 0, LLT::scalar(Query.Types[1].getSizeInBits())); 12830b57cec5SDimitry Andric }) 1284bdd1243dSDimitry Andric .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1285bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 12860b57cec5SDimitry Andric }); 12870b57cec5SDimitry Andric 12880b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 12890b57cec5SDimitry Andric .scalarize(0) 12900b57cec5SDimitry Andric .custom(); 12910b57cec5SDimitry Andric 12925ffd83dbSDimitry Andric const auto needToSplitMemOp = [=](const LegalityQuery &Query, 12935ffd83dbSDimitry Andric bool IsLoad) -> bool { 12948bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 12958bcb0991SDimitry Andric 12968bcb0991SDimitry Andric // Split vector extloads. 1297fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1298480093f4SDimitry Andric 12998bcb0991SDimitry Andric if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 13008bcb0991SDimitry Andric return true; 13018bcb0991SDimitry Andric 13028bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 13038bcb0991SDimitry Andric unsigned AS = PtrTy.getAddressSpace(); 130406c3fb27SDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 130506c3fb27SDimitry Andric Query.MMODescrs[0].Ordering != 130606c3fb27SDimitry Andric AtomicOrdering::NotAtomic)) 13078bcb0991SDimitry Andric return true; 13088bcb0991SDimitry Andric 13098bcb0991SDimitry Andric // Catch weird sized loads that don't evenly divide into the access sizes 13108bcb0991SDimitry Andric // TODO: May be able to widen depending on alignment etc. 13115ffd83dbSDimitry Andric unsigned NumRegs = (MemSize + 31) / 32; 13125ffd83dbSDimitry Andric if (NumRegs == 3) { 13135ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 13148bcb0991SDimitry Andric return true; 13155ffd83dbSDimitry Andric } else { 13165ffd83dbSDimitry Andric // If the alignment allows, these should have been widened. 13175ffd83dbSDimitry Andric if (!isPowerOf2_32(NumRegs)) 13185ffd83dbSDimitry Andric return true; 13195ffd83dbSDimitry Andric } 13208bcb0991SDimitry Andric 13218bcb0991SDimitry Andric return false; 13228bcb0991SDimitry Andric }; 13238bcb0991SDimitry Andric 1324e8d8bef9SDimitry Andric unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1325e8d8bef9SDimitry Andric unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1326e8d8bef9SDimitry Andric unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 13278bcb0991SDimitry Andric 13288bcb0991SDimitry Andric // TODO: Refine based on subtargets which support unaligned access or 128-bit 13298bcb0991SDimitry Andric // LDS 13308bcb0991SDimitry Andric // TODO: Unsupported flat for SI. 13318bcb0991SDimitry Andric 13328bcb0991SDimitry Andric for (unsigned Op : {G_LOAD, G_STORE}) { 13338bcb0991SDimitry Andric const bool IsStore = Op == G_STORE; 13348bcb0991SDimitry Andric 13358bcb0991SDimitry Andric auto &Actions = getActionDefinitionsBuilder(Op); 13365ffd83dbSDimitry Andric // Explicitly list some common cases. 13375ffd83dbSDimitry Andric // TODO: Does this help compile time at all? 1338fe6060f1SDimitry Andric Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1339fe6060f1SDimitry Andric {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1340fe6060f1SDimitry Andric {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1341fe6060f1SDimitry Andric {S64, GlobalPtr, S64, GlobalAlign32}, 1342fe6060f1SDimitry Andric {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1343fe6060f1SDimitry Andric {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1344fe6060f1SDimitry Andric {S32, GlobalPtr, S8, GlobalAlign8}, 1345fe6060f1SDimitry Andric {S32, GlobalPtr, S16, GlobalAlign16}, 13468bcb0991SDimitry Andric 1347fe6060f1SDimitry Andric {S32, LocalPtr, S32, 32}, 1348fe6060f1SDimitry Andric {S64, LocalPtr, S64, 32}, 1349fe6060f1SDimitry Andric {V2S32, LocalPtr, V2S32, 32}, 1350fe6060f1SDimitry Andric {S32, LocalPtr, S8, 8}, 1351fe6060f1SDimitry Andric {S32, LocalPtr, S16, 16}, 1352fe6060f1SDimitry Andric {V2S16, LocalPtr, S32, 32}, 13538bcb0991SDimitry Andric 1354fe6060f1SDimitry Andric {S32, PrivatePtr, S32, 32}, 1355fe6060f1SDimitry Andric {S32, PrivatePtr, S8, 8}, 1356fe6060f1SDimitry Andric {S32, PrivatePtr, S16, 16}, 1357fe6060f1SDimitry Andric {V2S16, PrivatePtr, S32, 32}, 13588bcb0991SDimitry Andric 1359fe6060f1SDimitry Andric {S32, ConstantPtr, S32, GlobalAlign32}, 1360fe6060f1SDimitry Andric {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1361fe6060f1SDimitry Andric {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1362fe6060f1SDimitry Andric {S64, ConstantPtr, S64, GlobalAlign32}, 1363fe6060f1SDimitry Andric {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 13645ffd83dbSDimitry Andric Actions.legalIf( 13655ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1366fe6060f1SDimitry Andric return isLoadStoreLegal(ST, Query); 13675ffd83dbSDimitry Andric }); 13685ffd83dbSDimitry Andric 136906c3fb27SDimitry Andric // The custom pointers (fat pointers, buffer resources) don't work with load 137006c3fb27SDimitry Andric // and store at this level. Fat pointers should have been lowered to 137106c3fb27SDimitry Andric // intrinsics before the translation to MIR. 137206c3fb27SDimitry Andric Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr})); 137306c3fb27SDimitry Andric 137406c3fb27SDimitry Andric // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 137506c3fb27SDimitry Andric // ptrtoint. This is needed to account for the fact that we can't have i128 137606c3fb27SDimitry Andric // as a register class for SelectionDAG reasons. 137706c3fb27SDimitry Andric Actions.customIf([=](const LegalityQuery &Query) -> bool { 137806c3fb27SDimitry Andric return hasBufferRsrcWorkaround(Query.Types[0]); 137906c3fb27SDimitry Andric }); 138006c3fb27SDimitry Andric 13815ffd83dbSDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 13825ffd83dbSDimitry Andric // 64-bits. 13835ffd83dbSDimitry Andric // 13845ffd83dbSDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 13855ffd83dbSDimitry Andric // inserting addrspacecasts. 13865ffd83dbSDimitry Andric Actions.customIf(typeIs(1, Constant32Ptr)); 13875ffd83dbSDimitry Andric 13885ffd83dbSDimitry Andric // Turn any illegal element vectors into something easier to deal 13895ffd83dbSDimitry Andric // with. These will ultimately produce 32-bit scalar shifts to extract the 13905ffd83dbSDimitry Andric // parts anyway. 13915ffd83dbSDimitry Andric // 13925ffd83dbSDimitry Andric // For odd 16-bit element vectors, prefer to split those into pieces with 13935ffd83dbSDimitry Andric // 16-bit vector parts. 13945ffd83dbSDimitry Andric Actions.bitcastIf( 13955ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1396e8d8bef9SDimitry Andric return shouldBitcastLoadStoreType(ST, Query.Types[0], 1397fe6060f1SDimitry Andric Query.MMODescrs[0].MemoryTy); 13985ffd83dbSDimitry Andric }, bitcastToRegisterType(0)); 13995ffd83dbSDimitry Andric 1400e8d8bef9SDimitry Andric if (!IsStore) { 1401e8d8bef9SDimitry Andric // Widen suitably aligned loads by loading extra bytes. The standard 1402e8d8bef9SDimitry Andric // legalization actions can't properly express widening memory operands. 1403e8d8bef9SDimitry Andric Actions.customIf([=](const LegalityQuery &Query) -> bool { 1404e8d8bef9SDimitry Andric return shouldWidenLoad(ST, Query, G_LOAD); 1405e8d8bef9SDimitry Andric }); 1406e8d8bef9SDimitry Andric } 1407e8d8bef9SDimitry Andric 1408e8d8bef9SDimitry Andric // FIXME: load/store narrowing should be moved to lower action 14098bcb0991SDimitry Andric Actions 14108bcb0991SDimitry Andric .narrowScalarIf( 14118bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 14125ffd83dbSDimitry Andric return !Query.Types[0].isVector() && 14135ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 14148bcb0991SDimitry Andric }, 14158bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 14168bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 14178bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 14188bcb0991SDimitry Andric 14198bcb0991SDimitry Andric const unsigned DstSize = DstTy.getSizeInBits(); 1420fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 14218bcb0991SDimitry Andric 14228bcb0991SDimitry Andric // Split extloads. 14238bcb0991SDimitry Andric if (DstSize > MemSize) 1424bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(MemSize)); 14258bcb0991SDimitry Andric 142606c3fb27SDimitry Andric unsigned MaxSize = maxSizeForAddrSpace( 142706c3fb27SDimitry Andric ST, PtrTy.getAddressSpace(), Op == G_LOAD, 142806c3fb27SDimitry Andric Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 14298bcb0991SDimitry Andric if (MemSize > MaxSize) 1430bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(MaxSize)); 14318bcb0991SDimitry Andric 143204eeddc0SDimitry Andric uint64_t Align = Query.MMODescrs[0].AlignInBits; 1433bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(Align)); 14348bcb0991SDimitry Andric }) 14358bcb0991SDimitry Andric .fewerElementsIf( 14368bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 14375ffd83dbSDimitry Andric return Query.Types[0].isVector() && 14385ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 14398bcb0991SDimitry Andric }, 14408bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 14418bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 14428bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 14438bcb0991SDimitry Andric 14448bcb0991SDimitry Andric LLT EltTy = DstTy.getElementType(); 144506c3fb27SDimitry Andric unsigned MaxSize = maxSizeForAddrSpace( 144606c3fb27SDimitry Andric ST, PtrTy.getAddressSpace(), Op == G_LOAD, 144706c3fb27SDimitry Andric Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 14485ffd83dbSDimitry Andric 14495ffd83dbSDimitry Andric // FIXME: Handle widened to power of 2 results better. This ends 14505ffd83dbSDimitry Andric // up scalarizing. 14515ffd83dbSDimitry Andric // FIXME: 3 element stores scalarized on SI 14528bcb0991SDimitry Andric 14538bcb0991SDimitry Andric // Split if it's too large for the address space. 1454fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1455fe6060f1SDimitry Andric if (MemSize > MaxSize) { 14568bcb0991SDimitry Andric unsigned NumElts = DstTy.getNumElements(); 14575ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 14585ffd83dbSDimitry Andric 14595ffd83dbSDimitry Andric if (MaxSize % EltSize == 0) { 1460bdd1243dSDimitry Andric return std::pair( 1461fe6060f1SDimitry Andric 0, LLT::scalarOrVector( 1462fe6060f1SDimitry Andric ElementCount::getFixed(MaxSize / EltSize), EltTy)); 14635ffd83dbSDimitry Andric } 14645ffd83dbSDimitry Andric 1465fe6060f1SDimitry Andric unsigned NumPieces = MemSize / MaxSize; 14668bcb0991SDimitry Andric 14678bcb0991SDimitry Andric // FIXME: Refine when odd breakdowns handled 14688bcb0991SDimitry Andric // The scalars will need to be re-legalized. 14698bcb0991SDimitry Andric if (NumPieces == 1 || NumPieces >= NumElts || 14708bcb0991SDimitry Andric NumElts % NumPieces != 0) 1471bdd1243dSDimitry Andric return std::pair(0, EltTy); 14728bcb0991SDimitry Andric 1473bdd1243dSDimitry Andric return std::pair(0, 1474bdd1243dSDimitry Andric LLT::fixed_vector(NumElts / NumPieces, EltTy)); 14758bcb0991SDimitry Andric } 14768bcb0991SDimitry Andric 14775ffd83dbSDimitry Andric // FIXME: We could probably handle weird extending loads better. 14785ffd83dbSDimitry Andric if (DstTy.getSizeInBits() > MemSize) 1479bdd1243dSDimitry Andric return std::pair(0, EltTy); 14805ffd83dbSDimitry Andric 14815ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 14825ffd83dbSDimitry Andric unsigned DstSize = DstTy.getSizeInBits(); 14835ffd83dbSDimitry Andric if (!isPowerOf2_32(DstSize)) { 14845ffd83dbSDimitry Andric // We're probably decomposing an odd sized store. Try to split 14855ffd83dbSDimitry Andric // to the widest type. TODO: Account for alignment. As-is it 14865ffd83dbSDimitry Andric // should be OK, since the new parts will be further legalized. 148706c3fb27SDimitry Andric unsigned FloorSize = llvm::bit_floor(DstSize); 1488bdd1243dSDimitry Andric return std::pair( 1489fe6060f1SDimitry Andric 0, LLT::scalarOrVector( 1490fe6060f1SDimitry Andric ElementCount::getFixed(FloorSize / EltSize), EltTy)); 14915ffd83dbSDimitry Andric } 14925ffd83dbSDimitry Andric 14938bcb0991SDimitry Andric // May need relegalization for the scalars. 1494bdd1243dSDimitry Andric return std::pair(0, EltTy); 14958bcb0991SDimitry Andric }) 1496fe6060f1SDimitry Andric .minScalar(0, S32) 1497fe6060f1SDimitry Andric .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 14988bcb0991SDimitry Andric .widenScalarToNextPow2(0) 1499e8d8bef9SDimitry Andric .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1500e8d8bef9SDimitry Andric .lower(); 15018bcb0991SDimitry Andric } 15020b57cec5SDimitry Andric 1503fe6060f1SDimitry Andric // FIXME: Unaligned accesses not lowered. 15040b57cec5SDimitry Andric auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1505fe6060f1SDimitry Andric .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1506fe6060f1SDimitry Andric {S32, GlobalPtr, S16, 2 * 8}, 1507fe6060f1SDimitry Andric {S32, LocalPtr, S8, 8}, 1508fe6060f1SDimitry Andric {S32, LocalPtr, S16, 16}, 1509fe6060f1SDimitry Andric {S32, PrivatePtr, S8, 8}, 1510fe6060f1SDimitry Andric {S32, PrivatePtr, S16, 16}, 1511fe6060f1SDimitry Andric {S32, ConstantPtr, S8, 8}, 1512fe6060f1SDimitry Andric {S32, ConstantPtr, S16, 2 * 8}}) 1513fe6060f1SDimitry Andric .legalIf( 1514fe6060f1SDimitry Andric [=](const LegalityQuery &Query) -> bool { 1515fe6060f1SDimitry Andric return isLoadStoreLegal(ST, Query); 1516fe6060f1SDimitry Andric }); 1517fe6060f1SDimitry Andric 15180b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 15198bcb0991SDimitry Andric ExtLoads.legalForTypesWithMemDesc( 1520fe6060f1SDimitry Andric {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 15210b57cec5SDimitry Andric } 15220b57cec5SDimitry Andric 1523fe6060f1SDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1524fe6060f1SDimitry Andric // 64-bits. 1525fe6060f1SDimitry Andric // 1526fe6060f1SDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 1527fe6060f1SDimitry Andric // inserting addrspacecasts. 1528fe6060f1SDimitry Andric ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1529fe6060f1SDimitry Andric 15300b57cec5SDimitry Andric ExtLoads.clampScalar(0, S32, S32) 15310b57cec5SDimitry Andric .widenScalarToNextPow2(0) 15320b57cec5SDimitry Andric .lower(); 15330b57cec5SDimitry Andric 15340b57cec5SDimitry Andric auto &Atomics = getActionDefinitionsBuilder( 15350b57cec5SDimitry Andric {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 15360b57cec5SDimitry Andric G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 15370b57cec5SDimitry Andric G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 153806c3fb27SDimitry Andric G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 15390b57cec5SDimitry Andric .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1540e8d8bef9SDimitry Andric {S64, GlobalPtr}, {S64, LocalPtr}, 1541e8d8bef9SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 15420b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 15430b57cec5SDimitry Andric Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 15440b57cec5SDimitry Andric } 15450b57cec5SDimitry Andric 1546fe6060f1SDimitry Andric auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1547349cc55cSDimitry Andric if (ST.hasLDSFPAtomicAdd()) { 1548fe6060f1SDimitry Andric Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1549fe6060f1SDimitry Andric if (ST.hasGFX90AInsts()) 1550fe6060f1SDimitry Andric Atomic.legalFor({{S64, LocalPtr}}); 155106c3fb27SDimitry Andric if (ST.hasAtomicDsPkAdd16Insts()) 155281ad6265SDimitry Andric Atomic.legalFor({{V2S16, LocalPtr}}); 15535ffd83dbSDimitry Andric } 1554fe6060f1SDimitry Andric if (ST.hasAtomicFaddInsts()) 1555fe6060f1SDimitry Andric Atomic.legalFor({{S32, GlobalPtr}}); 1556bdd1243dSDimitry Andric if (ST.hasFlatAtomicFaddF32Inst()) 1557bdd1243dSDimitry Andric Atomic.legalFor({{S32, FlatPtr}}); 15588bcb0991SDimitry Andric 155904eeddc0SDimitry Andric if (ST.hasGFX90AInsts()) { 156004eeddc0SDimitry Andric // These are legal with some caveats, and should have undergone expansion in 156104eeddc0SDimitry Andric // the IR in most situations 156204eeddc0SDimitry Andric // TODO: Move atomic expansion into legalizer 156304eeddc0SDimitry Andric Atomic.legalFor({ 156404eeddc0SDimitry Andric {S32, GlobalPtr}, 156504eeddc0SDimitry Andric {S64, GlobalPtr}, 156604eeddc0SDimitry Andric {S64, FlatPtr} 156704eeddc0SDimitry Andric }); 156804eeddc0SDimitry Andric } 156904eeddc0SDimitry Andric 1570480093f4SDimitry Andric // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1571480093f4SDimitry Andric // demarshalling 1572480093f4SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1573480093f4SDimitry Andric .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1574480093f4SDimitry Andric {S32, FlatPtr}, {S64, FlatPtr}}) 1575480093f4SDimitry Andric .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1576480093f4SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 15770b57cec5SDimitry Andric // TODO: Pointer types, any 32-bit or 64-bit vector 1578480093f4SDimitry Andric 1579480093f4SDimitry Andric // Condition should be s32 for scalar, s1 for vector. 15800b57cec5SDimitry Andric getActionDefinitionsBuilder(G_SELECT) 1581fe6060f1SDimitry Andric .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1582fe6060f1SDimitry Andric LocalPtr, FlatPtr, PrivatePtr, 1583fe6060f1SDimitry Andric LLT::fixed_vector(2, LocalPtr), 1584fe6060f1SDimitry Andric LLT::fixed_vector(2, PrivatePtr)}, 1585fe6060f1SDimitry Andric {S1, S32}) 15860b57cec5SDimitry Andric .clampScalar(0, S16, S64) 15875ffd83dbSDimitry Andric .scalarize(1) 15880b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 15890b57cec5SDimitry Andric .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 15900b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 2) 15910b57cec5SDimitry Andric .clampMaxNumElements(0, LocalPtr, 2) 15920b57cec5SDimitry Andric .clampMaxNumElements(0, PrivatePtr, 2) 15930b57cec5SDimitry Andric .scalarize(0) 15940b57cec5SDimitry Andric .widenScalarToNextPow2(0) 1595480093f4SDimitry Andric .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 15960b57cec5SDimitry Andric 15970b57cec5SDimitry Andric // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 15980b57cec5SDimitry Andric // be more flexible with the shift amount type. 15990b57cec5SDimitry Andric auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 16000b57cec5SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}); 16010b57cec5SDimitry Andric if (ST.has16BitInsts()) { 16020b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 16035ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 16040b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2); 16050b57cec5SDimitry Andric } else 16065ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}}); 16070b57cec5SDimitry Andric 16085ffd83dbSDimitry Andric // TODO: Support 16-bit shift amounts for all types 16095ffd83dbSDimitry Andric Shifts.widenScalarIf( 16105ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { 16115ffd83dbSDimitry Andric // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 16125ffd83dbSDimitry Andric // 32-bit amount. 16135ffd83dbSDimitry Andric const LLT ValTy = Query.Types[0]; 16145ffd83dbSDimitry Andric const LLT AmountTy = Query.Types[1]; 16155ffd83dbSDimitry Andric return ValTy.getSizeInBits() <= 16 && 16165ffd83dbSDimitry Andric AmountTy.getSizeInBits() < 16; 16175ffd83dbSDimitry Andric }, changeTo(1, S16)); 16185ffd83dbSDimitry Andric Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1619480093f4SDimitry Andric Shifts.clampScalar(1, S32, S32); 16200b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 16); 162104eeddc0SDimitry Andric Shifts.clampScalar(0, S16, S64); 1622e8d8bef9SDimitry Andric 1623e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1624e8d8bef9SDimitry Andric .minScalar(0, S16) 1625e8d8bef9SDimitry Andric .scalarize(0) 1626e8d8bef9SDimitry Andric .lower(); 16270b57cec5SDimitry Andric } else { 16280b57cec5SDimitry Andric // Make sure we legalize the shift amount type first, as the general 16290b57cec5SDimitry Andric // expansion for the shifted type will produce much worse code if it hasn't 16300b57cec5SDimitry Andric // been truncated already. 16310b57cec5SDimitry Andric Shifts.clampScalar(1, S32, S32); 16320b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 32); 163304eeddc0SDimitry Andric Shifts.clampScalar(0, S32, S64); 1634e8d8bef9SDimitry Andric 1635e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1636e8d8bef9SDimitry Andric .minScalar(0, S32) 1637e8d8bef9SDimitry Andric .scalarize(0) 1638e8d8bef9SDimitry Andric .lower(); 16390b57cec5SDimitry Andric } 16400b57cec5SDimitry Andric Shifts.scalarize(0); 16410b57cec5SDimitry Andric 16420b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 16430b57cec5SDimitry Andric unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 16440b57cec5SDimitry Andric unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 16450b57cec5SDimitry Andric unsigned IdxTypeIdx = 2; 16460b57cec5SDimitry Andric 16470b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 16480b57cec5SDimitry Andric .customIf([=](const LegalityQuery &Query) { 16490b57cec5SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 16500b57cec5SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 16510b57cec5SDimitry Andric const LLT IdxTy = Query.Types[IdxTypeIdx]; 1652e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 165306c3fb27SDimitry Andric const bool isLegalVecType = 165406c3fb27SDimitry Andric !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 165506c3fb27SDimitry Andric // Address space 8 pointers are 128-bit wide values, but the logic 165606c3fb27SDimitry Andric // below will try to bitcast them to 2N x s64, which will fail. 165706c3fb27SDimitry Andric // Therefore, as an intermediate step, wrap extracts/insertions from a 165806c3fb27SDimitry Andric // ptrtoint-ing the vector and scalar arguments (or inttoptring the 165906c3fb27SDimitry Andric // extraction result) in order to produce a vector operation that can 166006c3fb27SDimitry Andric // be handled by the logic below. 166106c3fb27SDimitry Andric if (EltTy.isPointer() && EltSize > 64) 166206c3fb27SDimitry Andric return true; 1663e8d8bef9SDimitry Andric return (EltSize == 32 || EltSize == 64) && 16640b57cec5SDimitry Andric VecTy.getSizeInBits() % 32 == 0 && 16655ffd83dbSDimitry Andric VecTy.getSizeInBits() <= MaxRegisterSize && 166606c3fb27SDimitry Andric IdxTy.getSizeInBits() == 32 && 166706c3fb27SDimitry Andric isLegalVecType; 16680b57cec5SDimitry Andric }) 1669e8d8bef9SDimitry Andric .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1670e8d8bef9SDimitry Andric bitcastToVectorElement32(VecTypeIdx)) 1671e8d8bef9SDimitry Andric //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1672e8d8bef9SDimitry Andric .bitcastIf( 1673e8d8bef9SDimitry Andric all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1674e8d8bef9SDimitry Andric [=](const LegalityQuery &Query) { 1675e8d8bef9SDimitry Andric // For > 64-bit element types, try to turn this into a 64-bit 1676e8d8bef9SDimitry Andric // element vector since we may be able to do better indexing 1677e8d8bef9SDimitry Andric // if this is scalar. If not, fall back to 32. 1678e8d8bef9SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 1679e8d8bef9SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 1680e8d8bef9SDimitry Andric const unsigned DstEltSize = EltTy.getSizeInBits(); 1681e8d8bef9SDimitry Andric const unsigned VecSize = VecTy.getSizeInBits(); 1682e8d8bef9SDimitry Andric 1683e8d8bef9SDimitry Andric const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1684bdd1243dSDimitry Andric return std::pair( 1685fe6060f1SDimitry Andric VecTypeIdx, 1686fe6060f1SDimitry Andric LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1687e8d8bef9SDimitry Andric }) 16880b57cec5SDimitry Andric .clampScalar(EltTypeIdx, S32, S64) 16890b57cec5SDimitry Andric .clampScalar(VecTypeIdx, S32, S64) 1690e8d8bef9SDimitry Andric .clampScalar(IdxTypeIdx, S32, S32) 1691e8d8bef9SDimitry Andric .clampMaxNumElements(VecTypeIdx, S32, 32) 1692e8d8bef9SDimitry Andric // TODO: Clamp elements for 64-bit vectors? 169306c3fb27SDimitry Andric .moreElementsIf( 169406c3fb27SDimitry Andric isIllegalRegisterType(VecTypeIdx), 169506c3fb27SDimitry Andric moreElementsToNextExistingRegClass(VecTypeIdx)) 1696e8d8bef9SDimitry Andric // It should only be necessary with variable indexes. 1697e8d8bef9SDimitry Andric // As a last resort, lower to the stack 1698e8d8bef9SDimitry Andric .lower(); 16990b57cec5SDimitry Andric } 17000b57cec5SDimitry Andric 17010b57cec5SDimitry Andric getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 17020b57cec5SDimitry Andric .unsupportedIf([=](const LegalityQuery &Query) { 17030b57cec5SDimitry Andric const LLT &EltTy = Query.Types[1].getElementType(); 17040b57cec5SDimitry Andric return Query.Types[0] != EltTy; 17050b57cec5SDimitry Andric }); 17060b57cec5SDimitry Andric 17070b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT, G_INSERT}) { 17080b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 17090b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 17100b57cec5SDimitry Andric 17110b57cec5SDimitry Andric // FIXME: Doesn't handle extract of illegal sizes. 17120b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 17138bcb0991SDimitry Andric .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 17140eae32dcSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 17150eae32dcSDimitry Andric // Sub-vector(or single element) insert and extract. 17160eae32dcSDimitry Andric // TODO: verify immediate offset here since lower only works with 17170eae32dcSDimitry Andric // whole elements. 17180eae32dcSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 17190eae32dcSDimitry Andric return BigTy.isVector(); 17200eae32dcSDimitry Andric }) 17218bcb0991SDimitry Andric // FIXME: Multiples of 16 should not be legal. 17220b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 17230b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 17240b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 17250b57cec5SDimitry Andric return (BigTy.getSizeInBits() % 32 == 0) && 17260b57cec5SDimitry Andric (LitTy.getSizeInBits() % 16 == 0); 17270b57cec5SDimitry Andric }) 17280b57cec5SDimitry Andric .widenScalarIf( 17290b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 17300b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 17310b57cec5SDimitry Andric return (BigTy.getScalarSizeInBits() < 16); 17320b57cec5SDimitry Andric }, 17330b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 17340b57cec5SDimitry Andric .widenScalarIf( 17350b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 17360b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 17370b57cec5SDimitry Andric return (LitTy.getScalarSizeInBits() < 16); 17380b57cec5SDimitry Andric }, 17390b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 17400b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 17410b57cec5SDimitry Andric .widenScalarToNextPow2(BigTyIdx, 32); 17420b57cec5SDimitry Andric 17430b57cec5SDimitry Andric } 17440b57cec5SDimitry Andric 17458bcb0991SDimitry Andric auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 17460b57cec5SDimitry Andric .legalForCartesianProduct(AllS32Vectors, {S32}) 17470b57cec5SDimitry Andric .legalForCartesianProduct(AllS64Vectors, {S64}) 17488bcb0991SDimitry Andric .clampNumElements(0, V16S32, V32S32) 17498bcb0991SDimitry Andric .clampNumElements(0, V2S64, V16S64) 175006c3fb27SDimitry Andric .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 175106c3fb27SDimitry Andric .moreElementsIf( 175206c3fb27SDimitry Andric isIllegalRegisterType(0), 175306c3fb27SDimitry Andric moreElementsToNextExistingRegClass(0)); 17548bcb0991SDimitry Andric 17558bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) { 17565ffd83dbSDimitry Andric BuildVector 17575ffd83dbSDimitry Andric // FIXME: Should probably widen s1 vectors straight to s32 17585ffd83dbSDimitry Andric .minScalarOrElt(0, S16) 1759bdd1243dSDimitry Andric .minScalar(1, S16); 17605ffd83dbSDimitry Andric 17618bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 17628bcb0991SDimitry Andric .legalFor({V2S16, S32}) 17638bcb0991SDimitry Andric .lower(); 17648bcb0991SDimitry Andric } else { 17655ffd83dbSDimitry Andric BuildVector.customFor({V2S16, S16}); 17665ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 17675ffd83dbSDimitry Andric 17688bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 17695ffd83dbSDimitry Andric .customFor({V2S16, S32}) 17708bcb0991SDimitry Andric .lower(); 17718bcb0991SDimitry Andric } 17728bcb0991SDimitry Andric 17735ffd83dbSDimitry Andric BuildVector.legalIf(isRegisterType(0)); 17745ffd83dbSDimitry Andric 17755ffd83dbSDimitry Andric // FIXME: Clamp maximum size 17760b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1777e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 1778e8d8bef9SDimitry Andric .clampMaxNumElements(0, S32, 32) 1779e8d8bef9SDimitry Andric .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1780e8d8bef9SDimitry Andric .clampMaxNumElements(0, S16, 64); 17810b57cec5SDimitry Andric 17828bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 17838bcb0991SDimitry Andric 17840b57cec5SDimitry Andric // Merge/Unmerge 17850b57cec5SDimitry Andric for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 17860b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 17870b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 17880b57cec5SDimitry Andric 17890b57cec5SDimitry Andric auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 17905ffd83dbSDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 17910b57cec5SDimitry Andric if (Ty.isVector()) { 17920b57cec5SDimitry Andric const LLT &EltTy = Ty.getElementType(); 17935ffd83dbSDimitry Andric if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 17940b57cec5SDimitry Andric return true; 179506c3fb27SDimitry Andric if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 17960b57cec5SDimitry Andric return true; 17970b57cec5SDimitry Andric } 17980b57cec5SDimitry Andric return false; 17990b57cec5SDimitry Andric }; 18000b57cec5SDimitry Andric 18018bcb0991SDimitry Andric auto &Builder = getActionDefinitionsBuilder(Op) 1802e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 18035ffd83dbSDimitry Andric .lowerFor({{S16, V2S16}}) 18045ffd83dbSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 18055ffd83dbSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 18065ffd83dbSDimitry Andric return BigTy.getSizeInBits() == 32; 18075ffd83dbSDimitry Andric }) 18085ffd83dbSDimitry Andric // Try to widen to s16 first for small types. 18095ffd83dbSDimitry Andric // TODO: Only do this on targets with legal s16 shifts 18105ffd83dbSDimitry Andric .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 18110b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 18128bcb0991SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 18138bcb0991SDimitry Andric .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 18148bcb0991SDimitry Andric elementTypeIs(1, S16)), 18158bcb0991SDimitry Andric changeTo(1, V2S16)) 18165ffd83dbSDimitry Andric // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 18175ffd83dbSDimitry Andric // worth considering the multiples of 64 since 2*192 and 2*384 are not 18185ffd83dbSDimitry Andric // valid. 18195ffd83dbSDimitry Andric .clampScalar(LitTyIdx, S32, S512) 18205ffd83dbSDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 18210b57cec5SDimitry Andric // Break up vectors with weird elements into scalars 18220b57cec5SDimitry Andric .fewerElementsIf( 18235ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 18240b57cec5SDimitry Andric scalarize(0)) 18250b57cec5SDimitry Andric .fewerElementsIf( 18265ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 18270b57cec5SDimitry Andric scalarize(1)) 18285ffd83dbSDimitry Andric .clampScalar(BigTyIdx, S32, MaxScalar); 18298bcb0991SDimitry Andric 18308bcb0991SDimitry Andric if (Op == G_MERGE_VALUES) { 18318bcb0991SDimitry Andric Builder.widenScalarIf( 18328bcb0991SDimitry Andric // TODO: Use 16-bit shifts if legal for 8-bit values? 18330b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 18348bcb0991SDimitry Andric const LLT Ty = Query.Types[LitTyIdx]; 18358bcb0991SDimitry Andric return Ty.getSizeInBits() < 32; 18368bcb0991SDimitry Andric }, 18378bcb0991SDimitry Andric changeTo(LitTyIdx, S32)); 18388bcb0991SDimitry Andric } 18398bcb0991SDimitry Andric 18408bcb0991SDimitry Andric Builder.widenScalarIf( 18418bcb0991SDimitry Andric [=](const LegalityQuery &Query) { 18428bcb0991SDimitry Andric const LLT Ty = Query.Types[BigTyIdx]; 184306c3fb27SDimitry Andric return Ty.getSizeInBits() % 16 != 0; 18440b57cec5SDimitry Andric }, 18450b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 18460b57cec5SDimitry Andric // Pick the next power of 2, or a multiple of 64 over 128. 18470b57cec5SDimitry Andric // Whichever is smaller. 18480b57cec5SDimitry Andric const LLT &Ty = Query.Types[BigTyIdx]; 18490b57cec5SDimitry Andric unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 18500b57cec5SDimitry Andric if (NewSizeInBits >= 256) { 18510b57cec5SDimitry Andric unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 18520b57cec5SDimitry Andric if (RoundedTo < NewSizeInBits) 18530b57cec5SDimitry Andric NewSizeInBits = RoundedTo; 18540b57cec5SDimitry Andric } 1855bdd1243dSDimitry Andric return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 18560b57cec5SDimitry Andric }) 18570b57cec5SDimitry Andric // Any vectors left are the wrong size. Scalarize them. 18580b57cec5SDimitry Andric .scalarize(0) 18590b57cec5SDimitry Andric .scalarize(1); 18600b57cec5SDimitry Andric } 18610b57cec5SDimitry Andric 18625ffd83dbSDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 18635ffd83dbSDimitry Andric // RegBankSelect. 18645ffd83dbSDimitry Andric auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 18655ffd83dbSDimitry Andric .legalFor({{S32}, {S64}}); 18668bcb0991SDimitry Andric 18675ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 18685ffd83dbSDimitry Andric SextInReg.lowerFor({{V2S16}}) 18695ffd83dbSDimitry Andric // Prefer to reduce vector widths for 16-bit vectors before lowering, to 18705ffd83dbSDimitry Andric // get more vector shift opportunities, since we'll get those when 18715ffd83dbSDimitry Andric // expanded. 18720eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2); 18735ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 18745ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}, {S16}}); 18755ffd83dbSDimitry Andric } else { 18765ffd83dbSDimitry Andric // Prefer to promote to s32 before lowering if we don't have 16-bit 18775ffd83dbSDimitry Andric // shifts. This avoid a lot of intermediate truncate and extend operations. 18785ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}}); 18795ffd83dbSDimitry Andric } 18805ffd83dbSDimitry Andric 18815ffd83dbSDimitry Andric SextInReg 18825ffd83dbSDimitry Andric .scalarize(0) 18835ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 18845ffd83dbSDimitry Andric .lower(); 18855ffd83dbSDimitry Andric 1886349cc55cSDimitry Andric getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1887349cc55cSDimitry Andric .scalarize(0) 1888349cc55cSDimitry Andric .lower(); 1889349cc55cSDimitry Andric 1890fe6060f1SDimitry Andric // TODO: Only Try to form v2s16 with legal packed instructions. 18915ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSHR) 18925ffd83dbSDimitry Andric .legalFor({{S32, S32}}) 1893fe6060f1SDimitry Andric .lowerFor({{V2S16, V2S16}}) 18940eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 18955ffd83dbSDimitry Andric .scalarize(0) 18965ffd83dbSDimitry Andric .lower(); 1897480093f4SDimitry Andric 1898fe6060f1SDimitry Andric if (ST.hasVOP3PInsts()) { 1899fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FSHL) 1900fe6060f1SDimitry Andric .lowerFor({{V2S16, V2S16}}) 19010eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 1902fe6060f1SDimitry Andric .scalarize(0) 1903fe6060f1SDimitry Andric .lower(); 1904fe6060f1SDimitry Andric } else { 1905fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FSHL) 1906fe6060f1SDimitry Andric .scalarize(0) 1907fe6060f1SDimitry Andric .lower(); 1908fe6060f1SDimitry Andric } 1909fe6060f1SDimitry Andric 1910480093f4SDimitry Andric getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1911480093f4SDimitry Andric .legalFor({S64}); 1912480093f4SDimitry Andric 1913e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FENCE) 1914e8d8bef9SDimitry Andric .alwaysLegal(); 1915e8d8bef9SDimitry Andric 1916fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1917fe6060f1SDimitry Andric .scalarize(0) 1918fe6060f1SDimitry Andric .minScalar(0, S32) 1919fe6060f1SDimitry Andric .lower(); 1920fe6060f1SDimitry Andric 1921fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1922fe6060f1SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}) 1923fe6060f1SDimitry Andric .clampScalar(1, S32, S32) 1924fe6060f1SDimitry Andric .clampScalar(0, S32, S64) 1925fe6060f1SDimitry Andric .widenScalarToNextPow2(0) 1926fe6060f1SDimitry Andric .scalarize(0); 1927fe6060f1SDimitry Andric 19285ffd83dbSDimitry Andric getActionDefinitionsBuilder({ 19295ffd83dbSDimitry Andric // TODO: Verify V_BFI_B32 is generated from expanded bit ops 19305ffd83dbSDimitry Andric G_FCOPYSIGN, 19315ffd83dbSDimitry Andric 19325ffd83dbSDimitry Andric G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1933e8d8bef9SDimitry Andric G_ATOMICRMW_NAND, 1934e8d8bef9SDimitry Andric G_ATOMICRMW_FSUB, 19355ffd83dbSDimitry Andric G_READ_REGISTER, 19365ffd83dbSDimitry Andric G_WRITE_REGISTER, 19375ffd83dbSDimitry Andric 19385ffd83dbSDimitry Andric G_SADDO, G_SSUBO, 19395ffd83dbSDimitry Andric 19405ffd83dbSDimitry Andric // TODO: Implement 1941fe6060f1SDimitry Andric G_FMINIMUM, G_FMAXIMUM}).lower(); 19425ffd83dbSDimitry Andric 1943349cc55cSDimitry Andric getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1944349cc55cSDimitry Andric .lower(); 1945349cc55cSDimitry Andric 1946480093f4SDimitry Andric getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 19475ffd83dbSDimitry Andric G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1948480093f4SDimitry Andric G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1949480093f4SDimitry Andric .unsupported(); 1950480093f4SDimitry Andric 1951fe6060f1SDimitry Andric getLegacyLegalizerInfo().computeTables(); 19520b57cec5SDimitry Andric verify(*ST.getInstrInfo()); 19530b57cec5SDimitry Andric } 19540b57cec5SDimitry Andric 19555ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 19565ffd83dbSDimitry Andric MachineInstr &MI) const { 19575ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 19585ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 19595ffd83dbSDimitry Andric 19600b57cec5SDimitry Andric switch (MI.getOpcode()) { 19610b57cec5SDimitry Andric case TargetOpcode::G_ADDRSPACE_CAST: 19628bcb0991SDimitry Andric return legalizeAddrSpaceCast(MI, MRI, B); 19630b57cec5SDimitry Andric case TargetOpcode::G_FRINT: 19648bcb0991SDimitry Andric return legalizeFrint(MI, MRI, B); 19650b57cec5SDimitry Andric case TargetOpcode::G_FCEIL: 19668bcb0991SDimitry Andric return legalizeFceil(MI, MRI, B); 1967e8d8bef9SDimitry Andric case TargetOpcode::G_FREM: 1968e8d8bef9SDimitry Andric return legalizeFrem(MI, MRI, B); 19690b57cec5SDimitry Andric case TargetOpcode::G_INTRINSIC_TRUNC: 19708bcb0991SDimitry Andric return legalizeIntrinsicTrunc(MI, MRI, B); 19710b57cec5SDimitry Andric case TargetOpcode::G_SITOFP: 19728bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, true); 19730b57cec5SDimitry Andric case TargetOpcode::G_UITOFP: 19748bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, false); 19755ffd83dbSDimitry Andric case TargetOpcode::G_FPTOSI: 19765ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, true); 19775ffd83dbSDimitry Andric case TargetOpcode::G_FPTOUI: 19785ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, false); 19790b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM: 19800b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM: 19810b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM_IEEE: 19820b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM_IEEE: 19835ffd83dbSDimitry Andric return legalizeMinNumMaxNum(Helper, MI); 19840b57cec5SDimitry Andric case TargetOpcode::G_EXTRACT_VECTOR_ELT: 19858bcb0991SDimitry Andric return legalizeExtractVectorElt(MI, MRI, B); 19860b57cec5SDimitry Andric case TargetOpcode::G_INSERT_VECTOR_ELT: 19878bcb0991SDimitry Andric return legalizeInsertVectorElt(MI, MRI, B); 19888bcb0991SDimitry Andric case TargetOpcode::G_FSIN: 19898bcb0991SDimitry Andric case TargetOpcode::G_FCOS: 19908bcb0991SDimitry Andric return legalizeSinCos(MI, MRI, B); 19918bcb0991SDimitry Andric case TargetOpcode::G_GLOBAL_VALUE: 19928bcb0991SDimitry Andric return legalizeGlobalValue(MI, MRI, B); 19938bcb0991SDimitry Andric case TargetOpcode::G_LOAD: 1994fe6060f1SDimitry Andric case TargetOpcode::G_SEXTLOAD: 1995fe6060f1SDimitry Andric case TargetOpcode::G_ZEXTLOAD: 1996e8d8bef9SDimitry Andric return legalizeLoad(Helper, MI); 199706c3fb27SDimitry Andric case TargetOpcode::G_STORE: 199806c3fb27SDimitry Andric return legalizeStore(Helper, MI); 19998bcb0991SDimitry Andric case TargetOpcode::G_FMAD: 20008bcb0991SDimitry Andric return legalizeFMad(MI, MRI, B); 20018bcb0991SDimitry Andric case TargetOpcode::G_FDIV: 20028bcb0991SDimitry Andric return legalizeFDIV(MI, MRI, B); 200306c3fb27SDimitry Andric case TargetOpcode::G_FFREXP: 200406c3fb27SDimitry Andric return legalizeFFREXP(MI, MRI, B); 200506c3fb27SDimitry Andric case TargetOpcode::G_FSQRT: 200606c3fb27SDimitry Andric return legalizeFSQRT(MI, MRI, B); 20075ffd83dbSDimitry Andric case TargetOpcode::G_UDIV: 20085ffd83dbSDimitry Andric case TargetOpcode::G_UREM: 2009fe6060f1SDimitry Andric case TargetOpcode::G_UDIVREM: 2010fe6060f1SDimitry Andric return legalizeUnsignedDIV_REM(MI, MRI, B); 20115ffd83dbSDimitry Andric case TargetOpcode::G_SDIV: 20125ffd83dbSDimitry Andric case TargetOpcode::G_SREM: 2013fe6060f1SDimitry Andric case TargetOpcode::G_SDIVREM: 2014fe6060f1SDimitry Andric return legalizeSignedDIV_REM(MI, MRI, B); 2015480093f4SDimitry Andric case TargetOpcode::G_ATOMIC_CMPXCHG: 2016480093f4SDimitry Andric return legalizeAtomicCmpXChg(MI, MRI, B); 201706c3fb27SDimitry Andric case TargetOpcode::G_FLOG2: 201806c3fb27SDimitry Andric return legalizeFlog2(MI, B); 20195ffd83dbSDimitry Andric case TargetOpcode::G_FLOG: 20205ffd83dbSDimitry Andric case TargetOpcode::G_FLOG10: 202106c3fb27SDimitry Andric return legalizeFlogCommon(MI, B); 202206c3fb27SDimitry Andric case TargetOpcode::G_FEXP2: 202306c3fb27SDimitry Andric return legalizeFExp2(MI, B); 20245ffd83dbSDimitry Andric case TargetOpcode::G_FEXP: 20255ffd83dbSDimitry Andric return legalizeFExp(MI, B); 20265ffd83dbSDimitry Andric case TargetOpcode::G_FPOW: 20275ffd83dbSDimitry Andric return legalizeFPow(MI, B); 20285ffd83dbSDimitry Andric case TargetOpcode::G_FFLOOR: 20295ffd83dbSDimitry Andric return legalizeFFloor(MI, MRI, B); 20305ffd83dbSDimitry Andric case TargetOpcode::G_BUILD_VECTOR: 2031bdd1243dSDimitry Andric case TargetOpcode::G_BUILD_VECTOR_TRUNC: 20325ffd83dbSDimitry Andric return legalizeBuildVector(MI, MRI, B); 203381ad6265SDimitry Andric case TargetOpcode::G_MUL: 203481ad6265SDimitry Andric return legalizeMul(Helper, MI); 2035349cc55cSDimitry Andric case TargetOpcode::G_CTLZ: 2036349cc55cSDimitry Andric case TargetOpcode::G_CTTZ: 2037349cc55cSDimitry Andric return legalizeCTLZ_CTTZ(MI, MRI, B); 203881ad6265SDimitry Andric case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 203981ad6265SDimitry Andric return legalizeFPTruncRound(MI, B); 20400b57cec5SDimitry Andric default: 20410b57cec5SDimitry Andric return false; 20420b57cec5SDimitry Andric } 20430b57cec5SDimitry Andric 20440b57cec5SDimitry Andric llvm_unreachable("expected switch to return"); 20450b57cec5SDimitry Andric } 20460b57cec5SDimitry Andric 20470b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture( 20480b57cec5SDimitry Andric unsigned AS, 20490b57cec5SDimitry Andric MachineRegisterInfo &MRI, 20508bcb0991SDimitry Andric MachineIRBuilder &B) const { 20518bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 20520b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 20530b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 2054bdd1243dSDimitry Andric const LLT S64 = LLT::scalar(64); 20550b57cec5SDimitry Andric 20568bcb0991SDimitry Andric assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 20578bcb0991SDimitry Andric 20580b57cec5SDimitry Andric if (ST.hasApertureRegs()) { 2059bdd1243dSDimitry Andric // Note: this register is somewhat broken. When used as a 32-bit operand, 2060bdd1243dSDimitry Andric // it only returns zeroes. The real value is in the upper 32 bits. 2061bdd1243dSDimitry Andric // Thus, we must emit extract the high 32 bits. 2062bdd1243dSDimitry Andric const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2063bdd1243dSDimitry Andric ? AMDGPU::SRC_SHARED_BASE 2064bdd1243dSDimitry Andric : AMDGPU::SRC_PRIVATE_BASE; 2065bdd1243dSDimitry Andric // FIXME: It would be more natural to emit a COPY here, but then copy 2066bdd1243dSDimitry Andric // coalescing would kick in and it would think it's okay to use the "HI" 2067bdd1243dSDimitry Andric // subregister (instead of extracting the HI 32 bits) which is an artificial 2068bdd1243dSDimitry Andric // (unusable) register. 2069bdd1243dSDimitry Andric // Register TableGen definitions would need an overhaul to get rid of the 2070bdd1243dSDimitry Andric // artificial "HI" aperture registers and prevent this kind of issue from 2071bdd1243dSDimitry Andric // happening. 2072bdd1243dSDimitry Andric Register Dst = MRI.createGenericVirtualRegister(S64); 2073bdd1243dSDimitry Andric MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2074bdd1243dSDimitry Andric B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2075bdd1243dSDimitry Andric return B.buildUnmerge(S32, Dst).getReg(1); 20760b57cec5SDimitry Andric } 20770b57cec5SDimitry Andric 207881ad6265SDimitry Andric // TODO: can we be smarter about machine pointer info? 207981ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 208081ad6265SDimitry Andric Register LoadAddr = MRI.createGenericVirtualRegister( 208181ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 208281ad6265SDimitry Andric // For code object version 5, private_base and shared_base are passed through 208381ad6265SDimitry Andric // implicit kernargs. 208406c3fb27SDimitry Andric if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= 208506c3fb27SDimitry Andric AMDGPU::AMDHSA_COV5) { 208681ad6265SDimitry Andric AMDGPUTargetLowering::ImplicitParameter Param = 208781ad6265SDimitry Andric AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 208881ad6265SDimitry Andric : AMDGPUTargetLowering::PRIVATE_BASE; 208981ad6265SDimitry Andric uint64_t Offset = 209081ad6265SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 209181ad6265SDimitry Andric 209281ad6265SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister( 209381ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 209481ad6265SDimitry Andric 209581ad6265SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 209681ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 209781ad6265SDimitry Andric return Register(); 209881ad6265SDimitry Andric 209981ad6265SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 210081ad6265SDimitry Andric PtrInfo, 210181ad6265SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 210281ad6265SDimitry Andric MachineMemOperand::MOInvariant, 210381ad6265SDimitry Andric LLT::scalar(32), commonAlignment(Align(64), Offset)); 210481ad6265SDimitry Andric 210581ad6265SDimitry Andric // Pointer address 210681ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, KernargPtrReg, 210781ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 210881ad6265SDimitry Andric // Load address 210981ad6265SDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 211081ad6265SDimitry Andric } 211181ad6265SDimitry Andric 21120b57cec5SDimitry Andric Register QueuePtr = MRI.createGenericVirtualRegister( 21130b57cec5SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 21140b57cec5SDimitry Andric 2115e8d8bef9SDimitry Andric if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 21168bcb0991SDimitry Andric return Register(); 21170b57cec5SDimitry Andric 21180b57cec5SDimitry Andric // Offset into amd_queue_t for group_segment_aperture_base_hi / 21190b57cec5SDimitry Andric // private_segment_aperture_base_hi. 21200b57cec5SDimitry Andric uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 21210b57cec5SDimitry Andric 21220b57cec5SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 21230b57cec5SDimitry Andric PtrInfo, 21245ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 21250b57cec5SDimitry Andric MachineMemOperand::MOInvariant, 2126fe6060f1SDimitry Andric LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 21270b57cec5SDimitry Andric 212881ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, QueuePtr, 212981ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 21305ffd83dbSDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 21310b57cec5SDimitry Andric } 21320b57cec5SDimitry Andric 213304eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is 213404eeddc0SDimitry Andric /// not necessary. 213504eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 213604eeddc0SDimitry Andric const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 213704eeddc0SDimitry Andric MachineInstr *Def = MRI.getVRegDef(Val); 213804eeddc0SDimitry Andric switch (Def->getOpcode()) { 213904eeddc0SDimitry Andric case AMDGPU::G_FRAME_INDEX: 214004eeddc0SDimitry Andric case AMDGPU::G_GLOBAL_VALUE: 214104eeddc0SDimitry Andric case AMDGPU::G_BLOCK_ADDR: 214204eeddc0SDimitry Andric return true; 214304eeddc0SDimitry Andric case AMDGPU::G_CONSTANT: { 214404eeddc0SDimitry Andric const ConstantInt *CI = Def->getOperand(1).getCImm(); 214504eeddc0SDimitry Andric return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 214604eeddc0SDimitry Andric } 214704eeddc0SDimitry Andric default: 214804eeddc0SDimitry Andric return false; 214904eeddc0SDimitry Andric } 215004eeddc0SDimitry Andric 215104eeddc0SDimitry Andric return false; 215204eeddc0SDimitry Andric } 215304eeddc0SDimitry Andric 21540b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 21550b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21568bcb0991SDimitry Andric MachineIRBuilder &B) const { 21578bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 21580b57cec5SDimitry Andric 21598bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 21600b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 21610b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 21620b57cec5SDimitry Andric 21630b57cec5SDimitry Andric LLT DstTy = MRI.getType(Dst); 21640b57cec5SDimitry Andric LLT SrcTy = MRI.getType(Src); 21650b57cec5SDimitry Andric unsigned DestAS = DstTy.getAddressSpace(); 21660b57cec5SDimitry Andric unsigned SrcAS = SrcTy.getAddressSpace(); 21670b57cec5SDimitry Andric 21680b57cec5SDimitry Andric // TODO: Avoid reloading from the queue ptr for each cast, or at least each 21690b57cec5SDimitry Andric // vector element. 21700b57cec5SDimitry Andric assert(!DstTy.isVector()); 21710b57cec5SDimitry Andric 21720b57cec5SDimitry Andric const AMDGPUTargetMachine &TM 21730b57cec5SDimitry Andric = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 21740b57cec5SDimitry Andric 2175e8d8bef9SDimitry Andric if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 21768bcb0991SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 21778bcb0991SDimitry Andric return true; 21788bcb0991SDimitry Andric } 21798bcb0991SDimitry Andric 218081ad6265SDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 218181ad6265SDimitry Andric (DestAS == AMDGPUAS::LOCAL_ADDRESS || 218281ad6265SDimitry Andric DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 218304eeddc0SDimitry Andric if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 218404eeddc0SDimitry Andric // Extract low 32-bits of the pointer. 218504eeddc0SDimitry Andric B.buildExtract(Dst, Src, 0); 218604eeddc0SDimitry Andric MI.eraseFromParent(); 218704eeddc0SDimitry Andric return true; 218804eeddc0SDimitry Andric } 218904eeddc0SDimitry Andric 21900b57cec5SDimitry Andric unsigned NullVal = TM.getNullPointerValue(DestAS); 21910b57cec5SDimitry Andric 21928bcb0991SDimitry Andric auto SegmentNull = B.buildConstant(DstTy, NullVal); 21938bcb0991SDimitry Andric auto FlatNull = B.buildConstant(SrcTy, 0); 21940b57cec5SDimitry Andric 21950b57cec5SDimitry Andric // Extract low 32-bits of the pointer. 21965ffd83dbSDimitry Andric auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 21970b57cec5SDimitry Andric 21985ffd83dbSDimitry Andric auto CmpRes = 21995ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 22008bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 22010b57cec5SDimitry Andric 22020b57cec5SDimitry Andric MI.eraseFromParent(); 22030b57cec5SDimitry Andric return true; 22040b57cec5SDimitry Andric } 22050b57cec5SDimitry Andric 220681ad6265SDimitry Andric if (DestAS == AMDGPUAS::FLAT_ADDRESS && 220781ad6265SDimitry Andric (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 220881ad6265SDimitry Andric SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 22098bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 22108bcb0991SDimitry Andric if (!ApertureReg.isValid()) 22118bcb0991SDimitry Andric return false; 22120b57cec5SDimitry Andric 22130b57cec5SDimitry Andric // Coerce the type of the low half of the result so we can use merge_values. 22145ffd83dbSDimitry Andric Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 22150b57cec5SDimitry Andric 22160b57cec5SDimitry Andric // TODO: Should we allow mismatched types but matching sizes in merges to 22170b57cec5SDimitry Andric // avoid the ptrtoint? 2218bdd1243dSDimitry Andric auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 221904eeddc0SDimitry Andric 222004eeddc0SDimitry Andric if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 222104eeddc0SDimitry Andric B.buildCopy(Dst, BuildPtr); 222204eeddc0SDimitry Andric MI.eraseFromParent(); 222304eeddc0SDimitry Andric return true; 222404eeddc0SDimitry Andric } 222504eeddc0SDimitry Andric 222604eeddc0SDimitry Andric auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 222704eeddc0SDimitry Andric auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 222804eeddc0SDimitry Andric 222981ad6265SDimitry Andric auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 223081ad6265SDimitry Andric SegmentNull.getReg(0)); 223104eeddc0SDimitry Andric 22325ffd83dbSDimitry Andric B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 22330b57cec5SDimitry Andric 22340b57cec5SDimitry Andric MI.eraseFromParent(); 22350b57cec5SDimitry Andric return true; 22360b57cec5SDimitry Andric } 22370b57cec5SDimitry Andric 223881ad6265SDimitry Andric if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 223981ad6265SDimitry Andric SrcTy.getSizeInBits() == 64) { 224081ad6265SDimitry Andric // Truncate. 224181ad6265SDimitry Andric B.buildExtract(Dst, Src, 0); 224281ad6265SDimitry Andric MI.eraseFromParent(); 224381ad6265SDimitry Andric return true; 224481ad6265SDimitry Andric } 224581ad6265SDimitry Andric 224681ad6265SDimitry Andric if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 224781ad6265SDimitry Andric DstTy.getSizeInBits() == 64) { 224881ad6265SDimitry Andric const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 224981ad6265SDimitry Andric uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2250bdd1243dSDimitry Andric auto PtrLo = B.buildPtrToInt(S32, Src); 2251bdd1243dSDimitry Andric auto HighAddr = B.buildConstant(S32, AddrHiVal); 2252bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 225381ad6265SDimitry Andric MI.eraseFromParent(); 225481ad6265SDimitry Andric return true; 225581ad6265SDimitry Andric } 225681ad6265SDimitry Andric 225781ad6265SDimitry Andric DiagnosticInfoUnsupported InvalidAddrSpaceCast( 225881ad6265SDimitry Andric MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 225981ad6265SDimitry Andric 226081ad6265SDimitry Andric LLVMContext &Ctx = MF.getFunction().getContext(); 226181ad6265SDimitry Andric Ctx.diagnose(InvalidAddrSpaceCast); 226281ad6265SDimitry Andric B.buildUndef(Dst); 226381ad6265SDimitry Andric MI.eraseFromParent(); 226481ad6265SDimitry Andric return true; 226581ad6265SDimitry Andric } 226681ad6265SDimitry Andric 22670b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint( 22680b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 22698bcb0991SDimitry Andric MachineIRBuilder &B) const { 22700b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 22710b57cec5SDimitry Andric LLT Ty = MRI.getType(Src); 22720b57cec5SDimitry Andric assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 22730b57cec5SDimitry Andric 22740b57cec5SDimitry Andric APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 22750b57cec5SDimitry Andric APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 22760b57cec5SDimitry Andric 22778bcb0991SDimitry Andric auto C1 = B.buildFConstant(Ty, C1Val); 22788bcb0991SDimitry Andric auto CopySign = B.buildFCopysign(Ty, C1, Src); 22790b57cec5SDimitry Andric 22800b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 22818bcb0991SDimitry Andric auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 22828bcb0991SDimitry Andric auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 22830b57cec5SDimitry Andric 22848bcb0991SDimitry Andric auto C2 = B.buildFConstant(Ty, C2Val); 22858bcb0991SDimitry Andric auto Fabs = B.buildFAbs(Ty, Src); 22860b57cec5SDimitry Andric 22878bcb0991SDimitry Andric auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 22888bcb0991SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2289e8d8bef9SDimitry Andric MI.eraseFromParent(); 22900b57cec5SDimitry Andric return true; 22910b57cec5SDimitry Andric } 22920b57cec5SDimitry Andric 22930b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil( 22940b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 22950b57cec5SDimitry Andric MachineIRBuilder &B) const { 22960b57cec5SDimitry Andric 22970b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 22980b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 22990b57cec5SDimitry Andric 23000b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 23010b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 23020b57cec5SDimitry Andric 23030b57cec5SDimitry Andric // result = trunc(src) 23040b57cec5SDimitry Andric // if (src > 0.0 && src != result) 23050b57cec5SDimitry Andric // result += 1.0 23060b57cec5SDimitry Andric 23075ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S64, Src); 23080b57cec5SDimitry Andric 23090b57cec5SDimitry Andric const auto Zero = B.buildFConstant(S64, 0.0); 23100b57cec5SDimitry Andric const auto One = B.buildFConstant(S64, 1.0); 23110b57cec5SDimitry Andric auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 23120b57cec5SDimitry Andric auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 23130b57cec5SDimitry Andric auto And = B.buildAnd(S1, Lt0, NeTrunc); 23140b57cec5SDimitry Andric auto Add = B.buildSelect(S64, And, One, Zero); 23150b57cec5SDimitry Andric 23160b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 23170b57cec5SDimitry Andric B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 231804eeddc0SDimitry Andric MI.eraseFromParent(); 23190b57cec5SDimitry Andric return true; 23200b57cec5SDimitry Andric } 23210b57cec5SDimitry Andric 2322e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem( 2323e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 2324e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 2325e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2326e8d8bef9SDimitry Andric Register Src0Reg = MI.getOperand(1).getReg(); 2327e8d8bef9SDimitry Andric Register Src1Reg = MI.getOperand(2).getReg(); 2328e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 2329e8d8bef9SDimitry Andric LLT Ty = MRI.getType(DstReg); 2330e8d8bef9SDimitry Andric 2331e8d8bef9SDimitry Andric auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2332e8d8bef9SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2333e8d8bef9SDimitry Andric auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2334e8d8bef9SDimitry Andric B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2335e8d8bef9SDimitry Andric MI.eraseFromParent(); 2336e8d8bef9SDimitry Andric return true; 2337e8d8bef9SDimitry Andric } 2338e8d8bef9SDimitry Andric 2339e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi, 23400b57cec5SDimitry Andric MachineIRBuilder &B) { 23410b57cec5SDimitry Andric const unsigned FractBits = 52; 23420b57cec5SDimitry Andric const unsigned ExpBits = 11; 23430b57cec5SDimitry Andric LLT S32 = LLT::scalar(32); 23440b57cec5SDimitry Andric 23450b57cec5SDimitry Andric auto Const0 = B.buildConstant(S32, FractBits - 32); 23460b57cec5SDimitry Andric auto Const1 = B.buildConstant(S32, ExpBits); 23470b57cec5SDimitry Andric 23480b57cec5SDimitry Andric auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2349e8d8bef9SDimitry Andric .addUse(Hi) 23500b57cec5SDimitry Andric .addUse(Const0.getReg(0)) 23510b57cec5SDimitry Andric .addUse(Const1.getReg(0)); 23520b57cec5SDimitry Andric 23530b57cec5SDimitry Andric return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 23540b57cec5SDimitry Andric } 23550b57cec5SDimitry Andric 23560b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 23570b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 23580b57cec5SDimitry Andric MachineIRBuilder &B) const { 23590b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 23600b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 23610b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 23620b57cec5SDimitry Andric 23630b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 23640b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 23650b57cec5SDimitry Andric 23660b57cec5SDimitry Andric // TODO: Should this use extract since the low half is unused? 23670b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 23680b57cec5SDimitry Andric Register Hi = Unmerge.getReg(1); 23690b57cec5SDimitry Andric 23700b57cec5SDimitry Andric // Extract the upper half, since this is where we will find the sign and 23710b57cec5SDimitry Andric // exponent. 23720b57cec5SDimitry Andric auto Exp = extractF64Exponent(Hi, B); 23730b57cec5SDimitry Andric 23740b57cec5SDimitry Andric const unsigned FractBits = 52; 23750b57cec5SDimitry Andric 23760b57cec5SDimitry Andric // Extract the sign bit. 23770b57cec5SDimitry Andric const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 23780b57cec5SDimitry Andric auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 23790b57cec5SDimitry Andric 23800b57cec5SDimitry Andric const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 23810b57cec5SDimitry Andric 23820b57cec5SDimitry Andric const auto Zero32 = B.buildConstant(S32, 0); 23830b57cec5SDimitry Andric 23840b57cec5SDimitry Andric // Extend back to 64-bits. 2385bdd1243dSDimitry Andric auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 23860b57cec5SDimitry Andric 23870b57cec5SDimitry Andric auto Shr = B.buildAShr(S64, FractMask, Exp); 23880b57cec5SDimitry Andric auto Not = B.buildNot(S64, Shr); 23890b57cec5SDimitry Andric auto Tmp0 = B.buildAnd(S64, Src, Not); 23900b57cec5SDimitry Andric auto FiftyOne = B.buildConstant(S32, FractBits - 1); 23910b57cec5SDimitry Andric 23920b57cec5SDimitry Andric auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 23930b57cec5SDimitry Andric auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 23940b57cec5SDimitry Andric 23950b57cec5SDimitry Andric auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 23960b57cec5SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2397e8d8bef9SDimitry Andric MI.eraseFromParent(); 23980b57cec5SDimitry Andric return true; 23990b57cec5SDimitry Andric } 24000b57cec5SDimitry Andric 24010b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP( 24020b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 24030b57cec5SDimitry Andric MachineIRBuilder &B, bool Signed) const { 24040b57cec5SDimitry Andric 24050b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 24060b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 24070b57cec5SDimitry Andric 24080b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 24090b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 24100b57cec5SDimitry Andric 2411349cc55cSDimitry Andric assert(MRI.getType(Src) == S64); 24120b57cec5SDimitry Andric 24130b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2414349cc55cSDimitry Andric auto ThirtyTwo = B.buildConstant(S32, 32); 24150b57cec5SDimitry Andric 2416349cc55cSDimitry Andric if (MRI.getType(Dst) == S64) { 2417349cc55cSDimitry Andric auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2418349cc55cSDimitry Andric : B.buildUITOFP(S64, Unmerge.getReg(1)); 24190b57cec5SDimitry Andric 24200b57cec5SDimitry Andric auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 242106c3fb27SDimitry Andric auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 24220b57cec5SDimitry Andric 24230b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 24240b57cec5SDimitry Andric B.buildFAdd(Dst, LdExp, CvtLo); 24250b57cec5SDimitry Andric MI.eraseFromParent(); 24260b57cec5SDimitry Andric return true; 24270b57cec5SDimitry Andric } 24280b57cec5SDimitry Andric 2429349cc55cSDimitry Andric assert(MRI.getType(Dst) == S32); 2430349cc55cSDimitry Andric 2431349cc55cSDimitry Andric auto One = B.buildConstant(S32, 1); 2432349cc55cSDimitry Andric 2433349cc55cSDimitry Andric MachineInstrBuilder ShAmt; 2434349cc55cSDimitry Andric if (Signed) { 2435349cc55cSDimitry Andric auto ThirtyOne = B.buildConstant(S32, 31); 2436349cc55cSDimitry Andric auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2437349cc55cSDimitry Andric auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2438349cc55cSDimitry Andric auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2439349cc55cSDimitry Andric auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2440349cc55cSDimitry Andric /*HasSideEffects=*/false) 2441349cc55cSDimitry Andric .addUse(Unmerge.getReg(1)); 2442349cc55cSDimitry Andric auto LS2 = B.buildSub(S32, LS, One); 2443349cc55cSDimitry Andric ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2444349cc55cSDimitry Andric } else 2445349cc55cSDimitry Andric ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2446349cc55cSDimitry Andric auto Norm = B.buildShl(S64, Src, ShAmt); 2447349cc55cSDimitry Andric auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2448349cc55cSDimitry Andric auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2449349cc55cSDimitry Andric auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2450349cc55cSDimitry Andric auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2451349cc55cSDimitry Andric auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 245206c3fb27SDimitry Andric B.buildFLdexp(Dst, FVal, Scale); 2453349cc55cSDimitry Andric MI.eraseFromParent(); 2454349cc55cSDimitry Andric return true; 2455349cc55cSDimitry Andric } 2456349cc55cSDimitry Andric 24575ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this 24585ffd83dbSDimitry Andric // actually works. 2459fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2460fe6060f1SDimitry Andric MachineRegisterInfo &MRI, 2461fe6060f1SDimitry Andric MachineIRBuilder &B, 2462fe6060f1SDimitry Andric bool Signed) const { 24635ffd83dbSDimitry Andric 24645ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 24655ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 24665ffd83dbSDimitry Andric 24675ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 24685ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 24695ffd83dbSDimitry Andric 2470fe6060f1SDimitry Andric const LLT SrcLT = MRI.getType(Src); 2471fe6060f1SDimitry Andric assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 24725ffd83dbSDimitry Andric 24735ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 24745ffd83dbSDimitry Andric 2475fe6060f1SDimitry Andric // The basic idea of converting a floating point number into a pair of 32-bit 2476fe6060f1SDimitry Andric // integers is illustrated as follows: 2477fe6060f1SDimitry Andric // 2478fe6060f1SDimitry Andric // tf := trunc(val); 2479fe6060f1SDimitry Andric // hif := floor(tf * 2^-32); 2480fe6060f1SDimitry Andric // lof := tf - hif * 2^32; // lof is always positive due to floor. 2481fe6060f1SDimitry Andric // hi := fptoi(hif); 2482fe6060f1SDimitry Andric // lo := fptoi(lof); 2483fe6060f1SDimitry Andric // 2484fe6060f1SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2485fe6060f1SDimitry Andric MachineInstrBuilder Sign; 2486fe6060f1SDimitry Andric if (Signed && SrcLT == S32) { 2487fe6060f1SDimitry Andric // However, a 32-bit floating point number has only 23 bits mantissa and 2488fe6060f1SDimitry Andric // it's not enough to hold all the significant bits of `lof` if val is 2489fe6060f1SDimitry Andric // negative. To avoid the loss of precision, We need to take the absolute 2490fe6060f1SDimitry Andric // value after truncating and flip the result back based on the original 2491fe6060f1SDimitry Andric // signedness. 2492fe6060f1SDimitry Andric Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2493fe6060f1SDimitry Andric Trunc = B.buildFAbs(S32, Trunc, Flags); 2494fe6060f1SDimitry Andric } 2495fe6060f1SDimitry Andric MachineInstrBuilder K0, K1; 2496fe6060f1SDimitry Andric if (SrcLT == S64) { 249706c3fb27SDimitry Andric K0 = B.buildFConstant( 249806c3fb27SDimitry Andric S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 249906c3fb27SDimitry Andric K1 = B.buildFConstant( 250006c3fb27SDimitry Andric S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2501fe6060f1SDimitry Andric } else { 250206c3fb27SDimitry Andric K0 = B.buildFConstant( 250306c3fb27SDimitry Andric S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 250406c3fb27SDimitry Andric K1 = B.buildFConstant( 250506c3fb27SDimitry Andric S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2506fe6060f1SDimitry Andric } 25075ffd83dbSDimitry Andric 2508fe6060f1SDimitry Andric auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2509fe6060f1SDimitry Andric auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2510fe6060f1SDimitry Andric auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 25115ffd83dbSDimitry Andric 2512fe6060f1SDimitry Andric auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2513fe6060f1SDimitry Andric : B.buildFPTOUI(S32, FloorMul); 25145ffd83dbSDimitry Andric auto Lo = B.buildFPTOUI(S32, Fma); 25155ffd83dbSDimitry Andric 2516fe6060f1SDimitry Andric if (Signed && SrcLT == S32) { 2517fe6060f1SDimitry Andric // Flip the result based on the signedness, which is either all 0s or 1s. 2518bdd1243dSDimitry Andric Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2519fe6060f1SDimitry Andric // r := xor({lo, hi}, sign) - sign; 2520bdd1243dSDimitry Andric B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2521bdd1243dSDimitry Andric Sign); 2522fe6060f1SDimitry Andric } else 2523bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, {Lo, Hi}); 25245ffd83dbSDimitry Andric MI.eraseFromParent(); 25255ffd83dbSDimitry Andric 25265ffd83dbSDimitry Andric return true; 25275ffd83dbSDimitry Andric } 25285ffd83dbSDimitry Andric 25295ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 25305ffd83dbSDimitry Andric MachineInstr &MI) const { 25315ffd83dbSDimitry Andric MachineFunction &MF = Helper.MIRBuilder.getMF(); 25320b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 25330b57cec5SDimitry Andric 25340b57cec5SDimitry Andric const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 25350b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 25360b57cec5SDimitry Andric 25370b57cec5SDimitry Andric // With ieee_mode disabled, the instructions have the correct behavior 25380b57cec5SDimitry Andric // already for G_FMINNUM/G_FMAXNUM 25390b57cec5SDimitry Andric if (!MFI->getMode().IEEE) 25400b57cec5SDimitry Andric return !IsIEEEOp; 25410b57cec5SDimitry Andric 25420b57cec5SDimitry Andric if (IsIEEEOp) 25430b57cec5SDimitry Andric return true; 25440b57cec5SDimitry Andric 25450b57cec5SDimitry Andric return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 25460b57cec5SDimitry Andric } 25470b57cec5SDimitry Andric 25480b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 25490b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 25500b57cec5SDimitry Andric MachineIRBuilder &B) const { 25510b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 25520b57cec5SDimitry Andric 25530b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 25545ffd83dbSDimitry Andric 255506c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 255606c3fb27SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 255706c3fb27SDimitry Andric 255806c3fb27SDimitry Andric LLT VecTy = MRI.getType(Vec); 255906c3fb27SDimitry Andric LLT EltTy = VecTy.getElementType(); 256006c3fb27SDimitry Andric assert(EltTy == MRI.getType(Dst)); 256106c3fb27SDimitry Andric 256206c3fb27SDimitry Andric // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 256306c3fb27SDimitry Andric // but we can't go directly to that logic becasue you can't bitcast a vector 256406c3fb27SDimitry Andric // of pointers to a vector of integers. Therefore, introduce an intermediate 256506c3fb27SDimitry Andric // vector of integers using ptrtoint (and inttoptr on the output) in order to 256606c3fb27SDimitry Andric // drive the legalization forward. 256706c3fb27SDimitry Andric if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 256806c3fb27SDimitry Andric LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 256906c3fb27SDimitry Andric LLT IntVecTy = VecTy.changeElementType(IntTy); 257006c3fb27SDimitry Andric 257106c3fb27SDimitry Andric auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 257206c3fb27SDimitry Andric auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 257306c3fb27SDimitry Andric B.buildIntToPtr(Dst, IntElt); 257406c3fb27SDimitry Andric 257506c3fb27SDimitry Andric MI.eraseFromParent(); 257606c3fb27SDimitry Andric return true; 257706c3fb27SDimitry Andric } 257806c3fb27SDimitry Andric 25795ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 25805ffd83dbSDimitry Andric // constant before this, so we shouldn't need 2581349cc55cSDimitry Andric // getIConstantVRegValWithLookThrough. 2582bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeIdxVal = 2583349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2584e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 25850b57cec5SDimitry Andric return true; 2586bdd1243dSDimitry Andric const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 25870b57cec5SDimitry Andric 258804eeddc0SDimitry Andric if (IdxVal < VecTy.getNumElements()) { 258904eeddc0SDimitry Andric auto Unmerge = B.buildUnmerge(EltTy, Vec); 259004eeddc0SDimitry Andric B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 259104eeddc0SDimitry Andric } else { 25920b57cec5SDimitry Andric B.buildUndef(Dst); 259304eeddc0SDimitry Andric } 25940b57cec5SDimitry Andric 25950b57cec5SDimitry Andric MI.eraseFromParent(); 25960b57cec5SDimitry Andric return true; 25970b57cec5SDimitry Andric } 25980b57cec5SDimitry Andric 25990b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 26000b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 26010b57cec5SDimitry Andric MachineIRBuilder &B) const { 26020b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 26030b57cec5SDimitry Andric 26040b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 26055ffd83dbSDimitry Andric 260606c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 260706c3fb27SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 260806c3fb27SDimitry Andric Register Ins = MI.getOperand(2).getReg(); 260906c3fb27SDimitry Andric 261006c3fb27SDimitry Andric LLT VecTy = MRI.getType(Vec); 261106c3fb27SDimitry Andric LLT EltTy = VecTy.getElementType(); 261206c3fb27SDimitry Andric assert(EltTy == MRI.getType(Ins)); 261306c3fb27SDimitry Andric 261406c3fb27SDimitry Andric // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 261506c3fb27SDimitry Andric // but we can't go directly to that logic becasue you can't bitcast a vector 261606c3fb27SDimitry Andric // of pointers to a vector of integers. Therefore, make the pointer vector 261706c3fb27SDimitry Andric // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 261806c3fb27SDimitry Andric // new value, and then inttoptr the result vector back. This will then allow 261906c3fb27SDimitry Andric // the rest of legalization to take over. 262006c3fb27SDimitry Andric if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 262106c3fb27SDimitry Andric LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 262206c3fb27SDimitry Andric LLT IntVecTy = VecTy.changeElementType(IntTy); 262306c3fb27SDimitry Andric 262406c3fb27SDimitry Andric auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 262506c3fb27SDimitry Andric auto IntIns = B.buildPtrToInt(IntTy, Ins); 262606c3fb27SDimitry Andric auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 262706c3fb27SDimitry Andric MI.getOperand(3)); 262806c3fb27SDimitry Andric B.buildIntToPtr(Dst, IntVecDest); 262906c3fb27SDimitry Andric MI.eraseFromParent(); 263006c3fb27SDimitry Andric return true; 263106c3fb27SDimitry Andric } 263206c3fb27SDimitry Andric 26335ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 26345ffd83dbSDimitry Andric // constant before this, so we shouldn't need 2635349cc55cSDimitry Andric // getIConstantVRegValWithLookThrough. 2636bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeIdxVal = 2637349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2638e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 26390b57cec5SDimitry Andric return true; 26400b57cec5SDimitry Andric 2641bdd1243dSDimitry Andric const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 26420b57cec5SDimitry Andric 264304eeddc0SDimitry Andric unsigned NumElts = VecTy.getNumElements(); 264404eeddc0SDimitry Andric if (IdxVal < NumElts) { 264504eeddc0SDimitry Andric SmallVector<Register, 8> SrcRegs; 264604eeddc0SDimitry Andric for (unsigned i = 0; i < NumElts; ++i) 264704eeddc0SDimitry Andric SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 264804eeddc0SDimitry Andric B.buildUnmerge(SrcRegs, Vec); 264904eeddc0SDimitry Andric 265004eeddc0SDimitry Andric SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2651bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, SrcRegs); 265204eeddc0SDimitry Andric } else { 26530b57cec5SDimitry Andric B.buildUndef(Dst); 265404eeddc0SDimitry Andric } 26550b57cec5SDimitry Andric 26560b57cec5SDimitry Andric MI.eraseFromParent(); 26570b57cec5SDimitry Andric return true; 26580b57cec5SDimitry Andric } 26590b57cec5SDimitry Andric 26608bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos( 26618bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 26628bcb0991SDimitry Andric MachineIRBuilder &B) const { 26638bcb0991SDimitry Andric 26648bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 26658bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 26668bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 26678bcb0991SDimitry Andric unsigned Flags = MI.getFlags(); 26688bcb0991SDimitry Andric 26698bcb0991SDimitry Andric Register TrigVal; 26705ffd83dbSDimitry Andric auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 26718bcb0991SDimitry Andric if (ST.hasTrigReducedRange()) { 26728bcb0991SDimitry Andric auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 26738bcb0991SDimitry Andric TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 26748bcb0991SDimitry Andric .addUse(MulVal.getReg(0)) 26758bcb0991SDimitry Andric .setMIFlags(Flags).getReg(0); 26768bcb0991SDimitry Andric } else 26778bcb0991SDimitry Andric TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 26788bcb0991SDimitry Andric 26798bcb0991SDimitry Andric Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 26808bcb0991SDimitry Andric Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2681bdd1243dSDimitry Andric B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) 26828bcb0991SDimitry Andric .addUse(TrigVal) 26838bcb0991SDimitry Andric .setMIFlags(Flags); 26848bcb0991SDimitry Andric MI.eraseFromParent(); 26858bcb0991SDimitry Andric return true; 26868bcb0991SDimitry Andric } 26878bcb0991SDimitry Andric 26885ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 26895ffd83dbSDimitry Andric MachineIRBuilder &B, 26905ffd83dbSDimitry Andric const GlobalValue *GV, 26915ffd83dbSDimitry Andric int64_t Offset, 26925ffd83dbSDimitry Andric unsigned GAFlags) const { 26935ffd83dbSDimitry Andric assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 26948bcb0991SDimitry Andric // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 26958bcb0991SDimitry Andric // to the following code sequence: 26968bcb0991SDimitry Andric // 26978bcb0991SDimitry Andric // For constant address space: 26988bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 26998bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol 27008bcb0991SDimitry Andric // s_addc_u32 s1, s1, 0 27018bcb0991SDimitry Andric // 27028bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 27038bcb0991SDimitry Andric // a fixup or relocation is emitted to replace $symbol with a literal 27048bcb0991SDimitry Andric // constant, which is a pc-relative offset from the encoding of the $symbol 27058bcb0991SDimitry Andric // operand to the global variable. 27068bcb0991SDimitry Andric // 27078bcb0991SDimitry Andric // For global address space: 27088bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 27098bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 27108bcb0991SDimitry Andric // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 27118bcb0991SDimitry Andric // 27128bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 27138bcb0991SDimitry Andric // fixups or relocations are emitted to replace $symbol@*@lo and 27148bcb0991SDimitry Andric // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 27158bcb0991SDimitry Andric // which is a 64-bit pc-relative offset from the encoding of the $symbol 27168bcb0991SDimitry Andric // operand to the global variable. 27178bcb0991SDimitry Andric // 27188bcb0991SDimitry Andric // What we want here is an offset from the value returned by s_getpc 27198bcb0991SDimitry Andric // (which is the address of the s_add_u32 instruction) to the global 27208bcb0991SDimitry Andric // variable, but since the encoding of $symbol starts 4 bytes after the start 27218bcb0991SDimitry Andric // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 27228bcb0991SDimitry Andric // small. This requires us to add 4 to the global variable offset in order to 2723e8d8bef9SDimitry Andric // compute the correct address. Similarly for the s_addc_u32 instruction, the 2724e8d8bef9SDimitry Andric // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2725e8d8bef9SDimitry Andric // instruction. 27268bcb0991SDimitry Andric 27278bcb0991SDimitry Andric LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 27288bcb0991SDimitry Andric 27298bcb0991SDimitry Andric Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 27308bcb0991SDimitry Andric B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 27318bcb0991SDimitry Andric 27328bcb0991SDimitry Andric MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 27338bcb0991SDimitry Andric .addDef(PCReg); 27348bcb0991SDimitry Andric 27358bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 27368bcb0991SDimitry Andric if (GAFlags == SIInstrInfo::MO_NONE) 27378bcb0991SDimitry Andric MIB.addImm(0); 27388bcb0991SDimitry Andric else 2739e8d8bef9SDimitry Andric MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 27408bcb0991SDimitry Andric 274106c3fb27SDimitry Andric if (!B.getMRI()->getRegClassOrNull(PCReg)) 27428bcb0991SDimitry Andric B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 27438bcb0991SDimitry Andric 27448bcb0991SDimitry Andric if (PtrTy.getSizeInBits() == 32) 27458bcb0991SDimitry Andric B.buildExtract(DstReg, PCReg, 0); 27468bcb0991SDimitry Andric return true; 27478bcb0991SDimitry Andric } 27488bcb0991SDimitry Andric 27498bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue( 27508bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 27518bcb0991SDimitry Andric MachineIRBuilder &B) const { 27528bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 27538bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 27548bcb0991SDimitry Andric unsigned AS = Ty.getAddressSpace(); 27558bcb0991SDimitry Andric 27568bcb0991SDimitry Andric const GlobalValue *GV = MI.getOperand(1).getGlobal(); 27578bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 27588bcb0991SDimitry Andric SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 27598bcb0991SDimitry Andric 27608bcb0991SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2761fe6060f1SDimitry Andric if (!MFI->isModuleEntryFunction() && 2762fe6060f1SDimitry Andric !GV->getName().equals("llvm.amdgcn.module.lds")) { 27638bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 27648bcb0991SDimitry Andric DiagnosticInfoUnsupported BadLDSDecl( 27655ffd83dbSDimitry Andric Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 27665ffd83dbSDimitry Andric DS_Warning); 27678bcb0991SDimitry Andric Fn.getContext().diagnose(BadLDSDecl); 27685ffd83dbSDimitry Andric 27695ffd83dbSDimitry Andric // We currently don't have a way to correctly allocate LDS objects that 27705ffd83dbSDimitry Andric // aren't directly associated with a kernel. We do force inlining of 27715ffd83dbSDimitry Andric // functions that use local objects. However, if these dead functions are 27725ffd83dbSDimitry Andric // not eliminated, we don't want a compile time error. Just emit a warning 27735ffd83dbSDimitry Andric // and a trap, since there should be no callable path here. 27745ffd83dbSDimitry Andric B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 27755ffd83dbSDimitry Andric B.buildUndef(DstReg); 27765ffd83dbSDimitry Andric MI.eraseFromParent(); 27775ffd83dbSDimitry Andric return true; 27788bcb0991SDimitry Andric } 27798bcb0991SDimitry Andric 27808bcb0991SDimitry Andric // TODO: We could emit code to handle the initialization somewhere. 2781349cc55cSDimitry Andric // We ignore the initializer for now and legalize it to allow selection. 2782349cc55cSDimitry Andric // The initializer will anyway get errored out during assembly emission. 27835ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 27845ffd83dbSDimitry Andric if (!TLI->shouldUseLDSConstAddress(GV)) { 27855ffd83dbSDimitry Andric MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 27865ffd83dbSDimitry Andric return true; // Leave in place; 27875ffd83dbSDimitry Andric } 27885ffd83dbSDimitry Andric 2789e8d8bef9SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2790e8d8bef9SDimitry Andric Type *Ty = GV->getValueType(); 2791e8d8bef9SDimitry Andric // HIP uses an unsized array `extern __shared__ T s[]` or similar 2792e8d8bef9SDimitry Andric // zero-sized type in other languages to declare the dynamic shared 2793e8d8bef9SDimitry Andric // memory which size is not known at the compile time. They will be 2794e8d8bef9SDimitry Andric // allocated by the runtime and placed directly after the static 2795e8d8bef9SDimitry Andric // allocated ones. They all share the same offset. 2796e8d8bef9SDimitry Andric if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2797e8d8bef9SDimitry Andric // Adjust alignment for that dynamic shared memory array. 279806c3fb27SDimitry Andric MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 2799e8d8bef9SDimitry Andric LLT S32 = LLT::scalar(32); 2800e8d8bef9SDimitry Andric auto Sz = 2801e8d8bef9SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2802e8d8bef9SDimitry Andric B.buildIntToPtr(DstReg, Sz); 2803e8d8bef9SDimitry Andric MI.eraseFromParent(); 2804e8d8bef9SDimitry Andric return true; 2805e8d8bef9SDimitry Andric } 2806e8d8bef9SDimitry Andric } 2807e8d8bef9SDimitry Andric 2808349cc55cSDimitry Andric B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2809349cc55cSDimitry Andric *cast<GlobalVariable>(GV))); 28108bcb0991SDimitry Andric MI.eraseFromParent(); 28118bcb0991SDimitry Andric return true; 28128bcb0991SDimitry Andric } 28138bcb0991SDimitry Andric 28148bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 28158bcb0991SDimitry Andric 28168bcb0991SDimitry Andric if (TLI->shouldEmitFixup(GV)) { 28178bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 28188bcb0991SDimitry Andric MI.eraseFromParent(); 28198bcb0991SDimitry Andric return true; 28208bcb0991SDimitry Andric } 28218bcb0991SDimitry Andric 28228bcb0991SDimitry Andric if (TLI->shouldEmitPCReloc(GV)) { 28238bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 28248bcb0991SDimitry Andric MI.eraseFromParent(); 28258bcb0991SDimitry Andric return true; 28268bcb0991SDimitry Andric } 28278bcb0991SDimitry Andric 28288bcb0991SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 28298bcb0991SDimitry Andric Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 28308bcb0991SDimitry Andric 2831fe6060f1SDimitry Andric LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 28328bcb0991SDimitry Andric MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 28338bcb0991SDimitry Andric MachinePointerInfo::getGOT(MF), 28348bcb0991SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 28358bcb0991SDimitry Andric MachineMemOperand::MOInvariant, 2836fe6060f1SDimitry Andric LoadTy, Align(8)); 28378bcb0991SDimitry Andric 28388bcb0991SDimitry Andric buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 28398bcb0991SDimitry Andric 28408bcb0991SDimitry Andric if (Ty.getSizeInBits() == 32) { 2841349cc55cSDimitry Andric // Truncate if this is a 32-bit constant address. 28428bcb0991SDimitry Andric auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 28438bcb0991SDimitry Andric B.buildExtract(DstReg, Load, 0); 28448bcb0991SDimitry Andric } else 28458bcb0991SDimitry Andric B.buildLoad(DstReg, GOTAddr, *GOTMMO); 28468bcb0991SDimitry Andric 28478bcb0991SDimitry Andric MI.eraseFromParent(); 28488bcb0991SDimitry Andric return true; 28498bcb0991SDimitry Andric } 28508bcb0991SDimitry Andric 2851e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) { 2852e8d8bef9SDimitry Andric if (Ty.isVector()) 2853fe6060f1SDimitry Andric return Ty.changeElementCount( 2854fe6060f1SDimitry Andric ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2855e8d8bef9SDimitry Andric return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2856e8d8bef9SDimitry Andric } 2857e8d8bef9SDimitry Andric 2858e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2859e8d8bef9SDimitry Andric MachineInstr &MI) const { 2860e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 2861e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 2862e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 2863e8d8bef9SDimitry Andric 2864e8d8bef9SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2865e8d8bef9SDimitry Andric LLT PtrTy = MRI.getType(PtrReg); 2866e8d8bef9SDimitry Andric unsigned AddrSpace = PtrTy.getAddressSpace(); 2867e8d8bef9SDimitry Andric 2868e8d8bef9SDimitry Andric if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 28698bcb0991SDimitry Andric LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2870e8d8bef9SDimitry Andric auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 28718bcb0991SDimitry Andric Observer.changingInstr(MI); 28728bcb0991SDimitry Andric MI.getOperand(1).setReg(Cast.getReg(0)); 28738bcb0991SDimitry Andric Observer.changedInstr(MI); 28748bcb0991SDimitry Andric return true; 28758bcb0991SDimitry Andric } 28768bcb0991SDimitry Andric 2877fe6060f1SDimitry Andric if (MI.getOpcode() != AMDGPU::G_LOAD) 2878fe6060f1SDimitry Andric return false; 2879fe6060f1SDimitry Andric 2880e8d8bef9SDimitry Andric Register ValReg = MI.getOperand(0).getReg(); 2881e8d8bef9SDimitry Andric LLT ValTy = MRI.getType(ValReg); 2882e8d8bef9SDimitry Andric 288306c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(ValTy)) { 288406c3fb27SDimitry Andric Observer.changingInstr(MI); 288506c3fb27SDimitry Andric castBufferRsrcFromV4I32(MI, B, MRI, 0); 288606c3fb27SDimitry Andric Observer.changedInstr(MI); 288706c3fb27SDimitry Andric return true; 288806c3fb27SDimitry Andric } 288906c3fb27SDimitry Andric 2890e8d8bef9SDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 2891e8d8bef9SDimitry Andric const unsigned ValSize = ValTy.getSizeInBits(); 2892fe6060f1SDimitry Andric const LLT MemTy = MMO->getMemoryType(); 2893e8d8bef9SDimitry Andric const Align MemAlign = MMO->getAlign(); 2894fe6060f1SDimitry Andric const unsigned MemSize = MemTy.getSizeInBits(); 289504eeddc0SDimitry Andric const uint64_t AlignInBits = 8 * MemAlign.value(); 2896e8d8bef9SDimitry Andric 2897e8d8bef9SDimitry Andric // Widen non-power-of-2 loads to the alignment if needed 2898fe6060f1SDimitry Andric if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2899e8d8bef9SDimitry Andric const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2900e8d8bef9SDimitry Andric 2901e8d8bef9SDimitry Andric // This was already the correct extending load result type, so just adjust 2902e8d8bef9SDimitry Andric // the memory type. 2903e8d8bef9SDimitry Andric if (WideMemSize == ValSize) { 2904e8d8bef9SDimitry Andric MachineFunction &MF = B.getMF(); 2905e8d8bef9SDimitry Andric 2906e8d8bef9SDimitry Andric MachineMemOperand *WideMMO = 2907e8d8bef9SDimitry Andric MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2908e8d8bef9SDimitry Andric Observer.changingInstr(MI); 2909e8d8bef9SDimitry Andric MI.setMemRefs(MF, {WideMMO}); 2910e8d8bef9SDimitry Andric Observer.changedInstr(MI); 2911e8d8bef9SDimitry Andric return true; 2912e8d8bef9SDimitry Andric } 2913e8d8bef9SDimitry Andric 2914e8d8bef9SDimitry Andric // Don't bother handling edge case that should probably never be produced. 2915e8d8bef9SDimitry Andric if (ValSize > WideMemSize) 2916e8d8bef9SDimitry Andric return false; 2917e8d8bef9SDimitry Andric 2918e8d8bef9SDimitry Andric LLT WideTy = widenToNextPowerOf2(ValTy); 2919e8d8bef9SDimitry Andric 2920e8d8bef9SDimitry Andric Register WideLoad; 2921e8d8bef9SDimitry Andric if (!WideTy.isVector()) { 2922e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2923e8d8bef9SDimitry Andric B.buildTrunc(ValReg, WideLoad).getReg(0); 2924e8d8bef9SDimitry Andric } else { 2925e8d8bef9SDimitry Andric // Extract the subvector. 2926e8d8bef9SDimitry Andric 2927e8d8bef9SDimitry Andric if (isRegisterType(ValTy)) { 2928e8d8bef9SDimitry Andric // If this a case where G_EXTRACT is legal, use it. 2929e8d8bef9SDimitry Andric // (e.g. <3 x s32> -> <4 x s32>) 2930e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2931e8d8bef9SDimitry Andric B.buildExtract(ValReg, WideLoad, 0); 2932e8d8bef9SDimitry Andric } else { 2933e8d8bef9SDimitry Andric // For cases where the widened type isn't a nice register value, unmerge 2934e8d8bef9SDimitry Andric // from a widened register (e.g. <3 x s16> -> <4 x s16>) 29350eae32dcSDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 29360eae32dcSDimitry Andric B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2937e8d8bef9SDimitry Andric } 2938e8d8bef9SDimitry Andric } 2939e8d8bef9SDimitry Andric 2940e8d8bef9SDimitry Andric MI.eraseFromParent(); 2941e8d8bef9SDimitry Andric return true; 2942e8d8bef9SDimitry Andric } 2943e8d8bef9SDimitry Andric 2944e8d8bef9SDimitry Andric return false; 2945e8d8bef9SDimitry Andric } 2946e8d8bef9SDimitry Andric 294706c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 294806c3fb27SDimitry Andric MachineInstr &MI) const { 294906c3fb27SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 295006c3fb27SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 295106c3fb27SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 295206c3fb27SDimitry Andric 295306c3fb27SDimitry Andric Register DataReg = MI.getOperand(0).getReg(); 295406c3fb27SDimitry Andric LLT DataTy = MRI.getType(DataReg); 295506c3fb27SDimitry Andric 295606c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(DataTy)) { 295706c3fb27SDimitry Andric Observer.changingInstr(MI); 295806c3fb27SDimitry Andric castBufferRsrcArgToV4I32(MI, B, 0); 295906c3fb27SDimitry Andric Observer.changedInstr(MI); 296006c3fb27SDimitry Andric return true; 296106c3fb27SDimitry Andric } 296206c3fb27SDimitry Andric return false; 296306c3fb27SDimitry Andric } 296406c3fb27SDimitry Andric 29658bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad( 29668bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 29678bcb0991SDimitry Andric MachineIRBuilder &B) const { 29688bcb0991SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 29698bcb0991SDimitry Andric assert(Ty.isScalar()); 29708bcb0991SDimitry Andric 2971480093f4SDimitry Andric MachineFunction &MF = B.getMF(); 2972480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2973480093f4SDimitry Andric 29748bcb0991SDimitry Andric // TODO: Always legal with future ftz flag. 29755ffd83dbSDimitry Andric // FIXME: Do we need just output? 297606c3fb27SDimitry Andric if (Ty == LLT::scalar(32) && 297706c3fb27SDimitry Andric MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 29788bcb0991SDimitry Andric return true; 297906c3fb27SDimitry Andric if (Ty == LLT::scalar(16) && 298006c3fb27SDimitry Andric MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 29818bcb0991SDimitry Andric return true; 29828bcb0991SDimitry Andric 29838bcb0991SDimitry Andric MachineIRBuilder HelperBuilder(MI); 29848bcb0991SDimitry Andric GISelObserverWrapper DummyObserver; 29858bcb0991SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 29868bcb0991SDimitry Andric return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 29878bcb0991SDimitry Andric } 29888bcb0991SDimitry Andric 2989480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2990480093f4SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2991480093f4SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2992480093f4SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2993480093f4SDimitry Andric Register CmpVal = MI.getOperand(2).getReg(); 2994480093f4SDimitry Andric Register NewVal = MI.getOperand(3).getReg(); 2995480093f4SDimitry Andric 2996e8d8bef9SDimitry Andric assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2997480093f4SDimitry Andric "this should not have been custom lowered"); 2998480093f4SDimitry Andric 2999480093f4SDimitry Andric LLT ValTy = MRI.getType(CmpVal); 3000fe6060f1SDimitry Andric LLT VecTy = LLT::fixed_vector(2, ValTy); 3001480093f4SDimitry Andric 3002480093f4SDimitry Andric Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3003480093f4SDimitry Andric 3004480093f4SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3005480093f4SDimitry Andric .addDef(DstReg) 3006480093f4SDimitry Andric .addUse(PtrReg) 3007480093f4SDimitry Andric .addUse(PackedVal) 3008480093f4SDimitry Andric .setMemRefs(MI.memoperands()); 3009480093f4SDimitry Andric 3010480093f4SDimitry Andric MI.eraseFromParent(); 3011480093f4SDimitry Andric return true; 3012480093f4SDimitry Andric } 3013480093f4SDimitry Andric 301406c3fb27SDimitry Andric /// Return true if it's known that \p Src can never be an f32 denormal value. 301506c3fb27SDimitry Andric static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 301606c3fb27SDimitry Andric Register Src) { 301706c3fb27SDimitry Andric Register ExtSrc; 301806c3fb27SDimitry Andric if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc)))) 301906c3fb27SDimitry Andric return MRI.getType(ExtSrc) == LLT::scalar(16); 302006c3fb27SDimitry Andric return false; 302106c3fb27SDimitry Andric } 302206c3fb27SDimitry Andric 302306c3fb27SDimitry Andric static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 302406c3fb27SDimitry Andric if (Flags & MachineInstr::FmAfn) 302506c3fb27SDimitry Andric return true; 302606c3fb27SDimitry Andric const auto &Options = MF.getTarget().Options; 302706c3fb27SDimitry Andric return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 302806c3fb27SDimitry Andric } 302906c3fb27SDimitry Andric 303006c3fb27SDimitry Andric static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 303106c3fb27SDimitry Andric unsigned Flags) { 303206c3fb27SDimitry Andric return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 303306c3fb27SDimitry Andric MF.getDenormalMode(APFloat::IEEEsingle()).Input != 303406c3fb27SDimitry Andric DenormalMode::PreserveSign; 303506c3fb27SDimitry Andric } 303606c3fb27SDimitry Andric 303706c3fb27SDimitry Andric std::pair<Register, Register> 303806c3fb27SDimitry Andric AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 303906c3fb27SDimitry Andric unsigned Flags) const { 3040*8a4dda33SDimitry Andric if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) 304106c3fb27SDimitry Andric return {}; 304206c3fb27SDimitry Andric 304306c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 304406c3fb27SDimitry Andric auto SmallestNormal = B.buildFConstant( 304506c3fb27SDimitry Andric F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 304606c3fb27SDimitry Andric auto IsLtSmallestNormal = 304706c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 304806c3fb27SDimitry Andric 304906c3fb27SDimitry Andric auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 305006c3fb27SDimitry Andric auto One = B.buildFConstant(F32, 1.0); 305106c3fb27SDimitry Andric auto ScaleFactor = 305206c3fb27SDimitry Andric B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 305306c3fb27SDimitry Andric auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 305406c3fb27SDimitry Andric 305506c3fb27SDimitry Andric return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 305606c3fb27SDimitry Andric } 305706c3fb27SDimitry Andric 305806c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 305906c3fb27SDimitry Andric MachineIRBuilder &B) const { 306006c3fb27SDimitry Andric // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 306106c3fb27SDimitry Andric // If we have to handle denormals, scale up the input and adjust the result. 306206c3fb27SDimitry Andric 306306c3fb27SDimitry Andric // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 306406c3fb27SDimitry Andric // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 306506c3fb27SDimitry Andric 30665ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 30675ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 30685ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 30695ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 30705ffd83dbSDimitry Andric 307106c3fb27SDimitry Andric if (Ty == LLT::scalar(16)) { 307206c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 307306c3fb27SDimitry Andric // Nothing in half is a denormal when promoted to f32. 307406c3fb27SDimitry Andric auto Ext = B.buildFPExt(F32, Src, Flags); 307506c3fb27SDimitry Andric auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false) 307606c3fb27SDimitry Andric .addUse(Ext.getReg(0)) 307706c3fb27SDimitry Andric .setMIFlags(Flags); 307806c3fb27SDimitry Andric B.buildFPTrunc(Dst, Log2, Flags); 30795ffd83dbSDimitry Andric MI.eraseFromParent(); 30805ffd83dbSDimitry Andric return true; 30815ffd83dbSDimitry Andric } 30825ffd83dbSDimitry Andric 308306c3fb27SDimitry Andric assert(Ty == LLT::scalar(32)); 308406c3fb27SDimitry Andric 308506c3fb27SDimitry Andric auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 308606c3fb27SDimitry Andric if (!ScaledInput) { 308706c3fb27SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false) 308806c3fb27SDimitry Andric .addUse(Src) 308906c3fb27SDimitry Andric .setMIFlags(Flags); 309006c3fb27SDimitry Andric MI.eraseFromParent(); 309106c3fb27SDimitry Andric return true; 309206c3fb27SDimitry Andric } 309306c3fb27SDimitry Andric 309406c3fb27SDimitry Andric auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 309506c3fb27SDimitry Andric .addUse(ScaledInput) 309606c3fb27SDimitry Andric .setMIFlags(Flags); 309706c3fb27SDimitry Andric 309806c3fb27SDimitry Andric auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 309906c3fb27SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 310006c3fb27SDimitry Andric auto ResultOffset = 310106c3fb27SDimitry Andric B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 310206c3fb27SDimitry Andric B.buildFSub(Dst, Log2, ResultOffset, Flags); 310306c3fb27SDimitry Andric 310406c3fb27SDimitry Andric MI.eraseFromParent(); 310506c3fb27SDimitry Andric return true; 310606c3fb27SDimitry Andric } 310706c3fb27SDimitry Andric 310806c3fb27SDimitry Andric static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 310906c3fb27SDimitry Andric Register Z, unsigned Flags) { 311006c3fb27SDimitry Andric auto FMul = B.buildFMul(Ty, X, Y, Flags); 311106c3fb27SDimitry Andric return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 311206c3fb27SDimitry Andric } 311306c3fb27SDimitry Andric 311406c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 311506c3fb27SDimitry Andric MachineIRBuilder &B) const { 311606c3fb27SDimitry Andric const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 311706c3fb27SDimitry Andric assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 311806c3fb27SDimitry Andric 311906c3fb27SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 312006c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 312106c3fb27SDimitry Andric Register X = MI.getOperand(1).getReg(); 312206c3fb27SDimitry Andric unsigned Flags = MI.getFlags(); 312306c3fb27SDimitry Andric const LLT Ty = MRI.getType(X); 312406c3fb27SDimitry Andric MachineFunction &MF = B.getMF(); 312506c3fb27SDimitry Andric 312606c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 312706c3fb27SDimitry Andric const LLT F16 = LLT::scalar(16); 312806c3fb27SDimitry Andric 312906c3fb27SDimitry Andric const AMDGPUTargetMachine &TM = 313006c3fb27SDimitry Andric static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 313106c3fb27SDimitry Andric 313206c3fb27SDimitry Andric if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 313306c3fb27SDimitry Andric TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 313406c3fb27SDimitry Andric if (Ty == F16 && !ST.has16BitInsts()) { 313506c3fb27SDimitry Andric Register LogVal = MRI.createGenericVirtualRegister(F32); 313606c3fb27SDimitry Andric auto PromoteSrc = B.buildFPExt(F32, X); 3137*8a4dda33SDimitry Andric legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags); 313806c3fb27SDimitry Andric B.buildFPTrunc(Dst, LogVal); 313906c3fb27SDimitry Andric } else { 3140*8a4dda33SDimitry Andric legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags); 314106c3fb27SDimitry Andric } 314206c3fb27SDimitry Andric 314306c3fb27SDimitry Andric MI.eraseFromParent(); 314406c3fb27SDimitry Andric return true; 314506c3fb27SDimitry Andric } 314606c3fb27SDimitry Andric 314706c3fb27SDimitry Andric auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 314806c3fb27SDimitry Andric if (ScaledInput) 314906c3fb27SDimitry Andric X = ScaledInput; 315006c3fb27SDimitry Andric 315106c3fb27SDimitry Andric auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 315206c3fb27SDimitry Andric .addUse(X) 315306c3fb27SDimitry Andric .setMIFlags(Flags); 315406c3fb27SDimitry Andric 315506c3fb27SDimitry Andric Register R; 315606c3fb27SDimitry Andric if (ST.hasFastFMAF32()) { 315706c3fb27SDimitry Andric // c+cc are ln(2)/ln(10) to more than 49 bits 315806c3fb27SDimitry Andric const float c_log10 = 0x1.344134p-2f; 315906c3fb27SDimitry Andric const float cc_log10 = 0x1.09f79ep-26f; 316006c3fb27SDimitry Andric 316106c3fb27SDimitry Andric // c + cc is ln(2) to more than 49 bits 316206c3fb27SDimitry Andric const float c_log = 0x1.62e42ep-1f; 316306c3fb27SDimitry Andric const float cc_log = 0x1.efa39ep-25f; 316406c3fb27SDimitry Andric 316506c3fb27SDimitry Andric auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 316606c3fb27SDimitry Andric auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 316706c3fb27SDimitry Andric 316806c3fb27SDimitry Andric R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 316906c3fb27SDimitry Andric auto NegR = B.buildFNeg(Ty, R, Flags); 317006c3fb27SDimitry Andric auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 317106c3fb27SDimitry Andric auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 317206c3fb27SDimitry Andric R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 317306c3fb27SDimitry Andric } else { 317406c3fb27SDimitry Andric // ch+ct is ln(2)/ln(10) to more than 36 bits 317506c3fb27SDimitry Andric const float ch_log10 = 0x1.344000p-2f; 317606c3fb27SDimitry Andric const float ct_log10 = 0x1.3509f6p-18f; 317706c3fb27SDimitry Andric 317806c3fb27SDimitry Andric // ch + ct is ln(2) to more than 36 bits 317906c3fb27SDimitry Andric const float ch_log = 0x1.62e000p-1f; 318006c3fb27SDimitry Andric const float ct_log = 0x1.0bfbe8p-15f; 318106c3fb27SDimitry Andric 318206c3fb27SDimitry Andric auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 318306c3fb27SDimitry Andric auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 318406c3fb27SDimitry Andric 318506c3fb27SDimitry Andric auto MaskConst = B.buildConstant(Ty, 0xfffff000); 318606c3fb27SDimitry Andric auto YH = B.buildAnd(Ty, Y, MaskConst); 318706c3fb27SDimitry Andric auto YT = B.buildFSub(Ty, Y, YH, Flags); 318806c3fb27SDimitry Andric auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 318906c3fb27SDimitry Andric 319006c3fb27SDimitry Andric Register Mad0 = 319106c3fb27SDimitry Andric getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 319206c3fb27SDimitry Andric Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 319306c3fb27SDimitry Andric R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 319406c3fb27SDimitry Andric } 319506c3fb27SDimitry Andric 319606c3fb27SDimitry Andric const bool IsFiniteOnly = 319706c3fb27SDimitry Andric (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 319806c3fb27SDimitry Andric (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 319906c3fb27SDimitry Andric 320006c3fb27SDimitry Andric if (!IsFiniteOnly) { 320106c3fb27SDimitry Andric // Expand isfinite(x) => fabs(x) < inf 320206c3fb27SDimitry Andric auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 320306c3fb27SDimitry Andric auto Fabs = B.buildFAbs(Ty, Y); 320406c3fb27SDimitry Andric auto IsFinite = 320506c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 320606c3fb27SDimitry Andric R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 320706c3fb27SDimitry Andric } 320806c3fb27SDimitry Andric 320906c3fb27SDimitry Andric if (ScaledInput) { 321006c3fb27SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 321106c3fb27SDimitry Andric auto ShiftK = 321206c3fb27SDimitry Andric B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 321306c3fb27SDimitry Andric auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 321406c3fb27SDimitry Andric B.buildFSub(Dst, R, Shift, Flags); 321506c3fb27SDimitry Andric } else { 321606c3fb27SDimitry Andric B.buildCopy(Dst, R); 321706c3fb27SDimitry Andric } 321806c3fb27SDimitry Andric 321906c3fb27SDimitry Andric MI.eraseFromParent(); 322006c3fb27SDimitry Andric return true; 322106c3fb27SDimitry Andric } 322206c3fb27SDimitry Andric 322306c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 3224*8a4dda33SDimitry Andric Register Src, bool IsLog10, 322506c3fb27SDimitry Andric unsigned Flags) const { 3226*8a4dda33SDimitry Andric const double Log2BaseInverted = 3227*8a4dda33SDimitry Andric IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 3228*8a4dda33SDimitry Andric 322906c3fb27SDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 3230*8a4dda33SDimitry Andric 3231*8a4dda33SDimitry Andric if (Ty == LLT::scalar(32)) { 3232*8a4dda33SDimitry Andric auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); 3233*8a4dda33SDimitry Andric if (ScaledInput) { 3234*8a4dda33SDimitry Andric auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 3235*8a4dda33SDimitry Andric .addUse(Src) 3236*8a4dda33SDimitry Andric .setMIFlags(Flags); 3237*8a4dda33SDimitry Andric auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); 3238*8a4dda33SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 3239*8a4dda33SDimitry Andric auto ResultOffset = 3240*8a4dda33SDimitry Andric B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags); 3241*8a4dda33SDimitry Andric auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted); 3242*8a4dda33SDimitry Andric 3243*8a4dda33SDimitry Andric if (ST.hasFastFMAF32()) 3244*8a4dda33SDimitry Andric B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags); 3245*8a4dda33SDimitry Andric else { 3246*8a4dda33SDimitry Andric auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags); 3247*8a4dda33SDimitry Andric B.buildFAdd(Dst, Mul, ResultOffset, Flags); 3248*8a4dda33SDimitry Andric } 3249*8a4dda33SDimitry Andric 3250*8a4dda33SDimitry Andric return true; 3251*8a4dda33SDimitry Andric } 3252*8a4dda33SDimitry Andric } 3253*8a4dda33SDimitry Andric 325406c3fb27SDimitry Andric auto Log2Operand = Ty == LLT::scalar(16) 325506c3fb27SDimitry Andric ? B.buildFLog2(Ty, Src, Flags) 325606c3fb27SDimitry Andric : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false) 325706c3fb27SDimitry Andric .addUse(Src) 325806c3fb27SDimitry Andric .setMIFlags(Flags); 325906c3fb27SDimitry Andric auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 326006c3fb27SDimitry Andric B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 326106c3fb27SDimitry Andric return true; 326206c3fb27SDimitry Andric } 326306c3fb27SDimitry Andric 326406c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 326506c3fb27SDimitry Andric MachineIRBuilder &B) const { 326606c3fb27SDimitry Andric // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 326706c3fb27SDimitry Andric // If we have to handle denormals, scale up the input and adjust the result. 326806c3fb27SDimitry Andric 326906c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 327006c3fb27SDimitry Andric Register Src = MI.getOperand(1).getReg(); 327106c3fb27SDimitry Andric unsigned Flags = MI.getFlags(); 327206c3fb27SDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 327306c3fb27SDimitry Andric const LLT F16 = LLT::scalar(16); 327406c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 327506c3fb27SDimitry Andric 327606c3fb27SDimitry Andric if (Ty == F16) { 327706c3fb27SDimitry Andric // Nothing in half is a denormal when promoted to f32. 327806c3fb27SDimitry Andric auto Ext = B.buildFPExt(F32, Src, Flags); 327906c3fb27SDimitry Andric auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false) 328006c3fb27SDimitry Andric .addUse(Ext.getReg(0)) 328106c3fb27SDimitry Andric .setMIFlags(Flags); 328206c3fb27SDimitry Andric B.buildFPTrunc(Dst, Log2, Flags); 328306c3fb27SDimitry Andric MI.eraseFromParent(); 328406c3fb27SDimitry Andric return true; 328506c3fb27SDimitry Andric } 328606c3fb27SDimitry Andric 328706c3fb27SDimitry Andric assert(Ty == F32); 328806c3fb27SDimitry Andric 3289*8a4dda33SDimitry Andric if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { 329006c3fb27SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) 329106c3fb27SDimitry Andric .addUse(Src) 329206c3fb27SDimitry Andric .setMIFlags(Flags); 329306c3fb27SDimitry Andric MI.eraseFromParent(); 329406c3fb27SDimitry Andric return true; 329506c3fb27SDimitry Andric } 329606c3fb27SDimitry Andric 329706c3fb27SDimitry Andric // bool needs_scaling = x < -0x1.f80000p+6f; 329806c3fb27SDimitry Andric // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 329906c3fb27SDimitry Andric 330006c3fb27SDimitry Andric // -nextafter(128.0, -1) 330106c3fb27SDimitry Andric auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 330206c3fb27SDimitry Andric auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 330306c3fb27SDimitry Andric RangeCheckConst, Flags); 330406c3fb27SDimitry Andric 330506c3fb27SDimitry Andric auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 330606c3fb27SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 330706c3fb27SDimitry Andric auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 330806c3fb27SDimitry Andric auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 330906c3fb27SDimitry Andric 331006c3fb27SDimitry Andric auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) 331106c3fb27SDimitry Andric .addUse(AddInput.getReg(0)) 331206c3fb27SDimitry Andric .setMIFlags(Flags); 331306c3fb27SDimitry Andric 331406c3fb27SDimitry Andric auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 331506c3fb27SDimitry Andric auto One = B.buildFConstant(Ty, 1.0); 331606c3fb27SDimitry Andric auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 331706c3fb27SDimitry Andric B.buildFMul(Dst, Exp2, ResultScale, Flags); 331806c3fb27SDimitry Andric MI.eraseFromParent(); 331906c3fb27SDimitry Andric return true; 332006c3fb27SDimitry Andric } 332106c3fb27SDimitry Andric 332206c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 332306c3fb27SDimitry Andric Register Src, 332406c3fb27SDimitry Andric unsigned Flags) const { 332506c3fb27SDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 332606c3fb27SDimitry Andric auto K = B.buildFConstant(Ty, numbers::log2e); 332706c3fb27SDimitry Andric auto Mul = B.buildFMul(Ty, Src, K, Flags); 332806c3fb27SDimitry Andric 332906c3fb27SDimitry Andric if (Ty == LLT::scalar(32)) { 333006c3fb27SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false) 333106c3fb27SDimitry Andric .addUse(Mul.getReg(0)) 333206c3fb27SDimitry Andric .setMIFlags(Flags); 333306c3fb27SDimitry Andric } else { 333406c3fb27SDimitry Andric B.buildFExp2(Dst, Mul.getReg(0), Flags); 333506c3fb27SDimitry Andric } 333606c3fb27SDimitry Andric 333706c3fb27SDimitry Andric return true; 333806c3fb27SDimitry Andric } 333906c3fb27SDimitry Andric 33405ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 33415ffd83dbSDimitry Andric MachineIRBuilder &B) const { 33425ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 334306c3fb27SDimitry Andric Register X = MI.getOperand(1).getReg(); 334406c3fb27SDimitry Andric const unsigned Flags = MI.getFlags(); 334506c3fb27SDimitry Andric MachineFunction &MF = B.getMF(); 334606c3fb27SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 334706c3fb27SDimitry Andric LLT Ty = MRI.getType(Dst); 334806c3fb27SDimitry Andric const LLT F16 = LLT::scalar(16); 334906c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 335006c3fb27SDimitry Andric const bool IsExp10 = false; // TODO: For some reason exp10 is missing 33515ffd83dbSDimitry Andric 335206c3fb27SDimitry Andric if (Ty == F16) { 335306c3fb27SDimitry Andric // v_exp_f16 (fmul x, log2e) 335406c3fb27SDimitry Andric if (allowApproxFunc(MF, Flags)) { 335506c3fb27SDimitry Andric // TODO: Does this really require fast? 335606c3fb27SDimitry Andric legalizeFExpUnsafe(B, Dst, X, Flags); 335706c3fb27SDimitry Andric MI.eraseFromParent(); 335806c3fb27SDimitry Andric return true; 335906c3fb27SDimitry Andric } 336006c3fb27SDimitry Andric 336106c3fb27SDimitry Andric // exp(f16 x) -> 336206c3fb27SDimitry Andric // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 336306c3fb27SDimitry Andric 336406c3fb27SDimitry Andric // Nothing in half is a denormal when promoted to f32. 336506c3fb27SDimitry Andric auto Ext = B.buildFPExt(F32, X, Flags); 336606c3fb27SDimitry Andric Register Lowered = MRI.createGenericVirtualRegister(F32); 336706c3fb27SDimitry Andric legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 336806c3fb27SDimitry Andric B.buildFPTrunc(Dst, Lowered, Flags); 336906c3fb27SDimitry Andric MI.eraseFromParent(); 337006c3fb27SDimitry Andric return true; 337106c3fb27SDimitry Andric } 337206c3fb27SDimitry Andric 337306c3fb27SDimitry Andric assert(Ty == F32); 337406c3fb27SDimitry Andric 337506c3fb27SDimitry Andric // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 337606c3fb27SDimitry Andric // library behavior. Also, is known-not-daz source sufficient? 337706c3fb27SDimitry Andric if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) { 337806c3fb27SDimitry Andric legalizeFExpUnsafe(B, Dst, X, Flags); 337906c3fb27SDimitry Andric MI.eraseFromParent(); 338006c3fb27SDimitry Andric return true; 338106c3fb27SDimitry Andric } 338206c3fb27SDimitry Andric 338306c3fb27SDimitry Andric // Algorithm: 338406c3fb27SDimitry Andric // 338506c3fb27SDimitry Andric // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 338606c3fb27SDimitry Andric // 338706c3fb27SDimitry Andric // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 338806c3fb27SDimitry Andric // n = 64*m + j, 0 <= j < 64 338906c3fb27SDimitry Andric // 339006c3fb27SDimitry Andric // e^x = 2^((64*m + j + f)/64) 339106c3fb27SDimitry Andric // = (2^m) * (2^(j/64)) * 2^(f/64) 339206c3fb27SDimitry Andric // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 339306c3fb27SDimitry Andric // 339406c3fb27SDimitry Andric // f = x*(64/ln(2)) - n 339506c3fb27SDimitry Andric // r = f*(ln(2)/64) = x - n*(ln(2)/64) 339606c3fb27SDimitry Andric // 339706c3fb27SDimitry Andric // e^x = (2^m) * (2^(j/64)) * e^r 339806c3fb27SDimitry Andric // 339906c3fb27SDimitry Andric // (2^(j/64)) is precomputed 340006c3fb27SDimitry Andric // 340106c3fb27SDimitry Andric // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 340206c3fb27SDimitry Andric // e^r = 1 + q 340306c3fb27SDimitry Andric // 340406c3fb27SDimitry Andric // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 340506c3fb27SDimitry Andric // 340606c3fb27SDimitry Andric // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 340706c3fb27SDimitry Andric const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 340806c3fb27SDimitry Andric Register PH, PL; 340906c3fb27SDimitry Andric 341006c3fb27SDimitry Andric if (ST.hasFastFMAF32()) { 341106c3fb27SDimitry Andric const float c_exp = numbers::log2ef; 341206c3fb27SDimitry Andric const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 341306c3fb27SDimitry Andric const float c_exp10 = 0x1.a934f0p+1f; 341406c3fb27SDimitry Andric const float cc_exp10 = 0x1.2f346ep-24f; 341506c3fb27SDimitry Andric 341606c3fb27SDimitry Andric auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 341706c3fb27SDimitry Andric PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 341806c3fb27SDimitry Andric auto NegPH = B.buildFNeg(Ty, PH, Flags); 341906c3fb27SDimitry Andric auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 342006c3fb27SDimitry Andric 342106c3fb27SDimitry Andric auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 342206c3fb27SDimitry Andric PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 342306c3fb27SDimitry Andric } else { 342406c3fb27SDimitry Andric const float ch_exp = 0x1.714000p+0f; 342506c3fb27SDimitry Andric const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 342606c3fb27SDimitry Andric 342706c3fb27SDimitry Andric const float ch_exp10 = 0x1.a92000p+1f; 342806c3fb27SDimitry Andric const float cl_exp10 = 0x1.4f0978p-11f; 342906c3fb27SDimitry Andric 343006c3fb27SDimitry Andric auto MaskConst = B.buildConstant(Ty, 0xfffff000); 343106c3fb27SDimitry Andric auto XH = B.buildAnd(Ty, X, MaskConst); 343206c3fb27SDimitry Andric auto XL = B.buildFSub(Ty, X, XH, Flags); 343306c3fb27SDimitry Andric 343406c3fb27SDimitry Andric auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 343506c3fb27SDimitry Andric PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 343606c3fb27SDimitry Andric 343706c3fb27SDimitry Andric auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 343806c3fb27SDimitry Andric auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 343906c3fb27SDimitry Andric 344006c3fb27SDimitry Andric Register Mad0 = 344106c3fb27SDimitry Andric getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 344206c3fb27SDimitry Andric PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 344306c3fb27SDimitry Andric } 344406c3fb27SDimitry Andric 344506c3fb27SDimitry Andric auto E = B.buildFRint(Ty, PH, Flags); 344606c3fb27SDimitry Andric 344706c3fb27SDimitry Andric // It is unsafe to contract this fsub into the PH multiply. 344806c3fb27SDimitry Andric auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 344906c3fb27SDimitry Andric auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 345006c3fb27SDimitry Andric auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 345106c3fb27SDimitry Andric 345206c3fb27SDimitry Andric auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false) 345306c3fb27SDimitry Andric .addUse(A.getReg(0)) 345406c3fb27SDimitry Andric .setMIFlags(Flags); 345506c3fb27SDimitry Andric auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 345606c3fb27SDimitry Andric 345706c3fb27SDimitry Andric auto UnderflowCheckConst = 345806c3fb27SDimitry Andric B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 345906c3fb27SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 346006c3fb27SDimitry Andric auto Underflow = 346106c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 346206c3fb27SDimitry Andric 346306c3fb27SDimitry Andric R = B.buildSelect(Ty, Underflow, Zero, R); 346406c3fb27SDimitry Andric 346506c3fb27SDimitry Andric const auto &Options = MF.getTarget().Options; 346606c3fb27SDimitry Andric 346706c3fb27SDimitry Andric if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 346806c3fb27SDimitry Andric auto OverflowCheckConst = 346906c3fb27SDimitry Andric B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 347006c3fb27SDimitry Andric 347106c3fb27SDimitry Andric auto Overflow = 347206c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 347306c3fb27SDimitry Andric auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 347406c3fb27SDimitry Andric R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 347506c3fb27SDimitry Andric } 347606c3fb27SDimitry Andric 347706c3fb27SDimitry Andric B.buildCopy(Dst, R); 34785ffd83dbSDimitry Andric MI.eraseFromParent(); 34795ffd83dbSDimitry Andric return true; 34805ffd83dbSDimitry Andric } 34815ffd83dbSDimitry Andric 34825ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 34835ffd83dbSDimitry Andric MachineIRBuilder &B) const { 34845ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 34855ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 34865ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 34875ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 34885ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 34895ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 34905ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 34915ffd83dbSDimitry Andric 34925ffd83dbSDimitry Andric if (Ty == S32) { 34935ffd83dbSDimitry Andric auto Log = B.buildFLog2(S32, Src0, Flags); 34945ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 34955ffd83dbSDimitry Andric .addUse(Log.getReg(0)) 34965ffd83dbSDimitry Andric .addUse(Src1) 34975ffd83dbSDimitry Andric .setMIFlags(Flags); 34985ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 34995ffd83dbSDimitry Andric } else if (Ty == S16) { 35005ffd83dbSDimitry Andric // There's no f16 fmul_legacy, so we need to convert for it. 35015ffd83dbSDimitry Andric auto Log = B.buildFLog2(S16, Src0, Flags); 35025ffd83dbSDimitry Andric auto Ext0 = B.buildFPExt(S32, Log, Flags); 35035ffd83dbSDimitry Andric auto Ext1 = B.buildFPExt(S32, Src1, Flags); 35045ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 35055ffd83dbSDimitry Andric .addUse(Ext0.getReg(0)) 35065ffd83dbSDimitry Andric .addUse(Ext1.getReg(0)) 35075ffd83dbSDimitry Andric .setMIFlags(Flags); 35085ffd83dbSDimitry Andric 35095ffd83dbSDimitry Andric B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 35105ffd83dbSDimitry Andric } else 35115ffd83dbSDimitry Andric return false; 35125ffd83dbSDimitry Andric 35135ffd83dbSDimitry Andric MI.eraseFromParent(); 35145ffd83dbSDimitry Andric return true; 35155ffd83dbSDimitry Andric } 35165ffd83dbSDimitry Andric 35175ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers. 35185ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 35195ffd83dbSDimitry Andric Register ModSrc = OrigSrc; 35205ffd83dbSDimitry Andric if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 35215ffd83dbSDimitry Andric ModSrc = SrcFNeg->getOperand(1).getReg(); 35225ffd83dbSDimitry Andric if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 35235ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 35245ffd83dbSDimitry Andric } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 35255ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 35265ffd83dbSDimitry Andric return ModSrc; 35275ffd83dbSDimitry Andric } 35285ffd83dbSDimitry Andric 35295ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 35305ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 35315ffd83dbSDimitry Andric MachineIRBuilder &B) const { 35325ffd83dbSDimitry Andric 35335ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 35345ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 35355ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 35365ffd83dbSDimitry Andric Register OrigSrc = MI.getOperand(1).getReg(); 35375ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 35385ffd83dbSDimitry Andric assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 35395ffd83dbSDimitry Andric "this should not have been custom lowered"); 35405ffd83dbSDimitry Andric 35415ffd83dbSDimitry Andric // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 35425ffd83dbSDimitry Andric // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 35435ffd83dbSDimitry Andric // efficient way to implement it is using V_FRACT_F64. The workaround for the 35445ffd83dbSDimitry Andric // V_FRACT bug is: 35455ffd83dbSDimitry Andric // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 35465ffd83dbSDimitry Andric // 35475ffd83dbSDimitry Andric // Convert floor(x) to (x - fract(x)) 35485ffd83dbSDimitry Andric 35495ffd83dbSDimitry Andric auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 35505ffd83dbSDimitry Andric .addUse(OrigSrc) 35515ffd83dbSDimitry Andric .setMIFlags(Flags); 35525ffd83dbSDimitry Andric 35535ffd83dbSDimitry Andric // Give source modifier matching some assistance before obscuring a foldable 35545ffd83dbSDimitry Andric // pattern. 35555ffd83dbSDimitry Andric 35565ffd83dbSDimitry Andric // TODO: We can avoid the neg on the fract? The input sign to fract 35575ffd83dbSDimitry Andric // shouldn't matter? 35585ffd83dbSDimitry Andric Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 35595ffd83dbSDimitry Andric 356006c3fb27SDimitry Andric auto Const = 356106c3fb27SDimitry Andric B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff)); 35625ffd83dbSDimitry Andric 35635ffd83dbSDimitry Andric Register Min = MRI.createGenericVirtualRegister(S64); 35645ffd83dbSDimitry Andric 35655ffd83dbSDimitry Andric // We don't need to concern ourselves with the snan handling difference, so 35665ffd83dbSDimitry Andric // use the one which will directly select. 35675ffd83dbSDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 35685ffd83dbSDimitry Andric if (MFI->getMode().IEEE) 35695ffd83dbSDimitry Andric B.buildFMinNumIEEE(Min, Fract, Const, Flags); 35705ffd83dbSDimitry Andric else 35715ffd83dbSDimitry Andric B.buildFMinNum(Min, Fract, Const, Flags); 35725ffd83dbSDimitry Andric 35735ffd83dbSDimitry Andric Register CorrectedFract = Min; 35745ffd83dbSDimitry Andric if (!MI.getFlag(MachineInstr::FmNoNans)) { 35755ffd83dbSDimitry Andric auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 35765ffd83dbSDimitry Andric CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 35775ffd83dbSDimitry Andric } 35785ffd83dbSDimitry Andric 35795ffd83dbSDimitry Andric auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 35805ffd83dbSDimitry Andric B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 35815ffd83dbSDimitry Andric 35825ffd83dbSDimitry Andric MI.eraseFromParent(); 35835ffd83dbSDimitry Andric return true; 35845ffd83dbSDimitry Andric } 35855ffd83dbSDimitry Andric 35865ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations. 35875ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper. 35885ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector( 35895ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 35905ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 35915ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3592bdd1243dSDimitry Andric const LLT S16 = LLT::scalar(16); 3593fe6060f1SDimitry Andric assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 35945ffd83dbSDimitry Andric 35955ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 35965ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 35975ffd83dbSDimitry Andric 3598bdd1243dSDimitry Andric if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3599bdd1243dSDimitry Andric assert(MRI.getType(Src0) == S32); 3600bdd1243dSDimitry Andric Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3601bdd1243dSDimitry Andric Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3602bdd1243dSDimitry Andric } 3603bdd1243dSDimitry Andric 3604bdd1243dSDimitry Andric auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 36055ffd83dbSDimitry Andric B.buildBitcast(Dst, Merge); 36065ffd83dbSDimitry Andric 36075ffd83dbSDimitry Andric MI.eraseFromParent(); 36085ffd83dbSDimitry Andric return true; 36095ffd83dbSDimitry Andric } 36105ffd83dbSDimitry Andric 361181ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 361281ad6265SDimitry Andric // 361381ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits. 361481ad6265SDimitry Andric // 361581ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence 361681ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of 361781ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go 361881ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction 361981ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions. 362006c3fb27SDimitry Andric void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 362106c3fb27SDimitry Andric MutableArrayRef<Register> Accum, 362206c3fb27SDimitry Andric ArrayRef<Register> Src0, 362306c3fb27SDimitry Andric ArrayRef<Register> Src1, 362406c3fb27SDimitry Andric bool UsePartialMad64_32, 362506c3fb27SDimitry Andric bool SeparateOddAlignedProducts) const { 362681ad6265SDimitry Andric // Use (possibly empty) vectors of S1 registers to represent the set of 362781ad6265SDimitry Andric // carries from one pair of positions to the next. 362881ad6265SDimitry Andric using Carry = SmallVector<Register, 2>; 362981ad6265SDimitry Andric 363081ad6265SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 363106c3fb27SDimitry Andric GISelKnownBits &KB = *Helper.getKnownBits(); 363281ad6265SDimitry Andric 363381ad6265SDimitry Andric const LLT S1 = LLT::scalar(1); 363481ad6265SDimitry Andric const LLT S32 = LLT::scalar(32); 363581ad6265SDimitry Andric const LLT S64 = LLT::scalar(64); 363681ad6265SDimitry Andric 363781ad6265SDimitry Andric Register Zero32; 363881ad6265SDimitry Andric Register Zero64; 363981ad6265SDimitry Andric 364081ad6265SDimitry Andric auto getZero32 = [&]() -> Register { 364181ad6265SDimitry Andric if (!Zero32) 364281ad6265SDimitry Andric Zero32 = B.buildConstant(S32, 0).getReg(0); 364381ad6265SDimitry Andric return Zero32; 364481ad6265SDimitry Andric }; 364581ad6265SDimitry Andric auto getZero64 = [&]() -> Register { 364681ad6265SDimitry Andric if (!Zero64) 364781ad6265SDimitry Andric Zero64 = B.buildConstant(S64, 0).getReg(0); 364881ad6265SDimitry Andric return Zero64; 364981ad6265SDimitry Andric }; 365081ad6265SDimitry Andric 365106c3fb27SDimitry Andric SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 365206c3fb27SDimitry Andric for (unsigned i = 0; i < Src0.size(); ++i) { 365306c3fb27SDimitry Andric Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); 365406c3fb27SDimitry Andric Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); 365506c3fb27SDimitry Andric } 365606c3fb27SDimitry Andric 365781ad6265SDimitry Andric // Merge the given carries into the 32-bit LocalAccum, which is modified 365881ad6265SDimitry Andric // in-place. 365981ad6265SDimitry Andric // 366081ad6265SDimitry Andric // Returns the carry-out, which is a single S1 register or null. 366181ad6265SDimitry Andric auto mergeCarry = 366281ad6265SDimitry Andric [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 366381ad6265SDimitry Andric if (CarryIn.empty()) 366481ad6265SDimitry Andric return Register(); 366581ad6265SDimitry Andric 366681ad6265SDimitry Andric bool HaveCarryOut = true; 366781ad6265SDimitry Andric Register CarryAccum; 366881ad6265SDimitry Andric if (CarryIn.size() == 1) { 366981ad6265SDimitry Andric if (!LocalAccum) { 367081ad6265SDimitry Andric LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 367181ad6265SDimitry Andric return Register(); 367281ad6265SDimitry Andric } 367381ad6265SDimitry Andric 367481ad6265SDimitry Andric CarryAccum = getZero32(); 367581ad6265SDimitry Andric } else { 367681ad6265SDimitry Andric CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 367781ad6265SDimitry Andric for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 367881ad6265SDimitry Andric CarryAccum = 367981ad6265SDimitry Andric B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 368081ad6265SDimitry Andric .getReg(0); 368181ad6265SDimitry Andric } 368281ad6265SDimitry Andric 368381ad6265SDimitry Andric if (!LocalAccum) { 368481ad6265SDimitry Andric LocalAccum = getZero32(); 368581ad6265SDimitry Andric HaveCarryOut = false; 368681ad6265SDimitry Andric } 368781ad6265SDimitry Andric } 368881ad6265SDimitry Andric 368981ad6265SDimitry Andric auto Add = 369081ad6265SDimitry Andric B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 369181ad6265SDimitry Andric LocalAccum = Add.getReg(0); 369281ad6265SDimitry Andric return HaveCarryOut ? Add.getReg(1) : Register(); 369381ad6265SDimitry Andric }; 369481ad6265SDimitry Andric 369581ad6265SDimitry Andric // Build a multiply-add chain to compute 369681ad6265SDimitry Andric // 369781ad6265SDimitry Andric // LocalAccum + (partial products at DstIndex) 369881ad6265SDimitry Andric // + (opportunistic subset of CarryIn) 369981ad6265SDimitry Andric // 370081ad6265SDimitry Andric // LocalAccum is an array of one or two 32-bit registers that are updated 370181ad6265SDimitry Andric // in-place. The incoming registers may be null. 370281ad6265SDimitry Andric // 370381ad6265SDimitry Andric // In some edge cases, carry-ins can be consumed "for free". In that case, 370481ad6265SDimitry Andric // the consumed carry bits are removed from CarryIn in-place. 370581ad6265SDimitry Andric auto buildMadChain = 370681ad6265SDimitry Andric [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 370781ad6265SDimitry Andric -> Carry { 370881ad6265SDimitry Andric assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 370981ad6265SDimitry Andric (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 371081ad6265SDimitry Andric 371181ad6265SDimitry Andric Carry CarryOut; 371281ad6265SDimitry Andric unsigned j0 = 0; 371381ad6265SDimitry Andric 371481ad6265SDimitry Andric // Use plain 32-bit multiplication for the most significant part of the 371581ad6265SDimitry Andric // result by default. 371681ad6265SDimitry Andric if (LocalAccum.size() == 1 && 371781ad6265SDimitry Andric (!UsePartialMad64_32 || !CarryIn.empty())) { 371881ad6265SDimitry Andric do { 371906c3fb27SDimitry Andric // Skip multiplication if one of the operands is 0 372081ad6265SDimitry Andric unsigned j1 = DstIndex - j0; 372106c3fb27SDimitry Andric if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 372206c3fb27SDimitry Andric ++j0; 372306c3fb27SDimitry Andric continue; 372406c3fb27SDimitry Andric } 372581ad6265SDimitry Andric auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 372606c3fb27SDimitry Andric if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { 372781ad6265SDimitry Andric LocalAccum[0] = Mul.getReg(0); 372881ad6265SDimitry Andric } else { 372981ad6265SDimitry Andric if (CarryIn.empty()) { 373081ad6265SDimitry Andric LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 373181ad6265SDimitry Andric } else { 373281ad6265SDimitry Andric LocalAccum[0] = 373381ad6265SDimitry Andric B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 373481ad6265SDimitry Andric .getReg(0); 373581ad6265SDimitry Andric CarryIn.pop_back(); 373681ad6265SDimitry Andric } 373781ad6265SDimitry Andric } 373881ad6265SDimitry Andric ++j0; 373981ad6265SDimitry Andric } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 374081ad6265SDimitry Andric } 374181ad6265SDimitry Andric 374281ad6265SDimitry Andric // Build full 64-bit multiplies. 374381ad6265SDimitry Andric if (j0 <= DstIndex) { 374481ad6265SDimitry Andric bool HaveSmallAccum = false; 374581ad6265SDimitry Andric Register Tmp; 374681ad6265SDimitry Andric 374781ad6265SDimitry Andric if (LocalAccum[0]) { 374881ad6265SDimitry Andric if (LocalAccum.size() == 1) { 374981ad6265SDimitry Andric Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 375081ad6265SDimitry Andric HaveSmallAccum = true; 375181ad6265SDimitry Andric } else if (LocalAccum[1]) { 3752bdd1243dSDimitry Andric Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 375381ad6265SDimitry Andric HaveSmallAccum = false; 375481ad6265SDimitry Andric } else { 375581ad6265SDimitry Andric Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 375681ad6265SDimitry Andric HaveSmallAccum = true; 375781ad6265SDimitry Andric } 375881ad6265SDimitry Andric } else { 375981ad6265SDimitry Andric assert(LocalAccum.size() == 1 || !LocalAccum[1]); 376081ad6265SDimitry Andric Tmp = getZero64(); 376181ad6265SDimitry Andric HaveSmallAccum = true; 376281ad6265SDimitry Andric } 376381ad6265SDimitry Andric 376481ad6265SDimitry Andric do { 376581ad6265SDimitry Andric unsigned j1 = DstIndex - j0; 376606c3fb27SDimitry Andric if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 376706c3fb27SDimitry Andric ++j0; 376806c3fb27SDimitry Andric continue; 376906c3fb27SDimitry Andric } 377081ad6265SDimitry Andric auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 377181ad6265SDimitry Andric {Src0[j0], Src1[j1], Tmp}); 377281ad6265SDimitry Andric Tmp = Mad.getReg(0); 377381ad6265SDimitry Andric if (!HaveSmallAccum) 377481ad6265SDimitry Andric CarryOut.push_back(Mad.getReg(1)); 377581ad6265SDimitry Andric HaveSmallAccum = false; 377606c3fb27SDimitry Andric 377781ad6265SDimitry Andric ++j0; 377881ad6265SDimitry Andric } while (j0 <= DstIndex); 377981ad6265SDimitry Andric 378081ad6265SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Tmp); 378181ad6265SDimitry Andric LocalAccum[0] = Unmerge.getReg(0); 378281ad6265SDimitry Andric if (LocalAccum.size() > 1) 378381ad6265SDimitry Andric LocalAccum[1] = Unmerge.getReg(1); 378481ad6265SDimitry Andric } 378581ad6265SDimitry Andric 378681ad6265SDimitry Andric return CarryOut; 378781ad6265SDimitry Andric }; 378881ad6265SDimitry Andric 378981ad6265SDimitry Andric // Outer multiply loop, iterating over destination parts from least 379081ad6265SDimitry Andric // significant to most significant parts. 379181ad6265SDimitry Andric // 379281ad6265SDimitry Andric // The columns of the following diagram correspond to the destination parts 379381ad6265SDimitry Andric // affected by one iteration of the outer loop (ignoring boundary 379481ad6265SDimitry Andric // conditions). 379581ad6265SDimitry Andric // 379681ad6265SDimitry Andric // Dest index relative to 2 * i: 1 0 -1 379781ad6265SDimitry Andric // ------ 379881ad6265SDimitry Andric // Carries from previous iteration: e o 379981ad6265SDimitry Andric // Even-aligned partial product sum: E E . 380081ad6265SDimitry Andric // Odd-aligned partial product sum: O O 380181ad6265SDimitry Andric // 380281ad6265SDimitry Andric // 'o' is OddCarry, 'e' is EvenCarry. 380381ad6265SDimitry Andric // EE and OO are computed from partial products via buildMadChain and use 380481ad6265SDimitry Andric // accumulation where possible and appropriate. 380581ad6265SDimitry Andric // 380681ad6265SDimitry Andric Register SeparateOddCarry; 380781ad6265SDimitry Andric Carry EvenCarry; 380881ad6265SDimitry Andric Carry OddCarry; 380981ad6265SDimitry Andric 381081ad6265SDimitry Andric for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 381181ad6265SDimitry Andric Carry OddCarryIn = std::move(OddCarry); 381281ad6265SDimitry Andric Carry EvenCarryIn = std::move(EvenCarry); 381381ad6265SDimitry Andric OddCarry.clear(); 381481ad6265SDimitry Andric EvenCarry.clear(); 381581ad6265SDimitry Andric 381681ad6265SDimitry Andric // Partial products at offset 2 * i. 381781ad6265SDimitry Andric if (2 * i < Accum.size()) { 381881ad6265SDimitry Andric auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 381981ad6265SDimitry Andric EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 382081ad6265SDimitry Andric } 382181ad6265SDimitry Andric 382281ad6265SDimitry Andric // Partial products at offset 2 * i - 1. 382381ad6265SDimitry Andric if (i > 0) { 382481ad6265SDimitry Andric if (!SeparateOddAlignedProducts) { 382581ad6265SDimitry Andric auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 382681ad6265SDimitry Andric OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 382781ad6265SDimitry Andric } else { 382881ad6265SDimitry Andric bool IsHighest = 2 * i >= Accum.size(); 382981ad6265SDimitry Andric Register SeparateOddOut[2]; 3830bdd1243dSDimitry Andric auto LocalAccum = MutableArrayRef(SeparateOddOut) 383181ad6265SDimitry Andric .take_front(IsHighest ? 1 : 2); 383281ad6265SDimitry Andric OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 383381ad6265SDimitry Andric 383481ad6265SDimitry Andric MachineInstr *Lo; 383581ad6265SDimitry Andric 383681ad6265SDimitry Andric if (i == 1) { 383781ad6265SDimitry Andric if (!IsHighest) 383881ad6265SDimitry Andric Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 383981ad6265SDimitry Andric else 384081ad6265SDimitry Andric Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 384181ad6265SDimitry Andric } else { 384281ad6265SDimitry Andric Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 384381ad6265SDimitry Andric SeparateOddCarry); 384481ad6265SDimitry Andric } 384581ad6265SDimitry Andric Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 384681ad6265SDimitry Andric 384781ad6265SDimitry Andric if (!IsHighest) { 384881ad6265SDimitry Andric auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 384981ad6265SDimitry Andric Lo->getOperand(1).getReg()); 385081ad6265SDimitry Andric Accum[2 * i] = Hi.getReg(0); 385181ad6265SDimitry Andric SeparateOddCarry = Hi.getReg(1); 385281ad6265SDimitry Andric } 385381ad6265SDimitry Andric } 385481ad6265SDimitry Andric } 385581ad6265SDimitry Andric 385681ad6265SDimitry Andric // Add in the carries from the previous iteration 385781ad6265SDimitry Andric if (i > 0) { 385881ad6265SDimitry Andric if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 385981ad6265SDimitry Andric EvenCarryIn.push_back(CarryOut); 386081ad6265SDimitry Andric 386181ad6265SDimitry Andric if (2 * i < Accum.size()) { 386281ad6265SDimitry Andric if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 386381ad6265SDimitry Andric OddCarry.push_back(CarryOut); 386481ad6265SDimitry Andric } 386581ad6265SDimitry Andric } 386681ad6265SDimitry Andric } 386781ad6265SDimitry Andric } 386881ad6265SDimitry Andric 386981ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions. 387081ad6265SDimitry Andric // 387181ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to 387281ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 387381ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 387481ad6265SDimitry Andric MachineInstr &MI) const { 387581ad6265SDimitry Andric assert(ST.hasMad64_32()); 387681ad6265SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_MUL); 387781ad6265SDimitry Andric 387881ad6265SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 387981ad6265SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 388081ad6265SDimitry Andric 388181ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 388281ad6265SDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 388381ad6265SDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 388481ad6265SDimitry Andric 388581ad6265SDimitry Andric LLT Ty = MRI.getType(DstReg); 388681ad6265SDimitry Andric assert(Ty.isScalar()); 388781ad6265SDimitry Andric 388881ad6265SDimitry Andric unsigned Size = Ty.getSizeInBits(); 388981ad6265SDimitry Andric unsigned NumParts = Size / 32; 389081ad6265SDimitry Andric assert((Size % 32) == 0); 389181ad6265SDimitry Andric assert(NumParts >= 2); 389281ad6265SDimitry Andric 389381ad6265SDimitry Andric // Whether to use MAD_64_32 for partial products whose high half is 389481ad6265SDimitry Andric // discarded. This avoids some ADD instructions but risks false dependency 389581ad6265SDimitry Andric // stalls on some subtargets in some cases. 389681ad6265SDimitry Andric const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 389781ad6265SDimitry Andric 389881ad6265SDimitry Andric // Whether to compute odd-aligned partial products separately. This is 389981ad6265SDimitry Andric // advisable on subtargets where the accumulator of MAD_64_32 must be placed 390081ad6265SDimitry Andric // in an even-aligned VGPR. 390181ad6265SDimitry Andric const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 390281ad6265SDimitry Andric 390381ad6265SDimitry Andric LLT S32 = LLT::scalar(32); 390481ad6265SDimitry Andric SmallVector<Register, 2> Src0Parts, Src1Parts; 390581ad6265SDimitry Andric for (unsigned i = 0; i < NumParts; ++i) { 390681ad6265SDimitry Andric Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 390781ad6265SDimitry Andric Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 390881ad6265SDimitry Andric } 390981ad6265SDimitry Andric B.buildUnmerge(Src0Parts, Src0); 391081ad6265SDimitry Andric B.buildUnmerge(Src1Parts, Src1); 391181ad6265SDimitry Andric 391281ad6265SDimitry Andric SmallVector<Register, 2> AccumRegs(NumParts); 391381ad6265SDimitry Andric buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 391481ad6265SDimitry Andric SeparateOddAlignedProducts); 391581ad6265SDimitry Andric 3916bdd1243dSDimitry Andric B.buildMergeLikeInstr(DstReg, AccumRegs); 391781ad6265SDimitry Andric MI.eraseFromParent(); 391881ad6265SDimitry Andric return true; 391981ad6265SDimitry Andric } 392081ad6265SDimitry Andric 3921349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3922349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3923349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select. 3924349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3925349cc55cSDimitry Andric MachineRegisterInfo &MRI, 3926349cc55cSDimitry Andric MachineIRBuilder &B) const { 3927349cc55cSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 3928349cc55cSDimitry Andric Register Src = MI.getOperand(1).getReg(); 3929349cc55cSDimitry Andric LLT DstTy = MRI.getType(Dst); 3930349cc55cSDimitry Andric LLT SrcTy = MRI.getType(Src); 3931349cc55cSDimitry Andric 3932349cc55cSDimitry Andric unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3933349cc55cSDimitry Andric ? AMDGPU::G_AMDGPU_FFBH_U32 3934349cc55cSDimitry Andric : AMDGPU::G_AMDGPU_FFBL_B32; 3935349cc55cSDimitry Andric auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3936349cc55cSDimitry Andric B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3937349cc55cSDimitry Andric 3938349cc55cSDimitry Andric MI.eraseFromParent(); 3939349cc55cSDimitry Andric return true; 3940349cc55cSDimitry Andric } 3941349cc55cSDimitry Andric 3942e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1 3943e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3944e8d8bef9SDimitry Andric if (MI.getOpcode() != TargetOpcode::G_XOR) 3945e8d8bef9SDimitry Andric return false; 3946349cc55cSDimitry Andric auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3947e8d8bef9SDimitry Andric return ConstVal && *ConstVal == -1; 3948e8d8bef9SDimitry Andric } 3949e8d8bef9SDimitry Andric 39500b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid. 3951e8d8bef9SDimitry Andric static MachineInstr * 3952e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3953e8d8bef9SDimitry Andric MachineBasicBlock *&UncondBrTarget, bool &Negated) { 39540b57cec5SDimitry Andric Register CondDef = MI.getOperand(0).getReg(); 39550b57cec5SDimitry Andric if (!MRI.hasOneNonDBGUse(CondDef)) 39560b57cec5SDimitry Andric return nullptr; 39570b57cec5SDimitry Andric 39585ffd83dbSDimitry Andric MachineBasicBlock *Parent = MI.getParent(); 3959e8d8bef9SDimitry Andric MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3960e8d8bef9SDimitry Andric 3961e8d8bef9SDimitry Andric if (isNot(MRI, *UseMI)) { 3962e8d8bef9SDimitry Andric Register NegatedCond = UseMI->getOperand(0).getReg(); 3963e8d8bef9SDimitry Andric if (!MRI.hasOneNonDBGUse(NegatedCond)) 3964e8d8bef9SDimitry Andric return nullptr; 3965e8d8bef9SDimitry Andric 3966e8d8bef9SDimitry Andric // We're deleting the def of this value, so we need to remove it. 3967349cc55cSDimitry Andric eraseInstr(*UseMI, MRI); 3968e8d8bef9SDimitry Andric 3969e8d8bef9SDimitry Andric UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3970e8d8bef9SDimitry Andric Negated = true; 3971e8d8bef9SDimitry Andric } 3972e8d8bef9SDimitry Andric 3973e8d8bef9SDimitry Andric if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3974480093f4SDimitry Andric return nullptr; 3975480093f4SDimitry Andric 39765ffd83dbSDimitry Andric // Make sure the cond br is followed by a G_BR, or is the last instruction. 3977e8d8bef9SDimitry Andric MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 39785ffd83dbSDimitry Andric if (Next == Parent->end()) { 39795ffd83dbSDimitry Andric MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 39805ffd83dbSDimitry Andric if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 39815ffd83dbSDimitry Andric return nullptr; 39825ffd83dbSDimitry Andric UncondBrTarget = &*NextMBB; 39835ffd83dbSDimitry Andric } else { 3984480093f4SDimitry Andric if (Next->getOpcode() != AMDGPU::G_BR) 3985480093f4SDimitry Andric return nullptr; 3986480093f4SDimitry Andric Br = &*Next; 39875ffd83dbSDimitry Andric UncondBrTarget = Br->getOperand(0).getMBB(); 3988480093f4SDimitry Andric } 3989480093f4SDimitry Andric 3990e8d8bef9SDimitry Andric return UseMI; 39910b57cec5SDimitry Andric } 39920b57cec5SDimitry Andric 39930b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3994e8d8bef9SDimitry Andric const ArgDescriptor *Arg, 3995e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC, 3996e8d8bef9SDimitry Andric LLT ArgTy) const { 3997e8d8bef9SDimitry Andric MCRegister SrcReg = Arg->getRegister(); 3998e8d8bef9SDimitry Andric assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 39995ffd83dbSDimitry Andric assert(DstReg.isVirtual() && "Virtual register expected"); 40000b57cec5SDimitry Andric 400104eeddc0SDimitry Andric Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 400204eeddc0SDimitry Andric *ArgRC, B.getDebugLoc(), ArgTy); 40030b57cec5SDimitry Andric if (Arg->isMasked()) { 40040b57cec5SDimitry Andric // TODO: Should we try to emit this once in the entry block? 40050b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 40060b57cec5SDimitry Andric const unsigned Mask = Arg->getMask(); 400706c3fb27SDimitry Andric const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 40080b57cec5SDimitry Andric 40098bcb0991SDimitry Andric Register AndMaskSrc = LiveIn; 40108bcb0991SDimitry Andric 401104eeddc0SDimitry Andric // TODO: Avoid clearing the high bits if we know workitem id y/z are always 401204eeddc0SDimitry Andric // 0. 40138bcb0991SDimitry Andric if (Shift != 0) { 40140b57cec5SDimitry Andric auto ShiftAmt = B.buildConstant(S32, Shift); 40158bcb0991SDimitry Andric AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 40168bcb0991SDimitry Andric } 40178bcb0991SDimitry Andric 40188bcb0991SDimitry Andric B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 40195ffd83dbSDimitry Andric } else { 40200b57cec5SDimitry Andric B.buildCopy(DstReg, LiveIn); 40210b57cec5SDimitry Andric } 40220b57cec5SDimitry Andric 40230b57cec5SDimitry Andric return true; 40240b57cec5SDimitry Andric } 40250b57cec5SDimitry Andric 4026e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue( 4027e8d8bef9SDimitry Andric Register DstReg, MachineIRBuilder &B, 4028e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4029e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4030e8d8bef9SDimitry Andric const ArgDescriptor *Arg; 4031e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC; 4032e8d8bef9SDimitry Andric LLT ArgTy; 4033e8d8bef9SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4034e8d8bef9SDimitry Andric 4035349cc55cSDimitry Andric if (!Arg) { 4036349cc55cSDimitry Andric if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4037349cc55cSDimitry Andric // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4038349cc55cSDimitry Andric // case the pointer argument may be missing and we use null. 4039349cc55cSDimitry Andric B.buildConstant(DstReg, 0); 4040349cc55cSDimitry Andric return true; 4041349cc55cSDimitry Andric } 4042349cc55cSDimitry Andric 4043349cc55cSDimitry Andric // It's undefined behavior if a function marked with the amdgpu-no-* 4044349cc55cSDimitry Andric // attributes uses the corresponding intrinsic. 4045349cc55cSDimitry Andric B.buildUndef(DstReg); 4046349cc55cSDimitry Andric return true; 4047349cc55cSDimitry Andric } 4048349cc55cSDimitry Andric 4049e8d8bef9SDimitry Andric if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4050e8d8bef9SDimitry Andric return false; // TODO: Handle these 4051e8d8bef9SDimitry Andric return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4052e8d8bef9SDimitry Andric } 4053e8d8bef9SDimitry Andric 40540b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 40555ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 40560b57cec5SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4057e8d8bef9SDimitry Andric if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 40585ffd83dbSDimitry Andric return false; 40595ffd83dbSDimitry Andric 40600b57cec5SDimitry Andric MI.eraseFromParent(); 40610b57cec5SDimitry Andric return true; 40620b57cec5SDimitry Andric } 40630b57cec5SDimitry Andric 406481ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 406581ad6265SDimitry Andric int64_t C) { 406681ad6265SDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), C); 406781ad6265SDimitry Andric MI.eraseFromParent(); 406881ad6265SDimitry Andric return true; 406981ad6265SDimitry Andric } 407081ad6265SDimitry Andric 407181ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 407281ad6265SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 407381ad6265SDimitry Andric unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 407481ad6265SDimitry Andric unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 407581ad6265SDimitry Andric if (MaxID == 0) 407681ad6265SDimitry Andric return replaceWithConstant(B, MI, 0); 407781ad6265SDimitry Andric 407881ad6265SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 407981ad6265SDimitry Andric const ArgDescriptor *Arg; 408081ad6265SDimitry Andric const TargetRegisterClass *ArgRC; 408181ad6265SDimitry Andric LLT ArgTy; 408281ad6265SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 408381ad6265SDimitry Andric 408481ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 408581ad6265SDimitry Andric if (!Arg) { 408681ad6265SDimitry Andric // It's undefined behavior if a function marked with the amdgpu-no-* 408781ad6265SDimitry Andric // attributes uses the corresponding intrinsic. 408881ad6265SDimitry Andric B.buildUndef(DstReg); 408981ad6265SDimitry Andric MI.eraseFromParent(); 409081ad6265SDimitry Andric return true; 409181ad6265SDimitry Andric } 409281ad6265SDimitry Andric 409381ad6265SDimitry Andric if (Arg->isMasked()) { 409481ad6265SDimitry Andric // Don't bother inserting AssertZext for packed IDs since we're emitting the 409581ad6265SDimitry Andric // masking operations anyway. 409681ad6265SDimitry Andric // 409781ad6265SDimitry Andric // TODO: We could assert the top bit is 0 for the source copy. 409881ad6265SDimitry Andric if (!loadInputValue(DstReg, B, ArgType)) 409981ad6265SDimitry Andric return false; 410081ad6265SDimitry Andric } else { 410181ad6265SDimitry Andric Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 410281ad6265SDimitry Andric if (!loadInputValue(TmpReg, B, ArgType)) 410381ad6265SDimitry Andric return false; 4104bdd1243dSDimitry Andric B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 410581ad6265SDimitry Andric } 410681ad6265SDimitry Andric 410781ad6265SDimitry Andric MI.eraseFromParent(); 410881ad6265SDimitry Andric return true; 410981ad6265SDimitry Andric } 411081ad6265SDimitry Andric 411181ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 411281ad6265SDimitry Andric int64_t Offset) const { 411381ad6265SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 411481ad6265SDimitry Andric Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 411581ad6265SDimitry Andric 411681ad6265SDimitry Andric // TODO: If we passed in the base kernel offset we could have a better 411781ad6265SDimitry Andric // alignment than 4, but we don't really need it. 411881ad6265SDimitry Andric if (!loadInputValue(KernArgReg, B, 411981ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 412081ad6265SDimitry Andric llvm_unreachable("failed to find kernarg segment ptr"); 412181ad6265SDimitry Andric 412281ad6265SDimitry Andric auto COffset = B.buildConstant(LLT::scalar(64), Offset); 412381ad6265SDimitry Andric // TODO: Should get nuw 412481ad6265SDimitry Andric return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 412581ad6265SDimitry Andric } 412681ad6265SDimitry Andric 412781ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by 412881ad6265SDimitry Andric /// legacy intrinsics. 412981ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 413081ad6265SDimitry Andric MachineIRBuilder &B, 413181ad6265SDimitry Andric uint64_t Offset, 413281ad6265SDimitry Andric Align Alignment) const { 413381ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 413481ad6265SDimitry Andric 413581ad6265SDimitry Andric assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 413681ad6265SDimitry Andric "unexpected kernarg parameter type"); 413781ad6265SDimitry Andric 413881ad6265SDimitry Andric Register Ptr = getKernargParameterPtr(B, Offset); 413981ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 414081ad6265SDimitry Andric B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 414181ad6265SDimitry Andric MachineMemOperand::MODereferenceable | 414281ad6265SDimitry Andric MachineMemOperand::MOInvariant); 414381ad6265SDimitry Andric MI.eraseFromParent(); 414481ad6265SDimitry Andric return true; 414581ad6265SDimitry Andric } 414681ad6265SDimitry Andric 41478bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 41488bcb0991SDimitry Andric MachineRegisterInfo &MRI, 41498bcb0991SDimitry Andric MachineIRBuilder &B) const { 4150480093f4SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 4151480093f4SDimitry Andric LLT DstTy = MRI.getType(Dst); 4152480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 4153480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4154480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 41558bcb0991SDimitry Andric 4156480093f4SDimitry Andric if (DstTy == S16) 4157480093f4SDimitry Andric return legalizeFDIV16(MI, MRI, B); 4158480093f4SDimitry Andric if (DstTy == S32) 4159480093f4SDimitry Andric return legalizeFDIV32(MI, MRI, B); 4160480093f4SDimitry Andric if (DstTy == S64) 4161480093f4SDimitry Andric return legalizeFDIV64(MI, MRI, B); 4162480093f4SDimitry Andric 41638bcb0991SDimitry Andric return false; 41648bcb0991SDimitry Andric } 41658bcb0991SDimitry Andric 4166fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4167fe6060f1SDimitry Andric Register DstDivReg, 4168fe6060f1SDimitry Andric Register DstRemReg, 41695ffd83dbSDimitry Andric Register X, 4170fe6060f1SDimitry Andric Register Y) const { 41715ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 41725ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 41735ffd83dbSDimitry Andric 41745ffd83dbSDimitry Andric // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 41755ffd83dbSDimitry Andric // algorithm used here. 41765ffd83dbSDimitry Andric 41775ffd83dbSDimitry Andric // Initial estimate of inv(y). 41785ffd83dbSDimitry Andric auto FloatY = B.buildUITOFP(S32, Y); 41795ffd83dbSDimitry Andric auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 418006c3fb27SDimitry Andric auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 41815ffd83dbSDimitry Andric auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 41825ffd83dbSDimitry Andric auto Z = B.buildFPTOUI(S32, ScaledY); 41835ffd83dbSDimitry Andric 41845ffd83dbSDimitry Andric // One round of UNR. 41855ffd83dbSDimitry Andric auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 41865ffd83dbSDimitry Andric auto NegYZ = B.buildMul(S32, NegY, Z); 41875ffd83dbSDimitry Andric Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 41885ffd83dbSDimitry Andric 41895ffd83dbSDimitry Andric // Quotient/remainder estimate. 41905ffd83dbSDimitry Andric auto Q = B.buildUMulH(S32, X, Z); 41915ffd83dbSDimitry Andric auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 41925ffd83dbSDimitry Andric 41935ffd83dbSDimitry Andric // First quotient/remainder refinement. 41945ffd83dbSDimitry Andric auto One = B.buildConstant(S32, 1); 41955ffd83dbSDimitry Andric auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4196fe6060f1SDimitry Andric if (DstDivReg) 41975ffd83dbSDimitry Andric Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 41985ffd83dbSDimitry Andric R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 41995ffd83dbSDimitry Andric 42005ffd83dbSDimitry Andric // Second quotient/remainder refinement. 42015ffd83dbSDimitry Andric Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4202fe6060f1SDimitry Andric if (DstDivReg) 4203fe6060f1SDimitry Andric B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 42045ffd83dbSDimitry Andric 4205fe6060f1SDimitry Andric if (DstRemReg) 4206fe6060f1SDimitry Andric B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 42075ffd83dbSDimitry Andric } 42085ffd83dbSDimitry Andric 4209349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32 42105ffd83dbSDimitry Andric // 42115ffd83dbSDimitry Andric // Return lo, hi of result 42125ffd83dbSDimitry Andric // 42135ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo 42145ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi 42155ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 42165ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad 42175ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc 42185ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32) 42195ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2 42205ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1 42215ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 42225ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 42235ffd83dbSDimitry Andric Register Val) { 42245ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 42255ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, Val); 42265ffd83dbSDimitry Andric 42275ffd83dbSDimitry Andric auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 42285ffd83dbSDimitry Andric auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 42295ffd83dbSDimitry Andric 423006c3fb27SDimitry Andric auto Mad = B.buildFMAD( 423106c3fb27SDimitry Andric S32, CvtHi, // 2**32 423206c3fb27SDimitry Andric B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 42335ffd83dbSDimitry Andric 42345ffd83dbSDimitry Andric auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 423506c3fb27SDimitry Andric auto Mul1 = B.buildFMul( 423606c3fb27SDimitry Andric S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 42375ffd83dbSDimitry Andric 42385ffd83dbSDimitry Andric // 2**(-32) 423906c3fb27SDimitry Andric auto Mul2 = B.buildFMul( 424006c3fb27SDimitry Andric S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 42415ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 42425ffd83dbSDimitry Andric 42435ffd83dbSDimitry Andric // -(2**32) 424406c3fb27SDimitry Andric auto Mad2 = B.buildFMAD( 424506c3fb27SDimitry Andric S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 424606c3fb27SDimitry Andric Mul1); 42475ffd83dbSDimitry Andric 42485ffd83dbSDimitry Andric auto ResultLo = B.buildFPTOUI(S32, Mad2); 42495ffd83dbSDimitry Andric auto ResultHi = B.buildFPTOUI(S32, Trunc); 42505ffd83dbSDimitry Andric 42515ffd83dbSDimitry Andric return {ResultLo.getReg(0), ResultHi.getReg(0)}; 42525ffd83dbSDimitry Andric } 42535ffd83dbSDimitry Andric 4254fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4255fe6060f1SDimitry Andric Register DstDivReg, 4256fe6060f1SDimitry Andric Register DstRemReg, 42575ffd83dbSDimitry Andric Register Numer, 4258fe6060f1SDimitry Andric Register Denom) const { 42595ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 42605ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 42615ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 42625ffd83dbSDimitry Andric Register RcpLo, RcpHi; 42635ffd83dbSDimitry Andric 42645ffd83dbSDimitry Andric std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 42655ffd83dbSDimitry Andric 4266bdd1243dSDimitry Andric auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 42675ffd83dbSDimitry Andric 42685ffd83dbSDimitry Andric auto Zero64 = B.buildConstant(S64, 0); 42695ffd83dbSDimitry Andric auto NegDenom = B.buildSub(S64, Zero64, Denom); 42705ffd83dbSDimitry Andric 42715ffd83dbSDimitry Andric auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 42725ffd83dbSDimitry Andric auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 42735ffd83dbSDimitry Andric 42745ffd83dbSDimitry Andric auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 42755ffd83dbSDimitry Andric Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 42765ffd83dbSDimitry Andric Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 42775ffd83dbSDimitry Andric 42785ffd83dbSDimitry Andric auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 42795ffd83dbSDimitry Andric auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4280bdd1243dSDimitry Andric auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 42815ffd83dbSDimitry Andric 42825ffd83dbSDimitry Andric auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 42835ffd83dbSDimitry Andric auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 42845ffd83dbSDimitry Andric auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 42855ffd83dbSDimitry Andric Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 42865ffd83dbSDimitry Andric Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 42875ffd83dbSDimitry Andric 42885ffd83dbSDimitry Andric auto Zero32 = B.buildConstant(S32, 0); 42895ffd83dbSDimitry Andric auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4290349cc55cSDimitry Andric auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4291bdd1243dSDimitry Andric auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 42925ffd83dbSDimitry Andric 42935ffd83dbSDimitry Andric auto UnmergeNumer = B.buildUnmerge(S32, Numer); 42945ffd83dbSDimitry Andric Register NumerLo = UnmergeNumer.getReg(0); 42955ffd83dbSDimitry Andric Register NumerHi = UnmergeNumer.getReg(1); 42965ffd83dbSDimitry Andric 42975ffd83dbSDimitry Andric auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 42985ffd83dbSDimitry Andric auto Mul3 = B.buildMul(S64, Denom, MulHi3); 42995ffd83dbSDimitry Andric auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 43005ffd83dbSDimitry Andric Register Mul3_Lo = UnmergeMul3.getReg(0); 43015ffd83dbSDimitry Andric Register Mul3_Hi = UnmergeMul3.getReg(1); 43025ffd83dbSDimitry Andric auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 43035ffd83dbSDimitry Andric auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 43045ffd83dbSDimitry Andric auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4305bdd1243dSDimitry Andric auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 43065ffd83dbSDimitry Andric 43075ffd83dbSDimitry Andric auto UnmergeDenom = B.buildUnmerge(S32, Denom); 43085ffd83dbSDimitry Andric Register DenomLo = UnmergeDenom.getReg(0); 43095ffd83dbSDimitry Andric Register DenomHi = UnmergeDenom.getReg(1); 43105ffd83dbSDimitry Andric 43115ffd83dbSDimitry Andric auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 43125ffd83dbSDimitry Andric auto C1 = B.buildSExt(S32, CmpHi); 43135ffd83dbSDimitry Andric 43145ffd83dbSDimitry Andric auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 43155ffd83dbSDimitry Andric auto C2 = B.buildSExt(S32, CmpLo); 43165ffd83dbSDimitry Andric 43175ffd83dbSDimitry Andric auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 43185ffd83dbSDimitry Andric auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 43195ffd83dbSDimitry Andric 43205ffd83dbSDimitry Andric // TODO: Here and below portions of the code can be enclosed into if/endif. 43215ffd83dbSDimitry Andric // Currently control flow is unconditional and we have 4 selects after 43225ffd83dbSDimitry Andric // potential endif to substitute PHIs. 43235ffd83dbSDimitry Andric 43245ffd83dbSDimitry Andric // if C3 != 0 ... 43255ffd83dbSDimitry Andric auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 43265ffd83dbSDimitry Andric auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 43275ffd83dbSDimitry Andric auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4328bdd1243dSDimitry Andric auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 43295ffd83dbSDimitry Andric 43305ffd83dbSDimitry Andric auto One64 = B.buildConstant(S64, 1); 43315ffd83dbSDimitry Andric auto Add3 = B.buildAdd(S64, MulHi3, One64); 43325ffd83dbSDimitry Andric 43335ffd83dbSDimitry Andric auto C4 = 43345ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 43355ffd83dbSDimitry Andric auto C5 = 43365ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 43375ffd83dbSDimitry Andric auto C6 = B.buildSelect( 43385ffd83dbSDimitry Andric S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 43395ffd83dbSDimitry Andric 43405ffd83dbSDimitry Andric // if (C6 != 0) 43415ffd83dbSDimitry Andric auto Add4 = B.buildAdd(S64, Add3, One64); 43425ffd83dbSDimitry Andric auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 43435ffd83dbSDimitry Andric 43445ffd83dbSDimitry Andric auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 43455ffd83dbSDimitry Andric auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4346bdd1243dSDimitry Andric auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 43475ffd83dbSDimitry Andric 43485ffd83dbSDimitry Andric // endif C6 43495ffd83dbSDimitry Andric // endif C3 43505ffd83dbSDimitry Andric 4351fe6060f1SDimitry Andric if (DstDivReg) { 43525ffd83dbSDimitry Andric auto Sel1 = B.buildSelect( 43535ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4354fe6060f1SDimitry Andric B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4355fe6060f1SDimitry Andric Sel1, MulHi3); 4356fe6060f1SDimitry Andric } 4357fe6060f1SDimitry Andric 4358fe6060f1SDimitry Andric if (DstRemReg) { 43595ffd83dbSDimitry Andric auto Sel2 = B.buildSelect( 43605ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4361fe6060f1SDimitry Andric B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4362fe6060f1SDimitry Andric Sel2, Sub1); 43635ffd83dbSDimitry Andric } 43645ffd83dbSDimitry Andric } 43655ffd83dbSDimitry Andric 4366fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 43675ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 43685ffd83dbSDimitry Andric MachineIRBuilder &B) const { 4369fe6060f1SDimitry Andric Register DstDivReg, DstRemReg; 4370fe6060f1SDimitry Andric switch (MI.getOpcode()) { 4371fe6060f1SDimitry Andric default: 4372fe6060f1SDimitry Andric llvm_unreachable("Unexpected opcode!"); 4373fe6060f1SDimitry Andric case AMDGPU::G_UDIV: { 4374fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 4375fe6060f1SDimitry Andric break; 4376fe6060f1SDimitry Andric } 4377fe6060f1SDimitry Andric case AMDGPU::G_UREM: { 4378fe6060f1SDimitry Andric DstRemReg = MI.getOperand(0).getReg(); 4379fe6060f1SDimitry Andric break; 4380fe6060f1SDimitry Andric } 4381fe6060f1SDimitry Andric case AMDGPU::G_UDIVREM: { 4382fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 4383fe6060f1SDimitry Andric DstRemReg = MI.getOperand(1).getReg(); 4384fe6060f1SDimitry Andric break; 4385fe6060f1SDimitry Andric } 4386fe6060f1SDimitry Andric } 4387fe6060f1SDimitry Andric 43885ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 43895ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 4390fe6060f1SDimitry Andric const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4391fe6060f1SDimitry Andric Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4392fe6060f1SDimitry Andric Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4393fe6060f1SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 43945ffd83dbSDimitry Andric 43955ffd83dbSDimitry Andric if (Ty == S32) 4396fe6060f1SDimitry Andric legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 43975ffd83dbSDimitry Andric else if (Ty == S64) 4398fe6060f1SDimitry Andric legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 43995ffd83dbSDimitry Andric else 44005ffd83dbSDimitry Andric return false; 44015ffd83dbSDimitry Andric 44025ffd83dbSDimitry Andric MI.eraseFromParent(); 44035ffd83dbSDimitry Andric return true; 44045ffd83dbSDimitry Andric } 44055ffd83dbSDimitry Andric 4406fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 44075ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 44085ffd83dbSDimitry Andric MachineIRBuilder &B) const { 44095ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 44105ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 44115ffd83dbSDimitry Andric 4412fe6060f1SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 44135ffd83dbSDimitry Andric if (Ty != S32 && Ty != S64) 44145ffd83dbSDimitry Andric return false; 44155ffd83dbSDimitry Andric 4416fe6060f1SDimitry Andric const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4417fe6060f1SDimitry Andric Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4418fe6060f1SDimitry Andric Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 44195ffd83dbSDimitry Andric 44205ffd83dbSDimitry Andric auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 44215ffd83dbSDimitry Andric auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 44225ffd83dbSDimitry Andric auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 44235ffd83dbSDimitry Andric 44245ffd83dbSDimitry Andric LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 44255ffd83dbSDimitry Andric RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 44265ffd83dbSDimitry Andric 44275ffd83dbSDimitry Andric LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 44285ffd83dbSDimitry Andric RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 44295ffd83dbSDimitry Andric 4430fe6060f1SDimitry Andric Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4431fe6060f1SDimitry Andric switch (MI.getOpcode()) { 4432fe6060f1SDimitry Andric default: 4433fe6060f1SDimitry Andric llvm_unreachable("Unexpected opcode!"); 4434fe6060f1SDimitry Andric case AMDGPU::G_SDIV: { 4435fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 4436fe6060f1SDimitry Andric TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4437fe6060f1SDimitry Andric break; 4438fe6060f1SDimitry Andric } 4439fe6060f1SDimitry Andric case AMDGPU::G_SREM: { 4440fe6060f1SDimitry Andric DstRemReg = MI.getOperand(0).getReg(); 4441fe6060f1SDimitry Andric TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4442fe6060f1SDimitry Andric break; 4443fe6060f1SDimitry Andric } 4444fe6060f1SDimitry Andric case AMDGPU::G_SDIVREM: { 4445fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 4446fe6060f1SDimitry Andric DstRemReg = MI.getOperand(1).getReg(); 4447fe6060f1SDimitry Andric TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4448fe6060f1SDimitry Andric TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4449fe6060f1SDimitry Andric break; 4450fe6060f1SDimitry Andric } 4451fe6060f1SDimitry Andric } 4452fe6060f1SDimitry Andric 44535ffd83dbSDimitry Andric if (Ty == S32) 4454fe6060f1SDimitry Andric legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 44555ffd83dbSDimitry Andric else 4456fe6060f1SDimitry Andric legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 44575ffd83dbSDimitry Andric 4458fe6060f1SDimitry Andric if (DstDivReg) { 4459fe6060f1SDimitry Andric auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4460fe6060f1SDimitry Andric auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4461fe6060f1SDimitry Andric B.buildSub(DstDivReg, SignXor, Sign); 4462fe6060f1SDimitry Andric } 44635ffd83dbSDimitry Andric 4464fe6060f1SDimitry Andric if (DstRemReg) { 4465fe6060f1SDimitry Andric auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4466fe6060f1SDimitry Andric auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4467fe6060f1SDimitry Andric B.buildSub(DstRemReg, SignXor, Sign); 4468fe6060f1SDimitry Andric } 44695ffd83dbSDimitry Andric 44705ffd83dbSDimitry Andric MI.eraseFromParent(); 44715ffd83dbSDimitry Andric return true; 44725ffd83dbSDimitry Andric } 44735ffd83dbSDimitry Andric 44748bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 44758bcb0991SDimitry Andric MachineRegisterInfo &MRI, 44768bcb0991SDimitry Andric MachineIRBuilder &B) const { 44778bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 44788bcb0991SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 44798bcb0991SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 44808bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 44818bcb0991SDimitry Andric LLT ResTy = MRI.getType(Res); 44828bcb0991SDimitry Andric 44838bcb0991SDimitry Andric const MachineFunction &MF = B.getMF(); 448406c3fb27SDimitry Andric bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 448506c3fb27SDimitry Andric MF.getTarget().Options.UnsafeFPMath; 44868bcb0991SDimitry Andric 44878bcb0991SDimitry Andric if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 448806c3fb27SDimitry Andric if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 448906c3fb27SDimitry Andric return false; 449006c3fb27SDimitry Andric 449106c3fb27SDimitry Andric // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 449206c3fb27SDimitry Andric // the CI documentation has a worst case error of 1 ulp. 449306c3fb27SDimitry Andric // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 449406c3fb27SDimitry Andric // use it as long as we aren't trying to use denormals. 449506c3fb27SDimitry Andric // 449606c3fb27SDimitry Andric // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 449706c3fb27SDimitry Andric 44988bcb0991SDimitry Andric // 1 / x -> RCP(x) 44998bcb0991SDimitry Andric if (CLHS->isExactlyValue(1.0)) { 45008bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 45018bcb0991SDimitry Andric .addUse(RHS) 45028bcb0991SDimitry Andric .setMIFlags(Flags); 45038bcb0991SDimitry Andric 45048bcb0991SDimitry Andric MI.eraseFromParent(); 45058bcb0991SDimitry Andric return true; 45068bcb0991SDimitry Andric } 45078bcb0991SDimitry Andric 450806c3fb27SDimitry Andric // TODO: Match rsq 450906c3fb27SDimitry Andric 45108bcb0991SDimitry Andric // -1 / x -> RCP( FNEG(x) ) 45118bcb0991SDimitry Andric if (CLHS->isExactlyValue(-1.0)) { 45128bcb0991SDimitry Andric auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 45138bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 45148bcb0991SDimitry Andric .addUse(FNeg.getReg(0)) 45158bcb0991SDimitry Andric .setMIFlags(Flags); 45168bcb0991SDimitry Andric 45178bcb0991SDimitry Andric MI.eraseFromParent(); 45188bcb0991SDimitry Andric return true; 45198bcb0991SDimitry Andric } 45208bcb0991SDimitry Andric } 45218bcb0991SDimitry Andric 452206c3fb27SDimitry Andric // For f16 require arcp only. 452306c3fb27SDimitry Andric // For f32 require afn+arcp. 452406c3fb27SDimitry Andric if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 452506c3fb27SDimitry Andric !MI.getFlag(MachineInstr::FmArcp))) 452606c3fb27SDimitry Andric return false; 452706c3fb27SDimitry Andric 45288bcb0991SDimitry Andric // x / y -> x * (1.0 / y) 45298bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 45308bcb0991SDimitry Andric .addUse(RHS) 45318bcb0991SDimitry Andric .setMIFlags(Flags); 45328bcb0991SDimitry Andric B.buildFMul(Res, LHS, RCP, Flags); 45338bcb0991SDimitry Andric 45348bcb0991SDimitry Andric MI.eraseFromParent(); 45358bcb0991SDimitry Andric return true; 45368bcb0991SDimitry Andric } 45378bcb0991SDimitry Andric 4538e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4539e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 4540e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 4541e8d8bef9SDimitry Andric Register Res = MI.getOperand(0).getReg(); 4542e8d8bef9SDimitry Andric Register X = MI.getOperand(1).getReg(); 4543e8d8bef9SDimitry Andric Register Y = MI.getOperand(2).getReg(); 4544e8d8bef9SDimitry Andric uint16_t Flags = MI.getFlags(); 4545e8d8bef9SDimitry Andric LLT ResTy = MRI.getType(Res); 4546e8d8bef9SDimitry Andric 4547e8d8bef9SDimitry Andric const MachineFunction &MF = B.getMF(); 4548e8d8bef9SDimitry Andric bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4549e8d8bef9SDimitry Andric MI.getFlag(MachineInstr::FmAfn); 4550e8d8bef9SDimitry Andric 4551e8d8bef9SDimitry Andric if (!AllowInaccurateRcp) 45528bcb0991SDimitry Andric return false; 4553e8d8bef9SDimitry Andric 4554e8d8bef9SDimitry Andric auto NegY = B.buildFNeg(ResTy, Y); 4555e8d8bef9SDimitry Andric auto One = B.buildFConstant(ResTy, 1.0); 4556e8d8bef9SDimitry Andric 4557e8d8bef9SDimitry Andric auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 4558e8d8bef9SDimitry Andric .addUse(Y) 4559e8d8bef9SDimitry Andric .setMIFlags(Flags); 4560e8d8bef9SDimitry Andric 4561e8d8bef9SDimitry Andric auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4562e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp0, R, R); 4563e8d8bef9SDimitry Andric 4564e8d8bef9SDimitry Andric auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4565e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp1, R, R); 4566e8d8bef9SDimitry Andric 4567e8d8bef9SDimitry Andric auto Ret = B.buildFMul(ResTy, X, R); 4568e8d8bef9SDimitry Andric auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4569e8d8bef9SDimitry Andric 4570e8d8bef9SDimitry Andric B.buildFMA(Res, Tmp2, R, Ret); 4571e8d8bef9SDimitry Andric MI.eraseFromParent(); 4572e8d8bef9SDimitry Andric return true; 45738bcb0991SDimitry Andric } 45748bcb0991SDimitry Andric 4575480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4576480093f4SDimitry Andric MachineRegisterInfo &MRI, 4577480093f4SDimitry Andric MachineIRBuilder &B) const { 4578e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4579e8d8bef9SDimitry Andric return true; 4580e8d8bef9SDimitry Andric 4581480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 4582480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 4583480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 4584480093f4SDimitry Andric 4585480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 4586480093f4SDimitry Andric 4587480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 4588480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4589480093f4SDimitry Andric 4590480093f4SDimitry Andric auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4591480093f4SDimitry Andric auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4592480093f4SDimitry Andric 4593480093f4SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4594480093f4SDimitry Andric .addUse(RHSExt.getReg(0)) 4595480093f4SDimitry Andric .setMIFlags(Flags); 4596480093f4SDimitry Andric 4597480093f4SDimitry Andric auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 4598480093f4SDimitry Andric auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 4599480093f4SDimitry Andric 4600480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 4601480093f4SDimitry Andric .addUse(RDst.getReg(0)) 4602480093f4SDimitry Andric .addUse(RHS) 4603480093f4SDimitry Andric .addUse(LHS) 4604480093f4SDimitry Andric .setMIFlags(Flags); 4605480093f4SDimitry Andric 4606480093f4SDimitry Andric MI.eraseFromParent(); 4607480093f4SDimitry Andric return true; 4608480093f4SDimitry Andric } 4609480093f4SDimitry Andric 4610480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4611480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode. 461206c3fb27SDimitry Andric static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4613480093f4SDimitry Andric const GCNSubtarget &ST, 461406c3fb27SDimitry Andric SIModeRegisterDefaults Mode) { 4615480093f4SDimitry Andric // Set SP denorm mode to this value. 4616480093f4SDimitry Andric unsigned SPDenormMode = 46175ffd83dbSDimitry Andric Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 4618480093f4SDimitry Andric 4619480093f4SDimitry Andric if (ST.hasDenormModeInst()) { 4620480093f4SDimitry Andric // Preserve default FP64FP16 denorm mode while updating FP32 mode. 46215ffd83dbSDimitry Andric uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 4622480093f4SDimitry Andric 46235ffd83dbSDimitry Andric uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 4624480093f4SDimitry Andric B.buildInstr(AMDGPU::S_DENORM_MODE) 4625480093f4SDimitry Andric .addImm(NewDenormModeValue); 4626480093f4SDimitry Andric 4627480093f4SDimitry Andric } else { 4628480093f4SDimitry Andric // Select FP32 bit field in mode register. 4629480093f4SDimitry Andric unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 4630480093f4SDimitry Andric (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 4631480093f4SDimitry Andric (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 4632480093f4SDimitry Andric 4633480093f4SDimitry Andric B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 4634480093f4SDimitry Andric .addImm(SPDenormMode) 4635480093f4SDimitry Andric .addImm(SPDenormModeBitField); 4636480093f4SDimitry Andric } 4637480093f4SDimitry Andric } 4638480093f4SDimitry Andric 4639480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 4640480093f4SDimitry Andric MachineRegisterInfo &MRI, 4641480093f4SDimitry Andric MachineIRBuilder &B) const { 4642e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4643e8d8bef9SDimitry Andric return true; 4644e8d8bef9SDimitry Andric 4645480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 4646480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 4647480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 4648480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 464906c3fb27SDimitry Andric SIModeRegisterDefaults Mode = MFI->getMode(); 4650480093f4SDimitry Andric 4651480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 4652480093f4SDimitry Andric 4653480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4654480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 4655480093f4SDimitry Andric 4656480093f4SDimitry Andric auto One = B.buildFConstant(S32, 1.0f); 4657480093f4SDimitry Andric 4658480093f4SDimitry Andric auto DenominatorScaled = 4659480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 4660480093f4SDimitry Andric .addUse(LHS) 46615ffd83dbSDimitry Andric .addUse(RHS) 46625ffd83dbSDimitry Andric .addImm(0) 4663480093f4SDimitry Andric .setMIFlags(Flags); 4664480093f4SDimitry Andric auto NumeratorScaled = 4665480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 4666480093f4SDimitry Andric .addUse(LHS) 4667480093f4SDimitry Andric .addUse(RHS) 46685ffd83dbSDimitry Andric .addImm(1) 4669480093f4SDimitry Andric .setMIFlags(Flags); 4670480093f4SDimitry Andric 4671480093f4SDimitry Andric auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 4672480093f4SDimitry Andric .addUse(DenominatorScaled.getReg(0)) 4673480093f4SDimitry Andric .setMIFlags(Flags); 4674480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 4675480093f4SDimitry Andric 4676480093f4SDimitry Andric // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 4677480093f4SDimitry Andric // aren't modeled as reading it. 467806c3fb27SDimitry Andric if (Mode.FP32Denormals != DenormalMode::getIEEE()) 4679480093f4SDimitry Andric toggleSPDenormMode(true, B, ST, Mode); 4680480093f4SDimitry Andric 4681480093f4SDimitry Andric auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 4682480093f4SDimitry Andric auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 4683480093f4SDimitry Andric auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 4684480093f4SDimitry Andric auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 4685480093f4SDimitry Andric auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 4686480093f4SDimitry Andric auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 4687480093f4SDimitry Andric 468806c3fb27SDimitry Andric // FIXME: This mishandles dynamic denormal mode. We need to query the 468906c3fb27SDimitry Andric // current mode and restore the original. 469006c3fb27SDimitry Andric if (Mode.FP32Denormals != DenormalMode::getIEEE()) 4691480093f4SDimitry Andric toggleSPDenormMode(false, B, ST, Mode); 4692480093f4SDimitry Andric 4693480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 4694480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 4695480093f4SDimitry Andric .addUse(Fma1.getReg(0)) 4696480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 4697480093f4SDimitry Andric .addUse(NumeratorScaled.getReg(1)) 4698480093f4SDimitry Andric .setMIFlags(Flags); 4699480093f4SDimitry Andric 4700480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 4701480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 4702480093f4SDimitry Andric .addUse(RHS) 4703480093f4SDimitry Andric .addUse(LHS) 4704480093f4SDimitry Andric .setMIFlags(Flags); 4705480093f4SDimitry Andric 4706480093f4SDimitry Andric MI.eraseFromParent(); 4707480093f4SDimitry Andric return true; 4708480093f4SDimitry Andric } 4709480093f4SDimitry Andric 4710480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 4711480093f4SDimitry Andric MachineRegisterInfo &MRI, 4712480093f4SDimitry Andric MachineIRBuilder &B) const { 4713e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 4714e8d8bef9SDimitry Andric return true; 4715e8d8bef9SDimitry Andric 4716480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 4717480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 4718480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 4719480093f4SDimitry Andric 4720480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 4721480093f4SDimitry Andric 4722480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 4723480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 4724480093f4SDimitry Andric 4725480093f4SDimitry Andric auto One = B.buildFConstant(S64, 1.0); 4726480093f4SDimitry Andric 4727480093f4SDimitry Andric auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4728480093f4SDimitry Andric .addUse(LHS) 4729480093f4SDimitry Andric .addUse(RHS) 47305ffd83dbSDimitry Andric .addImm(0) 4731480093f4SDimitry Andric .setMIFlags(Flags); 4732480093f4SDimitry Andric 4733480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 4734480093f4SDimitry Andric 4735480093f4SDimitry Andric auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 4736480093f4SDimitry Andric .addUse(DivScale0.getReg(0)) 4737480093f4SDimitry Andric .setMIFlags(Flags); 4738480093f4SDimitry Andric 4739480093f4SDimitry Andric auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 4740480093f4SDimitry Andric auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 4741480093f4SDimitry Andric auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4742480093f4SDimitry Andric 4743480093f4SDimitry Andric auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4744480093f4SDimitry Andric .addUse(LHS) 4745480093f4SDimitry Andric .addUse(RHS) 47465ffd83dbSDimitry Andric .addImm(1) 4747480093f4SDimitry Andric .setMIFlags(Flags); 4748480093f4SDimitry Andric 4749480093f4SDimitry Andric auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 47505ffd83dbSDimitry Andric auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4751480093f4SDimitry Andric auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4752480093f4SDimitry Andric 4753480093f4SDimitry Andric Register Scale; 4754480093f4SDimitry Andric if (!ST.hasUsableDivScaleConditionOutput()) { 4755480093f4SDimitry Andric // Workaround a hardware bug on SI where the condition output from div_scale 4756480093f4SDimitry Andric // is not usable. 4757480093f4SDimitry Andric 4758480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4759480093f4SDimitry Andric 4760480093f4SDimitry Andric auto NumUnmerge = B.buildUnmerge(S32, LHS); 4761480093f4SDimitry Andric auto DenUnmerge = B.buildUnmerge(S32, RHS); 4762480093f4SDimitry Andric auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4763480093f4SDimitry Andric auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4764480093f4SDimitry Andric 4765480093f4SDimitry Andric auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4766480093f4SDimitry Andric Scale1Unmerge.getReg(1)); 4767480093f4SDimitry Andric auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4768480093f4SDimitry Andric Scale0Unmerge.getReg(1)); 47695ffd83dbSDimitry Andric Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4770480093f4SDimitry Andric } else { 4771480093f4SDimitry Andric Scale = DivScale1.getReg(1); 4772480093f4SDimitry Andric } 4773480093f4SDimitry Andric 4774480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4775480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 4776480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 4777480093f4SDimitry Andric .addUse(Mul.getReg(0)) 4778480093f4SDimitry Andric .addUse(Scale) 4779480093f4SDimitry Andric .setMIFlags(Flags); 4780480093f4SDimitry Andric 4781bdd1243dSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) 4782480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 4783480093f4SDimitry Andric .addUse(RHS) 4784480093f4SDimitry Andric .addUse(LHS) 4785480093f4SDimitry Andric .setMIFlags(Flags); 4786480093f4SDimitry Andric 4787480093f4SDimitry Andric MI.eraseFromParent(); 4788480093f4SDimitry Andric return true; 4789480093f4SDimitry Andric } 4790480093f4SDimitry Andric 479106c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 479206c3fb27SDimitry Andric MachineRegisterInfo &MRI, 479306c3fb27SDimitry Andric MachineIRBuilder &B) const { 479406c3fb27SDimitry Andric Register Res0 = MI.getOperand(0).getReg(); 479506c3fb27SDimitry Andric Register Res1 = MI.getOperand(1).getReg(); 479606c3fb27SDimitry Andric Register Val = MI.getOperand(2).getReg(); 479706c3fb27SDimitry Andric uint16_t Flags = MI.getFlags(); 479806c3fb27SDimitry Andric 479906c3fb27SDimitry Andric LLT Ty = MRI.getType(Res0); 480006c3fb27SDimitry Andric LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 480106c3fb27SDimitry Andric 480206c3fb27SDimitry Andric auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false) 480306c3fb27SDimitry Andric .addUse(Val) 480406c3fb27SDimitry Andric .setMIFlags(Flags); 480506c3fb27SDimitry Andric auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false) 480606c3fb27SDimitry Andric .addUse(Val) 480706c3fb27SDimitry Andric .setMIFlags(Flags); 480806c3fb27SDimitry Andric 480906c3fb27SDimitry Andric if (ST.hasFractBug()) { 481006c3fb27SDimitry Andric auto Fabs = B.buildFAbs(Ty, Val); 481106c3fb27SDimitry Andric auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 481206c3fb27SDimitry Andric auto IsFinite = 481306c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 481406c3fb27SDimitry Andric auto Zero = B.buildConstant(InstrExpTy, 0); 481506c3fb27SDimitry Andric Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 481606c3fb27SDimitry Andric Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 481706c3fb27SDimitry Andric } 481806c3fb27SDimitry Andric 481906c3fb27SDimitry Andric B.buildCopy(Res0, Mant); 482006c3fb27SDimitry Andric B.buildSExtOrTrunc(Res1, Exp); 482106c3fb27SDimitry Andric 482206c3fb27SDimitry Andric MI.eraseFromParent(); 482306c3fb27SDimitry Andric return true; 482406c3fb27SDimitry Andric } 482506c3fb27SDimitry Andric 48268bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 48278bcb0991SDimitry Andric MachineRegisterInfo &MRI, 48288bcb0991SDimitry Andric MachineIRBuilder &B) const { 48298bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 48308bcb0991SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 48318bcb0991SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 48328bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 48338bcb0991SDimitry Andric 48348bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 48358bcb0991SDimitry Andric LLT S1 = LLT::scalar(1); 48368bcb0991SDimitry Andric 48378bcb0991SDimitry Andric auto Abs = B.buildFAbs(S32, RHS, Flags); 48388bcb0991SDimitry Andric const APFloat C0Val(1.0f); 48398bcb0991SDimitry Andric 484006c3fb27SDimitry Andric auto C0 = B.buildFConstant(S32, 0x1p+96f); 484106c3fb27SDimitry Andric auto C1 = B.buildFConstant(S32, 0x1p-32f); 484206c3fb27SDimitry Andric auto C2 = B.buildFConstant(S32, 1.0f); 48438bcb0991SDimitry Andric 48448bcb0991SDimitry Andric auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 48458bcb0991SDimitry Andric auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 48468bcb0991SDimitry Andric 48478bcb0991SDimitry Andric auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 48488bcb0991SDimitry Andric 48498bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 48508bcb0991SDimitry Andric .addUse(Mul0.getReg(0)) 48518bcb0991SDimitry Andric .setMIFlags(Flags); 48528bcb0991SDimitry Andric 48538bcb0991SDimitry Andric auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 48548bcb0991SDimitry Andric 48558bcb0991SDimitry Andric B.buildFMul(Res, Sel, Mul1, Flags); 48568bcb0991SDimitry Andric 48578bcb0991SDimitry Andric MI.eraseFromParent(); 48588bcb0991SDimitry Andric return true; 48598bcb0991SDimitry Andric } 48608bcb0991SDimitry Andric 486106c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 486206c3fb27SDimitry Andric MachineRegisterInfo &MRI, 486306c3fb27SDimitry Andric MachineIRBuilder &B) const { 486406c3fb27SDimitry Andric // For double type, the SQRT and RSQ instructions don't have required 486506c3fb27SDimitry Andric // precision, we apply Goldschmidt's algorithm to improve the result: 486606c3fb27SDimitry Andric // 486706c3fb27SDimitry Andric // y0 = rsq(x) 486806c3fb27SDimitry Andric // g0 = x * y0 486906c3fb27SDimitry Andric // h0 = 0.5 * y0 487006c3fb27SDimitry Andric // 487106c3fb27SDimitry Andric // r0 = 0.5 - h0 * g0 487206c3fb27SDimitry Andric // g1 = g0 * r0 + g0 487306c3fb27SDimitry Andric // h1 = h0 * r0 + h0 487406c3fb27SDimitry Andric // 487506c3fb27SDimitry Andric // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 487606c3fb27SDimitry Andric // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 487706c3fb27SDimitry Andric // h2 = h1 * r1 + h1 487806c3fb27SDimitry Andric // 487906c3fb27SDimitry Andric // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 488006c3fb27SDimitry Andric // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 488106c3fb27SDimitry Andric // 488206c3fb27SDimitry Andric // sqrt(x) = g3 488306c3fb27SDimitry Andric 488406c3fb27SDimitry Andric const LLT S1 = LLT::scalar(1); 488506c3fb27SDimitry Andric const LLT S32 = LLT::scalar(32); 488606c3fb27SDimitry Andric const LLT F64 = LLT::scalar(64); 488706c3fb27SDimitry Andric 488806c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 488906c3fb27SDimitry Andric assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 489006c3fb27SDimitry Andric 489106c3fb27SDimitry Andric Register X = MI.getOperand(1).getReg(); 489206c3fb27SDimitry Andric unsigned Flags = MI.getFlags(); 489306c3fb27SDimitry Andric 489406c3fb27SDimitry Andric auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 489506c3fb27SDimitry Andric 489606c3fb27SDimitry Andric auto ZeroInt = B.buildConstant(S32, 0); 489706c3fb27SDimitry Andric auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 489806c3fb27SDimitry Andric 489906c3fb27SDimitry Andric // Scale up input if it is too small. 490006c3fb27SDimitry Andric auto ScaleUpFactor = B.buildConstant(S32, 256); 490106c3fb27SDimitry Andric auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 490206c3fb27SDimitry Andric auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 490306c3fb27SDimitry Andric 490406c3fb27SDimitry Andric auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false) 490506c3fb27SDimitry Andric .addReg(SqrtX.getReg(0)); 490606c3fb27SDimitry Andric 490706c3fb27SDimitry Andric auto Half = B.buildFConstant(F64, 0.5); 490806c3fb27SDimitry Andric auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 490906c3fb27SDimitry Andric auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 491006c3fb27SDimitry Andric 491106c3fb27SDimitry Andric auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 491206c3fb27SDimitry Andric auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 491306c3fb27SDimitry Andric 491406c3fb27SDimitry Andric auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 491506c3fb27SDimitry Andric auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 491606c3fb27SDimitry Andric 491706c3fb27SDimitry Andric auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 491806c3fb27SDimitry Andric auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 491906c3fb27SDimitry Andric 492006c3fb27SDimitry Andric auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 492106c3fb27SDimitry Andric 492206c3fb27SDimitry Andric auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 492306c3fb27SDimitry Andric auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 492406c3fb27SDimitry Andric 492506c3fb27SDimitry Andric auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 492606c3fb27SDimitry Andric 492706c3fb27SDimitry Andric // Scale down the result. 492806c3fb27SDimitry Andric auto ScaleDownFactor = B.buildConstant(S32, -128); 492906c3fb27SDimitry Andric auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 493006c3fb27SDimitry Andric SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 493106c3fb27SDimitry Andric 493206c3fb27SDimitry Andric // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 493306c3fb27SDimitry Andric // with finite only or nsz because rsq(+/-0) = +/-inf 493406c3fb27SDimitry Andric 493506c3fb27SDimitry Andric // TODO: Check for DAZ and expand to subnormals 493606c3fb27SDimitry Andric auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 493706c3fb27SDimitry Andric 493806c3fb27SDimitry Andric // If x is +INF, +0, or -0, use its original value 493906c3fb27SDimitry Andric B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 494006c3fb27SDimitry Andric 494106c3fb27SDimitry Andric MI.eraseFromParent(); 494206c3fb27SDimitry Andric return true; 494306c3fb27SDimitry Andric } 494406c3fb27SDimitry Andric 4945e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4946e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions? 4947e8d8bef9SDimitry Andric // 4948e8d8bef9SDimitry Andric // Reciprocal square root. The clamp prevents infinite results, clamping 4949e8d8bef9SDimitry Andric // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4950e8d8bef9SDimitry Andric // +-max_float. 4951e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4952e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 4953e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 4954e8d8bef9SDimitry Andric if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4955e8d8bef9SDimitry Andric return true; 4956e8d8bef9SDimitry Andric 4957e8d8bef9SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 4958e8d8bef9SDimitry Andric Register Src = MI.getOperand(2).getReg(); 4959e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 4960e8d8bef9SDimitry Andric 4961e8d8bef9SDimitry Andric LLT Ty = MRI.getType(Dst); 4962e8d8bef9SDimitry Andric 4963e8d8bef9SDimitry Andric const fltSemantics *FltSemantics; 4964e8d8bef9SDimitry Andric if (Ty == LLT::scalar(32)) 4965e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEsingle(); 4966e8d8bef9SDimitry Andric else if (Ty == LLT::scalar(64)) 4967e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEdouble(); 4968e8d8bef9SDimitry Andric else 4969e8d8bef9SDimitry Andric return false; 4970e8d8bef9SDimitry Andric 4971e8d8bef9SDimitry Andric auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4972e8d8bef9SDimitry Andric .addUse(Src) 4973e8d8bef9SDimitry Andric .setMIFlags(Flags); 4974e8d8bef9SDimitry Andric 4975e8d8bef9SDimitry Andric // We don't need to concern ourselves with the snan handling difference, since 4976e8d8bef9SDimitry Andric // the rsq quieted (or not) so use the one which will directly select. 4977e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4978e8d8bef9SDimitry Andric const bool UseIEEE = MFI->getMode().IEEE; 4979e8d8bef9SDimitry Andric 4980e8d8bef9SDimitry Andric auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4981e8d8bef9SDimitry Andric auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4982e8d8bef9SDimitry Andric B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4983e8d8bef9SDimitry Andric 4984e8d8bef9SDimitry Andric auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4985e8d8bef9SDimitry Andric 4986e8d8bef9SDimitry Andric if (UseIEEE) 4987e8d8bef9SDimitry Andric B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4988e8d8bef9SDimitry Andric else 4989e8d8bef9SDimitry Andric B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4990e8d8bef9SDimitry Andric MI.eraseFromParent(); 4991e8d8bef9SDimitry Andric return true; 4992e8d8bef9SDimitry Andric } 4993e8d8bef9SDimitry Andric 4994e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4995e8d8bef9SDimitry Andric switch (IID) { 4996e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 4997e8d8bef9SDimitry Andric return AMDGPU::G_ATOMICRMW_FADD; 4998e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 4999e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 5000e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 5001e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 5002e8d8bef9SDimitry Andric default: 5003e8d8bef9SDimitry Andric llvm_unreachable("not a DS FP intrinsic"); 5004e8d8bef9SDimitry Andric } 5005e8d8bef9SDimitry Andric } 5006e8d8bef9SDimitry Andric 5007e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 5008e8d8bef9SDimitry Andric MachineInstr &MI, 5009e8d8bef9SDimitry Andric Intrinsic::ID IID) const { 5010e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 5011e8d8bef9SDimitry Andric Observer.changingInstr(MI); 5012e8d8bef9SDimitry Andric 5013e8d8bef9SDimitry Andric MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 5014e8d8bef9SDimitry Andric 5015e8d8bef9SDimitry Andric // The remaining operands were used to set fields in the MemOperand on 5016e8d8bef9SDimitry Andric // construction. 5017e8d8bef9SDimitry Andric for (int I = 6; I > 3; --I) 501881ad6265SDimitry Andric MI.removeOperand(I); 5019e8d8bef9SDimitry Andric 502081ad6265SDimitry Andric MI.removeOperand(1); // Remove the intrinsic ID. 5021e8d8bef9SDimitry Andric Observer.changedInstr(MI); 5022e8d8bef9SDimitry Andric return true; 5023e8d8bef9SDimitry Andric } 5024e8d8bef9SDimitry Andric 5025e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5026e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 5027e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 5028e8d8bef9SDimitry Andric uint64_t Offset = 5029e8d8bef9SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset( 5030e8d8bef9SDimitry Andric B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5031e8d8bef9SDimitry Andric LLT DstTy = MRI.getType(DstReg); 5032e8d8bef9SDimitry Andric LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5033e8d8bef9SDimitry Andric 5034e8d8bef9SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5035e8d8bef9SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 5036e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5037e8d8bef9SDimitry Andric return false; 5038e8d8bef9SDimitry Andric 5039e8d8bef9SDimitry Andric // FIXME: This should be nuw 5040e8d8bef9SDimitry Andric B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5041e8d8bef9SDimitry Andric return true; 5042e8d8bef9SDimitry Andric } 5043e8d8bef9SDimitry Andric 504406c3fb27SDimitry Andric /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 504506c3fb27SDimitry Andric /// bits of the pointer and replace them with the stride argument, then 504606c3fb27SDimitry Andric /// merge_values everything together. In the common case of a raw buffer (the 504706c3fb27SDimitry Andric /// stride component is 0), we can just AND off the upper half. 504806c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 504906c3fb27SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 505006c3fb27SDimitry Andric Register Result = MI.getOperand(0).getReg(); 505106c3fb27SDimitry Andric Register Pointer = MI.getOperand(2).getReg(); 505206c3fb27SDimitry Andric Register Stride = MI.getOperand(3).getReg(); 505306c3fb27SDimitry Andric Register NumRecords = MI.getOperand(4).getReg(); 505406c3fb27SDimitry Andric Register Flags = MI.getOperand(5).getReg(); 505506c3fb27SDimitry Andric 505606c3fb27SDimitry Andric LLT S32 = LLT::scalar(32); 505706c3fb27SDimitry Andric 505806c3fb27SDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 505906c3fb27SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Pointer); 506006c3fb27SDimitry Andric Register LowHalf = Unmerge.getReg(0); 506106c3fb27SDimitry Andric Register HighHalf = Unmerge.getReg(1); 506206c3fb27SDimitry Andric 506306c3fb27SDimitry Andric auto AndMask = B.buildConstant(S32, 0x0000ffff); 506406c3fb27SDimitry Andric auto Masked = B.buildAnd(S32, HighHalf, AndMask); 506506c3fb27SDimitry Andric 506606c3fb27SDimitry Andric MachineInstrBuilder NewHighHalf = Masked; 506706c3fb27SDimitry Andric std::optional<ValueAndVReg> StrideConst = 506806c3fb27SDimitry Andric getIConstantVRegValWithLookThrough(Stride, MRI); 506906c3fb27SDimitry Andric if (!StrideConst || !StrideConst->Value.isZero()) { 507006c3fb27SDimitry Andric MachineInstrBuilder ShiftedStride; 507106c3fb27SDimitry Andric if (StrideConst) { 507206c3fb27SDimitry Andric uint32_t StrideVal = StrideConst->Value.getZExtValue(); 507306c3fb27SDimitry Andric uint32_t ShiftedStrideVal = StrideVal << 16; 507406c3fb27SDimitry Andric ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 507506c3fb27SDimitry Andric } else { 507606c3fb27SDimitry Andric auto ExtStride = B.buildAnyExt(S32, Stride); 507706c3fb27SDimitry Andric auto ShiftConst = B.buildConstant(S32, 16); 507806c3fb27SDimitry Andric ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 507906c3fb27SDimitry Andric } 508006c3fb27SDimitry Andric NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 508106c3fb27SDimitry Andric } 508206c3fb27SDimitry Andric Register NewHighHalfReg = NewHighHalf.getReg(0); 508306c3fb27SDimitry Andric B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 508406c3fb27SDimitry Andric MI.eraseFromParent(); 508506c3fb27SDimitry Andric return true; 508606c3fb27SDimitry Andric } 508706c3fb27SDimitry Andric 50880b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 50890b57cec5SDimitry Andric MachineRegisterInfo &MRI, 50900b57cec5SDimitry Andric MachineIRBuilder &B) const { 50910b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 50920b57cec5SDimitry Andric if (!MFI->isEntryFunction()) { 50930b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 50940b57cec5SDimitry Andric AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 50950b57cec5SDimitry Andric } 50960b57cec5SDimitry Andric 50970b57cec5SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 5098e8d8bef9SDimitry Andric if (!getImplicitArgPtr(DstReg, MRI, B)) 50990b57cec5SDimitry Andric return false; 51000b57cec5SDimitry Andric 51010b57cec5SDimitry Andric MI.eraseFromParent(); 51020b57cec5SDimitry Andric return true; 51030b57cec5SDimitry Andric } 51040b57cec5SDimitry Andric 5105fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5106fcaf7f86SDimitry Andric MachineRegisterInfo &MRI, 5107fcaf7f86SDimitry Andric MachineIRBuilder &B) const { 5108fcaf7f86SDimitry Andric Function &F = B.getMF().getFunction(); 5109bdd1243dSDimitry Andric std::optional<uint32_t> KnownSize = 5110fcaf7f86SDimitry Andric AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5111fcaf7f86SDimitry Andric if (KnownSize.has_value()) 5112bdd1243dSDimitry Andric B.buildConstant(DstReg, *KnownSize); 5113fcaf7f86SDimitry Andric return false; 5114fcaf7f86SDimitry Andric } 5115fcaf7f86SDimitry Andric 5116fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5117fcaf7f86SDimitry Andric MachineRegisterInfo &MRI, 5118fcaf7f86SDimitry Andric MachineIRBuilder &B) const { 5119fcaf7f86SDimitry Andric 5120fcaf7f86SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5121fcaf7f86SDimitry Andric if (!MFI->isEntryFunction()) { 5122fcaf7f86SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 5123fcaf7f86SDimitry Andric AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5124fcaf7f86SDimitry Andric } 5125fcaf7f86SDimitry Andric 5126fcaf7f86SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 5127fcaf7f86SDimitry Andric if (!getLDSKernelId(DstReg, MRI, B)) 5128fcaf7f86SDimitry Andric return false; 5129fcaf7f86SDimitry Andric 5130fcaf7f86SDimitry Andric MI.eraseFromParent(); 5131fcaf7f86SDimitry Andric return true; 5132fcaf7f86SDimitry Andric } 5133fcaf7f86SDimitry Andric 51348bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 51358bcb0991SDimitry Andric MachineRegisterInfo &MRI, 51368bcb0991SDimitry Andric MachineIRBuilder &B, 51378bcb0991SDimitry Andric unsigned AddrSpace) const { 51388bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5139e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5140e8d8bef9SDimitry Andric Register Hi32 = Unmerge.getReg(1); 5141e8d8bef9SDimitry Andric 51428bcb0991SDimitry Andric B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 51438bcb0991SDimitry Andric MI.eraseFromParent(); 51448bcb0991SDimitry Andric return true; 51458bcb0991SDimitry Andric } 51468bcb0991SDimitry Andric 51475ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 51485ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be 51495ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset 51505ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in 51515ffd83dbSDimitry Andric // the instruction's soffset field). This function takes the first kind of 51525ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset. 5153fe6060f1SDimitry Andric std::pair<Register, unsigned> 51545ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 51555ffd83dbSDimitry Andric Register OrigOffset) const { 515606c3fb27SDimitry Andric const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(); 51575ffd83dbSDimitry Andric Register BaseReg; 5158fe6060f1SDimitry Andric unsigned ImmOffset; 51595ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 5160fe6060f1SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 51615ffd83dbSDimitry Andric 5162fe6060f1SDimitry Andric std::tie(BaseReg, ImmOffset) = 5163fe6060f1SDimitry Andric AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 51645ffd83dbSDimitry Andric 5165fe6060f1SDimitry Andric // If BaseReg is a pointer, convert it to int. 5166fe6060f1SDimitry Andric if (MRI.getType(BaseReg).isPointer()) 5167fe6060f1SDimitry Andric BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 51685ffd83dbSDimitry Andric 516906c3fb27SDimitry Andric // If the immediate value is too big for the immoffset field, put only bits 517006c3fb27SDimitry Andric // that would normally fit in the immoffset field. The remaining value that 517106c3fb27SDimitry Andric // is copied/added for the voffset field is a large power of 2, and it 517206c3fb27SDimitry Andric // stands more chance of being CSEd with the copy/add for another similar 517306c3fb27SDimitry Andric // load/store. 517406c3fb27SDimitry Andric // However, do not do that rounding down if that is a negative 517506c3fb27SDimitry Andric // number, as it appears to be illegal to have a negative offset in the 517606c3fb27SDimitry Andric // vgpr, even if adding the immediate offset makes it positive. 51775ffd83dbSDimitry Andric unsigned Overflow = ImmOffset & ~MaxImm; 51785ffd83dbSDimitry Andric ImmOffset -= Overflow; 51795ffd83dbSDimitry Andric if ((int32_t)Overflow < 0) { 51805ffd83dbSDimitry Andric Overflow += ImmOffset; 51815ffd83dbSDimitry Andric ImmOffset = 0; 51825ffd83dbSDimitry Andric } 51835ffd83dbSDimitry Andric 51845ffd83dbSDimitry Andric if (Overflow != 0) { 51855ffd83dbSDimitry Andric if (!BaseReg) { 51865ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, Overflow).getReg(0); 51875ffd83dbSDimitry Andric } else { 51885ffd83dbSDimitry Andric auto OverflowVal = B.buildConstant(S32, Overflow); 51895ffd83dbSDimitry Andric BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 51905ffd83dbSDimitry Andric } 51915ffd83dbSDimitry Andric } 51925ffd83dbSDimitry Andric 51935ffd83dbSDimitry Andric if (!BaseReg) 51945ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, 0).getReg(0); 51955ffd83dbSDimitry Andric 5196bdd1243dSDimitry Andric return std::pair(BaseReg, ImmOffset); 5197fe6060f1SDimitry Andric } 5198fe6060f1SDimitry Andric 51998bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets. 52008bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 52018bcb0991SDimitry Andric MachineRegisterInfo &MRI, 5202e8d8bef9SDimitry Andric Register Reg, 5203e8d8bef9SDimitry Andric bool ImageStore) const { 52048bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 52058bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 52068bcb0991SDimitry Andric LLT StoreVT = MRI.getType(Reg); 52078bcb0991SDimitry Andric assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 52088bcb0991SDimitry Andric 5209e8d8bef9SDimitry Andric if (ST.hasUnpackedD16VMem()) { 52108bcb0991SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 52118bcb0991SDimitry Andric 52128bcb0991SDimitry Andric SmallVector<Register, 4> WideRegs; 52138bcb0991SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 52148bcb0991SDimitry Andric WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 52158bcb0991SDimitry Andric 52168bcb0991SDimitry Andric int NumElts = StoreVT.getNumElements(); 52178bcb0991SDimitry Andric 5218fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5219fe6060f1SDimitry Andric .getReg(0); 52208bcb0991SDimitry Andric } 52218bcb0991SDimitry Andric 5222e8d8bef9SDimitry Andric if (ImageStore && ST.hasImageStoreD16Bug()) { 5223e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 2) { 5224e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 5225e8d8bef9SDimitry Andric Reg = B.buildBitcast(S32, Reg).getReg(0); 5226e8d8bef9SDimitry Andric PackedRegs.push_back(Reg); 5227e8d8bef9SDimitry Andric PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5228fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5229fe6060f1SDimitry Andric .getReg(0); 5230e8d8bef9SDimitry Andric } 5231e8d8bef9SDimitry Andric 5232e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 3) { 5233e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 5234e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 5235e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5236e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 5237e8d8bef9SDimitry Andric PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5238fe6060f1SDimitry Andric Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5239fe6060f1SDimitry Andric return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5240e8d8bef9SDimitry Andric } 5241e8d8bef9SDimitry Andric 5242e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 4) { 5243e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 5244fe6060f1SDimitry Andric Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5245e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Reg); 5246e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5247e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 5248e8d8bef9SDimitry Andric PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5249fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5250fe6060f1SDimitry Andric .getReg(0); 5251e8d8bef9SDimitry Andric } 5252e8d8bef9SDimitry Andric 5253e8d8bef9SDimitry Andric llvm_unreachable("invalid data type"); 5254e8d8bef9SDimitry Andric } 5255e8d8bef9SDimitry Andric 52560eae32dcSDimitry Andric if (StoreVT == LLT::fixed_vector(3, S16)) { 52570eae32dcSDimitry Andric Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 52580eae32dcSDimitry Andric .getReg(0); 52590eae32dcSDimitry Andric } 5260e8d8bef9SDimitry Andric return Reg; 5261e8d8bef9SDimitry Andric } 5262e8d8bef9SDimitry Andric 52635ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType( 52645ffd83dbSDimitry Andric MachineIRBuilder &B, Register VData, bool IsFormat) const { 52655ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 52665ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData); 52678bcb0991SDimitry Andric 52688bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 52698bcb0991SDimitry Andric 527006c3fb27SDimitry Andric // Fixup buffer resources themselves needing to be v4i128. 527106c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(Ty)) 527206c3fb27SDimitry Andric return castBufferRsrcToV4I32(VData, B); 527306c3fb27SDimitry Andric 52748bcb0991SDimitry Andric // Fixup illegal register types for i8 stores. 52758bcb0991SDimitry Andric if (Ty == LLT::scalar(8) || Ty == S16) { 52768bcb0991SDimitry Andric Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 52775ffd83dbSDimitry Andric return AnyExt; 52788bcb0991SDimitry Andric } 52798bcb0991SDimitry Andric 52808bcb0991SDimitry Andric if (Ty.isVector()) { 52818bcb0991SDimitry Andric if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 52828bcb0991SDimitry Andric if (IsFormat) 52835ffd83dbSDimitry Andric return handleD16VData(B, *MRI, VData); 52845ffd83dbSDimitry Andric } 52855ffd83dbSDimitry Andric } 52865ffd83dbSDimitry Andric 52875ffd83dbSDimitry Andric return VData; 52885ffd83dbSDimitry Andric } 52895ffd83dbSDimitry Andric 52905ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 52915ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 52925ffd83dbSDimitry Andric MachineIRBuilder &B, 52935ffd83dbSDimitry Andric bool IsTyped, 52945ffd83dbSDimitry Andric bool IsFormat) const { 52955ffd83dbSDimitry Andric Register VData = MI.getOperand(1).getReg(); 52965ffd83dbSDimitry Andric LLT Ty = MRI.getType(VData); 52975ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 52985ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 52995ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 53005ffd83dbSDimitry Andric 53015ffd83dbSDimitry Andric VData = fixStoreSourceType(B, VData, IsFormat); 530206c3fb27SDimitry Andric castBufferRsrcArgToV4I32(MI, B, 2); 53035ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 53045ffd83dbSDimitry Andric 53055ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 53065ffd83dbSDimitry Andric const int MemSize = MMO->getSize(); 53075ffd83dbSDimitry Andric 53085ffd83dbSDimitry Andric unsigned ImmOffset; 53095ffd83dbSDimitry Andric 53105ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 53115ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 53125ffd83dbSDimitry Andric 53135ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 53145ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 53155ffd83dbSDimitry Andric Register VIndex; 53165ffd83dbSDimitry Andric int OpOffset = 0; 53175ffd83dbSDimitry Andric if (HasVIndex) { 53185ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 53195ffd83dbSDimitry Andric OpOffset = 1; 5320fe6060f1SDimitry Andric } else { 5321fe6060f1SDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 53225ffd83dbSDimitry Andric } 53235ffd83dbSDimitry Andric 53245ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 53255ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 53265ffd83dbSDimitry Andric 53275ffd83dbSDimitry Andric unsigned Format = 0; 53285ffd83dbSDimitry Andric if (IsTyped) { 53295ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 53305ffd83dbSDimitry Andric ++OpOffset; 53315ffd83dbSDimitry Andric } 53325ffd83dbSDimitry Andric 53335ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 53345ffd83dbSDimitry Andric 5335fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 53365ffd83dbSDimitry Andric 53375ffd83dbSDimitry Andric unsigned Opc; 53385ffd83dbSDimitry Andric if (IsTyped) { 53395ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 53405ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 53415ffd83dbSDimitry Andric } else if (IsFormat) { 53425ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 53435ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 53445ffd83dbSDimitry Andric } else { 53455ffd83dbSDimitry Andric switch (MemSize) { 53465ffd83dbSDimitry Andric case 1: 53475ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 53485ffd83dbSDimitry Andric break; 53495ffd83dbSDimitry Andric case 2: 53505ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 53515ffd83dbSDimitry Andric break; 53525ffd83dbSDimitry Andric default: 53535ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 53545ffd83dbSDimitry Andric break; 53555ffd83dbSDimitry Andric } 53565ffd83dbSDimitry Andric } 53575ffd83dbSDimitry Andric 53585ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 53595ffd83dbSDimitry Andric .addUse(VData) // vdata 53605ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 53615ffd83dbSDimitry Andric .addUse(VIndex) // vindex 53625ffd83dbSDimitry Andric .addUse(VOffset) // voffset 53635ffd83dbSDimitry Andric .addUse(SOffset) // soffset 53645ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 53655ffd83dbSDimitry Andric 53665ffd83dbSDimitry Andric if (IsTyped) 53675ffd83dbSDimitry Andric MIB.addImm(Format); 53685ffd83dbSDimitry Andric 53695ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 53705ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 53715ffd83dbSDimitry Andric .addMemOperand(MMO); 53725ffd83dbSDimitry Andric 53735ffd83dbSDimitry Andric MI.eraseFromParent(); 53748bcb0991SDimitry Andric return true; 53758bcb0991SDimitry Andric } 53768bcb0991SDimitry Andric 5377bdd1243dSDimitry Andric static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 5378bdd1243dSDimitry Andric Register VIndex, Register VOffset, Register SOffset, 5379bdd1243dSDimitry Andric unsigned ImmOffset, unsigned Format, 5380bdd1243dSDimitry Andric unsigned AuxiliaryData, MachineMemOperand *MMO, 5381bdd1243dSDimitry Andric bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 5382bdd1243dSDimitry Andric auto MIB = B.buildInstr(Opc) 5383bdd1243dSDimitry Andric .addDef(LoadDstReg) // vdata 5384bdd1243dSDimitry Andric .addUse(RSrc) // rsrc 5385bdd1243dSDimitry Andric .addUse(VIndex) // vindex 5386bdd1243dSDimitry Andric .addUse(VOffset) // voffset 5387bdd1243dSDimitry Andric .addUse(SOffset) // soffset 5388bdd1243dSDimitry Andric .addImm(ImmOffset); // offset(imm) 5389bdd1243dSDimitry Andric 5390bdd1243dSDimitry Andric if (IsTyped) 5391bdd1243dSDimitry Andric MIB.addImm(Format); 5392bdd1243dSDimitry Andric 5393bdd1243dSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5394bdd1243dSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5395bdd1243dSDimitry Andric .addMemOperand(MMO); 5396bdd1243dSDimitry Andric } 5397bdd1243dSDimitry Andric 53985ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 53995ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 54005ffd83dbSDimitry Andric MachineIRBuilder &B, 54015ffd83dbSDimitry Andric bool IsFormat, 54025ffd83dbSDimitry Andric bool IsTyped) const { 54035ffd83dbSDimitry Andric // FIXME: Verifier should enforce 1 MMO for these intrinsics. 54045ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 5405fe6060f1SDimitry Andric const LLT MemTy = MMO->getMemoryType(); 54065ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 54075ffd83dbSDimitry Andric 54085ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 5409bdd1243dSDimitry Andric 5410bdd1243dSDimitry Andric Register StatusDst; 5411bdd1243dSDimitry Andric int OpOffset = 0; 5412bdd1243dSDimitry Andric assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 5413bdd1243dSDimitry Andric bool IsTFE = MI.getNumExplicitDefs() == 2; 5414bdd1243dSDimitry Andric if (IsTFE) { 5415bdd1243dSDimitry Andric StatusDst = MI.getOperand(1).getReg(); 5416bdd1243dSDimitry Andric ++OpOffset; 5417bdd1243dSDimitry Andric } 5418bdd1243dSDimitry Andric 541906c3fb27SDimitry Andric castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 5420bdd1243dSDimitry Andric Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 54215ffd83dbSDimitry Andric 54225ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 54235ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 54245ffd83dbSDimitry Andric 54255ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 5426bdd1243dSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 54275ffd83dbSDimitry Andric Register VIndex; 54285ffd83dbSDimitry Andric if (HasVIndex) { 5429bdd1243dSDimitry Andric VIndex = MI.getOperand(3 + OpOffset).getReg(); 5430bdd1243dSDimitry Andric ++OpOffset; 5431fe6060f1SDimitry Andric } else { 5432fe6060f1SDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 54338bcb0991SDimitry Andric } 54348bcb0991SDimitry Andric 54355ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 54365ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 54375ffd83dbSDimitry Andric 54385ffd83dbSDimitry Andric unsigned Format = 0; 54395ffd83dbSDimitry Andric if (IsTyped) { 54405ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 54415ffd83dbSDimitry Andric ++OpOffset; 54428bcb0991SDimitry Andric } 54438bcb0991SDimitry Andric 54445ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 54455ffd83dbSDimitry Andric unsigned ImmOffset; 54465ffd83dbSDimitry Andric 54475ffd83dbSDimitry Andric LLT Ty = MRI.getType(Dst); 544806c3fb27SDimitry Andric // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 544906c3fb27SDimitry Andric // logic doesn't have to handle that case. 545006c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(Ty)) { 545106c3fb27SDimitry Andric Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 545206c3fb27SDimitry Andric Dst = MI.getOperand(0).getReg(); 545306c3fb27SDimitry Andric } 54545ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 54555ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 54565ffd83dbSDimitry Andric const bool Unpacked = ST.hasUnpackedD16VMem(); 54575ffd83dbSDimitry Andric 5458fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 54595ffd83dbSDimitry Andric 54605ffd83dbSDimitry Andric unsigned Opc; 54615ffd83dbSDimitry Andric 5462bdd1243dSDimitry Andric // TODO: Support TFE for typed and narrow loads. 54635ffd83dbSDimitry Andric if (IsTyped) { 5464bdd1243dSDimitry Andric if (IsTFE) 5465bdd1243dSDimitry Andric return false; 54665ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 54675ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 54685ffd83dbSDimitry Andric } else if (IsFormat) { 5469bdd1243dSDimitry Andric if (IsD16) { 5470bdd1243dSDimitry Andric if (IsTFE) 5471bdd1243dSDimitry Andric return false; 5472bdd1243dSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 54735ffd83dbSDimitry Andric } else { 5474bdd1243dSDimitry Andric Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 5475bdd1243dSDimitry Andric : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 5476bdd1243dSDimitry Andric } 5477bdd1243dSDimitry Andric } else { 5478bdd1243dSDimitry Andric if (IsTFE) 5479bdd1243dSDimitry Andric return false; 5480fe6060f1SDimitry Andric switch (MemTy.getSizeInBits()) { 5481fe6060f1SDimitry Andric case 8: 54825ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 54835ffd83dbSDimitry Andric break; 5484fe6060f1SDimitry Andric case 16: 54855ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 54865ffd83dbSDimitry Andric break; 54875ffd83dbSDimitry Andric default: 54885ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 54895ffd83dbSDimitry Andric break; 54905ffd83dbSDimitry Andric } 54915ffd83dbSDimitry Andric } 54925ffd83dbSDimitry Andric 5493bdd1243dSDimitry Andric if (IsTFE) { 5494bdd1243dSDimitry Andric unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 5495bdd1243dSDimitry Andric unsigned NumLoadDWords = NumValueDWords + 1; 5496bdd1243dSDimitry Andric LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 5497bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 5498bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5499bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5500bdd1243dSDimitry Andric if (NumValueDWords == 1) { 5501bdd1243dSDimitry Andric B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 5502bdd1243dSDimitry Andric } else { 5503bdd1243dSDimitry Andric SmallVector<Register, 5> LoadElts; 5504bdd1243dSDimitry Andric for (unsigned I = 0; I != NumValueDWords; ++I) 5505bdd1243dSDimitry Andric LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 5506bdd1243dSDimitry Andric LoadElts.push_back(StatusDst); 5507bdd1243dSDimitry Andric B.buildUnmerge(LoadElts, LoadDstReg); 5508bdd1243dSDimitry Andric LoadElts.truncate(NumValueDWords); 5509bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, LoadElts); 5510bdd1243dSDimitry Andric } 5511bdd1243dSDimitry Andric } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 5512bdd1243dSDimitry Andric (IsD16 && !Ty.isVector())) { 5513bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 5514bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5515bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 55165ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 55175ffd83dbSDimitry Andric B.buildTrunc(Dst, LoadDstReg); 5518bdd1243dSDimitry Andric } else if (Unpacked && IsD16 && Ty.isVector()) { 5519bdd1243dSDimitry Andric LLT UnpackedTy = Ty.changeElementSize(32); 5520bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 5521bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5522bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5523bdd1243dSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 55245ffd83dbSDimitry Andric // FIXME: G_TRUNC should work, but legalization currently fails 55255ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 55265ffd83dbSDimitry Andric SmallVector<Register, 4> Repack; 55275ffd83dbSDimitry Andric for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 55285ffd83dbSDimitry Andric Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 5529bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, Repack); 5530bdd1243dSDimitry Andric } else { 5531bdd1243dSDimitry Andric buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 5532bdd1243dSDimitry Andric AuxiliaryData, MMO, IsTyped, HasVIndex, B); 55335ffd83dbSDimitry Andric } 55345ffd83dbSDimitry Andric 55355ffd83dbSDimitry Andric MI.eraseFromParent(); 55365ffd83dbSDimitry Andric return true; 55375ffd83dbSDimitry Andric } 55385ffd83dbSDimitry Andric 55395ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 55405ffd83dbSDimitry Andric switch (IntrID) { 55415ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 554206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 55435ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 554406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 55455ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 55465ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 554706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 55485ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 554906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 55505ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 55515ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 555206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 55535ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 555406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 55555ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 55565ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 555706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 55585ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 555906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 55605ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 55615ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 556206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 55635ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 556406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 55655ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 55665ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 556706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 55685ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 556906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 55705ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 55715ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 557206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 55735ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 557406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 55755ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 55765ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 557706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 55785ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 557906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 55805ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 55815ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 558206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 55835ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 558406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 55855ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 55865ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 558706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 55885ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 558906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 55905ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 55915ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 559206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 55935ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 559406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 55955ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 55965ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 559706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 55985ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 559906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 56005ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 56015ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 560206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 56035ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 560406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 56055ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 5606e8d8bef9SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 560706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 5608e8d8bef9SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 560906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 5610e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 5611fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 561206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 5613fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 561406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 5615fe6060f1SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 5616fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 561706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 5618fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 561906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 5620fe6060f1SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 56215ffd83dbSDimitry Andric default: 56225ffd83dbSDimitry Andric llvm_unreachable("unhandled atomic opcode"); 56235ffd83dbSDimitry Andric } 56245ffd83dbSDimitry Andric } 56255ffd83dbSDimitry Andric 56265ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 56275ffd83dbSDimitry Andric MachineIRBuilder &B, 56285ffd83dbSDimitry Andric Intrinsic::ID IID) const { 562906c3fb27SDimitry Andric const bool IsCmpSwap = 563006c3fb27SDimitry Andric IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 563106c3fb27SDimitry Andric IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 563206c3fb27SDimitry Andric IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 563306c3fb27SDimitry Andric IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 5634e8d8bef9SDimitry Andric const bool HasReturn = MI.getNumExplicitDefs() != 0; 56355ffd83dbSDimitry Andric 5636e8d8bef9SDimitry Andric Register Dst; 56375ffd83dbSDimitry Andric 56385ffd83dbSDimitry Andric int OpOffset = 0; 5639e8d8bef9SDimitry Andric if (HasReturn) { 5640e8d8bef9SDimitry Andric // A few FP atomics do not support return values. 5641e8d8bef9SDimitry Andric Dst = MI.getOperand(0).getReg(); 5642e8d8bef9SDimitry Andric } else { 5643e8d8bef9SDimitry Andric OpOffset = -1; 5644e8d8bef9SDimitry Andric } 5645e8d8bef9SDimitry Andric 564606c3fb27SDimitry Andric // Since we don't have 128-bit atomics, we don't need to handle the case of 564706c3fb27SDimitry Andric // p8 argmunents to the atomic itself 5648e8d8bef9SDimitry Andric Register VData = MI.getOperand(2 + OpOffset).getReg(); 5649e8d8bef9SDimitry Andric Register CmpVal; 56505ffd83dbSDimitry Andric 56515ffd83dbSDimitry Andric if (IsCmpSwap) { 56525ffd83dbSDimitry Andric CmpVal = MI.getOperand(3 + OpOffset).getReg(); 56535ffd83dbSDimitry Andric ++OpOffset; 56545ffd83dbSDimitry Andric } 56555ffd83dbSDimitry Andric 565606c3fb27SDimitry Andric castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 56575ffd83dbSDimitry Andric Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 5658e8d8bef9SDimitry Andric const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 56595ffd83dbSDimitry Andric 56605ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 56615ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 56625ffd83dbSDimitry Andric Register VIndex; 56635ffd83dbSDimitry Andric if (HasVIndex) { 56645ffd83dbSDimitry Andric VIndex = MI.getOperand(4 + OpOffset).getReg(); 56655ffd83dbSDimitry Andric ++OpOffset; 5666fe6060f1SDimitry Andric } else { 5667fe6060f1SDimitry Andric VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 56685ffd83dbSDimitry Andric } 56695ffd83dbSDimitry Andric 56705ffd83dbSDimitry Andric Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 56715ffd83dbSDimitry Andric Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 56725ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 56735ffd83dbSDimitry Andric 56745ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 56755ffd83dbSDimitry Andric 56765ffd83dbSDimitry Andric unsigned ImmOffset; 5677fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 56785ffd83dbSDimitry Andric 5679e8d8bef9SDimitry Andric auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 5680e8d8bef9SDimitry Andric 5681e8d8bef9SDimitry Andric if (HasReturn) 5682e8d8bef9SDimitry Andric MIB.addDef(Dst); 5683e8d8bef9SDimitry Andric 5684e8d8bef9SDimitry Andric MIB.addUse(VData); // vdata 56855ffd83dbSDimitry Andric 56865ffd83dbSDimitry Andric if (IsCmpSwap) 56875ffd83dbSDimitry Andric MIB.addReg(CmpVal); 56885ffd83dbSDimitry Andric 56895ffd83dbSDimitry Andric MIB.addUse(RSrc) // rsrc 56905ffd83dbSDimitry Andric .addUse(VIndex) // vindex 56915ffd83dbSDimitry Andric .addUse(VOffset) // voffset 56925ffd83dbSDimitry Andric .addUse(SOffset) // soffset 56935ffd83dbSDimitry Andric .addImm(ImmOffset) // offset(imm) 56945ffd83dbSDimitry Andric .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 56955ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 56965ffd83dbSDimitry Andric .addMemOperand(MMO); 56975ffd83dbSDimitry Andric 56985ffd83dbSDimitry Andric MI.eraseFromParent(); 56995ffd83dbSDimitry Andric return true; 57005ffd83dbSDimitry Andric } 57015ffd83dbSDimitry Andric 5702fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 57035ffd83dbSDimitry Andric /// vector with s16 typed elements. 5704fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 5705fe6060f1SDimitry Andric SmallVectorImpl<Register> &PackedAddrs, 5706fe6060f1SDimitry Andric unsigned ArgOffset, 5707fe6060f1SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr, 5708fe6060f1SDimitry Andric bool IsA16, bool IsG16) { 57095ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 5710fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 5711fe6060f1SDimitry Andric auto EndIdx = Intr->VAddrEnd; 57125ffd83dbSDimitry Andric 5713e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 5714e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 57155ffd83dbSDimitry Andric if (!SrcOp.isReg()) 57165ffd83dbSDimitry Andric continue; // _L to _LZ may have eliminated this. 57175ffd83dbSDimitry Andric 57185ffd83dbSDimitry Andric Register AddrReg = SrcOp.getReg(); 57195ffd83dbSDimitry Andric 5720fe6060f1SDimitry Andric if ((I < Intr->GradientStart) || 5721fe6060f1SDimitry Andric (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 5722fe6060f1SDimitry Andric (I >= Intr->CoordStart && !IsA16)) { 57230eae32dcSDimitry Andric if ((I < Intr->GradientStart) && IsA16 && 57240eae32dcSDimitry Andric (B.getMRI()->getType(AddrReg) == S16)) { 572504eeddc0SDimitry Andric assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 57260eae32dcSDimitry Andric // Special handling of bias when A16 is on. Bias is of type half but 57270eae32dcSDimitry Andric // occupies full 32-bit. 57280eae32dcSDimitry Andric PackedAddrs.push_back( 57290eae32dcSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 57300eae32dcSDimitry Andric .getReg(0)); 57310eae32dcSDimitry Andric } else { 573204eeddc0SDimitry Andric assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 573304eeddc0SDimitry Andric "Bias needs to be converted to 16 bit in A16 mode"); 573404eeddc0SDimitry Andric // Handle any gradient or coordinate operands that should not be packed 57355ffd83dbSDimitry Andric AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 57365ffd83dbSDimitry Andric PackedAddrs.push_back(AddrReg); 57370eae32dcSDimitry Andric } 57385ffd83dbSDimitry Andric } else { 57395ffd83dbSDimitry Andric // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 57405ffd83dbSDimitry Andric // derivatives dx/dh and dx/dv are packed with undef. 57415ffd83dbSDimitry Andric if (((I + 1) >= EndIdx) || 5742e8d8bef9SDimitry Andric ((Intr->NumGradients / 2) % 2 == 1 && 5743e8d8bef9SDimitry Andric (I == static_cast<unsigned>(Intr->GradientStart + 5744e8d8bef9SDimitry Andric (Intr->NumGradients / 2) - 1) || 5745e8d8bef9SDimitry Andric I == static_cast<unsigned>(Intr->GradientStart + 5746e8d8bef9SDimitry Andric Intr->NumGradients - 1))) || 57475ffd83dbSDimitry Andric // Check for _L to _LZ optimization 5748e8d8bef9SDimitry Andric !MI.getOperand(ArgOffset + I + 1).isReg()) { 57495ffd83dbSDimitry Andric PackedAddrs.push_back( 57505ffd83dbSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 57515ffd83dbSDimitry Andric .getReg(0)); 57525ffd83dbSDimitry Andric } else { 57535ffd83dbSDimitry Andric PackedAddrs.push_back( 5754e8d8bef9SDimitry Andric B.buildBuildVector( 5755e8d8bef9SDimitry Andric V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 57565ffd83dbSDimitry Andric .getReg(0)); 57575ffd83dbSDimitry Andric ++I; 57585ffd83dbSDimitry Andric } 57595ffd83dbSDimitry Andric } 57605ffd83dbSDimitry Andric } 57615ffd83dbSDimitry Andric } 57625ffd83dbSDimitry Andric 57635ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register, 57645ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg. 57655ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 57665ffd83dbSDimitry Andric int DimIdx, int NumVAddrs) { 57675ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 5768bdd1243dSDimitry Andric (void)S32; 57695ffd83dbSDimitry Andric SmallVector<Register, 8> AddrRegs; 57705ffd83dbSDimitry Andric for (int I = 0; I != NumVAddrs; ++I) { 57715ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 57725ffd83dbSDimitry Andric if (SrcOp.isReg()) { 57735ffd83dbSDimitry Andric AddrRegs.push_back(SrcOp.getReg()); 57745ffd83dbSDimitry Andric assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 57755ffd83dbSDimitry Andric } 57765ffd83dbSDimitry Andric } 57775ffd83dbSDimitry Andric 57785ffd83dbSDimitry Andric int NumAddrRegs = AddrRegs.size(); 57795ffd83dbSDimitry Andric if (NumAddrRegs != 1) { 5780fe6060f1SDimitry Andric auto VAddr = 5781fe6060f1SDimitry Andric B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 57825ffd83dbSDimitry Andric MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 57835ffd83dbSDimitry Andric } 57845ffd83dbSDimitry Andric 57855ffd83dbSDimitry Andric for (int I = 1; I != NumVAddrs; ++I) { 57865ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 57875ffd83dbSDimitry Andric if (SrcOp.isReg()) 57885ffd83dbSDimitry Andric MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 57895ffd83dbSDimitry Andric } 57905ffd83dbSDimitry Andric } 57915ffd83dbSDimitry Andric 57925ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget. 57935ffd83dbSDimitry Andric /// 57945ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be 57955ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed 57965ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit 57975ffd83dbSDimitry Andric /// registers. 57985ffd83dbSDimitry Andric /// 57995ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want 58005ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't 580181ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid 58025ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on 5803349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires 5804349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg. 58055ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 5806e8d8bef9SDimitry Andric MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 5807e8d8bef9SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 58085ffd83dbSDimitry Andric 5809bdd1243dSDimitry Andric const MachineFunction &MF = *MI.getMF(); 5810e8d8bef9SDimitry Andric const unsigned NumDefs = MI.getNumExplicitDefs(); 5811e8d8bef9SDimitry Andric const unsigned ArgOffset = NumDefs + 1; 58125ffd83dbSDimitry Andric bool IsTFE = NumDefs == 2; 58135ffd83dbSDimitry Andric // We are only processing the operands of d16 image operations on subtargets 58145ffd83dbSDimitry Andric // that use the unpacked register layout, or need to repack the TFE result. 58155ffd83dbSDimitry Andric 58165ffd83dbSDimitry Andric // TODO: Do we need to guard against already legalized intrinsics? 58175ffd83dbSDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 5818e8d8bef9SDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 58195ffd83dbSDimitry Andric 58205ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 58215ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 58225ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 5823fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 58245ffd83dbSDimitry Andric 58255ffd83dbSDimitry Andric unsigned DMask = 0; 582604eeddc0SDimitry Andric Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 582704eeddc0SDimitry Andric LLT Ty = MRI->getType(VData); 58285ffd83dbSDimitry Andric 58295ffd83dbSDimitry Andric // Check for 16 bit addresses and pack if true. 5830e8d8bef9SDimitry Andric LLT GradTy = 5831e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 5832e8d8bef9SDimitry Andric LLT AddrTy = 5833e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 583406c3fb27SDimitry Andric const bool IsG16 = 583506c3fb27SDimitry Andric ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 58365ffd83dbSDimitry Andric const bool IsA16 = AddrTy == S16; 583704eeddc0SDimitry Andric const bool IsD16 = Ty.getScalarType() == S16; 58385ffd83dbSDimitry Andric 58395ffd83dbSDimitry Andric int DMaskLanes = 0; 58405ffd83dbSDimitry Andric if (!BaseOpcode->Atomic) { 5841e8d8bef9SDimitry Andric DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 58425ffd83dbSDimitry Andric if (BaseOpcode->Gather4) { 58435ffd83dbSDimitry Andric DMaskLanes = 4; 58445ffd83dbSDimitry Andric } else if (DMask != 0) { 5845bdd1243dSDimitry Andric DMaskLanes = llvm::popcount(DMask); 58465ffd83dbSDimitry Andric } else if (!IsTFE && !BaseOpcode->Store) { 58475ffd83dbSDimitry Andric // If dmask is 0, this is a no-op load. This can be eliminated. 58485ffd83dbSDimitry Andric B.buildUndef(MI.getOperand(0)); 58495ffd83dbSDimitry Andric MI.eraseFromParent(); 58505ffd83dbSDimitry Andric return true; 58515ffd83dbSDimitry Andric } 58525ffd83dbSDimitry Andric } 58535ffd83dbSDimitry Andric 58545ffd83dbSDimitry Andric Observer.changingInstr(MI); 58555ffd83dbSDimitry Andric auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 58565ffd83dbSDimitry Andric 585704eeddc0SDimitry Andric const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 585804eeddc0SDimitry Andric : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 585904eeddc0SDimitry Andric const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 586004eeddc0SDimitry Andric : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 586104eeddc0SDimitry Andric unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 58625ffd83dbSDimitry Andric 58635ffd83dbSDimitry Andric // Track that we legalized this 58645ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(NewOpcode)); 58655ffd83dbSDimitry Andric 58665ffd83dbSDimitry Andric // Expecting to get an error flag since TFC is on - and dmask is 0 Force 58675ffd83dbSDimitry Andric // dmask to be at least 1 otherwise the instruction will fail 58685ffd83dbSDimitry Andric if (IsTFE && DMask == 0) { 58695ffd83dbSDimitry Andric DMask = 0x1; 58705ffd83dbSDimitry Andric DMaskLanes = 1; 5871e8d8bef9SDimitry Andric MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 58725ffd83dbSDimitry Andric } 58735ffd83dbSDimitry Andric 58745ffd83dbSDimitry Andric if (BaseOpcode->Atomic) { 58755ffd83dbSDimitry Andric Register VData0 = MI.getOperand(2).getReg(); 58765ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData0); 58775ffd83dbSDimitry Andric 58785ffd83dbSDimitry Andric // TODO: Allow atomic swap and bit ops for v2s16/v4s16 58795ffd83dbSDimitry Andric if (Ty.isVector()) 58805ffd83dbSDimitry Andric return false; 58815ffd83dbSDimitry Andric 58825ffd83dbSDimitry Andric if (BaseOpcode->AtomicX2) { 58835ffd83dbSDimitry Andric Register VData1 = MI.getOperand(3).getReg(); 58845ffd83dbSDimitry Andric // The two values are packed in one register. 5885fe6060f1SDimitry Andric LLT PackedTy = LLT::fixed_vector(2, Ty); 58865ffd83dbSDimitry Andric auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 58875ffd83dbSDimitry Andric MI.getOperand(2).setReg(Concat.getReg(0)); 58885ffd83dbSDimitry Andric MI.getOperand(3).setReg(AMDGPU::NoRegister); 58895ffd83dbSDimitry Andric } 58905ffd83dbSDimitry Andric } 58915ffd83dbSDimitry Andric 5892e8d8bef9SDimitry Andric unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 58935ffd83dbSDimitry Andric 58945ffd83dbSDimitry Andric // Rewrite the addressing register layout before doing anything else. 5895fe6060f1SDimitry Andric if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 5896fe6060f1SDimitry Andric // 16 bit gradients are supported, but are tied to the A16 control 5897fe6060f1SDimitry Andric // so both gradients and addresses must be 16 bit 58985ffd83dbSDimitry Andric return false; 5899fe6060f1SDimitry Andric } 59005ffd83dbSDimitry Andric 5901fe6060f1SDimitry Andric if (IsA16 && !ST.hasA16()) { 5902fe6060f1SDimitry Andric // A16 not supported 5903fe6060f1SDimitry Andric return false; 5904fe6060f1SDimitry Andric } 5905fe6060f1SDimitry Andric 590606c3fb27SDimitry Andric const unsigned NSAMaxSize = ST.getNSAMaxSize(); 590706c3fb27SDimitry Andric const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 590806c3fb27SDimitry Andric 5909fe6060f1SDimitry Andric if (IsA16 || IsG16) { 5910e8d8bef9SDimitry Andric if (Intr->NumVAddrs > 1) { 59115ffd83dbSDimitry Andric SmallVector<Register, 4> PackedRegs; 59125ffd83dbSDimitry Andric 5913fe6060f1SDimitry Andric packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 5914fe6060f1SDimitry Andric IsG16); 59155ffd83dbSDimitry Andric 59165ffd83dbSDimitry Andric // See also below in the non-a16 branch 5917bdd1243dSDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && 5918bdd1243dSDimitry Andric PackedRegs.size() >= ST.getNSAThreshold(MF) && 591906c3fb27SDimitry Andric (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 592006c3fb27SDimitry Andric const bool UsePartialNSA = 592106c3fb27SDimitry Andric UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 59225ffd83dbSDimitry Andric 592306c3fb27SDimitry Andric if (UsePartialNSA) { 592406c3fb27SDimitry Andric // Pack registers that would go over NSAMaxSize into last VAddr register 592506c3fb27SDimitry Andric LLT PackedAddrTy = 592606c3fb27SDimitry Andric LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 592706c3fb27SDimitry Andric auto Concat = B.buildConcatVectors( 592806c3fb27SDimitry Andric PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 592906c3fb27SDimitry Andric PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 593006c3fb27SDimitry Andric PackedRegs.resize(NSAMaxSize); 593106c3fb27SDimitry Andric } else if (!UseNSA && PackedRegs.size() > 1) { 5932fe6060f1SDimitry Andric LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 59335ffd83dbSDimitry Andric auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 59345ffd83dbSDimitry Andric PackedRegs[0] = Concat.getReg(0); 59355ffd83dbSDimitry Andric PackedRegs.resize(1); 59365ffd83dbSDimitry Andric } 59375ffd83dbSDimitry Andric 5938e8d8bef9SDimitry Andric const unsigned NumPacked = PackedRegs.size(); 5939e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 5940e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 59415ffd83dbSDimitry Andric if (!SrcOp.isReg()) { 59425ffd83dbSDimitry Andric assert(SrcOp.isImm() && SrcOp.getImm() == 0); 59435ffd83dbSDimitry Andric continue; 59445ffd83dbSDimitry Andric } 59455ffd83dbSDimitry Andric 59465ffd83dbSDimitry Andric assert(SrcOp.getReg() != AMDGPU::NoRegister); 59475ffd83dbSDimitry Andric 5948e8d8bef9SDimitry Andric if (I - Intr->VAddrStart < NumPacked) 5949e8d8bef9SDimitry Andric SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 59505ffd83dbSDimitry Andric else 59515ffd83dbSDimitry Andric SrcOp.setReg(AMDGPU::NoRegister); 59525ffd83dbSDimitry Andric } 59535ffd83dbSDimitry Andric } 59545ffd83dbSDimitry Andric } else { 59555ffd83dbSDimitry Andric // If the register allocator cannot place the address registers contiguously 59565ffd83dbSDimitry Andric // without introducing moves, then using the non-sequential address encoding 59575ffd83dbSDimitry Andric // is always preferable, since it saves VALU instructions and is usually a 59585ffd83dbSDimitry Andric // wash in terms of code size or even better. 59595ffd83dbSDimitry Andric // 59605ffd83dbSDimitry Andric // However, we currently have no way of hinting to the register allocator 59615ffd83dbSDimitry Andric // that MIMG addresses should be placed contiguously when it is possible to 59625ffd83dbSDimitry Andric // do so, so force non-NSA for the common 2-address case as a heuristic. 59635ffd83dbSDimitry Andric // 59645ffd83dbSDimitry Andric // SIShrinkInstructions will convert NSA encodings to non-NSA after register 59655ffd83dbSDimitry Andric // allocation when possible. 596681ad6265SDimitry Andric // 596706c3fb27SDimitry Andric // Partial NSA is allowed on GFX11 where the final register is a contiguous 596806c3fb27SDimitry Andric // set of the remaining addresses. 5969bdd1243dSDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && 5970bdd1243dSDimitry Andric CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 597106c3fb27SDimitry Andric (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 597206c3fb27SDimitry Andric const bool UsePartialNSA = 597306c3fb27SDimitry Andric UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 59745ffd83dbSDimitry Andric 597506c3fb27SDimitry Andric if (UsePartialNSA) { 597606c3fb27SDimitry Andric convertImageAddrToPacked(B, MI, 597706c3fb27SDimitry Andric ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 597806c3fb27SDimitry Andric Intr->NumVAddrs - NSAMaxSize + 1); 597906c3fb27SDimitry Andric } else if (!UseNSA && Intr->NumVAddrs > 1) { 5980e8d8bef9SDimitry Andric convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 5981e8d8bef9SDimitry Andric Intr->NumVAddrs); 59825ffd83dbSDimitry Andric } 598306c3fb27SDimitry Andric } 59845ffd83dbSDimitry Andric 59855ffd83dbSDimitry Andric int Flags = 0; 59865ffd83dbSDimitry Andric if (IsA16) 59875ffd83dbSDimitry Andric Flags |= 1; 59885ffd83dbSDimitry Andric if (IsG16) 59895ffd83dbSDimitry Andric Flags |= 2; 59905ffd83dbSDimitry Andric MI.addOperand(MachineOperand::CreateImm(Flags)); 59915ffd83dbSDimitry Andric 59925ffd83dbSDimitry Andric if (BaseOpcode->Store) { // No TFE for stores? 59935ffd83dbSDimitry Andric // TODO: Handle dmask trim 599404eeddc0SDimitry Andric if (!Ty.isVector() || !IsD16) 59955ffd83dbSDimitry Andric return true; 59965ffd83dbSDimitry Andric 5997e8d8bef9SDimitry Andric Register RepackedReg = handleD16VData(B, *MRI, VData, true); 59985ffd83dbSDimitry Andric if (RepackedReg != VData) { 59995ffd83dbSDimitry Andric MI.getOperand(1).setReg(RepackedReg); 60005ffd83dbSDimitry Andric } 60015ffd83dbSDimitry Andric 60025ffd83dbSDimitry Andric return true; 60035ffd83dbSDimitry Andric } 60045ffd83dbSDimitry Andric 60055ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 60065ffd83dbSDimitry Andric const LLT EltTy = Ty.getScalarType(); 60075ffd83dbSDimitry Andric const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 60085ffd83dbSDimitry Andric 60095ffd83dbSDimitry Andric // Confirm that the return type is large enough for the dmask specified 60105ffd83dbSDimitry Andric if (NumElts < DMaskLanes) 60115ffd83dbSDimitry Andric return false; 60125ffd83dbSDimitry Andric 60135ffd83dbSDimitry Andric if (NumElts > 4 || DMaskLanes > 4) 60145ffd83dbSDimitry Andric return false; 60155ffd83dbSDimitry Andric 60165ffd83dbSDimitry Andric const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 6017fe6060f1SDimitry Andric const LLT AdjustedTy = 6018fe6060f1SDimitry Andric Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 60195ffd83dbSDimitry Andric 60205ffd83dbSDimitry Andric // The raw dword aligned data component of the load. The only legal cases 60215ffd83dbSDimitry Andric // where this matters should be when using the packed D16 format, for 60225ffd83dbSDimitry Andric // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 60235ffd83dbSDimitry Andric LLT RoundedTy; 60245ffd83dbSDimitry Andric 6025bdd1243dSDimitry Andric // S32 vector to cover all data, plus TFE result element. 60265ffd83dbSDimitry Andric LLT TFETy; 60275ffd83dbSDimitry Andric 60285ffd83dbSDimitry Andric // Register type to use for each loaded component. Will be S32 or V2S16. 60295ffd83dbSDimitry Andric LLT RegTy; 60305ffd83dbSDimitry Andric 60315ffd83dbSDimitry Andric if (IsD16 && ST.hasUnpackedD16VMem()) { 6032fe6060f1SDimitry Andric RoundedTy = 6033fe6060f1SDimitry Andric LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6034fe6060f1SDimitry Andric TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 60355ffd83dbSDimitry Andric RegTy = S32; 60365ffd83dbSDimitry Andric } else { 60375ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 60385ffd83dbSDimitry Andric unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 60395ffd83dbSDimitry Andric unsigned RoundedSize = 32 * RoundedElts; 6040fe6060f1SDimitry Andric RoundedTy = LLT::scalarOrVector( 6041fe6060f1SDimitry Andric ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6042fe6060f1SDimitry Andric TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 60435ffd83dbSDimitry Andric RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 60445ffd83dbSDimitry Andric } 60455ffd83dbSDimitry Andric 60465ffd83dbSDimitry Andric // The return type does not need adjustment. 60475ffd83dbSDimitry Andric // TODO: Should we change s16 case to s32 or <2 x s16>? 60485ffd83dbSDimitry Andric if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 60495ffd83dbSDimitry Andric return true; 60505ffd83dbSDimitry Andric 60515ffd83dbSDimitry Andric Register Dst1Reg; 60525ffd83dbSDimitry Andric 60535ffd83dbSDimitry Andric // Insert after the instruction. 60545ffd83dbSDimitry Andric B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 60555ffd83dbSDimitry Andric 60565ffd83dbSDimitry Andric // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 60575ffd83dbSDimitry Andric // s16> instead of s32, we would only need 1 bitcast instead of multiple. 60585ffd83dbSDimitry Andric const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 60595ffd83dbSDimitry Andric const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 60605ffd83dbSDimitry Andric 60615ffd83dbSDimitry Andric Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 60625ffd83dbSDimitry Andric 60635ffd83dbSDimitry Andric MI.getOperand(0).setReg(NewResultReg); 60645ffd83dbSDimitry Andric 60655ffd83dbSDimitry Andric // In the IR, TFE is supposed to be used with a 2 element struct return 6066349cc55cSDimitry Andric // type. The instruction really returns these two values in one contiguous 60675ffd83dbSDimitry Andric // register, with one additional dword beyond the loaded data. Rewrite the 60685ffd83dbSDimitry Andric // return type to use a single register result. 60695ffd83dbSDimitry Andric 60705ffd83dbSDimitry Andric if (IsTFE) { 60715ffd83dbSDimitry Andric Dst1Reg = MI.getOperand(1).getReg(); 60725ffd83dbSDimitry Andric if (MRI->getType(Dst1Reg) != S32) 60735ffd83dbSDimitry Andric return false; 60745ffd83dbSDimitry Andric 60755ffd83dbSDimitry Andric // TODO: Make sure the TFE operand bit is set. 607681ad6265SDimitry Andric MI.removeOperand(1); 60775ffd83dbSDimitry Andric 60785ffd83dbSDimitry Andric // Handle the easy case that requires no repack instructions. 60795ffd83dbSDimitry Andric if (Ty == S32) { 60805ffd83dbSDimitry Andric B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 60815ffd83dbSDimitry Andric return true; 60825ffd83dbSDimitry Andric } 60835ffd83dbSDimitry Andric } 60845ffd83dbSDimitry Andric 60855ffd83dbSDimitry Andric // Now figure out how to copy the new result register back into the old 60865ffd83dbSDimitry Andric // result. 60875ffd83dbSDimitry Andric SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 60885ffd83dbSDimitry Andric 60895ffd83dbSDimitry Andric const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 60905ffd83dbSDimitry Andric 60915ffd83dbSDimitry Andric if (ResultNumRegs == 1) { 60925ffd83dbSDimitry Andric assert(!IsTFE); 60935ffd83dbSDimitry Andric ResultRegs[0] = NewResultReg; 60945ffd83dbSDimitry Andric } else { 60955ffd83dbSDimitry Andric // We have to repack into a new vector of some kind. 60965ffd83dbSDimitry Andric for (int I = 0; I != NumDataRegs; ++I) 60975ffd83dbSDimitry Andric ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 60985ffd83dbSDimitry Andric B.buildUnmerge(ResultRegs, NewResultReg); 60995ffd83dbSDimitry Andric 61005ffd83dbSDimitry Andric // Drop the final TFE element to get the data part. The TFE result is 61015ffd83dbSDimitry Andric // directly written to the right place already. 61025ffd83dbSDimitry Andric if (IsTFE) 61035ffd83dbSDimitry Andric ResultRegs.resize(NumDataRegs); 61045ffd83dbSDimitry Andric } 61055ffd83dbSDimitry Andric 61065ffd83dbSDimitry Andric // For an s16 scalar result, we form an s32 result with a truncate regardless 61075ffd83dbSDimitry Andric // of packed vs. unpacked. 61085ffd83dbSDimitry Andric if (IsD16 && !Ty.isVector()) { 61095ffd83dbSDimitry Andric B.buildTrunc(DstReg, ResultRegs[0]); 61105ffd83dbSDimitry Andric return true; 61115ffd83dbSDimitry Andric } 61125ffd83dbSDimitry Andric 61135ffd83dbSDimitry Andric // Avoid a build/concat_vector of 1 entry. 61145ffd83dbSDimitry Andric if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 61155ffd83dbSDimitry Andric B.buildBitcast(DstReg, ResultRegs[0]); 61165ffd83dbSDimitry Andric return true; 61175ffd83dbSDimitry Andric } 61185ffd83dbSDimitry Andric 61195ffd83dbSDimitry Andric assert(Ty.isVector()); 61205ffd83dbSDimitry Andric 61215ffd83dbSDimitry Andric if (IsD16) { 61225ffd83dbSDimitry Andric // For packed D16 results with TFE enabled, all the data components are 61235ffd83dbSDimitry Andric // S32. Cast back to the expected type. 61245ffd83dbSDimitry Andric // 61255ffd83dbSDimitry Andric // TODO: We don't really need to use load s32 elements. We would only need one 61265ffd83dbSDimitry Andric // cast for the TFE result if a multiple of v2s16 was used. 61275ffd83dbSDimitry Andric if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 61285ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 61295ffd83dbSDimitry Andric Reg = B.buildBitcast(V2S16, Reg).getReg(0); 61305ffd83dbSDimitry Andric } else if (ST.hasUnpackedD16VMem()) { 61315ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 61325ffd83dbSDimitry Andric Reg = B.buildTrunc(S16, Reg).getReg(0); 61335ffd83dbSDimitry Andric } 61345ffd83dbSDimitry Andric } 61355ffd83dbSDimitry Andric 61365ffd83dbSDimitry Andric auto padWithUndef = [&](LLT Ty, int NumElts) { 61375ffd83dbSDimitry Andric if (NumElts == 0) 61385ffd83dbSDimitry Andric return; 61395ffd83dbSDimitry Andric Register Undef = B.buildUndef(Ty).getReg(0); 61405ffd83dbSDimitry Andric for (int I = 0; I != NumElts; ++I) 61415ffd83dbSDimitry Andric ResultRegs.push_back(Undef); 61425ffd83dbSDimitry Andric }; 61435ffd83dbSDimitry Andric 61445ffd83dbSDimitry Andric // Pad out any elements eliminated due to the dmask. 61455ffd83dbSDimitry Andric LLT ResTy = MRI->getType(ResultRegs[0]); 61465ffd83dbSDimitry Andric if (!ResTy.isVector()) { 61475ffd83dbSDimitry Andric padWithUndef(ResTy, NumElts - ResultRegs.size()); 61485ffd83dbSDimitry Andric B.buildBuildVector(DstReg, ResultRegs); 61495ffd83dbSDimitry Andric return true; 61505ffd83dbSDimitry Andric } 61515ffd83dbSDimitry Andric 61525ffd83dbSDimitry Andric assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 61535ffd83dbSDimitry Andric const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 61545ffd83dbSDimitry Andric 61555ffd83dbSDimitry Andric // Deal with the one annoying legal case. 6156fe6060f1SDimitry Andric const LLT V3S16 = LLT::fixed_vector(3, 16); 61575ffd83dbSDimitry Andric if (Ty == V3S16) { 61580eae32dcSDimitry Andric if (IsTFE) { 61590eae32dcSDimitry Andric if (ResultRegs.size() == 1) { 61600eae32dcSDimitry Andric NewResultReg = ResultRegs[0]; 61610eae32dcSDimitry Andric } else if (ResultRegs.size() == 2) { 61620eae32dcSDimitry Andric LLT V4S16 = LLT::fixed_vector(4, 16); 61630eae32dcSDimitry Andric NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 61640eae32dcSDimitry Andric } else { 61650eae32dcSDimitry Andric return false; 61660eae32dcSDimitry Andric } 61670eae32dcSDimitry Andric } 61680eae32dcSDimitry Andric 61690eae32dcSDimitry Andric if (MRI->getType(DstReg).getNumElements() < 61700eae32dcSDimitry Andric MRI->getType(NewResultReg).getNumElements()) { 61710eae32dcSDimitry Andric B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 61720eae32dcSDimitry Andric } else { 61730eae32dcSDimitry Andric B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 61740eae32dcSDimitry Andric } 61755ffd83dbSDimitry Andric return true; 61765ffd83dbSDimitry Andric } 61775ffd83dbSDimitry Andric 61785ffd83dbSDimitry Andric padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 61795ffd83dbSDimitry Andric B.buildConcatVectors(DstReg, ResultRegs); 61805ffd83dbSDimitry Andric return true; 61815ffd83dbSDimitry Andric } 61825ffd83dbSDimitry Andric 61835ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad( 6184e8d8bef9SDimitry Andric LegalizerHelper &Helper, MachineInstr &MI) const { 6185e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 6186e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 6187e8d8bef9SDimitry Andric 61885ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 61895ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 61905ffd83dbSDimitry Andric unsigned Size = Ty.getSizeInBits(); 61915ffd83dbSDimitry Andric MachineFunction &MF = B.getMF(); 61925ffd83dbSDimitry Andric 61935ffd83dbSDimitry Andric Observer.changingInstr(MI); 61945ffd83dbSDimitry Andric 619506c3fb27SDimitry Andric // Handle needing to s.buffer.load() a p8 value. 619606c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(Ty)) { 619706c3fb27SDimitry Andric Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 619806c3fb27SDimitry Andric Dst = MI.getOperand(0).getReg(); 619906c3fb27SDimitry Andric B.setInsertPt(B.getMBB(), MI); 620006c3fb27SDimitry Andric } 6201fe6060f1SDimitry Andric if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6202e8d8bef9SDimitry Andric Ty = getBitcastRegisterType(Ty); 6203e8d8bef9SDimitry Andric Helper.bitcastDst(MI, Ty, 0); 6204e8d8bef9SDimitry Andric Dst = MI.getOperand(0).getReg(); 6205e8d8bef9SDimitry Andric B.setInsertPt(B.getMBB(), MI); 6206e8d8bef9SDimitry Andric } 6207e8d8bef9SDimitry Andric 62085ffd83dbSDimitry Andric // FIXME: We don't really need this intermediate instruction. The intrinsic 62095ffd83dbSDimitry Andric // should be fixed to have a memory operand. Since it's readnone, we're not 62105ffd83dbSDimitry Andric // allowed to add one. 62115ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 621281ad6265SDimitry Andric MI.removeOperand(1); // Remove intrinsic ID 62135ffd83dbSDimitry Andric 62145ffd83dbSDimitry Andric // FIXME: When intrinsic definition is fixed, this should have an MMO already. 62155ffd83dbSDimitry Andric // TODO: Should this use datalayout alignment? 62165ffd83dbSDimitry Andric const unsigned MemSize = (Size + 7) / 8; 62175ffd83dbSDimitry Andric const Align MemAlign(4); 62185ffd83dbSDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 62195ffd83dbSDimitry Andric MachinePointerInfo(), 62205ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 62215ffd83dbSDimitry Andric MachineMemOperand::MOInvariant, 62225ffd83dbSDimitry Andric MemSize, MemAlign); 62235ffd83dbSDimitry Andric MI.addMemOperand(MF, MMO); 62245ffd83dbSDimitry Andric 62255ffd83dbSDimitry Andric // There are no 96-bit result scalar loads, but widening to 128-bit should 62265ffd83dbSDimitry Andric // always be legal. We may need to restore this to a 96-bit result if it turns 62275ffd83dbSDimitry Andric // out this needs to be converted to a vector load during RegBankSelect. 62285ffd83dbSDimitry Andric if (!isPowerOf2_32(Size)) { 62295ffd83dbSDimitry Andric if (Ty.isVector()) 62305ffd83dbSDimitry Andric Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 62315ffd83dbSDimitry Andric else 62325ffd83dbSDimitry Andric Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 62335ffd83dbSDimitry Andric } 62345ffd83dbSDimitry Andric 62355ffd83dbSDimitry Andric Observer.changedInstr(MI); 62365ffd83dbSDimitry Andric return true; 62375ffd83dbSDimitry Andric } 62385ffd83dbSDimitry Andric 6239e8d8bef9SDimitry Andric // TODO: Move to selection 62405ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 62410b57cec5SDimitry Andric MachineRegisterInfo &MRI, 62420b57cec5SDimitry Andric MachineIRBuilder &B) const { 6243fe6060f1SDimitry Andric if (!ST.isTrapHandlerEnabled() || 6244fe6060f1SDimitry Andric ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6245fe6060f1SDimitry Andric return legalizeTrapEndpgm(MI, MRI, B); 6246fe6060f1SDimitry Andric 624706c3fb27SDimitry Andric const Module *M = B.getMF().getFunction().getParent(); 624806c3fb27SDimitry Andric unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); 624906c3fb27SDimitry Andric if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) 6250fe6060f1SDimitry Andric return legalizeTrapHsaQueuePtr(MI, MRI, B); 6251fe6060f1SDimitry Andric 625206c3fb27SDimitry Andric return ST.supportsGetDoorbellID() ? 625306c3fb27SDimitry Andric legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6254fe6060f1SDimitry Andric } 6255fe6060f1SDimitry Andric 6256fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6257fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 625806c3fb27SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 625906c3fb27SDimitry Andric MachineBasicBlock &BB = B.getMBB(); 626006c3fb27SDimitry Andric MachineFunction *MF = BB.getParent(); 626106c3fb27SDimitry Andric 626206c3fb27SDimitry Andric if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 626306c3fb27SDimitry Andric BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 626406c3fb27SDimitry Andric .addImm(0); 626506c3fb27SDimitry Andric MI.eraseFromParent(); 626606c3fb27SDimitry Andric return true; 626706c3fb27SDimitry Andric } 626806c3fb27SDimitry Andric 626906c3fb27SDimitry Andric // We need a block split to make the real endpgm a terminator. We also don't 627006c3fb27SDimitry Andric // want to break phis in successor blocks, so we can't just delete to the 627106c3fb27SDimitry Andric // end of the block. 627206c3fb27SDimitry Andric BB.splitAt(MI, false /*UpdateLiveIns*/); 627306c3fb27SDimitry Andric MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 627406c3fb27SDimitry Andric MF->push_back(TrapBB); 627506c3fb27SDimitry Andric BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 627606c3fb27SDimitry Andric .addImm(0); 627706c3fb27SDimitry Andric BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 627806c3fb27SDimitry Andric .addMBB(TrapBB); 627906c3fb27SDimitry Andric 628006c3fb27SDimitry Andric BB.addSuccessor(TrapBB); 6281fe6060f1SDimitry Andric MI.eraseFromParent(); 6282fe6060f1SDimitry Andric return true; 6283fe6060f1SDimitry Andric } 6284fe6060f1SDimitry Andric 6285fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6286fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 628781ad6265SDimitry Andric MachineFunction &MF = B.getMF(); 628881ad6265SDimitry Andric const LLT S64 = LLT::scalar(64); 628981ad6265SDimitry Andric 629081ad6265SDimitry Andric Register SGPR01(AMDGPU::SGPR0_SGPR1); 629181ad6265SDimitry Andric // For code object version 5, queue_ptr is passed through implicit kernarg. 629206c3fb27SDimitry Andric if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >= 629306c3fb27SDimitry Andric AMDGPU::AMDHSA_COV5) { 629481ad6265SDimitry Andric AMDGPUTargetLowering::ImplicitParameter Param = 629581ad6265SDimitry Andric AMDGPUTargetLowering::QUEUE_PTR; 629681ad6265SDimitry Andric uint64_t Offset = 629781ad6265SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 629881ad6265SDimitry Andric 629981ad6265SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister( 630081ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 630181ad6265SDimitry Andric 630281ad6265SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 630381ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 630481ad6265SDimitry Andric return false; 630581ad6265SDimitry Andric 630681ad6265SDimitry Andric // TODO: can we be smarter about machine pointer info? 630781ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 630881ad6265SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 630981ad6265SDimitry Andric PtrInfo, 631081ad6265SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 631181ad6265SDimitry Andric MachineMemOperand::MOInvariant, 631281ad6265SDimitry Andric LLT::scalar(64), commonAlignment(Align(64), Offset)); 631381ad6265SDimitry Andric 631481ad6265SDimitry Andric // Pointer address 631581ad6265SDimitry Andric Register LoadAddr = MRI.createGenericVirtualRegister( 631681ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 631781ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, KernargPtrReg, 631881ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 631981ad6265SDimitry Andric // Load address 632081ad6265SDimitry Andric Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 632181ad6265SDimitry Andric B.buildCopy(SGPR01, Temp); 632281ad6265SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 632381ad6265SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 632481ad6265SDimitry Andric .addReg(SGPR01, RegState::Implicit); 632581ad6265SDimitry Andric MI.eraseFromParent(); 632681ad6265SDimitry Andric return true; 632781ad6265SDimitry Andric } 632881ad6265SDimitry Andric 63295ffd83dbSDimitry Andric // Pass queue pointer to trap handler as input, and insert trap instruction 63305ffd83dbSDimitry Andric // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 6331e8d8bef9SDimitry Andric Register LiveIn = 6332e8d8bef9SDimitry Andric MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6333e8d8bef9SDimitry Andric if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 63345ffd83dbSDimitry Andric return false; 6335e8d8bef9SDimitry Andric 63365ffd83dbSDimitry Andric B.buildCopy(SGPR01, LiveIn); 63375ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 6338fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 63395ffd83dbSDimitry Andric .addReg(SGPR01, RegState::Implicit); 6340fe6060f1SDimitry Andric 6341fe6060f1SDimitry Andric MI.eraseFromParent(); 6342fe6060f1SDimitry Andric return true; 63435ffd83dbSDimitry Andric } 63445ffd83dbSDimitry Andric 6345fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa( 6346fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6347fe6060f1SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 6348fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 63495ffd83dbSDimitry Andric MI.eraseFromParent(); 63505ffd83dbSDimitry Andric return true; 63515ffd83dbSDimitry Andric } 63525ffd83dbSDimitry Andric 63535ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 63545ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6355349cc55cSDimitry Andric // Is non-HSA path or trap-handler disabled? Then, report a warning 63565ffd83dbSDimitry Andric // accordingly 6357fe6060f1SDimitry Andric if (!ST.isTrapHandlerEnabled() || 6358fe6060f1SDimitry Andric ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 63595ffd83dbSDimitry Andric DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 63605ffd83dbSDimitry Andric "debugtrap handler not supported", 63615ffd83dbSDimitry Andric MI.getDebugLoc(), DS_Warning); 63625ffd83dbSDimitry Andric LLVMContext &Ctx = B.getMF().getFunction().getContext(); 63635ffd83dbSDimitry Andric Ctx.diagnose(NoTrap); 63645ffd83dbSDimitry Andric } else { 63655ffd83dbSDimitry Andric // Insert debug-trap instruction 6366fe6060f1SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 6367fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 63685ffd83dbSDimitry Andric } 63695ffd83dbSDimitry Andric 63705ffd83dbSDimitry Andric MI.eraseFromParent(); 63715ffd83dbSDimitry Andric return true; 63725ffd83dbSDimitry Andric } 63735ffd83dbSDimitry Andric 6374e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 6375e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 6376e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 6377e8d8bef9SDimitry Andric const LLT S16 = LLT::scalar(16); 6378e8d8bef9SDimitry Andric const LLT S32 = LLT::scalar(32); 637981ad6265SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 638081ad6265SDimitry Andric const LLT V3S32 = LLT::fixed_vector(3, 32); 6381e8d8bef9SDimitry Andric 6382e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 6383e8d8bef9SDimitry Andric Register NodePtr = MI.getOperand(2).getReg(); 6384e8d8bef9SDimitry Andric Register RayExtent = MI.getOperand(3).getReg(); 6385e8d8bef9SDimitry Andric Register RayOrigin = MI.getOperand(4).getReg(); 6386e8d8bef9SDimitry Andric Register RayDir = MI.getOperand(5).getReg(); 6387e8d8bef9SDimitry Andric Register RayInvDir = MI.getOperand(6).getReg(); 6388e8d8bef9SDimitry Andric Register TDescr = MI.getOperand(7).getReg(); 6389e8d8bef9SDimitry Andric 6390fe6060f1SDimitry Andric if (!ST.hasGFX10_AEncoding()) { 6391fe6060f1SDimitry Andric DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 6392fe6060f1SDimitry Andric "intrinsic not supported on subtarget", 6393fe6060f1SDimitry Andric MI.getDebugLoc()); 6394fe6060f1SDimitry Andric B.getMF().getFunction().getContext().diagnose(BadIntrin); 6395fe6060f1SDimitry Andric return false; 6396fe6060f1SDimitry Andric } 6397fe6060f1SDimitry Andric 639881ad6265SDimitry Andric const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 6399349cc55cSDimitry Andric const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 6400349cc55cSDimitry Andric const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 6401349cc55cSDimitry Andric const unsigned NumVDataDwords = 4; 6402349cc55cSDimitry Andric const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 640381ad6265SDimitry Andric const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 640481ad6265SDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 6405349cc55cSDimitry Andric const unsigned BaseOpcodes[2][2] = { 6406349cc55cSDimitry Andric {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 6407349cc55cSDimitry Andric {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 6408349cc55cSDimitry Andric AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 6409349cc55cSDimitry Andric int Opcode; 6410349cc55cSDimitry Andric if (UseNSA) { 641181ad6265SDimitry Andric Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 641281ad6265SDimitry Andric IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 641381ad6265SDimitry Andric : AMDGPU::MIMGEncGfx10NSA, 6414349cc55cSDimitry Andric NumVDataDwords, NumVAddrDwords); 6415349cc55cSDimitry Andric } else { 641681ad6265SDimitry Andric Opcode = AMDGPU::getMIMGOpcode( 641781ad6265SDimitry Andric BaseOpcodes[Is64][IsA16], 641881ad6265SDimitry Andric IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 6419bdd1243dSDimitry Andric NumVDataDwords, NumVAddrDwords); 6420349cc55cSDimitry Andric } 6421349cc55cSDimitry Andric assert(Opcode != -1); 6422e8d8bef9SDimitry Andric 6423e8d8bef9SDimitry Andric SmallVector<Register, 12> Ops; 642481ad6265SDimitry Andric if (UseNSA && IsGFX11Plus) { 642581ad6265SDimitry Andric auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 642681ad6265SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6427bdd1243dSDimitry Andric auto Merged = B.buildMergeLikeInstr( 642881ad6265SDimitry Andric V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 642981ad6265SDimitry Andric Ops.push_back(Merged.getReg(0)); 643081ad6265SDimitry Andric }; 643181ad6265SDimitry Andric 643281ad6265SDimitry Andric Ops.push_back(NodePtr); 643381ad6265SDimitry Andric Ops.push_back(RayExtent); 643481ad6265SDimitry Andric packLanes(RayOrigin); 643581ad6265SDimitry Andric 643681ad6265SDimitry Andric if (IsA16) { 643781ad6265SDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 643881ad6265SDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6439bdd1243dSDimitry Andric auto MergedDir = B.buildMergeLikeInstr( 644081ad6265SDimitry Andric V3S32, 6441bdd1243dSDimitry Andric {B.buildBitcast( 6442bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 644381ad6265SDimitry Andric UnmergeRayDir.getReg(0)})) 644481ad6265SDimitry Andric .getReg(0), 6445bdd1243dSDimitry Andric B.buildBitcast( 6446bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 644781ad6265SDimitry Andric UnmergeRayDir.getReg(1)})) 644881ad6265SDimitry Andric .getReg(0), 6449bdd1243dSDimitry Andric B.buildBitcast( 6450bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 645181ad6265SDimitry Andric UnmergeRayDir.getReg(2)})) 645281ad6265SDimitry Andric .getReg(0)}); 645381ad6265SDimitry Andric Ops.push_back(MergedDir.getReg(0)); 645481ad6265SDimitry Andric } else { 645581ad6265SDimitry Andric packLanes(RayDir); 645681ad6265SDimitry Andric packLanes(RayInvDir); 645781ad6265SDimitry Andric } 645881ad6265SDimitry Andric } else { 6459e8d8bef9SDimitry Andric if (Is64) { 6460e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 6461e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 6462e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 6463e8d8bef9SDimitry Andric } else { 6464e8d8bef9SDimitry Andric Ops.push_back(NodePtr); 6465e8d8bef9SDimitry Andric } 6466e8d8bef9SDimitry Andric Ops.push_back(RayExtent); 6467e8d8bef9SDimitry Andric 6468e8d8bef9SDimitry Andric auto packLanes = [&Ops, &S32, &B](Register Src) { 64690eae32dcSDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6470e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 6471e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 6472e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(2)); 6473e8d8bef9SDimitry Andric }; 6474e8d8bef9SDimitry Andric 6475e8d8bef9SDimitry Andric packLanes(RayOrigin); 6476e8d8bef9SDimitry Andric if (IsA16) { 64770eae32dcSDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 64780eae32dcSDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6479e8d8bef9SDimitry Andric Register R1 = MRI.createGenericVirtualRegister(S32); 6480e8d8bef9SDimitry Andric Register R2 = MRI.createGenericVirtualRegister(S32); 6481e8d8bef9SDimitry Andric Register R3 = MRI.createGenericVirtualRegister(S32); 6482bdd1243dSDimitry Andric B.buildMergeLikeInstr(R1, 6483bdd1243dSDimitry Andric {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 6484bdd1243dSDimitry Andric B.buildMergeLikeInstr( 6485bdd1243dSDimitry Andric R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 6486bdd1243dSDimitry Andric B.buildMergeLikeInstr( 6487bdd1243dSDimitry Andric R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 6488e8d8bef9SDimitry Andric Ops.push_back(R1); 6489e8d8bef9SDimitry Andric Ops.push_back(R2); 6490e8d8bef9SDimitry Andric Ops.push_back(R3); 6491e8d8bef9SDimitry Andric } else { 6492e8d8bef9SDimitry Andric packLanes(RayDir); 6493e8d8bef9SDimitry Andric packLanes(RayInvDir); 6494e8d8bef9SDimitry Andric } 649581ad6265SDimitry Andric } 6496e8d8bef9SDimitry Andric 6497349cc55cSDimitry Andric if (!UseNSA) { 6498349cc55cSDimitry Andric // Build a single vector containing all the operands so far prepared. 6499349cc55cSDimitry Andric LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 6500bdd1243dSDimitry Andric Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 6501349cc55cSDimitry Andric Ops.clear(); 6502349cc55cSDimitry Andric Ops.push_back(MergedOps); 6503349cc55cSDimitry Andric } 6504349cc55cSDimitry Andric 6505e8d8bef9SDimitry Andric auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 6506e8d8bef9SDimitry Andric .addDef(DstReg) 6507e8d8bef9SDimitry Andric .addImm(Opcode); 6508e8d8bef9SDimitry Andric 6509e8d8bef9SDimitry Andric for (Register R : Ops) { 6510e8d8bef9SDimitry Andric MIB.addUse(R); 6511e8d8bef9SDimitry Andric } 6512e8d8bef9SDimitry Andric 6513e8d8bef9SDimitry Andric MIB.addUse(TDescr) 6514e8d8bef9SDimitry Andric .addImm(IsA16 ? 1 : 0) 6515e8d8bef9SDimitry Andric .cloneMemRefs(MI); 6516e8d8bef9SDimitry Andric 6517e8d8bef9SDimitry Andric MI.eraseFromParent(); 6518e8d8bef9SDimitry Andric return true; 6519e8d8bef9SDimitry Andric } 6520e8d8bef9SDimitry Andric 652181ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 652281ad6265SDimitry Andric MachineIRBuilder &B) const { 652381ad6265SDimitry Andric unsigned Opc; 652481ad6265SDimitry Andric int RoundMode = MI.getOperand(2).getImm(); 652581ad6265SDimitry Andric 652681ad6265SDimitry Andric if (RoundMode == (int)RoundingMode::TowardPositive) 652781ad6265SDimitry Andric Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 652881ad6265SDimitry Andric else if (RoundMode == (int)RoundingMode::TowardNegative) 652981ad6265SDimitry Andric Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 653081ad6265SDimitry Andric else 653181ad6265SDimitry Andric return false; 653281ad6265SDimitry Andric 653381ad6265SDimitry Andric B.buildInstr(Opc) 653481ad6265SDimitry Andric .addDef(MI.getOperand(0).getReg()) 653581ad6265SDimitry Andric .addUse(MI.getOperand(1).getReg()); 653681ad6265SDimitry Andric 653704eeddc0SDimitry Andric MI.eraseFromParent(); 653881ad6265SDimitry Andric 653904eeddc0SDimitry Andric return true; 654004eeddc0SDimitry Andric } 654104eeddc0SDimitry Andric 65425ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 65435ffd83dbSDimitry Andric MachineInstr &MI) const { 65445ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 65455ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 65465ffd83dbSDimitry Andric 65470b57cec5SDimitry Andric // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 6548480093f4SDimitry Andric auto IntrID = MI.getIntrinsicID(); 6549480093f4SDimitry Andric switch (IntrID) { 6550480093f4SDimitry Andric case Intrinsic::amdgcn_if: 6551480093f4SDimitry Andric case Intrinsic::amdgcn_else: { 6552480093f4SDimitry Andric MachineInstr *Br = nullptr; 65535ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 6554e8d8bef9SDimitry Andric bool Negated = false; 6555e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 6556e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 65570b57cec5SDimitry Andric const SIRegisterInfo *TRI 65580b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 65590b57cec5SDimitry Andric 65600b57cec5SDimitry Andric Register Def = MI.getOperand(1).getReg(); 65610b57cec5SDimitry Andric Register Use = MI.getOperand(3).getReg(); 6562480093f4SDimitry Andric 65635ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6564e8d8bef9SDimitry Andric 6565e8d8bef9SDimitry Andric if (Negated) 6566e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 6567e8d8bef9SDimitry Andric 65685ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6569480093f4SDimitry Andric if (IntrID == Intrinsic::amdgcn_if) { 65700b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_IF) 65710b57cec5SDimitry Andric .addDef(Def) 65720b57cec5SDimitry Andric .addUse(Use) 65735ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 6574480093f4SDimitry Andric } else { 6575480093f4SDimitry Andric B.buildInstr(AMDGPU::SI_ELSE) 6576480093f4SDimitry Andric .addDef(Def) 6577480093f4SDimitry Andric .addUse(Use) 6578e8d8bef9SDimitry Andric .addMBB(UncondBrTarget); 6579480093f4SDimitry Andric } 6580480093f4SDimitry Andric 65815ffd83dbSDimitry Andric if (Br) { 65825ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 65835ffd83dbSDimitry Andric } else { 65845ffd83dbSDimitry Andric // The IRTranslator skips inserting the G_BR for fallthrough cases, but 65855ffd83dbSDimitry Andric // since we're swapping branch targets it needs to be reinserted. 65865ffd83dbSDimitry Andric // FIXME: IRTranslator should probably not do this 65875ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 65885ffd83dbSDimitry Andric } 65890b57cec5SDimitry Andric 65900b57cec5SDimitry Andric MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 65910b57cec5SDimitry Andric MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 65920b57cec5SDimitry Andric MI.eraseFromParent(); 65930b57cec5SDimitry Andric BrCond->eraseFromParent(); 65940b57cec5SDimitry Andric return true; 65950b57cec5SDimitry Andric } 65960b57cec5SDimitry Andric 65970b57cec5SDimitry Andric return false; 65980b57cec5SDimitry Andric } 65990b57cec5SDimitry Andric case Intrinsic::amdgcn_loop: { 6600480093f4SDimitry Andric MachineInstr *Br = nullptr; 66015ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 6602e8d8bef9SDimitry Andric bool Negated = false; 6603e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 6604e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 66050b57cec5SDimitry Andric const SIRegisterInfo *TRI 66060b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 66070b57cec5SDimitry Andric 66085ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 66090b57cec5SDimitry Andric Register Reg = MI.getOperand(2).getReg(); 66105ffd83dbSDimitry Andric 6611e8d8bef9SDimitry Andric if (Negated) 6612e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 6613e8d8bef9SDimitry Andric 66145ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 66150b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_LOOP) 66160b57cec5SDimitry Andric .addUse(Reg) 66175ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 66185ffd83dbSDimitry Andric 66195ffd83dbSDimitry Andric if (Br) 66205ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 66215ffd83dbSDimitry Andric else 66225ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 66235ffd83dbSDimitry Andric 66240b57cec5SDimitry Andric MI.eraseFromParent(); 66250b57cec5SDimitry Andric BrCond->eraseFromParent(); 66260b57cec5SDimitry Andric MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 66270b57cec5SDimitry Andric return true; 66280b57cec5SDimitry Andric } 66290b57cec5SDimitry Andric 66300b57cec5SDimitry Andric return false; 66310b57cec5SDimitry Andric } 663206c3fb27SDimitry Andric case Intrinsic::amdgcn_make_buffer_rsrc: 663306c3fb27SDimitry Andric return legalizePointerAsRsrcIntrin(MI, MRI, B); 66340b57cec5SDimitry Andric case Intrinsic::amdgcn_kernarg_segment_ptr: 66355ffd83dbSDimitry Andric if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 66365ffd83dbSDimitry Andric // This only makes sense to call in a kernel, so just lower to null. 66375ffd83dbSDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), 0); 66385ffd83dbSDimitry Andric MI.eraseFromParent(); 66395ffd83dbSDimitry Andric return true; 66405ffd83dbSDimitry Andric } 66415ffd83dbSDimitry Andric 66420b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 66430b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 66440b57cec5SDimitry Andric case Intrinsic::amdgcn_implicitarg_ptr: 66450b57cec5SDimitry Andric return legalizeImplicitArgPtr(MI, MRI, B); 66460b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 664781ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 66480b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_X); 66490b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 665081ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 66510b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 66520b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 665381ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 66540b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 66550b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_x: 66560b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 66570b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 66580b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_y: 66590b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 66600b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 66610b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_z: 66620b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 66630b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 6664fcaf7f86SDimitry Andric case Intrinsic::amdgcn_lds_kernel_id: 6665fcaf7f86SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 6666fcaf7f86SDimitry Andric AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 66670b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_ptr: 66680b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 66690b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_PTR); 66700b57cec5SDimitry Andric case Intrinsic::amdgcn_queue_ptr: 66710b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 66720b57cec5SDimitry Andric AMDGPUFunctionArgInfo::QUEUE_PTR); 66730b57cec5SDimitry Andric case Intrinsic::amdgcn_implicit_buffer_ptr: 66740b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 66750b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 66760b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_id: 66770b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 66780b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_ID); 667981ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_x: 668081ad6265SDimitry Andric // TODO: Emit error for hsa 668181ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 668281ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_X); 668381ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_y: 668481ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 668581ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_Y); 668681ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_z: 668781ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 668881ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_Z); 668981ad6265SDimitry Andric case Intrinsic::r600_read_local_size_x: 669081ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 669181ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 669281ad6265SDimitry Andric case Intrinsic::r600_read_local_size_y: 669381ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 669481ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 669581ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 669681ad6265SDimitry Andric case Intrinsic::r600_read_local_size_z: 669781ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 669881ad6265SDimitry Andric case Intrinsic::r600_read_global_size_x: 669981ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 670081ad6265SDimitry Andric case Intrinsic::r600_read_global_size_y: 670181ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 670281ad6265SDimitry Andric case Intrinsic::r600_read_global_size_z: 670381ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 67048bcb0991SDimitry Andric case Intrinsic::amdgcn_fdiv_fast: 67058bcb0991SDimitry Andric return legalizeFDIVFastIntrin(MI, MRI, B); 67068bcb0991SDimitry Andric case Intrinsic::amdgcn_is_shared: 67078bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 67088bcb0991SDimitry Andric case Intrinsic::amdgcn_is_private: 67098bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 67108bcb0991SDimitry Andric case Intrinsic::amdgcn_wavefrontsize: { 67118bcb0991SDimitry Andric B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 67128bcb0991SDimitry Andric MI.eraseFromParent(); 67138bcb0991SDimitry Andric return true; 67148bcb0991SDimitry Andric } 67155ffd83dbSDimitry Andric case Intrinsic::amdgcn_s_buffer_load: 6716e8d8bef9SDimitry Andric return legalizeSBufferLoad(Helper, MI); 67178bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store: 671806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_store: 67195ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store: 672006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_store: 67215ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, false); 67228bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store_format: 672306c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 67245ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store_format: 672506c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 67265ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, true); 67275ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_store: 672806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 67295ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_store: 673006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 67315ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, true, true); 67325ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load: 673306c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_load: 67345ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load: 673506c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_load: 67365ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, false, false); 67375ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load_format: 673806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 67395ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load_format: 674006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 67415ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, false); 67425ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_load: 674306c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 67445ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_load: 674506c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 67465ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, true); 67475ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 674806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 67495ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 675006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 67515ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 675206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 67535ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 675406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 67555ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 675606c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 67575ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 675806c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 67595ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 676006c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 67615ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 676206c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 67635ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 676406c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 67655ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 676606c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 67675ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 676806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 67695ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 677006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 67715ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 677206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 67735ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 677406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 67755ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 677606c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 67775ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 677806c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 67795ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 678006c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 67815ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 678206c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 67835ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 678406c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 67855ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 678606c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 67875ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 678806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 67895ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 679006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 67915ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 679206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 67935ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 679406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 67955ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 679606c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 67975ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 679806c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 6799fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 680006c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 6801fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 680206c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 6803fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 680406c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 6805fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 680606c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 680704eeddc0SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 680806c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 6809bdd1243dSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 681006c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 681104eeddc0SDimitry Andric return legalizeBufferAtomic(MI, B, IntrID); 68125ffd83dbSDimitry Andric case Intrinsic::trap: 68135ffd83dbSDimitry Andric return legalizeTrapIntrinsic(MI, MRI, B); 68145ffd83dbSDimitry Andric case Intrinsic::debugtrap: 68155ffd83dbSDimitry Andric return legalizeDebugTrapIntrinsic(MI, MRI, B); 6816e8d8bef9SDimitry Andric case Intrinsic::amdgcn_rsq_clamp: 6817e8d8bef9SDimitry Andric return legalizeRsqClampIntrinsic(MI, MRI, B); 6818e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 6819e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 6820e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 6821e8d8bef9SDimitry Andric return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 6822e8d8bef9SDimitry Andric case Intrinsic::amdgcn_image_bvh_intersect_ray: 6823e8d8bef9SDimitry Andric return legalizeBVHIntrinsic(MI, B); 682406c3fb27SDimitry Andric case Intrinsic::amdgcn_fmed3: { 682506c3fb27SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 682606c3fb27SDimitry Andric 682706c3fb27SDimitry Andric // FIXME: This is to workaround the inability of tablegen match combiners to 682806c3fb27SDimitry Andric // match intrinsics in patterns. 682906c3fb27SDimitry Andric Observer.changingInstr(MI); 683006c3fb27SDimitry Andric MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 683106c3fb27SDimitry Andric MI.removeOperand(1); 683206c3fb27SDimitry Andric Observer.changedInstr(MI); 683306c3fb27SDimitry Andric return true; 683406c3fb27SDimitry Andric } 68355ffd83dbSDimitry Andric default: { 68365ffd83dbSDimitry Andric if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 68375ffd83dbSDimitry Andric AMDGPU::getImageDimIntrinsicInfo(IntrID)) 68385ffd83dbSDimitry Andric return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 68390b57cec5SDimitry Andric return true; 68400b57cec5SDimitry Andric } 68415ffd83dbSDimitry Andric } 68420b57cec5SDimitry Andric 68430b57cec5SDimitry Andric return true; 68440b57cec5SDimitry Andric } 6845