10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for 100b57cec5SDimitry Andric /// AMDGPU. 110b57cec5SDimitry Andric /// \todo This should be generated by TableGen. 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 158bcb0991SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h" 18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h" 190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 205f757f3fSDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 215f757f3fSDimitry Andric #include "SIInstrInfo.h" 220b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 235f757f3fSDimitry Andric #include "SIRegisterInfo.h" 24fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 255ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h" 26fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h" 275f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 280b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 295ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 30e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 3106c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/Utils.h" 325f757f3fSDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h" 338bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 34e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 3581ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h" 360b57cec5SDimitry Andric 370b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo" 380b57cec5SDimitry Andric 390b57cec5SDimitry Andric using namespace llvm; 400b57cec5SDimitry Andric using namespace LegalizeActions; 410b57cec5SDimitry Andric using namespace LegalizeMutations; 420b57cec5SDimitry Andric using namespace LegalityPredicates; 435ffd83dbSDimitry Andric using namespace MIPatternMatch; 440b57cec5SDimitry Andric 455ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types. 465ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality( 475ffd83dbSDimitry Andric "amdgpu-global-isel-new-legality", 485ffd83dbSDimitry Andric cl::desc("Use GlobalISel desired legality, rather than try to use" 495ffd83dbSDimitry Andric "rules compatible with selection patterns"), 505ffd83dbSDimitry Andric cl::init(false), 515ffd83dbSDimitry Andric cl::ReallyHidden); 520b57cec5SDimitry Andric 535ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024; 545ffd83dbSDimitry Andric 555ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements 565ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) { 575ffd83dbSDimitry Andric unsigned NElts = Ty.getNumElements(); 585ffd83dbSDimitry Andric unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 59fe6060f1SDimitry Andric return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 600b57cec5SDimitry Andric } 610b57cec5SDimitry Andric 625ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits 635ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) { 645ffd83dbSDimitry Andric unsigned Bits = Ty.getSizeInBits(); 655ffd83dbSDimitry Andric unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 665ffd83dbSDimitry Andric return LLT::scalar(Pow2Bits); 678bcb0991SDimitry Andric } 688bcb0991SDimitry Andric 69349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an 70e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 71e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized. 720b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 730b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 740b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 75e8d8bef9SDimitry Andric if (!Ty.isVector()) 76e8d8bef9SDimitry Andric return false; 77e8d8bef9SDimitry Andric 78e8d8bef9SDimitry Andric const LLT EltTy = Ty.getElementType(); 79e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 80e8d8bef9SDimitry Andric return Ty.getNumElements() % 2 != 0 && 81e8d8bef9SDimitry Andric EltSize > 1 && EltSize < 32 && 828bcb0991SDimitry Andric Ty.getSizeInBits() % 32 != 0; 838bcb0991SDimitry Andric }; 848bcb0991SDimitry Andric } 858bcb0991SDimitry Andric 86e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 87e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 88e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 89e8d8bef9SDimitry Andric return Ty.getSizeInBits() % 32 == 0; 90e8d8bef9SDimitry Andric }; 91e8d8bef9SDimitry Andric } 92e8d8bef9SDimitry Andric 938bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) { 948bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 958bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 968bcb0991SDimitry Andric const LLT EltTy = Ty.getScalarType(); 978bcb0991SDimitry Andric return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 980b57cec5SDimitry Andric }; 990b57cec5SDimitry Andric } 1000b57cec5SDimitry Andric 1010b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 1020b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1030b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1040b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 105bdd1243dSDimitry Andric return std::pair(TypeIdx, 106fe6060f1SDimitry Andric LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 1070b57cec5SDimitry Andric }; 1080b57cec5SDimitry Andric } 1090b57cec5SDimitry Andric 1100b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 1110b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1120b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1130b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 1140b57cec5SDimitry Andric unsigned Size = Ty.getSizeInBits(); 1150b57cec5SDimitry Andric unsigned Pieces = (Size + 63) / 64; 1160b57cec5SDimitry Andric unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 117bdd1243dSDimitry Andric return std::pair(TypeIdx, LLT::scalarOrVector( 118bdd1243dSDimitry Andric ElementCount::getFixed(NewNumElts), EltTy)); 1190b57cec5SDimitry Andric }; 1200b57cec5SDimitry Andric } 1210b57cec5SDimitry Andric 1228bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit 1238bcb0991SDimitry Andric // type. 1248bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 1258bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1268bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1278bcb0991SDimitry Andric 1288bcb0991SDimitry Andric const LLT EltTy = Ty.getElementType(); 1298bcb0991SDimitry Andric const int Size = Ty.getSizeInBits(); 1308bcb0991SDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1318bcb0991SDimitry Andric const int NextMul32 = (Size + 31) / 32; 1328bcb0991SDimitry Andric 1338bcb0991SDimitry Andric assert(EltSize < 32); 1348bcb0991SDimitry Andric 1358bcb0991SDimitry Andric const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 136bdd1243dSDimitry Andric return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 1378bcb0991SDimitry Andric }; 1388bcb0991SDimitry Andric } 1398bcb0991SDimitry Andric 14006c3fb27SDimitry Andric // Increase the number of vector elements to reach the next legal RegClass. 14106c3fb27SDimitry Andric static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { 14206c3fb27SDimitry Andric return [=](const LegalityQuery &Query) { 14306c3fb27SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 14406c3fb27SDimitry Andric const unsigned NumElts = Ty.getNumElements(); 14506c3fb27SDimitry Andric const unsigned EltSize = Ty.getElementType().getSizeInBits(); 14606c3fb27SDimitry Andric const unsigned MaxNumElts = MaxRegisterSize / EltSize; 14706c3fb27SDimitry Andric 14806c3fb27SDimitry Andric assert(EltSize == 32 || EltSize == 64); 14906c3fb27SDimitry Andric assert(Ty.getSizeInBits() < MaxRegisterSize); 15006c3fb27SDimitry Andric 15106c3fb27SDimitry Andric unsigned NewNumElts; 15206c3fb27SDimitry Andric // Find the nearest legal RegClass that is larger than the current type. 15306c3fb27SDimitry Andric for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) { 15406c3fb27SDimitry Andric if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize)) 15506c3fb27SDimitry Andric break; 15606c3fb27SDimitry Andric } 15706c3fb27SDimitry Andric 15806c3fb27SDimitry Andric return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize)); 15906c3fb27SDimitry Andric }; 16006c3fb27SDimitry Andric } 16106c3fb27SDimitry Andric 16206c3fb27SDimitry Andric static LLT getBufferRsrcScalarType(const LLT Ty) { 16306c3fb27SDimitry Andric if (!Ty.isVector()) 16406c3fb27SDimitry Andric return LLT::scalar(128); 16506c3fb27SDimitry Andric const ElementCount NumElems = Ty.getElementCount(); 16606c3fb27SDimitry Andric return LLT::vector(NumElems, LLT::scalar(128)); 16706c3fb27SDimitry Andric } 16806c3fb27SDimitry Andric 16906c3fb27SDimitry Andric static LLT getBufferRsrcRegisterType(const LLT Ty) { 17006c3fb27SDimitry Andric if (!Ty.isVector()) 17106c3fb27SDimitry Andric return LLT::fixed_vector(4, LLT::scalar(32)); 17206c3fb27SDimitry Andric const unsigned NumElems = Ty.getElementCount().getFixedValue(); 17306c3fb27SDimitry Andric return LLT::fixed_vector(NumElems * 4, LLT::scalar(32)); 17406c3fb27SDimitry Andric } 17506c3fb27SDimitry Andric 176e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) { 177e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 1785ffd83dbSDimitry Andric 1795ffd83dbSDimitry Andric if (Size <= 32) { 1805ffd83dbSDimitry Andric // <2 x s8> -> s16 1815ffd83dbSDimitry Andric // <4 x s8> -> s32 182e8d8bef9SDimitry Andric return LLT::scalar(Size); 183e8d8bef9SDimitry Andric } 1845ffd83dbSDimitry Andric 185fe6060f1SDimitry Andric return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 186e8d8bef9SDimitry Andric } 187e8d8bef9SDimitry Andric 188e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 189e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 190e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 191bdd1243dSDimitry Andric return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 192e8d8bef9SDimitry Andric }; 193e8d8bef9SDimitry Andric } 194e8d8bef9SDimitry Andric 195e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 196e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 197e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 198e8d8bef9SDimitry Andric unsigned Size = Ty.getSizeInBits(); 199e8d8bef9SDimitry Andric assert(Size % 32 == 0); 200bdd1243dSDimitry Andric return std::pair( 201fe6060f1SDimitry Andric TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 2025ffd83dbSDimitry Andric }; 2035ffd83dbSDimitry Andric } 2045ffd83dbSDimitry Andric 2058bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 2068bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2078bcb0991SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2088bcb0991SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 2098bcb0991SDimitry Andric }; 2108bcb0991SDimitry Andric } 2118bcb0991SDimitry Andric 2120b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 2130b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 2140b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2150b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 2160b57cec5SDimitry Andric }; 2170b57cec5SDimitry Andric } 2180b57cec5SDimitry Andric 2190b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 2200b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 2210b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2220b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 2230b57cec5SDimitry Andric }; 2240b57cec5SDimitry Andric } 2250b57cec5SDimitry Andric 2265ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) { 2275ffd83dbSDimitry Andric return Size % 32 == 0 && Size <= MaxRegisterSize; 2285ffd83dbSDimitry Andric } 2295ffd83dbSDimitry Andric 2305ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) { 2315ffd83dbSDimitry Andric const int EltSize = EltTy.getSizeInBits(); 2325ffd83dbSDimitry Andric return EltSize == 16 || EltSize % 32 == 0; 2335ffd83dbSDimitry Andric } 2345ffd83dbSDimitry Andric 2355ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) { 2360b57cec5SDimitry Andric const int EltSize = Ty.getElementType().getSizeInBits(); 2370b57cec5SDimitry Andric return EltSize == 32 || EltSize == 64 || 2380b57cec5SDimitry Andric (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 2390b57cec5SDimitry Andric EltSize == 128 || EltSize == 256; 2400b57cec5SDimitry Andric } 2410b57cec5SDimitry Andric 2425ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) { 2435ffd83dbSDimitry Andric if (!isRegisterSize(Ty.getSizeInBits())) 2445ffd83dbSDimitry Andric return false; 2455ffd83dbSDimitry Andric 2465ffd83dbSDimitry Andric if (Ty.isVector()) 2475ffd83dbSDimitry Andric return isRegisterVectorType(Ty); 2485ffd83dbSDimitry Andric 2495ffd83dbSDimitry Andric return true; 2505ffd83dbSDimitry Andric } 2515ffd83dbSDimitry Andric 2525ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and 2535ffd83dbSDimitry Andric // multiples of v2s16. 2545ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) { 2555ffd83dbSDimitry Andric return [=](const LegalityQuery &Query) { 2565ffd83dbSDimitry Andric return isRegisterType(Query.Types[TypeIdx]); 2578bcb0991SDimitry Andric }; 2588bcb0991SDimitry Andric } 2598bcb0991SDimitry Andric 26006c3fb27SDimitry Andric // RegisterType that doesn't have a corresponding RegClass. 26106c3fb27SDimitry Andric static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) { 26206c3fb27SDimitry Andric return [=](const LegalityQuery &Query) { 26306c3fb27SDimitry Andric LLT Ty = Query.Types[TypeIdx]; 26406c3fb27SDimitry Andric return isRegisterType(Ty) && 26506c3fb27SDimitry Andric !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits()); 26606c3fb27SDimitry Andric }; 26706c3fb27SDimitry Andric } 26806c3fb27SDimitry Andric 2695ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 2708bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2715ffd83dbSDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2725ffd83dbSDimitry Andric if (!QueryTy.isVector()) 2735ffd83dbSDimitry Andric return false; 2745ffd83dbSDimitry Andric const LLT EltTy = QueryTy.getElementType(); 2755ffd83dbSDimitry Andric return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 2768bcb0991SDimitry Andric }; 2778bcb0991SDimitry Andric } 2788bcb0991SDimitry Andric 279fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger 280fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type. 281fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 2828bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2838bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 2848bcb0991SDimitry Andric return !Ty.isVector() && Ty.getSizeInBits() > 32 && 285fe6060f1SDimitry Andric Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 2860b57cec5SDimitry Andric }; 2870b57cec5SDimitry Andric } 2880b57cec5SDimitry Andric 2895ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 2905ffd83dbSDimitry Andric // handle some operations by just promoting the register during 2915ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits. 2925ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 29306c3fb27SDimitry Andric bool IsLoad, bool IsAtomic) { 2945ffd83dbSDimitry Andric switch (AS) { 2955ffd83dbSDimitry Andric case AMDGPUAS::PRIVATE_ADDRESS: 2965ffd83dbSDimitry Andric // FIXME: Private element size. 297e8d8bef9SDimitry Andric return ST.enableFlatScratch() ? 128 : 32; 2985ffd83dbSDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 2995ffd83dbSDimitry Andric return ST.useDS128() ? 128 : 64; 3005ffd83dbSDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 3015ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS: 3025ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 30306c3fb27SDimitry Andric case AMDGPUAS::BUFFER_RESOURCE: 3045ffd83dbSDimitry Andric // Treat constant and global as identical. SMRD loads are sometimes usable for 3055ffd83dbSDimitry Andric // global loads (ideally constant address space should be eliminated) 3065ffd83dbSDimitry Andric // depending on the context. Legality cannot be context dependent, but 3075ffd83dbSDimitry Andric // RegBankSelect can split the load as necessary depending on the pointer 3085ffd83dbSDimitry Andric // register bank/uniformity and if the memory is invariant or not written in a 3095ffd83dbSDimitry Andric // kernel. 3105ffd83dbSDimitry Andric return IsLoad ? 512 : 128; 3115ffd83dbSDimitry Andric default: 31206c3fb27SDimitry Andric // FIXME: Flat addresses may contextually need to be split to 32-bit parts 31306c3fb27SDimitry Andric // if they may alias scratch depending on the subtarget. This needs to be 31406c3fb27SDimitry Andric // moved to custom handling to use addressMayBeAccessedAsPrivate 31506c3fb27SDimitry Andric return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32; 3165ffd83dbSDimitry Andric } 3175ffd83dbSDimitry Andric } 3185ffd83dbSDimitry Andric 3195ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 320fe6060f1SDimitry Andric const LegalityQuery &Query) { 3215ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 3225ffd83dbSDimitry Andric 3235ffd83dbSDimitry Andric // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 324fe6060f1SDimitry Andric const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 3255ffd83dbSDimitry Andric 3265ffd83dbSDimitry Andric unsigned RegSize = Ty.getSizeInBits(); 32704eeddc0SDimitry Andric uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 32804eeddc0SDimitry Andric uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 3295ffd83dbSDimitry Andric unsigned AS = Query.Types[1].getAddressSpace(); 3305ffd83dbSDimitry Andric 3315ffd83dbSDimitry Andric // All of these need to be custom lowered to cast the pointer operand. 3325ffd83dbSDimitry Andric if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 3335ffd83dbSDimitry Andric return false; 3345ffd83dbSDimitry Andric 335fe6060f1SDimitry Andric // Do not handle extending vector loads. 336fe6060f1SDimitry Andric if (Ty.isVector() && MemSize != RegSize) 337fe6060f1SDimitry Andric return false; 338fe6060f1SDimitry Andric 3395ffd83dbSDimitry Andric // TODO: We should be able to widen loads if the alignment is high enough, but 3405ffd83dbSDimitry Andric // we also need to modify the memory access size. 3415ffd83dbSDimitry Andric #if 0 3425ffd83dbSDimitry Andric // Accept widening loads based on alignment. 3435ffd83dbSDimitry Andric if (IsLoad && MemSize < Size) 3445ffd83dbSDimitry Andric MemSize = std::max(MemSize, Align); 3455ffd83dbSDimitry Andric #endif 3465ffd83dbSDimitry Andric 3475ffd83dbSDimitry Andric // Only 1-byte and 2-byte to 32-bit extloads are valid. 3485ffd83dbSDimitry Andric if (MemSize != RegSize && RegSize != 32) 3495ffd83dbSDimitry Andric return false; 3505ffd83dbSDimitry Andric 35106c3fb27SDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 35206c3fb27SDimitry Andric Query.MMODescrs[0].Ordering != 35306c3fb27SDimitry Andric AtomicOrdering::NotAtomic)) 3545ffd83dbSDimitry Andric return false; 3555ffd83dbSDimitry Andric 3565ffd83dbSDimitry Andric switch (MemSize) { 3575ffd83dbSDimitry Andric case 8: 3585ffd83dbSDimitry Andric case 16: 3595ffd83dbSDimitry Andric case 32: 3605ffd83dbSDimitry Andric case 64: 3615ffd83dbSDimitry Andric case 128: 3625ffd83dbSDimitry Andric break; 3635ffd83dbSDimitry Andric case 96: 3645ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 3655ffd83dbSDimitry Andric return false; 3665ffd83dbSDimitry Andric break; 3675ffd83dbSDimitry Andric case 256: 3685ffd83dbSDimitry Andric case 512: 3695ffd83dbSDimitry Andric // These may contextually need to be broken down. 3705ffd83dbSDimitry Andric break; 3715ffd83dbSDimitry Andric default: 3725ffd83dbSDimitry Andric return false; 3735ffd83dbSDimitry Andric } 3745ffd83dbSDimitry Andric 3755ffd83dbSDimitry Andric assert(RegSize >= MemSize); 3765ffd83dbSDimitry Andric 377e8d8bef9SDimitry Andric if (AlignBits < MemSize) { 3785ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 379e8d8bef9SDimitry Andric if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 380e8d8bef9SDimitry Andric Align(AlignBits / 8))) 3815ffd83dbSDimitry Andric return false; 3825ffd83dbSDimitry Andric } 3835ffd83dbSDimitry Andric 3845ffd83dbSDimitry Andric return true; 3855ffd83dbSDimitry Andric } 3865ffd83dbSDimitry Andric 38706c3fb27SDimitry Andric // The newer buffer intrinsic forms take their resource arguments as 38806c3fb27SDimitry Andric // pointers in address space 8, aka s128 values. However, in order to not break 38906c3fb27SDimitry Andric // SelectionDAG, the underlying operations have to continue to take v4i32 39006c3fb27SDimitry Andric // arguments. Therefore, we convert resource pointers - or vectors of them 39106c3fb27SDimitry Andric // to integer values here. 39206c3fb27SDimitry Andric static bool hasBufferRsrcWorkaround(const LLT Ty) { 39306c3fb27SDimitry Andric if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) 39406c3fb27SDimitry Andric return true; 39506c3fb27SDimitry Andric if (Ty.isVector()) { 39606c3fb27SDimitry Andric const LLT ElemTy = Ty.getElementType(); 39706c3fb27SDimitry Andric return hasBufferRsrcWorkaround(ElemTy); 39806c3fb27SDimitry Andric } 39906c3fb27SDimitry Andric return false; 40006c3fb27SDimitry Andric } 40106c3fb27SDimitry Andric 4025ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 4035ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care 4045ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by 4055ffd83dbSDimitry Andric // bitcasting. 4065ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) { 4075ffd83dbSDimitry Andric if (EnableNewLegality) 4085ffd83dbSDimitry Andric return false; 4095ffd83dbSDimitry Andric 4105ffd83dbSDimitry Andric const unsigned Size = Ty.getSizeInBits(); 4115ffd83dbSDimitry Andric if (Size <= 64) 4125ffd83dbSDimitry Andric return false; 41306c3fb27SDimitry Andric // Address space 8 pointers get their own workaround. 41406c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(Ty)) 41506c3fb27SDimitry Andric return false; 4165ffd83dbSDimitry Andric if (!Ty.isVector()) 4175ffd83dbSDimitry Andric return true; 418e8d8bef9SDimitry Andric 419e8d8bef9SDimitry Andric LLT EltTy = Ty.getElementType(); 420e8d8bef9SDimitry Andric if (EltTy.isPointer()) 421e8d8bef9SDimitry Andric return true; 422e8d8bef9SDimitry Andric 423e8d8bef9SDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 4245ffd83dbSDimitry Andric return EltSize != 32 && EltSize != 64; 4255ffd83dbSDimitry Andric } 4265ffd83dbSDimitry Andric 427fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 4285ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 429fe6060f1SDimitry Andric return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 43006c3fb27SDimitry Andric !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty); 4315ffd83dbSDimitry Andric } 4325ffd83dbSDimitry Andric 433e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast 434e8d8bef9SDimitry Andric /// to a different type. 435e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 436fe6060f1SDimitry Andric const LLT MemTy) { 437fe6060f1SDimitry Andric const unsigned MemSizeInBits = MemTy.getSizeInBits(); 438e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 439e8d8bef9SDimitry Andric if (Size != MemSizeInBits) 440e8d8bef9SDimitry Andric return Size <= 32 && Ty.isVector(); 441e8d8bef9SDimitry Andric 442e8d8bef9SDimitry Andric if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 443e8d8bef9SDimitry Andric return true; 444fe6060f1SDimitry Andric 445fe6060f1SDimitry Andric // Don't try to handle bitcasting vector ext loads for now. 446fe6060f1SDimitry Andric return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 447fe6060f1SDimitry Andric (Size <= 32 || isRegisterSize(Size)) && 448e8d8bef9SDimitry Andric !isRegisterVectorElementType(Ty.getElementType()); 449e8d8bef9SDimitry Andric } 450e8d8bef9SDimitry Andric 451e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory 452e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself 453e8d8bef9SDimitry Andric /// changes, not the size of the result register. 454fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 45504eeddc0SDimitry Andric uint64_t AlignInBits, unsigned AddrSpace, 456e8d8bef9SDimitry Andric unsigned Opcode) { 457fe6060f1SDimitry Andric unsigned SizeInBits = MemoryTy.getSizeInBits(); 458e8d8bef9SDimitry Andric // We don't want to widen cases that are naturally legal. 459e8d8bef9SDimitry Andric if (isPowerOf2_32(SizeInBits)) 460e8d8bef9SDimitry Andric return false; 461e8d8bef9SDimitry Andric 462e8d8bef9SDimitry Andric // If we have 96-bit memory operations, we shouldn't touch them. Note we may 4635f757f3fSDimitry Andric // end up widening these for a scalar load during RegBankSelect, if we don't 4645f757f3fSDimitry Andric // have 96-bit scalar loads. 465e8d8bef9SDimitry Andric if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 466e8d8bef9SDimitry Andric return false; 467e8d8bef9SDimitry Andric 46806c3fb27SDimitry Andric if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false)) 469e8d8bef9SDimitry Andric return false; 470e8d8bef9SDimitry Andric 471e8d8bef9SDimitry Andric // A load is known dereferenceable up to the alignment, so it's legal to widen 472e8d8bef9SDimitry Andric // to it. 473e8d8bef9SDimitry Andric // 474e8d8bef9SDimitry Andric // TODO: Could check dereferenceable for less aligned cases. 475e8d8bef9SDimitry Andric unsigned RoundedSize = NextPowerOf2(SizeInBits); 476e8d8bef9SDimitry Andric if (AlignInBits < RoundedSize) 477e8d8bef9SDimitry Andric return false; 478e8d8bef9SDimitry Andric 479e8d8bef9SDimitry Andric // Do not widen if it would introduce a slow unaligned load. 480e8d8bef9SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 481bdd1243dSDimitry Andric unsigned Fast = 0; 482e8d8bef9SDimitry Andric return TLI->allowsMisalignedMemoryAccessesImpl( 483e8d8bef9SDimitry Andric RoundedSize, AddrSpace, Align(AlignInBits / 8), 484e8d8bef9SDimitry Andric MachineMemOperand::MOLoad, &Fast) && 485e8d8bef9SDimitry Andric Fast; 486e8d8bef9SDimitry Andric } 487e8d8bef9SDimitry Andric 488e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 489e8d8bef9SDimitry Andric unsigned Opcode) { 490e8d8bef9SDimitry Andric if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 491e8d8bef9SDimitry Andric return false; 492e8d8bef9SDimitry Andric 493fe6060f1SDimitry Andric return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 494e8d8bef9SDimitry Andric Query.MMODescrs[0].AlignInBits, 495e8d8bef9SDimitry Andric Query.Types[1].getAddressSpace(), Opcode); 496e8d8bef9SDimitry Andric } 497e8d8bef9SDimitry Andric 49806c3fb27SDimitry Andric /// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial 49906c3fb27SDimitry Andric /// type of the operand `idx` and then to transform it to a `p8` via bitcasts 50006c3fb27SDimitry Andric /// and inttoptr. In addition, handle vectors of p8. Returns the new type. 50106c3fb27SDimitry Andric static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, 50206c3fb27SDimitry Andric MachineRegisterInfo &MRI, unsigned Idx) { 50306c3fb27SDimitry Andric MachineOperand &MO = MI.getOperand(Idx); 50406c3fb27SDimitry Andric 50506c3fb27SDimitry Andric const LLT PointerTy = MRI.getType(MO.getReg()); 50606c3fb27SDimitry Andric 50706c3fb27SDimitry Andric // Paranoidly prevent us from doing this multiple times. 50806c3fb27SDimitry Andric if (!hasBufferRsrcWorkaround(PointerTy)) 50906c3fb27SDimitry Andric return PointerTy; 51006c3fb27SDimitry Andric 51106c3fb27SDimitry Andric const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 51206c3fb27SDimitry Andric const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 51306c3fb27SDimitry Andric if (!PointerTy.isVector()) { 51406c3fb27SDimitry Andric // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8) 51506c3fb27SDimitry Andric const unsigned NumParts = PointerTy.getSizeInBits() / 32; 51606c3fb27SDimitry Andric const LLT S32 = LLT::scalar(32); 51706c3fb27SDimitry Andric 51806c3fb27SDimitry Andric Register VectorReg = MRI.createGenericVirtualRegister(VectorTy); 51906c3fb27SDimitry Andric std::array<Register, 4> VectorElems; 52006c3fb27SDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 52106c3fb27SDimitry Andric for (unsigned I = 0; I < NumParts; ++I) 52206c3fb27SDimitry Andric VectorElems[I] = 52306c3fb27SDimitry Andric B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0); 52406c3fb27SDimitry Andric B.buildMergeValues(MO, VectorElems); 52506c3fb27SDimitry Andric MO.setReg(VectorReg); 52606c3fb27SDimitry Andric return VectorTy; 52706c3fb27SDimitry Andric } 52806c3fb27SDimitry Andric Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy); 52906c3fb27SDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 53006c3fb27SDimitry Andric auto Scalar = B.buildBitcast(ScalarTy, BitcastReg); 53106c3fb27SDimitry Andric B.buildIntToPtr(MO, Scalar); 53206c3fb27SDimitry Andric MO.setReg(BitcastReg); 53306c3fb27SDimitry Andric 53406c3fb27SDimitry Andric return VectorTy; 53506c3fb27SDimitry Andric } 53606c3fb27SDimitry Andric 53706c3fb27SDimitry Andric /// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is 53806c3fb27SDimitry Andric /// the form in which the value must be in order to be passed to the low-level 53906c3fb27SDimitry Andric /// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is 54006c3fb27SDimitry Andric /// needed in order to account for the fact that we can't define a register 54106c3fb27SDimitry Andric /// class for s128 without breaking SelectionDAG. 54206c3fb27SDimitry Andric static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) { 54306c3fb27SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 54406c3fb27SDimitry Andric const LLT PointerTy = MRI.getType(Pointer); 54506c3fb27SDimitry Andric const LLT ScalarTy = getBufferRsrcScalarType(PointerTy); 54606c3fb27SDimitry Andric const LLT VectorTy = getBufferRsrcRegisterType(PointerTy); 54706c3fb27SDimitry Andric 54806c3fb27SDimitry Andric if (!PointerTy.isVector()) { 54906c3fb27SDimitry Andric // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32) 55006c3fb27SDimitry Andric SmallVector<Register, 4> PointerParts; 55106c3fb27SDimitry Andric const unsigned NumParts = PointerTy.getSizeInBits() / 32; 55206c3fb27SDimitry Andric auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer); 55306c3fb27SDimitry Andric for (unsigned I = 0; I < NumParts; ++I) 55406c3fb27SDimitry Andric PointerParts.push_back(Unmerged.getReg(I)); 55506c3fb27SDimitry Andric return B.buildBuildVector(VectorTy, PointerParts).getReg(0); 55606c3fb27SDimitry Andric } 55706c3fb27SDimitry Andric Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0); 55806c3fb27SDimitry Andric return B.buildBitcast(VectorTy, Scalar).getReg(0); 55906c3fb27SDimitry Andric } 56006c3fb27SDimitry Andric 56106c3fb27SDimitry Andric static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, 56206c3fb27SDimitry Andric unsigned Idx) { 56306c3fb27SDimitry Andric MachineOperand &MO = MI.getOperand(Idx); 56406c3fb27SDimitry Andric 56506c3fb27SDimitry Andric const LLT PointerTy = B.getMRI()->getType(MO.getReg()); 56606c3fb27SDimitry Andric // Paranoidly prevent us from doing this multiple times. 56706c3fb27SDimitry Andric if (!hasBufferRsrcWorkaround(PointerTy)) 56806c3fb27SDimitry Andric return; 56906c3fb27SDimitry Andric MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B)); 57006c3fb27SDimitry Andric } 57106c3fb27SDimitry Andric 5720b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 5730b57cec5SDimitry Andric const GCNTargetMachine &TM) 5740b57cec5SDimitry Andric : ST(ST_) { 5750b57cec5SDimitry Andric using namespace TargetOpcode; 5760b57cec5SDimitry Andric 5770b57cec5SDimitry Andric auto GetAddrSpacePtr = [&TM](unsigned AS) { 5780b57cec5SDimitry Andric return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 5790b57cec5SDimitry Andric }; 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 582e8d8bef9SDimitry Andric const LLT S8 = LLT::scalar(8); 5830b57cec5SDimitry Andric const LLT S16 = LLT::scalar(16); 5840b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 5850b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 5860b57cec5SDimitry Andric const LLT S128 = LLT::scalar(128); 5870b57cec5SDimitry Andric const LLT S256 = LLT::scalar(256); 5885ffd83dbSDimitry Andric const LLT S512 = LLT::scalar(512); 5895ffd83dbSDimitry Andric const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 5900b57cec5SDimitry Andric 591fe6060f1SDimitry Andric const LLT V2S8 = LLT::fixed_vector(2, 8); 592fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 593fe6060f1SDimitry Andric const LLT V4S16 = LLT::fixed_vector(4, 16); 5940b57cec5SDimitry Andric 595fe6060f1SDimitry Andric const LLT V2S32 = LLT::fixed_vector(2, 32); 596fe6060f1SDimitry Andric const LLT V3S32 = LLT::fixed_vector(3, 32); 597fe6060f1SDimitry Andric const LLT V4S32 = LLT::fixed_vector(4, 32); 598fe6060f1SDimitry Andric const LLT V5S32 = LLT::fixed_vector(5, 32); 599fe6060f1SDimitry Andric const LLT V6S32 = LLT::fixed_vector(6, 32); 600fe6060f1SDimitry Andric const LLT V7S32 = LLT::fixed_vector(7, 32); 601fe6060f1SDimitry Andric const LLT V8S32 = LLT::fixed_vector(8, 32); 602fe6060f1SDimitry Andric const LLT V9S32 = LLT::fixed_vector(9, 32); 603fe6060f1SDimitry Andric const LLT V10S32 = LLT::fixed_vector(10, 32); 604fe6060f1SDimitry Andric const LLT V11S32 = LLT::fixed_vector(11, 32); 605fe6060f1SDimitry Andric const LLT V12S32 = LLT::fixed_vector(12, 32); 606fe6060f1SDimitry Andric const LLT V13S32 = LLT::fixed_vector(13, 32); 607fe6060f1SDimitry Andric const LLT V14S32 = LLT::fixed_vector(14, 32); 608fe6060f1SDimitry Andric const LLT V15S32 = LLT::fixed_vector(15, 32); 609fe6060f1SDimitry Andric const LLT V16S32 = LLT::fixed_vector(16, 32); 610fe6060f1SDimitry Andric const LLT V32S32 = LLT::fixed_vector(32, 32); 6110b57cec5SDimitry Andric 612fe6060f1SDimitry Andric const LLT V2S64 = LLT::fixed_vector(2, 64); 613fe6060f1SDimitry Andric const LLT V3S64 = LLT::fixed_vector(3, 64); 614fe6060f1SDimitry Andric const LLT V4S64 = LLT::fixed_vector(4, 64); 615fe6060f1SDimitry Andric const LLT V5S64 = LLT::fixed_vector(5, 64); 616fe6060f1SDimitry Andric const LLT V6S64 = LLT::fixed_vector(6, 64); 617fe6060f1SDimitry Andric const LLT V7S64 = LLT::fixed_vector(7, 64); 618fe6060f1SDimitry Andric const LLT V8S64 = LLT::fixed_vector(8, 64); 619fe6060f1SDimitry Andric const LLT V16S64 = LLT::fixed_vector(16, 64); 6200b57cec5SDimitry Andric 6210b57cec5SDimitry Andric std::initializer_list<LLT> AllS32Vectors = 6220b57cec5SDimitry Andric {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 6238bcb0991SDimitry Andric V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 6240b57cec5SDimitry Andric std::initializer_list<LLT> AllS64Vectors = 6258bcb0991SDimitry Andric {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 6260b57cec5SDimitry Andric 6270b57cec5SDimitry Andric const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 6280b57cec5SDimitry Andric const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 6298bcb0991SDimitry Andric const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 6300b57cec5SDimitry Andric const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 6318bcb0991SDimitry Andric const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 6320b57cec5SDimitry Andric const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 6330b57cec5SDimitry Andric const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 63406c3fb27SDimitry Andric const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER); 63506c3fb27SDimitry Andric const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE); 6365f757f3fSDimitry Andric const LLT BufferStridedPtr = 6375f757f3fSDimitry Andric GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER); 6380b57cec5SDimitry Andric 6390b57cec5SDimitry Andric const LLT CodePtr = FlatPtr; 6400b57cec5SDimitry Andric 6410b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces64 = { 6420b57cec5SDimitry Andric GlobalPtr, ConstantPtr, FlatPtr 6430b57cec5SDimitry Andric }; 6440b57cec5SDimitry Andric 6450b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces32 = { 6468bcb0991SDimitry Andric LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 6470b57cec5SDimitry Andric }; 6480b57cec5SDimitry Andric 64906c3fb27SDimitry Andric const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr}; 65006c3fb27SDimitry Andric 6510b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesBase = { 6520b57cec5SDimitry Andric S32, S64 6530b57cec5SDimitry Andric }; 6540b57cec5SDimitry Andric 6550b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypes16 = { 6560b57cec5SDimitry Andric S32, S64, S16 6570b57cec5SDimitry Andric }; 6580b57cec5SDimitry Andric 6590b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesPK16 = { 6600b57cec5SDimitry Andric S32, S64, S16, V2S16 6610b57cec5SDimitry Andric }; 6620b57cec5SDimitry Andric 6635ffd83dbSDimitry Andric const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 6645ffd83dbSDimitry Andric 665fe6060f1SDimitry Andric // s1 for VCC branches, s32 for SCC branches. 666fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 6670b57cec5SDimitry Andric 6680b57cec5SDimitry Andric // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 6690b57cec5SDimitry Andric // elements for v3s16 6700b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PHI) 671e8d8bef9SDimitry Andric .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 6720b57cec5SDimitry Andric .legalFor(AllS32Vectors) 6730b57cec5SDimitry Andric .legalFor(AllS64Vectors) 6740b57cec5SDimitry Andric .legalFor(AddrSpaces64) 6750b57cec5SDimitry Andric .legalFor(AddrSpaces32) 67606c3fb27SDimitry Andric .legalFor(AddrSpaces128) 677e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 678e8d8bef9SDimitry Andric .clampScalar(0, S16, S256) 6790b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 6800b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16) 6810b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 682e8d8bef9SDimitry Andric .scalarize(0); 6830b57cec5SDimitry Andric 684e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 685e8d8bef9SDimitry Andric // Full set of gfx9 features. 6865f757f3fSDimitry Andric if (ST.hasScalarAddSub64()) { 6875f757f3fSDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 6885f757f3fSDimitry Andric .legalFor({S64, S32, S16, V2S16}) 6895f757f3fSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 6905f757f3fSDimitry Andric .scalarize(0) 6915f757f3fSDimitry Andric .minScalar(0, S16) 6925f757f3fSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 6935f757f3fSDimitry Andric .maxScalar(0, S32); 6945f757f3fSDimitry Andric } else { 69581ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 6965ffd83dbSDimitry Andric .legalFor({S32, S16, V2S16}) 6970eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 69881ad6265SDimitry Andric .scalarize(0) 69981ad6265SDimitry Andric .minScalar(0, S16) 700349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 70181ad6265SDimitry Andric .maxScalar(0, S32); 7025f757f3fSDimitry Andric } 70381ad6265SDimitry Andric 7041db9f3b2SDimitry Andric if (ST.hasScalarSMulU64()) { 7051db9f3b2SDimitry Andric getActionDefinitionsBuilder(G_MUL) 7061db9f3b2SDimitry Andric .legalFor({S64, S32, S16, V2S16}) 7071db9f3b2SDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 7081db9f3b2SDimitry Andric .scalarize(0) 7091db9f3b2SDimitry Andric .minScalar(0, S16) 7101db9f3b2SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 7111db9f3b2SDimitry Andric .custom(); 7121db9f3b2SDimitry Andric } else { 71381ad6265SDimitry Andric getActionDefinitionsBuilder(G_MUL) 71481ad6265SDimitry Andric .legalFor({S32, S16, V2S16}) 71581ad6265SDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 71681ad6265SDimitry Andric .scalarize(0) 71781ad6265SDimitry Andric .minScalar(0, S16) 71881ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 71981ad6265SDimitry Andric .custom(); 7201db9f3b2SDimitry Andric } 72181ad6265SDimitry Andric assert(ST.hasMad64_32()); 722e8d8bef9SDimitry Andric 723e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 724e8d8bef9SDimitry Andric .legalFor({S32, S16, V2S16}) // Clamp modifier 725e8d8bef9SDimitry Andric .minScalarOrElt(0, S16) 7260eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 727e8d8bef9SDimitry Andric .scalarize(0) 728e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 32) 729e8d8bef9SDimitry Andric .lower(); 7305ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 73181ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 7320b57cec5SDimitry Andric .legalFor({S32, S16}) 733349cc55cSDimitry Andric .minScalar(0, S16) 734349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 735349cc55cSDimitry Andric .maxScalar(0, S32) 736349cc55cSDimitry Andric .scalarize(0); 737e8d8bef9SDimitry Andric 73881ad6265SDimitry Andric getActionDefinitionsBuilder(G_MUL) 73981ad6265SDimitry Andric .legalFor({S32, S16}) 74081ad6265SDimitry Andric .scalarize(0) 74181ad6265SDimitry Andric .minScalar(0, S16) 74281ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 74381ad6265SDimitry Andric .custom(); 74481ad6265SDimitry Andric assert(ST.hasMad64_32()); 74581ad6265SDimitry Andric 746e8d8bef9SDimitry Andric // Technically the saturating operations require clamp bit support, but this 747e8d8bef9SDimitry Andric // was introduced at the same time as 16-bit operations. 748e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 749e8d8bef9SDimitry Andric .legalFor({S32, S16}) // Clamp modifier 750e8d8bef9SDimitry Andric .minScalar(0, S16) 751e8d8bef9SDimitry Andric .scalarize(0) 752e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 16) 753e8d8bef9SDimitry Andric .lower(); 754e8d8bef9SDimitry Andric 755e8d8bef9SDimitry Andric // We're just lowering this, but it helps get a better result to try to 756e8d8bef9SDimitry Andric // coerce to the desired type first. 757e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 758e8d8bef9SDimitry Andric .minScalar(0, S16) 759e8d8bef9SDimitry Andric .scalarize(0) 760e8d8bef9SDimitry Andric .lower(); 7610b57cec5SDimitry Andric } else { 76281ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 7630b57cec5SDimitry Andric .legalFor({S32}) 764349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 7650b57cec5SDimitry Andric .clampScalar(0, S32, S32) 7660b57cec5SDimitry Andric .scalarize(0); 767e8d8bef9SDimitry Andric 76881ad6265SDimitry Andric auto &Mul = getActionDefinitionsBuilder(G_MUL) 76981ad6265SDimitry Andric .legalFor({S32}) 77081ad6265SDimitry Andric .scalarize(0) 77181ad6265SDimitry Andric .minScalar(0, S32) 77281ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32); 77381ad6265SDimitry Andric 77481ad6265SDimitry Andric if (ST.hasMad64_32()) 77581ad6265SDimitry Andric Mul.custom(); 77681ad6265SDimitry Andric else 77781ad6265SDimitry Andric Mul.maxScalar(0, S32); 77881ad6265SDimitry Andric 779e8d8bef9SDimitry Andric if (ST.hasIntClamp()) { 780e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 781e8d8bef9SDimitry Andric .legalFor({S32}) // Clamp modifier. 782e8d8bef9SDimitry Andric .scalarize(0) 783e8d8bef9SDimitry Andric .minScalarOrElt(0, S32) 784e8d8bef9SDimitry Andric .lower(); 785e8d8bef9SDimitry Andric } else { 786e8d8bef9SDimitry Andric // Clamp bit support was added in VI, along with 16-bit operations. 787e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 788e8d8bef9SDimitry Andric .minScalar(0, S32) 789e8d8bef9SDimitry Andric .scalarize(0) 790e8d8bef9SDimitry Andric .lower(); 7910b57cec5SDimitry Andric } 7920b57cec5SDimitry Andric 793e8d8bef9SDimitry Andric // FIXME: DAG expansion gets better results. The widening uses the smaller 794e8d8bef9SDimitry Andric // range values and goes for the min/max lowering directly. 795e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 796e8d8bef9SDimitry Andric .minScalar(0, S32) 797e8d8bef9SDimitry Andric .scalarize(0) 798e8d8bef9SDimitry Andric .lower(); 799e8d8bef9SDimitry Andric } 800e8d8bef9SDimitry Andric 801fe6060f1SDimitry Andric getActionDefinitionsBuilder( 802fe6060f1SDimitry Andric {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 8035ffd83dbSDimitry Andric .customFor({S32, S64}) 804480093f4SDimitry Andric .clampScalar(0, S32, S64) 805480093f4SDimitry Andric .widenScalarToNextPow2(0, 32) 806480093f4SDimitry Andric .scalarize(0); 807480093f4SDimitry Andric 808e8d8bef9SDimitry Andric auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 8090b57cec5SDimitry Andric .legalFor({S32}) 810349cc55cSDimitry Andric .maxScalar(0, S32); 811e8d8bef9SDimitry Andric 812e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts()) { 813e8d8bef9SDimitry Andric Mulh 814e8d8bef9SDimitry Andric .clampMaxNumElements(0, S8, 2) 815e8d8bef9SDimitry Andric .lowerFor({V2S8}); 816e8d8bef9SDimitry Andric } 817e8d8bef9SDimitry Andric 818e8d8bef9SDimitry Andric Mulh 819e8d8bef9SDimitry Andric .scalarize(0) 820e8d8bef9SDimitry Andric .lower(); 8210b57cec5SDimitry Andric 8220b57cec5SDimitry Andric // Report legal for any types we can handle anywhere. For the cases only legal 8230b57cec5SDimitry Andric // on the SALU, RegBankSelect will be able to re-legalize. 8240b57cec5SDimitry Andric getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 8250b57cec5SDimitry Andric .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 8260b57cec5SDimitry Andric .clampScalar(0, S32, S64) 8270b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 8288bcb0991SDimitry Andric .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 8290b57cec5SDimitry Andric .widenScalarToNextPow2(0) 8300b57cec5SDimitry Andric .scalarize(0); 8310b57cec5SDimitry Andric 832bdd1243dSDimitry Andric getActionDefinitionsBuilder( 833bdd1243dSDimitry Andric {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 834480093f4SDimitry Andric .legalFor({{S32, S1}, {S32, S32}}) 835bdd1243dSDimitry Andric .clampScalar(0, S32, S32) 836bdd1243dSDimitry Andric .scalarize(0); 8370b57cec5SDimitry Andric 8380b57cec5SDimitry Andric getActionDefinitionsBuilder(G_BITCAST) 8390b57cec5SDimitry Andric // Don't worry about the size constraint. 8408bcb0991SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 8415ffd83dbSDimitry Andric .lower(); 8420b57cec5SDimitry Andric 8430b57cec5SDimitry Andric 8440b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONSTANT) 8458bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, GlobalPtr, 8460b57cec5SDimitry Andric LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 847e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 8480b57cec5SDimitry Andric .clampScalar(0, S32, S64) 849e8d8bef9SDimitry Andric .widenScalarToNextPow2(0); 8500b57cec5SDimitry Andric 8515ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FCONSTANT) 8525ffd83dbSDimitry Andric .legalFor({S32, S64, S16}) 8535ffd83dbSDimitry Andric .clampScalar(0, S16, S64); 8548bcb0991SDimitry Andric 8555ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 8565ffd83dbSDimitry Andric .legalIf(isRegisterType(0)) 8575ffd83dbSDimitry Andric // s1 and s16 are special cases because they have legal operations on 8585ffd83dbSDimitry Andric // them, but don't really occupy registers in the normal way. 8595ffd83dbSDimitry Andric .legalFor({S1, S16}) 8605ffd83dbSDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 8615ffd83dbSDimitry Andric .clampScalarOrElt(0, S32, MaxScalar) 8625ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 8635ffd83dbSDimitry Andric .clampMaxNumElements(0, S32, 16); 8645ffd83dbSDimitry Andric 865fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 8665ffd83dbSDimitry Andric 8675ffd83dbSDimitry Andric // If the amount is divergent, we have to do a wave reduction to get the 8685ffd83dbSDimitry Andric // maximum value, so this is expanded during RegBankSelect. 8695ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_DYN_STACKALLOC) 8705ffd83dbSDimitry Andric .legalFor({{PrivatePtr, S32}}); 8715ffd83dbSDimitry Andric 8725f757f3fSDimitry Andric getActionDefinitionsBuilder(G_STACKSAVE) 8735f757f3fSDimitry Andric .customFor({PrivatePtr}); 8745f757f3fSDimitry Andric getActionDefinitionsBuilder(G_STACKRESTORE) 8755f757f3fSDimitry Andric .legalFor({PrivatePtr}); 8765f757f3fSDimitry Andric 8775ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_GLOBAL_VALUE) 878e8d8bef9SDimitry Andric .customIf(typeIsNot(0, PrivatePtr)); 879e8d8bef9SDimitry Andric 880fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 8810b57cec5SDimitry Andric 8820b57cec5SDimitry Andric auto &FPOpActions = getActionDefinitionsBuilder( 883bdd1243dSDimitry Andric { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 884bdd1243dSDimitry Andric G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 8850b57cec5SDimitry Andric .legalFor({S32, S64}); 8868bcb0991SDimitry Andric auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 8878bcb0991SDimitry Andric .customFor({S32, S64}); 8888bcb0991SDimitry Andric auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 8898bcb0991SDimitry Andric .customFor({S32, S64}); 8900b57cec5SDimitry Andric 8910b57cec5SDimitry Andric if (ST.has16BitInsts()) { 8920b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 8930b57cec5SDimitry Andric FPOpActions.legalFor({S16, V2S16}); 8940b57cec5SDimitry Andric else 8950b57cec5SDimitry Andric FPOpActions.legalFor({S16}); 8968bcb0991SDimitry Andric 8978bcb0991SDimitry Andric TrigActions.customFor({S16}); 8988bcb0991SDimitry Andric FDIVActions.customFor({S16}); 8990b57cec5SDimitry Andric } 9000b57cec5SDimitry Andric 9015f757f3fSDimitry Andric if (ST.hasPackedFP32Ops()) { 9025f757f3fSDimitry Andric FPOpActions.legalFor({V2S32}); 9035f757f3fSDimitry Andric FPOpActions.clampMaxNumElementsStrict(0, S32, 2); 9045f757f3fSDimitry Andric } 9055f757f3fSDimitry Andric 9060b57cec5SDimitry Andric auto &MinNumMaxNum = getActionDefinitionsBuilder({ 9070b57cec5SDimitry Andric G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 9080b57cec5SDimitry Andric 9090b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 9100b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesPK16) 911480093f4SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 9120b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 9130b57cec5SDimitry Andric .clampScalar(0, S16, S64) 9140b57cec5SDimitry Andric .scalarize(0); 9150b57cec5SDimitry Andric } else if (ST.has16BitInsts()) { 9160b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypes16) 9170b57cec5SDimitry Andric .clampScalar(0, S16, S64) 9180b57cec5SDimitry Andric .scalarize(0); 9190b57cec5SDimitry Andric } else { 9200b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesBase) 9210b57cec5SDimitry Andric .clampScalar(0, S32, S64) 9220b57cec5SDimitry Andric .scalarize(0); 9230b57cec5SDimitry Andric } 9240b57cec5SDimitry Andric 9250b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 9260eae32dcSDimitry Andric FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 9278bcb0991SDimitry Andric 9280b57cec5SDimitry Andric FPOpActions 9290b57cec5SDimitry Andric .scalarize(0) 9300b57cec5SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 9310b57cec5SDimitry Andric 9328bcb0991SDimitry Andric TrigActions 9338bcb0991SDimitry Andric .scalarize(0) 9348bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 9358bcb0991SDimitry Andric 9368bcb0991SDimitry Andric FDIVActions 9378bcb0991SDimitry Andric .scalarize(0) 9388bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 9398bcb0991SDimitry Andric 9408bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FNEG, G_FABS}) 9418bcb0991SDimitry Andric .legalFor(FPTypesPK16) 9420eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 9438bcb0991SDimitry Andric .scalarize(0) 9448bcb0991SDimitry Andric .clampScalar(0, S16, S64); 9458bcb0991SDimitry Andric 9460b57cec5SDimitry Andric if (ST.has16BitInsts()) { 94706c3fb27SDimitry Andric getActionDefinitionsBuilder(G_FSQRT) 9485f757f3fSDimitry Andric .legalFor({S16}) 9495f757f3fSDimitry Andric .customFor({S32, S64}) 95006c3fb27SDimitry Andric .scalarize(0) 9515f757f3fSDimitry Andric .unsupported(); 95206c3fb27SDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 9530b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 9540b57cec5SDimitry Andric .scalarize(0) 9550b57cec5SDimitry Andric .clampScalar(0, S16, S64); 95606c3fb27SDimitry Andric 95706c3fb27SDimitry Andric getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 95806c3fb27SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S16}}) 95906c3fb27SDimitry Andric .scalarize(0) 96006c3fb27SDimitry Andric .maxScalarIf(typeIs(0, S16), 1, S16) 96106c3fb27SDimitry Andric .clampScalar(1, S32, S32) 96206c3fb27SDimitry Andric .lower(); 96306c3fb27SDimitry Andric 96406c3fb27SDimitry Andric getActionDefinitionsBuilder(G_FFREXP) 96506c3fb27SDimitry Andric .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}}) 96606c3fb27SDimitry Andric .scalarize(0) 96706c3fb27SDimitry Andric .lower(); 9680b57cec5SDimitry Andric } else { 9695ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSQRT) 9705f757f3fSDimitry Andric .customFor({S32, S64, S16}) 9715ffd83dbSDimitry Andric .scalarize(0) 9725f757f3fSDimitry Andric .unsupported(); 9735f757f3fSDimitry Andric 9745ffd83dbSDimitry Andric 9755ffd83dbSDimitry Andric if (ST.hasFractBug()) { 9765ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 9775ffd83dbSDimitry Andric .customFor({S64}) 9785ffd83dbSDimitry Andric .legalFor({S32, S64}) 9795ffd83dbSDimitry Andric .scalarize(0) 9805ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 9815ffd83dbSDimitry Andric } else { 9825ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 9830b57cec5SDimitry Andric .legalFor({S32, S64}) 9840b57cec5SDimitry Andric .scalarize(0) 9850b57cec5SDimitry Andric .clampScalar(0, S32, S64); 9860b57cec5SDimitry Andric } 98706c3fb27SDimitry Andric 98806c3fb27SDimitry Andric getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP}) 98906c3fb27SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}) 99006c3fb27SDimitry Andric .scalarize(0) 99106c3fb27SDimitry Andric .clampScalar(0, S32, S64) 99206c3fb27SDimitry Andric .clampScalar(1, S32, S32) 99306c3fb27SDimitry Andric .lower(); 99406c3fb27SDimitry Andric 99506c3fb27SDimitry Andric getActionDefinitionsBuilder(G_FFREXP) 99606c3fb27SDimitry Andric .customFor({{S32, S32}, {S64, S32}}) 99706c3fb27SDimitry Andric .scalarize(0) 99806c3fb27SDimitry Andric .minScalar(0, S32) 99906c3fb27SDimitry Andric .clampScalar(1, S32, S32) 100006c3fb27SDimitry Andric .lower(); 10015ffd83dbSDimitry Andric } 10020b57cec5SDimitry Andric 10030b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPTRUNC) 10040b57cec5SDimitry Andric .legalFor({{S32, S64}, {S16, S32}}) 10055ffd83dbSDimitry Andric .scalarize(0) 10065ffd83dbSDimitry Andric .lower(); 10070b57cec5SDimitry Andric 10080b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPEXT) 10090b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}}) 1010e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 10110b57cec5SDimitry Andric .scalarize(0); 10120b57cec5SDimitry Andric 1013bdd1243dSDimitry Andric auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 101481ad6265SDimitry Andric if (ST.has16BitInsts()) { 101581ad6265SDimitry Andric FSubActions 101681ad6265SDimitry Andric // Use actual fsub instruction 101781ad6265SDimitry Andric .legalFor({S32, S16}) 101881ad6265SDimitry Andric // Must use fadd + fneg 101981ad6265SDimitry Andric .lowerFor({S64, V2S16}); 102081ad6265SDimitry Andric } else { 102181ad6265SDimitry Andric FSubActions 10220b57cec5SDimitry Andric // Use actual fsub instruction 10230b57cec5SDimitry Andric .legalFor({S32}) 10240b57cec5SDimitry Andric // Must use fadd + fneg 102581ad6265SDimitry Andric .lowerFor({S64, S16, V2S16}); 102681ad6265SDimitry Andric } 102781ad6265SDimitry Andric 102881ad6265SDimitry Andric FSubActions 10290b57cec5SDimitry Andric .scalarize(0) 10300b57cec5SDimitry Andric .clampScalar(0, S32, S64); 10310b57cec5SDimitry Andric 10328bcb0991SDimitry Andric // Whether this is legal depends on the floating point mode for the function. 10338bcb0991SDimitry Andric auto &FMad = getActionDefinitionsBuilder(G_FMAD); 10345ffd83dbSDimitry Andric if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 10358bcb0991SDimitry Andric FMad.customFor({S32, S16}); 10365ffd83dbSDimitry Andric else if (ST.hasMadMacF32Insts()) 10378bcb0991SDimitry Andric FMad.customFor({S32}); 10385ffd83dbSDimitry Andric else if (ST.hasMadF16()) 10395ffd83dbSDimitry Andric FMad.customFor({S16}); 10408bcb0991SDimitry Andric FMad.scalarize(0) 10418bcb0991SDimitry Andric .lower(); 10428bcb0991SDimitry Andric 1043e8d8bef9SDimitry Andric auto &FRem = getActionDefinitionsBuilder(G_FREM); 1044e8d8bef9SDimitry Andric if (ST.has16BitInsts()) { 1045e8d8bef9SDimitry Andric FRem.customFor({S16, S32, S64}); 1046e8d8bef9SDimitry Andric } else { 1047e8d8bef9SDimitry Andric FRem.minScalar(0, S32) 1048e8d8bef9SDimitry Andric .customFor({S32, S64}); 1049e8d8bef9SDimitry Andric } 1050e8d8bef9SDimitry Andric FRem.scalarize(0); 1051e8d8bef9SDimitry Andric 10525ffd83dbSDimitry Andric // TODO: Do we need to clamp maximum bitwidth? 10535ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_TRUNC) 10545ffd83dbSDimitry Andric .legalIf(isScalar(0)) 10555ffd83dbSDimitry Andric .legalFor({{V2S16, V2S32}}) 10565ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 10575ffd83dbSDimitry Andric // Avoid scalarizing in cases that should be truly illegal. In unresolvable 10585ffd83dbSDimitry Andric // situations (like an invalid implicit use), we don't want to infinite loop 10595ffd83dbSDimitry Andric // in the legalizer. 10605ffd83dbSDimitry Andric .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 10615ffd83dbSDimitry Andric .alwaysLegal(); 10625ffd83dbSDimitry Andric 10630b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 10640b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 10655ffd83dbSDimitry Andric {S32, S1}, {S64, S1}, {S16, S1}}) 1066480093f4SDimitry Andric .scalarize(0) 10675ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 10685ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 10690b57cec5SDimitry Andric 10708bcb0991SDimitry Andric // TODO: Split s1->s64 during regbankselect for VALU. 10718bcb0991SDimitry Andric auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 1072480093f4SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 1073480093f4SDimitry Andric .lowerIf(typeIs(1, S1)) 1074349cc55cSDimitry Andric .customFor({{S32, S64}, {S64, S64}}); 10758bcb0991SDimitry Andric if (ST.has16BitInsts()) 10768bcb0991SDimitry Andric IToFP.legalFor({{S16, S16}}); 10778bcb0991SDimitry Andric IToFP.clampScalar(1, S32, S64) 1078e8d8bef9SDimitry Andric .minScalar(0, S32) 10795ffd83dbSDimitry Andric .scalarize(0) 10805ffd83dbSDimitry Andric .widenScalarToNextPow2(1); 10810b57cec5SDimitry Andric 10828bcb0991SDimitry Andric auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 10835ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 1084fe6060f1SDimitry Andric .customFor({{S64, S32}, {S64, S64}}) 1085e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 10868bcb0991SDimitry Andric if (ST.has16BitInsts()) 10878bcb0991SDimitry Andric FPToI.legalFor({{S16, S16}}); 10888bcb0991SDimitry Andric else 10898bcb0991SDimitry Andric FPToI.minScalar(1, S32); 10908bcb0991SDimitry Andric 10918bcb0991SDimitry Andric FPToI.minScalar(0, S32) 1092fe6060f1SDimitry Andric .widenScalarToNextPow2(0, 32) 10935ffd83dbSDimitry Andric .scalarize(0) 10945ffd83dbSDimitry Andric .lower(); 10950b57cec5SDimitry Andric 109681ad6265SDimitry Andric getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 109781ad6265SDimitry Andric .customFor({S16, S32}) 109881ad6265SDimitry Andric .scalarize(0) 109981ad6265SDimitry Andric .lower(); 110081ad6265SDimitry Andric 11015f757f3fSDimitry Andric // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN 11025f757f3fSDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT}) 1103480093f4SDimitry Andric .scalarize(0) 1104480093f4SDimitry Andric .lower(); 11050b57cec5SDimitry Andric 1106480093f4SDimitry Andric if (ST.has16BitInsts()) { 11075f757f3fSDimitry Andric getActionDefinitionsBuilder( 11085f757f3fSDimitry Andric {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 1109480093f4SDimitry Andric .legalFor({S16, S32, S64}) 1110480093f4SDimitry Andric .clampScalar(0, S16, S64) 1111480093f4SDimitry Andric .scalarize(0); 1112480093f4SDimitry Andric } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 11135f757f3fSDimitry Andric getActionDefinitionsBuilder( 11145f757f3fSDimitry Andric {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 11150b57cec5SDimitry Andric .legalFor({S32, S64}) 11160b57cec5SDimitry Andric .clampScalar(0, S32, S64) 11170b57cec5SDimitry Andric .scalarize(0); 11180b57cec5SDimitry Andric } else { 11195f757f3fSDimitry Andric getActionDefinitionsBuilder( 11205f757f3fSDimitry Andric {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN}) 11210b57cec5SDimitry Andric .legalFor({S32}) 11220b57cec5SDimitry Andric .customFor({S64}) 11230b57cec5SDimitry Andric .clampScalar(0, S32, S64) 11240b57cec5SDimitry Andric .scalarize(0); 11250b57cec5SDimitry Andric } 11260b57cec5SDimitry Andric 1127480093f4SDimitry Andric getActionDefinitionsBuilder(G_PTR_ADD) 11285f757f3fSDimitry Andric .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr}) 1129e8d8bef9SDimitry Andric .legalIf(all(isPointer(0), sameSize(0, 1))) 1130e8d8bef9SDimitry Andric .scalarize(0) 1131e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0); 11320b57cec5SDimitry Andric 11335ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_PTRMASK) 1134e8d8bef9SDimitry Andric .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 1135e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0) 11365ffd83dbSDimitry Andric .scalarize(0); 11370b57cec5SDimitry Andric 11380b57cec5SDimitry Andric auto &CmpBuilder = 11390b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ICMP) 1140480093f4SDimitry Andric // The compare output type differs based on the register bank of the output, 1141480093f4SDimitry Andric // so make both s1 and s32 legal. 1142480093f4SDimitry Andric // 1143480093f4SDimitry Andric // Scalar compares producing output in scc will be promoted to s32, as that 1144480093f4SDimitry Andric // is the allocatable register type that will be needed for the copy from 1145480093f4SDimitry Andric // scc. This will be promoted during RegBankSelect, and we assume something 1146480093f4SDimitry Andric // before that won't try to use s32 result types. 1147480093f4SDimitry Andric // 1148480093f4SDimitry Andric // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 1149480093f4SDimitry Andric // bank. 11500b57cec5SDimitry Andric .legalForCartesianProduct( 11510b57cec5SDimitry Andric {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 1152480093f4SDimitry Andric .legalForCartesianProduct( 1153480093f4SDimitry Andric {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 11540b57cec5SDimitry Andric if (ST.has16BitInsts()) { 11550b57cec5SDimitry Andric CmpBuilder.legalFor({{S1, S16}}); 11560b57cec5SDimitry Andric } 11570b57cec5SDimitry Andric 11580b57cec5SDimitry Andric CmpBuilder 11590b57cec5SDimitry Andric .widenScalarToNextPow2(1) 11600b57cec5SDimitry Andric .clampScalar(1, S32, S64) 11610b57cec5SDimitry Andric .scalarize(0) 1162480093f4SDimitry Andric .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 11630b57cec5SDimitry Andric 11645f757f3fSDimitry Andric auto &FCmpBuilder = 11655f757f3fSDimitry Andric getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct( 11665f757f3fSDimitry Andric {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase); 11675f757f3fSDimitry Andric 11685f757f3fSDimitry Andric if (ST.hasSALUFloatInsts()) 11695f757f3fSDimitry Andric FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32}); 11705f757f3fSDimitry Andric 11715f757f3fSDimitry Andric FCmpBuilder 11720b57cec5SDimitry Andric .widenScalarToNextPow2(1) 11730b57cec5SDimitry Andric .clampScalar(1, S32, S64) 11740b57cec5SDimitry Andric .scalarize(0); 11750b57cec5SDimitry Andric 11765ffd83dbSDimitry Andric // FIXME: fpow has a selection pattern that should move to custom lowering. 117706c3fb27SDimitry Andric auto &ExpOps = getActionDefinitionsBuilder(G_FPOW); 11785ffd83dbSDimitry Andric if (ST.has16BitInsts()) 11795ffd83dbSDimitry Andric ExpOps.customFor({{S32}, {S16}}); 11805ffd83dbSDimitry Andric else 11815ffd83dbSDimitry Andric ExpOps.customFor({S32}); 11825ffd83dbSDimitry Andric ExpOps.clampScalar(0, MinScalarFPTy, S32) 11830b57cec5SDimitry Andric .scalarize(0); 11840b57cec5SDimitry Andric 1185e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FPOWI) 1186e8d8bef9SDimitry Andric .clampScalar(0, MinScalarFPTy, S32) 1187e8d8bef9SDimitry Andric .lower(); 1188e8d8bef9SDimitry Andric 118906c3fb27SDimitry Andric auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2}); 119006c3fb27SDimitry Andric Log2Ops.customFor({S32}); 119106c3fb27SDimitry Andric if (ST.has16BitInsts()) 119206c3fb27SDimitry Andric Log2Ops.legalFor({S16}); 119306c3fb27SDimitry Andric else 119406c3fb27SDimitry Andric Log2Ops.customFor({S16}); 119506c3fb27SDimitry Andric Log2Ops.scalarize(0) 119606c3fb27SDimitry Andric .lower(); 119706c3fb27SDimitry Andric 11985f757f3fSDimitry Andric auto &LogOps = 11995f757f3fSDimitry Andric getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10}); 120006c3fb27SDimitry Andric LogOps.customFor({S32, S16}); 120106c3fb27SDimitry Andric LogOps.clampScalar(0, MinScalarFPTy, S32) 120206c3fb27SDimitry Andric .scalarize(0); 120306c3fb27SDimitry Andric 12040b57cec5SDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 12055ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_CTPOP) 12060b57cec5SDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 12070b57cec5SDimitry Andric .clampScalar(0, S32, S32) 120804eeddc0SDimitry Andric .widenScalarToNextPow2(1, 32) 12090b57cec5SDimitry Andric .clampScalar(1, S32, S64) 12100b57cec5SDimitry Andric .scalarize(0) 121104eeddc0SDimitry Andric .widenScalarToNextPow2(0, 32); 121204eeddc0SDimitry Andric 1213bdd1243dSDimitry Andric // If no 16 bit instr is available, lower into different instructions. 1214bdd1243dSDimitry Andric if (ST.has16BitInsts()) 1215bdd1243dSDimitry Andric getActionDefinitionsBuilder(G_IS_FPCLASS) 1216bdd1243dSDimitry Andric .legalForCartesianProduct({S1}, FPTypes16) 1217bdd1243dSDimitry Andric .widenScalarToNextPow2(1) 1218bdd1243dSDimitry Andric .scalarize(0) 1219bdd1243dSDimitry Andric .lower(); 1220bdd1243dSDimitry Andric else 1221bdd1243dSDimitry Andric getActionDefinitionsBuilder(G_IS_FPCLASS) 1222bdd1243dSDimitry Andric .legalForCartesianProduct({S1}, FPTypesBase) 1223bdd1243dSDimitry Andric .lowerFor({S1, S16}) 1224bdd1243dSDimitry Andric .widenScalarToNextPow2(1) 1225bdd1243dSDimitry Andric .scalarize(0) 1226bdd1243dSDimitry Andric .lower(); 12270b57cec5SDimitry Andric 12285ffd83dbSDimitry Andric // The hardware instructions return a different result on 0 than the generic 12295ffd83dbSDimitry Andric // instructions expect. The hardware produces -1, but these produce the 12305ffd83dbSDimitry Andric // bitwidth. 12315ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 12325ffd83dbSDimitry Andric .scalarize(0) 12335ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 12345ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 12355ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 12365ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32) 1237349cc55cSDimitry Andric .custom(); 12385ffd83dbSDimitry Andric 12395ffd83dbSDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 12405ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 12415ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 12425ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 12435ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 12445ffd83dbSDimitry Andric .scalarize(0) 12455ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 12465ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 12475ffd83dbSDimitry Andric 1248fe6060f1SDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1249fe6060f1SDimitry Andric // RegBankSelect. 12505ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BITREVERSE) 1251fe6060f1SDimitry Andric .legalFor({S32, S64}) 1252fe6060f1SDimitry Andric .clampScalar(0, S32, S64) 1253fe6060f1SDimitry Andric .scalarize(0) 1254fe6060f1SDimitry Andric .widenScalarToNextPow2(0); 12550b57cec5SDimitry Andric 12560b57cec5SDimitry Andric if (ST.has16BitInsts()) { 12575ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 12585ffd83dbSDimitry Andric .legalFor({S16, S32, V2S16}) 12590eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 12605ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 12615ffd83dbSDimitry Andric // narrowScalar limitation. 12625ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 12635ffd83dbSDimitry Andric .clampScalar(0, S16, S32) 12645ffd83dbSDimitry Andric .scalarize(0); 12655ffd83dbSDimitry Andric 12660b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 1267fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 12680b57cec5SDimitry Andric .legalFor({S32, S16, V2S16}) 12690b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 12705ffd83dbSDimitry Andric .minScalar(0, S16) 12710b57cec5SDimitry Andric .widenScalarToNextPow2(0) 12725ffd83dbSDimitry Andric .scalarize(0) 12735ffd83dbSDimitry Andric .lower(); 12740b57cec5SDimitry Andric } else { 1275fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 12760b57cec5SDimitry Andric .legalFor({S32, S16}) 12770b57cec5SDimitry Andric .widenScalarToNextPow2(0) 12785ffd83dbSDimitry Andric .minScalar(0, S16) 12795ffd83dbSDimitry Andric .scalarize(0) 12805ffd83dbSDimitry Andric .lower(); 12810b57cec5SDimitry Andric } 12820b57cec5SDimitry Andric } else { 12835ffd83dbSDimitry Andric // TODO: Should have same legality without v_perm_b32 12845ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 12855ffd83dbSDimitry Andric .legalFor({S32}) 12865ffd83dbSDimitry Andric .lowerIf(scalarNarrowerThan(0, 32)) 12875ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 12885ffd83dbSDimitry Andric // narrowScalar limitation. 12895ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 12905ffd83dbSDimitry Andric .maxScalar(0, S32) 12915ffd83dbSDimitry Andric .scalarize(0) 12925ffd83dbSDimitry Andric .lower(); 12935ffd83dbSDimitry Andric 1294fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 12950b57cec5SDimitry Andric .legalFor({S32}) 12965ffd83dbSDimitry Andric .minScalar(0, S32) 12970b57cec5SDimitry Andric .widenScalarToNextPow2(0) 12985ffd83dbSDimitry Andric .scalarize(0) 12995ffd83dbSDimitry Andric .lower(); 13000b57cec5SDimitry Andric } 13010b57cec5SDimitry Andric 13020b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTTOPTR) 13030b57cec5SDimitry Andric // List the common cases 13040b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 13050b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 13060b57cec5SDimitry Andric .scalarize(0) 13070b57cec5SDimitry Andric // Accept any address space as long as the size matches 13080b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 13090b57cec5SDimitry Andric .widenScalarIf(smallerThan(1, 0), 13100b57cec5SDimitry Andric [](const LegalityQuery &Query) { 1311bdd1243dSDimitry Andric return std::pair( 1312bdd1243dSDimitry Andric 1, LLT::scalar(Query.Types[0].getSizeInBits())); 13130b57cec5SDimitry Andric }) 1314bdd1243dSDimitry Andric .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1315bdd1243dSDimitry Andric return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 13160b57cec5SDimitry Andric }); 13170b57cec5SDimitry Andric 13180b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PTRTOINT) 13190b57cec5SDimitry Andric // List the common cases 13200b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 13210b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 13220b57cec5SDimitry Andric .scalarize(0) 13230b57cec5SDimitry Andric // Accept any address space as long as the size matches 13240b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 13250b57cec5SDimitry Andric .widenScalarIf(smallerThan(0, 1), 13260b57cec5SDimitry Andric [](const LegalityQuery &Query) { 1327bdd1243dSDimitry Andric return std::pair( 1328bdd1243dSDimitry Andric 0, LLT::scalar(Query.Types[1].getSizeInBits())); 13290b57cec5SDimitry Andric }) 1330bdd1243dSDimitry Andric .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1331bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 13320b57cec5SDimitry Andric }); 13330b57cec5SDimitry Andric 13340b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 13350b57cec5SDimitry Andric .scalarize(0) 13360b57cec5SDimitry Andric .custom(); 13370b57cec5SDimitry Andric 13385ffd83dbSDimitry Andric const auto needToSplitMemOp = [=](const LegalityQuery &Query, 13395ffd83dbSDimitry Andric bool IsLoad) -> bool { 13408bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 13418bcb0991SDimitry Andric 13428bcb0991SDimitry Andric // Split vector extloads. 1343fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1344480093f4SDimitry Andric 13458bcb0991SDimitry Andric if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 13468bcb0991SDimitry Andric return true; 13478bcb0991SDimitry Andric 13488bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 13498bcb0991SDimitry Andric unsigned AS = PtrTy.getAddressSpace(); 135006c3fb27SDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad, 135106c3fb27SDimitry Andric Query.MMODescrs[0].Ordering != 135206c3fb27SDimitry Andric AtomicOrdering::NotAtomic)) 13538bcb0991SDimitry Andric return true; 13548bcb0991SDimitry Andric 13558bcb0991SDimitry Andric // Catch weird sized loads that don't evenly divide into the access sizes 13568bcb0991SDimitry Andric // TODO: May be able to widen depending on alignment etc. 13575ffd83dbSDimitry Andric unsigned NumRegs = (MemSize + 31) / 32; 13585ffd83dbSDimitry Andric if (NumRegs == 3) { 13595ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 13608bcb0991SDimitry Andric return true; 13615ffd83dbSDimitry Andric } else { 13625ffd83dbSDimitry Andric // If the alignment allows, these should have been widened. 13635ffd83dbSDimitry Andric if (!isPowerOf2_32(NumRegs)) 13645ffd83dbSDimitry Andric return true; 13655ffd83dbSDimitry Andric } 13668bcb0991SDimitry Andric 13678bcb0991SDimitry Andric return false; 13688bcb0991SDimitry Andric }; 13698bcb0991SDimitry Andric 1370e8d8bef9SDimitry Andric unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1371e8d8bef9SDimitry Andric unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1372e8d8bef9SDimitry Andric unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 13738bcb0991SDimitry Andric 13748bcb0991SDimitry Andric // TODO: Refine based on subtargets which support unaligned access or 128-bit 13758bcb0991SDimitry Andric // LDS 13768bcb0991SDimitry Andric // TODO: Unsupported flat for SI. 13778bcb0991SDimitry Andric 13788bcb0991SDimitry Andric for (unsigned Op : {G_LOAD, G_STORE}) { 13798bcb0991SDimitry Andric const bool IsStore = Op == G_STORE; 13808bcb0991SDimitry Andric 13818bcb0991SDimitry Andric auto &Actions = getActionDefinitionsBuilder(Op); 13825ffd83dbSDimitry Andric // Explicitly list some common cases. 13835ffd83dbSDimitry Andric // TODO: Does this help compile time at all? 1384fe6060f1SDimitry Andric Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1385fe6060f1SDimitry Andric {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1386fe6060f1SDimitry Andric {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1387fe6060f1SDimitry Andric {S64, GlobalPtr, S64, GlobalAlign32}, 1388fe6060f1SDimitry Andric {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1389fe6060f1SDimitry Andric {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1390fe6060f1SDimitry Andric {S32, GlobalPtr, S8, GlobalAlign8}, 1391fe6060f1SDimitry Andric {S32, GlobalPtr, S16, GlobalAlign16}, 13928bcb0991SDimitry Andric 1393fe6060f1SDimitry Andric {S32, LocalPtr, S32, 32}, 1394fe6060f1SDimitry Andric {S64, LocalPtr, S64, 32}, 1395fe6060f1SDimitry Andric {V2S32, LocalPtr, V2S32, 32}, 1396fe6060f1SDimitry Andric {S32, LocalPtr, S8, 8}, 1397fe6060f1SDimitry Andric {S32, LocalPtr, S16, 16}, 1398fe6060f1SDimitry Andric {V2S16, LocalPtr, S32, 32}, 13998bcb0991SDimitry Andric 1400fe6060f1SDimitry Andric {S32, PrivatePtr, S32, 32}, 1401fe6060f1SDimitry Andric {S32, PrivatePtr, S8, 8}, 1402fe6060f1SDimitry Andric {S32, PrivatePtr, S16, 16}, 1403fe6060f1SDimitry Andric {V2S16, PrivatePtr, S32, 32}, 14048bcb0991SDimitry Andric 1405fe6060f1SDimitry Andric {S32, ConstantPtr, S32, GlobalAlign32}, 1406fe6060f1SDimitry Andric {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1407fe6060f1SDimitry Andric {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1408fe6060f1SDimitry Andric {S64, ConstantPtr, S64, GlobalAlign32}, 1409fe6060f1SDimitry Andric {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 14105ffd83dbSDimitry Andric Actions.legalIf( 14115ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1412fe6060f1SDimitry Andric return isLoadStoreLegal(ST, Query); 14135ffd83dbSDimitry Andric }); 14145ffd83dbSDimitry Andric 141506c3fb27SDimitry Andric // The custom pointers (fat pointers, buffer resources) don't work with load 141606c3fb27SDimitry Andric // and store at this level. Fat pointers should have been lowered to 141706c3fb27SDimitry Andric // intrinsics before the translation to MIR. 14185f757f3fSDimitry Andric Actions.unsupportedIf( 14195f757f3fSDimitry Andric typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr})); 142006c3fb27SDimitry Andric 142106c3fb27SDimitry Andric // Address space 8 pointers are handled by a 4xs32 load, bitcast, and 142206c3fb27SDimitry Andric // ptrtoint. This is needed to account for the fact that we can't have i128 142306c3fb27SDimitry Andric // as a register class for SelectionDAG reasons. 142406c3fb27SDimitry Andric Actions.customIf([=](const LegalityQuery &Query) -> bool { 142506c3fb27SDimitry Andric return hasBufferRsrcWorkaround(Query.Types[0]); 142606c3fb27SDimitry Andric }); 142706c3fb27SDimitry Andric 14285ffd83dbSDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 14295ffd83dbSDimitry Andric // 64-bits. 14305ffd83dbSDimitry Andric // 14315ffd83dbSDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 14325ffd83dbSDimitry Andric // inserting addrspacecasts. 14335ffd83dbSDimitry Andric Actions.customIf(typeIs(1, Constant32Ptr)); 14345ffd83dbSDimitry Andric 14355ffd83dbSDimitry Andric // Turn any illegal element vectors into something easier to deal 14365ffd83dbSDimitry Andric // with. These will ultimately produce 32-bit scalar shifts to extract the 14375ffd83dbSDimitry Andric // parts anyway. 14385ffd83dbSDimitry Andric // 14395ffd83dbSDimitry Andric // For odd 16-bit element vectors, prefer to split those into pieces with 14405ffd83dbSDimitry Andric // 16-bit vector parts. 14415ffd83dbSDimitry Andric Actions.bitcastIf( 14425ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1443e8d8bef9SDimitry Andric return shouldBitcastLoadStoreType(ST, Query.Types[0], 1444fe6060f1SDimitry Andric Query.MMODescrs[0].MemoryTy); 14455ffd83dbSDimitry Andric }, bitcastToRegisterType(0)); 14465ffd83dbSDimitry Andric 1447e8d8bef9SDimitry Andric if (!IsStore) { 1448e8d8bef9SDimitry Andric // Widen suitably aligned loads by loading extra bytes. The standard 1449e8d8bef9SDimitry Andric // legalization actions can't properly express widening memory operands. 1450e8d8bef9SDimitry Andric Actions.customIf([=](const LegalityQuery &Query) -> bool { 1451e8d8bef9SDimitry Andric return shouldWidenLoad(ST, Query, G_LOAD); 1452e8d8bef9SDimitry Andric }); 1453e8d8bef9SDimitry Andric } 1454e8d8bef9SDimitry Andric 1455e8d8bef9SDimitry Andric // FIXME: load/store narrowing should be moved to lower action 14568bcb0991SDimitry Andric Actions 14578bcb0991SDimitry Andric .narrowScalarIf( 14588bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 14595ffd83dbSDimitry Andric return !Query.Types[0].isVector() && 14605ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 14618bcb0991SDimitry Andric }, 14628bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 14638bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 14648bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 14658bcb0991SDimitry Andric 14668bcb0991SDimitry Andric const unsigned DstSize = DstTy.getSizeInBits(); 1467fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 14688bcb0991SDimitry Andric 14698bcb0991SDimitry Andric // Split extloads. 14708bcb0991SDimitry Andric if (DstSize > MemSize) 1471bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(MemSize)); 14728bcb0991SDimitry Andric 147306c3fb27SDimitry Andric unsigned MaxSize = maxSizeForAddrSpace( 147406c3fb27SDimitry Andric ST, PtrTy.getAddressSpace(), Op == G_LOAD, 147506c3fb27SDimitry Andric Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 14768bcb0991SDimitry Andric if (MemSize > MaxSize) 1477bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(MaxSize)); 14788bcb0991SDimitry Andric 147904eeddc0SDimitry Andric uint64_t Align = Query.MMODescrs[0].AlignInBits; 1480bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(Align)); 14818bcb0991SDimitry Andric }) 14828bcb0991SDimitry Andric .fewerElementsIf( 14838bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 14845ffd83dbSDimitry Andric return Query.Types[0].isVector() && 14855ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 14868bcb0991SDimitry Andric }, 14878bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 14888bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 14898bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 14908bcb0991SDimitry Andric 14918bcb0991SDimitry Andric LLT EltTy = DstTy.getElementType(); 149206c3fb27SDimitry Andric unsigned MaxSize = maxSizeForAddrSpace( 149306c3fb27SDimitry Andric ST, PtrTy.getAddressSpace(), Op == G_LOAD, 149406c3fb27SDimitry Andric Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic); 14955ffd83dbSDimitry Andric 14965ffd83dbSDimitry Andric // FIXME: Handle widened to power of 2 results better. This ends 14975ffd83dbSDimitry Andric // up scalarizing. 14985ffd83dbSDimitry Andric // FIXME: 3 element stores scalarized on SI 14998bcb0991SDimitry Andric 15008bcb0991SDimitry Andric // Split if it's too large for the address space. 1501fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1502fe6060f1SDimitry Andric if (MemSize > MaxSize) { 15038bcb0991SDimitry Andric unsigned NumElts = DstTy.getNumElements(); 15045ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 15055ffd83dbSDimitry Andric 15065ffd83dbSDimitry Andric if (MaxSize % EltSize == 0) { 1507bdd1243dSDimitry Andric return std::pair( 1508fe6060f1SDimitry Andric 0, LLT::scalarOrVector( 1509fe6060f1SDimitry Andric ElementCount::getFixed(MaxSize / EltSize), EltTy)); 15105ffd83dbSDimitry Andric } 15115ffd83dbSDimitry Andric 1512fe6060f1SDimitry Andric unsigned NumPieces = MemSize / MaxSize; 15138bcb0991SDimitry Andric 15148bcb0991SDimitry Andric // FIXME: Refine when odd breakdowns handled 15158bcb0991SDimitry Andric // The scalars will need to be re-legalized. 15168bcb0991SDimitry Andric if (NumPieces == 1 || NumPieces >= NumElts || 15178bcb0991SDimitry Andric NumElts % NumPieces != 0) 1518bdd1243dSDimitry Andric return std::pair(0, EltTy); 15198bcb0991SDimitry Andric 1520bdd1243dSDimitry Andric return std::pair(0, 1521bdd1243dSDimitry Andric LLT::fixed_vector(NumElts / NumPieces, EltTy)); 15228bcb0991SDimitry Andric } 15238bcb0991SDimitry Andric 15245ffd83dbSDimitry Andric // FIXME: We could probably handle weird extending loads better. 15255ffd83dbSDimitry Andric if (DstTy.getSizeInBits() > MemSize) 1526bdd1243dSDimitry Andric return std::pair(0, EltTy); 15275ffd83dbSDimitry Andric 15285ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 15295ffd83dbSDimitry Andric unsigned DstSize = DstTy.getSizeInBits(); 15305ffd83dbSDimitry Andric if (!isPowerOf2_32(DstSize)) { 15315ffd83dbSDimitry Andric // We're probably decomposing an odd sized store. Try to split 15325ffd83dbSDimitry Andric // to the widest type. TODO: Account for alignment. As-is it 15335ffd83dbSDimitry Andric // should be OK, since the new parts will be further legalized. 153406c3fb27SDimitry Andric unsigned FloorSize = llvm::bit_floor(DstSize); 1535bdd1243dSDimitry Andric return std::pair( 1536fe6060f1SDimitry Andric 0, LLT::scalarOrVector( 1537fe6060f1SDimitry Andric ElementCount::getFixed(FloorSize / EltSize), EltTy)); 15385ffd83dbSDimitry Andric } 15395ffd83dbSDimitry Andric 15408bcb0991SDimitry Andric // May need relegalization for the scalars. 1541bdd1243dSDimitry Andric return std::pair(0, EltTy); 15428bcb0991SDimitry Andric }) 1543fe6060f1SDimitry Andric .minScalar(0, S32) 1544fe6060f1SDimitry Andric .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 15458bcb0991SDimitry Andric .widenScalarToNextPow2(0) 1546e8d8bef9SDimitry Andric .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1547e8d8bef9SDimitry Andric .lower(); 15488bcb0991SDimitry Andric } 15490b57cec5SDimitry Andric 1550fe6060f1SDimitry Andric // FIXME: Unaligned accesses not lowered. 15510b57cec5SDimitry Andric auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1552fe6060f1SDimitry Andric .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1553fe6060f1SDimitry Andric {S32, GlobalPtr, S16, 2 * 8}, 1554fe6060f1SDimitry Andric {S32, LocalPtr, S8, 8}, 1555fe6060f1SDimitry Andric {S32, LocalPtr, S16, 16}, 1556fe6060f1SDimitry Andric {S32, PrivatePtr, S8, 8}, 1557fe6060f1SDimitry Andric {S32, PrivatePtr, S16, 16}, 1558fe6060f1SDimitry Andric {S32, ConstantPtr, S8, 8}, 1559fe6060f1SDimitry Andric {S32, ConstantPtr, S16, 2 * 8}}) 1560fe6060f1SDimitry Andric .legalIf( 1561fe6060f1SDimitry Andric [=](const LegalityQuery &Query) -> bool { 1562fe6060f1SDimitry Andric return isLoadStoreLegal(ST, Query); 1563fe6060f1SDimitry Andric }); 1564fe6060f1SDimitry Andric 15650b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 15668bcb0991SDimitry Andric ExtLoads.legalForTypesWithMemDesc( 1567fe6060f1SDimitry Andric {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 15680b57cec5SDimitry Andric } 15690b57cec5SDimitry Andric 1570fe6060f1SDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1571fe6060f1SDimitry Andric // 64-bits. 1572fe6060f1SDimitry Andric // 1573fe6060f1SDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 1574fe6060f1SDimitry Andric // inserting addrspacecasts. 1575fe6060f1SDimitry Andric ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1576fe6060f1SDimitry Andric 15770b57cec5SDimitry Andric ExtLoads.clampScalar(0, S32, S32) 15780b57cec5SDimitry Andric .widenScalarToNextPow2(0) 15790b57cec5SDimitry Andric .lower(); 15800b57cec5SDimitry Andric 15810b57cec5SDimitry Andric auto &Atomics = getActionDefinitionsBuilder( 15820b57cec5SDimitry Andric {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 15830b57cec5SDimitry Andric G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 15840b57cec5SDimitry Andric G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 158506c3fb27SDimitry Andric G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP}) 15860b57cec5SDimitry Andric .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1587e8d8bef9SDimitry Andric {S64, GlobalPtr}, {S64, LocalPtr}, 1588e8d8bef9SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 15890b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 15900b57cec5SDimitry Andric Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 15910b57cec5SDimitry Andric } 15920b57cec5SDimitry Andric 1593fe6060f1SDimitry Andric auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1594349cc55cSDimitry Andric if (ST.hasLDSFPAtomicAdd()) { 1595fe6060f1SDimitry Andric Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1596fe6060f1SDimitry Andric if (ST.hasGFX90AInsts()) 1597fe6060f1SDimitry Andric Atomic.legalFor({{S64, LocalPtr}}); 159806c3fb27SDimitry Andric if (ST.hasAtomicDsPkAdd16Insts()) 159981ad6265SDimitry Andric Atomic.legalFor({{V2S16, LocalPtr}}); 16005ffd83dbSDimitry Andric } 1601fe6060f1SDimitry Andric if (ST.hasAtomicFaddInsts()) 1602fe6060f1SDimitry Andric Atomic.legalFor({{S32, GlobalPtr}}); 1603bdd1243dSDimitry Andric if (ST.hasFlatAtomicFaddF32Inst()) 1604bdd1243dSDimitry Andric Atomic.legalFor({{S32, FlatPtr}}); 16058bcb0991SDimitry Andric 160604eeddc0SDimitry Andric if (ST.hasGFX90AInsts()) { 160704eeddc0SDimitry Andric // These are legal with some caveats, and should have undergone expansion in 160804eeddc0SDimitry Andric // the IR in most situations 160904eeddc0SDimitry Andric // TODO: Move atomic expansion into legalizer 161004eeddc0SDimitry Andric Atomic.legalFor({ 161104eeddc0SDimitry Andric {S32, GlobalPtr}, 161204eeddc0SDimitry Andric {S64, GlobalPtr}, 161304eeddc0SDimitry Andric {S64, FlatPtr} 161404eeddc0SDimitry Andric }); 161504eeddc0SDimitry Andric } 161604eeddc0SDimitry Andric 1617480093f4SDimitry Andric // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1618480093f4SDimitry Andric // demarshalling 1619480093f4SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1620480093f4SDimitry Andric .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1621480093f4SDimitry Andric {S32, FlatPtr}, {S64, FlatPtr}}) 1622480093f4SDimitry Andric .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1623480093f4SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 16240b57cec5SDimitry Andric // TODO: Pointer types, any 32-bit or 64-bit vector 1625480093f4SDimitry Andric 1626480093f4SDimitry Andric // Condition should be s32 for scalar, s1 for vector. 16270b57cec5SDimitry Andric getActionDefinitionsBuilder(G_SELECT) 1628fe6060f1SDimitry Andric .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1629fe6060f1SDimitry Andric LocalPtr, FlatPtr, PrivatePtr, 1630fe6060f1SDimitry Andric LLT::fixed_vector(2, LocalPtr), 1631fe6060f1SDimitry Andric LLT::fixed_vector(2, PrivatePtr)}, 1632fe6060f1SDimitry Andric {S1, S32}) 16330b57cec5SDimitry Andric .clampScalar(0, S16, S64) 16345ffd83dbSDimitry Andric .scalarize(1) 16350b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 16360b57cec5SDimitry Andric .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 16370b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 2) 16380b57cec5SDimitry Andric .clampMaxNumElements(0, LocalPtr, 2) 16390b57cec5SDimitry Andric .clampMaxNumElements(0, PrivatePtr, 2) 16400b57cec5SDimitry Andric .scalarize(0) 16410b57cec5SDimitry Andric .widenScalarToNextPow2(0) 1642480093f4SDimitry Andric .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 16430b57cec5SDimitry Andric 16440b57cec5SDimitry Andric // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 16450b57cec5SDimitry Andric // be more flexible with the shift amount type. 16460b57cec5SDimitry Andric auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 16470b57cec5SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}); 16480b57cec5SDimitry Andric if (ST.has16BitInsts()) { 16490b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 16505ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 16510b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2); 16520b57cec5SDimitry Andric } else 16535ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}}); 16540b57cec5SDimitry Andric 16555ffd83dbSDimitry Andric // TODO: Support 16-bit shift amounts for all types 16565ffd83dbSDimitry Andric Shifts.widenScalarIf( 16575ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { 16585ffd83dbSDimitry Andric // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 16595ffd83dbSDimitry Andric // 32-bit amount. 16605ffd83dbSDimitry Andric const LLT ValTy = Query.Types[0]; 16615ffd83dbSDimitry Andric const LLT AmountTy = Query.Types[1]; 16625ffd83dbSDimitry Andric return ValTy.getSizeInBits() <= 16 && 16635ffd83dbSDimitry Andric AmountTy.getSizeInBits() < 16; 16645ffd83dbSDimitry Andric }, changeTo(1, S16)); 16655ffd83dbSDimitry Andric Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1666480093f4SDimitry Andric Shifts.clampScalar(1, S32, S32); 16670b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 16); 166804eeddc0SDimitry Andric Shifts.clampScalar(0, S16, S64); 1669e8d8bef9SDimitry Andric 1670e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1671e8d8bef9SDimitry Andric .minScalar(0, S16) 1672e8d8bef9SDimitry Andric .scalarize(0) 1673e8d8bef9SDimitry Andric .lower(); 16740b57cec5SDimitry Andric } else { 16750b57cec5SDimitry Andric // Make sure we legalize the shift amount type first, as the general 16760b57cec5SDimitry Andric // expansion for the shifted type will produce much worse code if it hasn't 16770b57cec5SDimitry Andric // been truncated already. 16780b57cec5SDimitry Andric Shifts.clampScalar(1, S32, S32); 16790b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 32); 168004eeddc0SDimitry Andric Shifts.clampScalar(0, S32, S64); 1681e8d8bef9SDimitry Andric 1682e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1683e8d8bef9SDimitry Andric .minScalar(0, S32) 1684e8d8bef9SDimitry Andric .scalarize(0) 1685e8d8bef9SDimitry Andric .lower(); 16860b57cec5SDimitry Andric } 16870b57cec5SDimitry Andric Shifts.scalarize(0); 16880b57cec5SDimitry Andric 16890b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 16900b57cec5SDimitry Andric unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 16910b57cec5SDimitry Andric unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 16920b57cec5SDimitry Andric unsigned IdxTypeIdx = 2; 16930b57cec5SDimitry Andric 16940b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 16950b57cec5SDimitry Andric .customIf([=](const LegalityQuery &Query) { 16960b57cec5SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 16970b57cec5SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 16980b57cec5SDimitry Andric const LLT IdxTy = Query.Types[IdxTypeIdx]; 1699e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 170006c3fb27SDimitry Andric const bool isLegalVecType = 170106c3fb27SDimitry Andric !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits()); 170206c3fb27SDimitry Andric // Address space 8 pointers are 128-bit wide values, but the logic 170306c3fb27SDimitry Andric // below will try to bitcast them to 2N x s64, which will fail. 170406c3fb27SDimitry Andric // Therefore, as an intermediate step, wrap extracts/insertions from a 170506c3fb27SDimitry Andric // ptrtoint-ing the vector and scalar arguments (or inttoptring the 170606c3fb27SDimitry Andric // extraction result) in order to produce a vector operation that can 170706c3fb27SDimitry Andric // be handled by the logic below. 170806c3fb27SDimitry Andric if (EltTy.isPointer() && EltSize > 64) 170906c3fb27SDimitry Andric return true; 1710e8d8bef9SDimitry Andric return (EltSize == 32 || EltSize == 64) && 17110b57cec5SDimitry Andric VecTy.getSizeInBits() % 32 == 0 && 17125ffd83dbSDimitry Andric VecTy.getSizeInBits() <= MaxRegisterSize && 171306c3fb27SDimitry Andric IdxTy.getSizeInBits() == 32 && 171406c3fb27SDimitry Andric isLegalVecType; 17150b57cec5SDimitry Andric }) 1716e8d8bef9SDimitry Andric .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1717e8d8bef9SDimitry Andric bitcastToVectorElement32(VecTypeIdx)) 1718e8d8bef9SDimitry Andric //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1719e8d8bef9SDimitry Andric .bitcastIf( 1720e8d8bef9SDimitry Andric all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1721e8d8bef9SDimitry Andric [=](const LegalityQuery &Query) { 1722e8d8bef9SDimitry Andric // For > 64-bit element types, try to turn this into a 64-bit 1723e8d8bef9SDimitry Andric // element vector since we may be able to do better indexing 1724e8d8bef9SDimitry Andric // if this is scalar. If not, fall back to 32. 1725e8d8bef9SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 1726e8d8bef9SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 1727e8d8bef9SDimitry Andric const unsigned DstEltSize = EltTy.getSizeInBits(); 1728e8d8bef9SDimitry Andric const unsigned VecSize = VecTy.getSizeInBits(); 1729e8d8bef9SDimitry Andric 1730e8d8bef9SDimitry Andric const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1731bdd1243dSDimitry Andric return std::pair( 1732fe6060f1SDimitry Andric VecTypeIdx, 1733fe6060f1SDimitry Andric LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1734e8d8bef9SDimitry Andric }) 17350b57cec5SDimitry Andric .clampScalar(EltTypeIdx, S32, S64) 17360b57cec5SDimitry Andric .clampScalar(VecTypeIdx, S32, S64) 1737e8d8bef9SDimitry Andric .clampScalar(IdxTypeIdx, S32, S32) 1738e8d8bef9SDimitry Andric .clampMaxNumElements(VecTypeIdx, S32, 32) 1739e8d8bef9SDimitry Andric // TODO: Clamp elements for 64-bit vectors? 174006c3fb27SDimitry Andric .moreElementsIf( 174106c3fb27SDimitry Andric isIllegalRegisterType(VecTypeIdx), 174206c3fb27SDimitry Andric moreElementsToNextExistingRegClass(VecTypeIdx)) 1743e8d8bef9SDimitry Andric // It should only be necessary with variable indexes. 1744e8d8bef9SDimitry Andric // As a last resort, lower to the stack 1745e8d8bef9SDimitry Andric .lower(); 17460b57cec5SDimitry Andric } 17470b57cec5SDimitry Andric 17480b57cec5SDimitry Andric getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 17490b57cec5SDimitry Andric .unsupportedIf([=](const LegalityQuery &Query) { 17500b57cec5SDimitry Andric const LLT &EltTy = Query.Types[1].getElementType(); 17510b57cec5SDimitry Andric return Query.Types[0] != EltTy; 17520b57cec5SDimitry Andric }); 17530b57cec5SDimitry Andric 17540b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT, G_INSERT}) { 17550b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 17560b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 17570b57cec5SDimitry Andric 17580b57cec5SDimitry Andric // FIXME: Doesn't handle extract of illegal sizes. 17590b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 17608bcb0991SDimitry Andric .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 17610eae32dcSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 17620eae32dcSDimitry Andric // Sub-vector(or single element) insert and extract. 17630eae32dcSDimitry Andric // TODO: verify immediate offset here since lower only works with 17640eae32dcSDimitry Andric // whole elements. 17650eae32dcSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 17660eae32dcSDimitry Andric return BigTy.isVector(); 17670eae32dcSDimitry Andric }) 17688bcb0991SDimitry Andric // FIXME: Multiples of 16 should not be legal. 17690b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 17700b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 17710b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 17720b57cec5SDimitry Andric return (BigTy.getSizeInBits() % 32 == 0) && 17730b57cec5SDimitry Andric (LitTy.getSizeInBits() % 16 == 0); 17740b57cec5SDimitry Andric }) 17750b57cec5SDimitry Andric .widenScalarIf( 17760b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 17770b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 17780b57cec5SDimitry Andric return (BigTy.getScalarSizeInBits() < 16); 17790b57cec5SDimitry Andric }, 17800b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 17810b57cec5SDimitry Andric .widenScalarIf( 17820b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 17830b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 17840b57cec5SDimitry Andric return (LitTy.getScalarSizeInBits() < 16); 17850b57cec5SDimitry Andric }, 17860b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 17870b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 17880b57cec5SDimitry Andric .widenScalarToNextPow2(BigTyIdx, 32); 17890b57cec5SDimitry Andric 17900b57cec5SDimitry Andric } 17910b57cec5SDimitry Andric 17928bcb0991SDimitry Andric auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 17930b57cec5SDimitry Andric .legalForCartesianProduct(AllS32Vectors, {S32}) 17940b57cec5SDimitry Andric .legalForCartesianProduct(AllS64Vectors, {S64}) 17958bcb0991SDimitry Andric .clampNumElements(0, V16S32, V32S32) 17968bcb0991SDimitry Andric .clampNumElements(0, V2S64, V16S64) 179706c3fb27SDimitry Andric .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)) 179806c3fb27SDimitry Andric .moreElementsIf( 179906c3fb27SDimitry Andric isIllegalRegisterType(0), 180006c3fb27SDimitry Andric moreElementsToNextExistingRegClass(0)); 18018bcb0991SDimitry Andric 18028bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) { 18035ffd83dbSDimitry Andric BuildVector 18045ffd83dbSDimitry Andric // FIXME: Should probably widen s1 vectors straight to s32 18055ffd83dbSDimitry Andric .minScalarOrElt(0, S16) 1806bdd1243dSDimitry Andric .minScalar(1, S16); 18075ffd83dbSDimitry Andric 18088bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 18098bcb0991SDimitry Andric .legalFor({V2S16, S32}) 18108bcb0991SDimitry Andric .lower(); 18118bcb0991SDimitry Andric } else { 18125ffd83dbSDimitry Andric BuildVector.customFor({V2S16, S16}); 18135ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 18145ffd83dbSDimitry Andric 18158bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 18165ffd83dbSDimitry Andric .customFor({V2S16, S32}) 18178bcb0991SDimitry Andric .lower(); 18188bcb0991SDimitry Andric } 18198bcb0991SDimitry Andric 18205ffd83dbSDimitry Andric BuildVector.legalIf(isRegisterType(0)); 18215ffd83dbSDimitry Andric 18225ffd83dbSDimitry Andric // FIXME: Clamp maximum size 18230b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1824e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 1825e8d8bef9SDimitry Andric .clampMaxNumElements(0, S32, 32) 1826e8d8bef9SDimitry Andric .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1827e8d8bef9SDimitry Andric .clampMaxNumElements(0, S16, 64); 18280b57cec5SDimitry Andric 18298bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 18308bcb0991SDimitry Andric 18310b57cec5SDimitry Andric // Merge/Unmerge 18320b57cec5SDimitry Andric for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 18330b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 18340b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 18350b57cec5SDimitry Andric 18360b57cec5SDimitry Andric auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 18375ffd83dbSDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 18380b57cec5SDimitry Andric if (Ty.isVector()) { 18390b57cec5SDimitry Andric const LLT &EltTy = Ty.getElementType(); 18405ffd83dbSDimitry Andric if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 18410b57cec5SDimitry Andric return true; 184206c3fb27SDimitry Andric if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits())) 18430b57cec5SDimitry Andric return true; 18440b57cec5SDimitry Andric } 18450b57cec5SDimitry Andric return false; 18460b57cec5SDimitry Andric }; 18470b57cec5SDimitry Andric 18488bcb0991SDimitry Andric auto &Builder = getActionDefinitionsBuilder(Op) 1849e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 18505ffd83dbSDimitry Andric .lowerFor({{S16, V2S16}}) 18515ffd83dbSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 18525ffd83dbSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 18535ffd83dbSDimitry Andric return BigTy.getSizeInBits() == 32; 18545ffd83dbSDimitry Andric }) 18555ffd83dbSDimitry Andric // Try to widen to s16 first for small types. 18565ffd83dbSDimitry Andric // TODO: Only do this on targets with legal s16 shifts 18575ffd83dbSDimitry Andric .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 18580b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 18598bcb0991SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 18608bcb0991SDimitry Andric .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 18618bcb0991SDimitry Andric elementTypeIs(1, S16)), 18628bcb0991SDimitry Andric changeTo(1, V2S16)) 18635ffd83dbSDimitry Andric // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 18645ffd83dbSDimitry Andric // worth considering the multiples of 64 since 2*192 and 2*384 are not 18655ffd83dbSDimitry Andric // valid. 18665ffd83dbSDimitry Andric .clampScalar(LitTyIdx, S32, S512) 18675ffd83dbSDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 18680b57cec5SDimitry Andric // Break up vectors with weird elements into scalars 18690b57cec5SDimitry Andric .fewerElementsIf( 18705ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 18710b57cec5SDimitry Andric scalarize(0)) 18720b57cec5SDimitry Andric .fewerElementsIf( 18735ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 18740b57cec5SDimitry Andric scalarize(1)) 18755ffd83dbSDimitry Andric .clampScalar(BigTyIdx, S32, MaxScalar); 18768bcb0991SDimitry Andric 18778bcb0991SDimitry Andric if (Op == G_MERGE_VALUES) { 18788bcb0991SDimitry Andric Builder.widenScalarIf( 18798bcb0991SDimitry Andric // TODO: Use 16-bit shifts if legal for 8-bit values? 18800b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 18818bcb0991SDimitry Andric const LLT Ty = Query.Types[LitTyIdx]; 18828bcb0991SDimitry Andric return Ty.getSizeInBits() < 32; 18838bcb0991SDimitry Andric }, 18848bcb0991SDimitry Andric changeTo(LitTyIdx, S32)); 18858bcb0991SDimitry Andric } 18868bcb0991SDimitry Andric 18878bcb0991SDimitry Andric Builder.widenScalarIf( 18888bcb0991SDimitry Andric [=](const LegalityQuery &Query) { 18898bcb0991SDimitry Andric const LLT Ty = Query.Types[BigTyIdx]; 189006c3fb27SDimitry Andric return Ty.getSizeInBits() % 16 != 0; 18910b57cec5SDimitry Andric }, 18920b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 18930b57cec5SDimitry Andric // Pick the next power of 2, or a multiple of 64 over 128. 18940b57cec5SDimitry Andric // Whichever is smaller. 18950b57cec5SDimitry Andric const LLT &Ty = Query.Types[BigTyIdx]; 18960b57cec5SDimitry Andric unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 18970b57cec5SDimitry Andric if (NewSizeInBits >= 256) { 18980b57cec5SDimitry Andric unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 18990b57cec5SDimitry Andric if (RoundedTo < NewSizeInBits) 19000b57cec5SDimitry Andric NewSizeInBits = RoundedTo; 19010b57cec5SDimitry Andric } 1902bdd1243dSDimitry Andric return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 19030b57cec5SDimitry Andric }) 19040b57cec5SDimitry Andric // Any vectors left are the wrong size. Scalarize them. 19050b57cec5SDimitry Andric .scalarize(0) 19060b57cec5SDimitry Andric .scalarize(1); 19070b57cec5SDimitry Andric } 19080b57cec5SDimitry Andric 19095ffd83dbSDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 19105ffd83dbSDimitry Andric // RegBankSelect. 19115ffd83dbSDimitry Andric auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 19125ffd83dbSDimitry Andric .legalFor({{S32}, {S64}}); 19138bcb0991SDimitry Andric 19145ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 19155ffd83dbSDimitry Andric SextInReg.lowerFor({{V2S16}}) 19165ffd83dbSDimitry Andric // Prefer to reduce vector widths for 16-bit vectors before lowering, to 19175ffd83dbSDimitry Andric // get more vector shift opportunities, since we'll get those when 19185ffd83dbSDimitry Andric // expanded. 19190eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2); 19205ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 19215ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}, {S16}}); 19225ffd83dbSDimitry Andric } else { 19235ffd83dbSDimitry Andric // Prefer to promote to s32 before lowering if we don't have 16-bit 19245ffd83dbSDimitry Andric // shifts. This avoid a lot of intermediate truncate and extend operations. 19255ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}}); 19265ffd83dbSDimitry Andric } 19275ffd83dbSDimitry Andric 19285ffd83dbSDimitry Andric SextInReg 19295ffd83dbSDimitry Andric .scalarize(0) 19305ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 19315ffd83dbSDimitry Andric .lower(); 19325ffd83dbSDimitry Andric 1933349cc55cSDimitry Andric getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1934349cc55cSDimitry Andric .scalarize(0) 1935349cc55cSDimitry Andric .lower(); 1936349cc55cSDimitry Andric 1937fe6060f1SDimitry Andric // TODO: Only Try to form v2s16 with legal packed instructions. 19385ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSHR) 19395ffd83dbSDimitry Andric .legalFor({{S32, S32}}) 1940fe6060f1SDimitry Andric .lowerFor({{V2S16, V2S16}}) 19410eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 19425ffd83dbSDimitry Andric .scalarize(0) 19435ffd83dbSDimitry Andric .lower(); 1944480093f4SDimitry Andric 1945fe6060f1SDimitry Andric if (ST.hasVOP3PInsts()) { 1946fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FSHL) 1947fe6060f1SDimitry Andric .lowerFor({{V2S16, V2S16}}) 19480eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 1949fe6060f1SDimitry Andric .scalarize(0) 1950fe6060f1SDimitry Andric .lower(); 1951fe6060f1SDimitry Andric } else { 1952fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FSHL) 1953fe6060f1SDimitry Andric .scalarize(0) 1954fe6060f1SDimitry Andric .lower(); 1955fe6060f1SDimitry Andric } 1956fe6060f1SDimitry Andric 1957480093f4SDimitry Andric getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1958480093f4SDimitry Andric .legalFor({S64}); 1959480093f4SDimitry Andric 1960e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FENCE) 1961e8d8bef9SDimitry Andric .alwaysLegal(); 1962e8d8bef9SDimitry Andric 1963fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1964fe6060f1SDimitry Andric .scalarize(0) 1965fe6060f1SDimitry Andric .minScalar(0, S32) 1966fe6060f1SDimitry Andric .lower(); 1967fe6060f1SDimitry Andric 1968fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1969fe6060f1SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}) 1970fe6060f1SDimitry Andric .clampScalar(1, S32, S32) 1971fe6060f1SDimitry Andric .clampScalar(0, S32, S64) 1972fe6060f1SDimitry Andric .widenScalarToNextPow2(0) 1973fe6060f1SDimitry Andric .scalarize(0); 1974fe6060f1SDimitry Andric 19755f757f3fSDimitry Andric getActionDefinitionsBuilder( 19765f757f3fSDimitry Andric {// TODO: Verify V_BFI_B32 is generated from expanded bit ops 19775ffd83dbSDimitry Andric G_FCOPYSIGN, 19785ffd83dbSDimitry Andric 19795f757f3fSDimitry Andric G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB, 19805f757f3fSDimitry Andric G_READ_REGISTER, G_WRITE_REGISTER, 19815ffd83dbSDimitry Andric 19825f757f3fSDimitry Andric G_SADDO, G_SSUBO}) 19835f757f3fSDimitry Andric .lower(); 19845ffd83dbSDimitry Andric 19855f757f3fSDimitry Andric if (ST.hasIEEEMinMax()) { 19865f757f3fSDimitry Andric getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}) 19875f757f3fSDimitry Andric .legalFor(FPTypesPK16) 19885f757f3fSDimitry Andric .clampMaxNumElements(0, S16, 2) 19895f757f3fSDimitry Andric .scalarize(0); 19905f757f3fSDimitry Andric } else { 19915ffd83dbSDimitry Andric // TODO: Implement 19925f757f3fSDimitry Andric getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 19935f757f3fSDimitry Andric } 19945ffd83dbSDimitry Andric 1995349cc55cSDimitry Andric getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1996349cc55cSDimitry Andric .lower(); 1997349cc55cSDimitry Andric 1998480093f4SDimitry Andric getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 19995ffd83dbSDimitry Andric G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 2000480093f4SDimitry Andric G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 2001480093f4SDimitry Andric .unsupported(); 2002480093f4SDimitry Andric 20035f757f3fSDimitry Andric getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal(); 20045f757f3fSDimitry Andric 2005fe6060f1SDimitry Andric getLegacyLegalizerInfo().computeTables(); 20060b57cec5SDimitry Andric verify(*ST.getInstrInfo()); 20070b57cec5SDimitry Andric } 20080b57cec5SDimitry Andric 20091db9f3b2SDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom( 20101db9f3b2SDimitry Andric LegalizerHelper &Helper, MachineInstr &MI, 20111db9f3b2SDimitry Andric LostDebugLocObserver &LocObserver) const { 20125ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 20135ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 20145ffd83dbSDimitry Andric 20150b57cec5SDimitry Andric switch (MI.getOpcode()) { 20160b57cec5SDimitry Andric case TargetOpcode::G_ADDRSPACE_CAST: 20178bcb0991SDimitry Andric return legalizeAddrSpaceCast(MI, MRI, B); 20185f757f3fSDimitry Andric case TargetOpcode::G_INTRINSIC_ROUNDEVEN: 20195f757f3fSDimitry Andric return legalizeFroundeven(MI, MRI, B); 20200b57cec5SDimitry Andric case TargetOpcode::G_FCEIL: 20218bcb0991SDimitry Andric return legalizeFceil(MI, MRI, B); 2022e8d8bef9SDimitry Andric case TargetOpcode::G_FREM: 2023e8d8bef9SDimitry Andric return legalizeFrem(MI, MRI, B); 20240b57cec5SDimitry Andric case TargetOpcode::G_INTRINSIC_TRUNC: 20258bcb0991SDimitry Andric return legalizeIntrinsicTrunc(MI, MRI, B); 20260b57cec5SDimitry Andric case TargetOpcode::G_SITOFP: 20278bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, true); 20280b57cec5SDimitry Andric case TargetOpcode::G_UITOFP: 20298bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, false); 20305ffd83dbSDimitry Andric case TargetOpcode::G_FPTOSI: 20315ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, true); 20325ffd83dbSDimitry Andric case TargetOpcode::G_FPTOUI: 20335ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, false); 20340b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM: 20350b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM: 20360b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM_IEEE: 20370b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM_IEEE: 20385ffd83dbSDimitry Andric return legalizeMinNumMaxNum(Helper, MI); 20390b57cec5SDimitry Andric case TargetOpcode::G_EXTRACT_VECTOR_ELT: 20408bcb0991SDimitry Andric return legalizeExtractVectorElt(MI, MRI, B); 20410b57cec5SDimitry Andric case TargetOpcode::G_INSERT_VECTOR_ELT: 20428bcb0991SDimitry Andric return legalizeInsertVectorElt(MI, MRI, B); 20438bcb0991SDimitry Andric case TargetOpcode::G_FSIN: 20448bcb0991SDimitry Andric case TargetOpcode::G_FCOS: 20458bcb0991SDimitry Andric return legalizeSinCos(MI, MRI, B); 20468bcb0991SDimitry Andric case TargetOpcode::G_GLOBAL_VALUE: 20478bcb0991SDimitry Andric return legalizeGlobalValue(MI, MRI, B); 20488bcb0991SDimitry Andric case TargetOpcode::G_LOAD: 2049fe6060f1SDimitry Andric case TargetOpcode::G_SEXTLOAD: 2050fe6060f1SDimitry Andric case TargetOpcode::G_ZEXTLOAD: 2051e8d8bef9SDimitry Andric return legalizeLoad(Helper, MI); 205206c3fb27SDimitry Andric case TargetOpcode::G_STORE: 205306c3fb27SDimitry Andric return legalizeStore(Helper, MI); 20548bcb0991SDimitry Andric case TargetOpcode::G_FMAD: 20558bcb0991SDimitry Andric return legalizeFMad(MI, MRI, B); 20568bcb0991SDimitry Andric case TargetOpcode::G_FDIV: 20578bcb0991SDimitry Andric return legalizeFDIV(MI, MRI, B); 205806c3fb27SDimitry Andric case TargetOpcode::G_FFREXP: 205906c3fb27SDimitry Andric return legalizeFFREXP(MI, MRI, B); 206006c3fb27SDimitry Andric case TargetOpcode::G_FSQRT: 206106c3fb27SDimitry Andric return legalizeFSQRT(MI, MRI, B); 20625ffd83dbSDimitry Andric case TargetOpcode::G_UDIV: 20635ffd83dbSDimitry Andric case TargetOpcode::G_UREM: 2064fe6060f1SDimitry Andric case TargetOpcode::G_UDIVREM: 2065fe6060f1SDimitry Andric return legalizeUnsignedDIV_REM(MI, MRI, B); 20665ffd83dbSDimitry Andric case TargetOpcode::G_SDIV: 20675ffd83dbSDimitry Andric case TargetOpcode::G_SREM: 2068fe6060f1SDimitry Andric case TargetOpcode::G_SDIVREM: 2069fe6060f1SDimitry Andric return legalizeSignedDIV_REM(MI, MRI, B); 2070480093f4SDimitry Andric case TargetOpcode::G_ATOMIC_CMPXCHG: 2071480093f4SDimitry Andric return legalizeAtomicCmpXChg(MI, MRI, B); 207206c3fb27SDimitry Andric case TargetOpcode::G_FLOG2: 207306c3fb27SDimitry Andric return legalizeFlog2(MI, B); 20745ffd83dbSDimitry Andric case TargetOpcode::G_FLOG: 20755ffd83dbSDimitry Andric case TargetOpcode::G_FLOG10: 207606c3fb27SDimitry Andric return legalizeFlogCommon(MI, B); 207706c3fb27SDimitry Andric case TargetOpcode::G_FEXP2: 207806c3fb27SDimitry Andric return legalizeFExp2(MI, B); 20795ffd83dbSDimitry Andric case TargetOpcode::G_FEXP: 20805f757f3fSDimitry Andric case TargetOpcode::G_FEXP10: 20815ffd83dbSDimitry Andric return legalizeFExp(MI, B); 20825ffd83dbSDimitry Andric case TargetOpcode::G_FPOW: 20835ffd83dbSDimitry Andric return legalizeFPow(MI, B); 20845ffd83dbSDimitry Andric case TargetOpcode::G_FFLOOR: 20855ffd83dbSDimitry Andric return legalizeFFloor(MI, MRI, B); 20865ffd83dbSDimitry Andric case TargetOpcode::G_BUILD_VECTOR: 2087bdd1243dSDimitry Andric case TargetOpcode::G_BUILD_VECTOR_TRUNC: 20885ffd83dbSDimitry Andric return legalizeBuildVector(MI, MRI, B); 208981ad6265SDimitry Andric case TargetOpcode::G_MUL: 209081ad6265SDimitry Andric return legalizeMul(Helper, MI); 2091349cc55cSDimitry Andric case TargetOpcode::G_CTLZ: 2092349cc55cSDimitry Andric case TargetOpcode::G_CTTZ: 2093349cc55cSDimitry Andric return legalizeCTLZ_CTTZ(MI, MRI, B); 209481ad6265SDimitry Andric case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 209581ad6265SDimitry Andric return legalizeFPTruncRound(MI, B); 20965f757f3fSDimitry Andric case TargetOpcode::G_STACKSAVE: 20975f757f3fSDimitry Andric return legalizeStackSave(MI, B); 20980b57cec5SDimitry Andric default: 20990b57cec5SDimitry Andric return false; 21000b57cec5SDimitry Andric } 21010b57cec5SDimitry Andric 21020b57cec5SDimitry Andric llvm_unreachable("expected switch to return"); 21030b57cec5SDimitry Andric } 21040b57cec5SDimitry Andric 21050b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture( 21060b57cec5SDimitry Andric unsigned AS, 21070b57cec5SDimitry Andric MachineRegisterInfo &MRI, 21088bcb0991SDimitry Andric MachineIRBuilder &B) const { 21098bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 21100b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 21110b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 2112bdd1243dSDimitry Andric const LLT S64 = LLT::scalar(64); 21130b57cec5SDimitry Andric 21148bcb0991SDimitry Andric assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 21158bcb0991SDimitry Andric 21160b57cec5SDimitry Andric if (ST.hasApertureRegs()) { 2117bdd1243dSDimitry Andric // Note: this register is somewhat broken. When used as a 32-bit operand, 2118bdd1243dSDimitry Andric // it only returns zeroes. The real value is in the upper 32 bits. 2119bdd1243dSDimitry Andric // Thus, we must emit extract the high 32 bits. 2120bdd1243dSDimitry Andric const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 2121bdd1243dSDimitry Andric ? AMDGPU::SRC_SHARED_BASE 2122bdd1243dSDimitry Andric : AMDGPU::SRC_PRIVATE_BASE; 2123bdd1243dSDimitry Andric // FIXME: It would be more natural to emit a COPY here, but then copy 2124bdd1243dSDimitry Andric // coalescing would kick in and it would think it's okay to use the "HI" 2125bdd1243dSDimitry Andric // subregister (instead of extracting the HI 32 bits) which is an artificial 2126bdd1243dSDimitry Andric // (unusable) register. 2127bdd1243dSDimitry Andric // Register TableGen definitions would need an overhaul to get rid of the 2128bdd1243dSDimitry Andric // artificial "HI" aperture registers and prevent this kind of issue from 2129bdd1243dSDimitry Andric // happening. 2130bdd1243dSDimitry Andric Register Dst = MRI.createGenericVirtualRegister(S64); 2131bdd1243dSDimitry Andric MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 2132bdd1243dSDimitry Andric B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 2133bdd1243dSDimitry Andric return B.buildUnmerge(S32, Dst).getReg(1); 21340b57cec5SDimitry Andric } 21350b57cec5SDimitry Andric 213681ad6265SDimitry Andric // TODO: can we be smarter about machine pointer info? 213781ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 213881ad6265SDimitry Andric Register LoadAddr = MRI.createGenericVirtualRegister( 213981ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 214081ad6265SDimitry Andric // For code object version 5, private_base and shared_base are passed through 214181ad6265SDimitry Andric // implicit kernargs. 21427a6dacacSDimitry Andric if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 214306c3fb27SDimitry Andric AMDGPU::AMDHSA_COV5) { 214481ad6265SDimitry Andric AMDGPUTargetLowering::ImplicitParameter Param = 214581ad6265SDimitry Andric AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 214681ad6265SDimitry Andric : AMDGPUTargetLowering::PRIVATE_BASE; 214781ad6265SDimitry Andric uint64_t Offset = 214881ad6265SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 214981ad6265SDimitry Andric 215081ad6265SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister( 215181ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 215281ad6265SDimitry Andric 215381ad6265SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 215481ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 215581ad6265SDimitry Andric return Register(); 215681ad6265SDimitry Andric 215781ad6265SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 215881ad6265SDimitry Andric PtrInfo, 215981ad6265SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 216081ad6265SDimitry Andric MachineMemOperand::MOInvariant, 216181ad6265SDimitry Andric LLT::scalar(32), commonAlignment(Align(64), Offset)); 216281ad6265SDimitry Andric 216381ad6265SDimitry Andric // Pointer address 216481ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, KernargPtrReg, 216581ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 216681ad6265SDimitry Andric // Load address 216781ad6265SDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 216881ad6265SDimitry Andric } 216981ad6265SDimitry Andric 21700b57cec5SDimitry Andric Register QueuePtr = MRI.createGenericVirtualRegister( 21710b57cec5SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 21720b57cec5SDimitry Andric 2173e8d8bef9SDimitry Andric if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 21748bcb0991SDimitry Andric return Register(); 21750b57cec5SDimitry Andric 21760b57cec5SDimitry Andric // Offset into amd_queue_t for group_segment_aperture_base_hi / 21770b57cec5SDimitry Andric // private_segment_aperture_base_hi. 21780b57cec5SDimitry Andric uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 21790b57cec5SDimitry Andric 21800b57cec5SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 21810b57cec5SDimitry Andric PtrInfo, 21825ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 21830b57cec5SDimitry Andric MachineMemOperand::MOInvariant, 2184fe6060f1SDimitry Andric LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 21850b57cec5SDimitry Andric 218681ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, QueuePtr, 218781ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 21885ffd83dbSDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 21890b57cec5SDimitry Andric } 21900b57cec5SDimitry Andric 219104eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is 219204eeddc0SDimitry Andric /// not necessary. 219304eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 219404eeddc0SDimitry Andric const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 219504eeddc0SDimitry Andric MachineInstr *Def = MRI.getVRegDef(Val); 219604eeddc0SDimitry Andric switch (Def->getOpcode()) { 219704eeddc0SDimitry Andric case AMDGPU::G_FRAME_INDEX: 219804eeddc0SDimitry Andric case AMDGPU::G_GLOBAL_VALUE: 219904eeddc0SDimitry Andric case AMDGPU::G_BLOCK_ADDR: 220004eeddc0SDimitry Andric return true; 220104eeddc0SDimitry Andric case AMDGPU::G_CONSTANT: { 220204eeddc0SDimitry Andric const ConstantInt *CI = Def->getOperand(1).getCImm(); 220304eeddc0SDimitry Andric return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 220404eeddc0SDimitry Andric } 220504eeddc0SDimitry Andric default: 220604eeddc0SDimitry Andric return false; 220704eeddc0SDimitry Andric } 220804eeddc0SDimitry Andric 220904eeddc0SDimitry Andric return false; 221004eeddc0SDimitry Andric } 221104eeddc0SDimitry Andric 22120b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 22130b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 22148bcb0991SDimitry Andric MachineIRBuilder &B) const { 22158bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 22160b57cec5SDimitry Andric 22178bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 22180b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 22190b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 22200b57cec5SDimitry Andric 22210b57cec5SDimitry Andric LLT DstTy = MRI.getType(Dst); 22220b57cec5SDimitry Andric LLT SrcTy = MRI.getType(Src); 22230b57cec5SDimitry Andric unsigned DestAS = DstTy.getAddressSpace(); 22240b57cec5SDimitry Andric unsigned SrcAS = SrcTy.getAddressSpace(); 22250b57cec5SDimitry Andric 22260b57cec5SDimitry Andric // TODO: Avoid reloading from the queue ptr for each cast, or at least each 22270b57cec5SDimitry Andric // vector element. 22280b57cec5SDimitry Andric assert(!DstTy.isVector()); 22290b57cec5SDimitry Andric 22300b57cec5SDimitry Andric const AMDGPUTargetMachine &TM 22310b57cec5SDimitry Andric = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 22320b57cec5SDimitry Andric 2233e8d8bef9SDimitry Andric if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 22348bcb0991SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 22358bcb0991SDimitry Andric return true; 22368bcb0991SDimitry Andric } 22378bcb0991SDimitry Andric 223881ad6265SDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 223981ad6265SDimitry Andric (DestAS == AMDGPUAS::LOCAL_ADDRESS || 224081ad6265SDimitry Andric DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 224104eeddc0SDimitry Andric if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 224204eeddc0SDimitry Andric // Extract low 32-bits of the pointer. 224304eeddc0SDimitry Andric B.buildExtract(Dst, Src, 0); 224404eeddc0SDimitry Andric MI.eraseFromParent(); 224504eeddc0SDimitry Andric return true; 224604eeddc0SDimitry Andric } 224704eeddc0SDimitry Andric 22480b57cec5SDimitry Andric unsigned NullVal = TM.getNullPointerValue(DestAS); 22490b57cec5SDimitry Andric 22508bcb0991SDimitry Andric auto SegmentNull = B.buildConstant(DstTy, NullVal); 22518bcb0991SDimitry Andric auto FlatNull = B.buildConstant(SrcTy, 0); 22520b57cec5SDimitry Andric 22530b57cec5SDimitry Andric // Extract low 32-bits of the pointer. 22545ffd83dbSDimitry Andric auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 22550b57cec5SDimitry Andric 22565ffd83dbSDimitry Andric auto CmpRes = 22575ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 22588bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 22590b57cec5SDimitry Andric 22600b57cec5SDimitry Andric MI.eraseFromParent(); 22610b57cec5SDimitry Andric return true; 22620b57cec5SDimitry Andric } 22630b57cec5SDimitry Andric 226481ad6265SDimitry Andric if (DestAS == AMDGPUAS::FLAT_ADDRESS && 226581ad6265SDimitry Andric (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 226681ad6265SDimitry Andric SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 22678bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 22688bcb0991SDimitry Andric if (!ApertureReg.isValid()) 22698bcb0991SDimitry Andric return false; 22700b57cec5SDimitry Andric 22710b57cec5SDimitry Andric // Coerce the type of the low half of the result so we can use merge_values. 22725ffd83dbSDimitry Andric Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 22730b57cec5SDimitry Andric 22740b57cec5SDimitry Andric // TODO: Should we allow mismatched types but matching sizes in merges to 22750b57cec5SDimitry Andric // avoid the ptrtoint? 2276bdd1243dSDimitry Andric auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 227704eeddc0SDimitry Andric 227804eeddc0SDimitry Andric if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 227904eeddc0SDimitry Andric B.buildCopy(Dst, BuildPtr); 228004eeddc0SDimitry Andric MI.eraseFromParent(); 228104eeddc0SDimitry Andric return true; 228204eeddc0SDimitry Andric } 228304eeddc0SDimitry Andric 228404eeddc0SDimitry Andric auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 228504eeddc0SDimitry Andric auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 228604eeddc0SDimitry Andric 228781ad6265SDimitry Andric auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 228881ad6265SDimitry Andric SegmentNull.getReg(0)); 228904eeddc0SDimitry Andric 22905ffd83dbSDimitry Andric B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 22910b57cec5SDimitry Andric 22920b57cec5SDimitry Andric MI.eraseFromParent(); 22930b57cec5SDimitry Andric return true; 22940b57cec5SDimitry Andric } 22950b57cec5SDimitry Andric 229681ad6265SDimitry Andric if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 229781ad6265SDimitry Andric SrcTy.getSizeInBits() == 64) { 229881ad6265SDimitry Andric // Truncate. 229981ad6265SDimitry Andric B.buildExtract(Dst, Src, 0); 230081ad6265SDimitry Andric MI.eraseFromParent(); 230181ad6265SDimitry Andric return true; 230281ad6265SDimitry Andric } 230381ad6265SDimitry Andric 230481ad6265SDimitry Andric if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 230581ad6265SDimitry Andric DstTy.getSizeInBits() == 64) { 230681ad6265SDimitry Andric const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 230781ad6265SDimitry Andric uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2308bdd1243dSDimitry Andric auto PtrLo = B.buildPtrToInt(S32, Src); 2309bdd1243dSDimitry Andric auto HighAddr = B.buildConstant(S32, AddrHiVal); 2310bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 231181ad6265SDimitry Andric MI.eraseFromParent(); 231281ad6265SDimitry Andric return true; 231381ad6265SDimitry Andric } 231481ad6265SDimitry Andric 231581ad6265SDimitry Andric DiagnosticInfoUnsupported InvalidAddrSpaceCast( 231681ad6265SDimitry Andric MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 231781ad6265SDimitry Andric 231881ad6265SDimitry Andric LLVMContext &Ctx = MF.getFunction().getContext(); 231981ad6265SDimitry Andric Ctx.diagnose(InvalidAddrSpaceCast); 232081ad6265SDimitry Andric B.buildUndef(Dst); 232181ad6265SDimitry Andric MI.eraseFromParent(); 232281ad6265SDimitry Andric return true; 232381ad6265SDimitry Andric } 232481ad6265SDimitry Andric 23255f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI, 23265f757f3fSDimitry Andric MachineRegisterInfo &MRI, 23278bcb0991SDimitry Andric MachineIRBuilder &B) const { 23280b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 23290b57cec5SDimitry Andric LLT Ty = MRI.getType(Src); 23300b57cec5SDimitry Andric assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 23310b57cec5SDimitry Andric 23320b57cec5SDimitry Andric APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 23330b57cec5SDimitry Andric APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 23340b57cec5SDimitry Andric 23358bcb0991SDimitry Andric auto C1 = B.buildFConstant(Ty, C1Val); 23368bcb0991SDimitry Andric auto CopySign = B.buildFCopysign(Ty, C1, Src); 23370b57cec5SDimitry Andric 23380b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 23398bcb0991SDimitry Andric auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 23408bcb0991SDimitry Andric auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 23410b57cec5SDimitry Andric 23428bcb0991SDimitry Andric auto C2 = B.buildFConstant(Ty, C2Val); 23438bcb0991SDimitry Andric auto Fabs = B.buildFAbs(Ty, Src); 23440b57cec5SDimitry Andric 23458bcb0991SDimitry Andric auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 23468bcb0991SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2347e8d8bef9SDimitry Andric MI.eraseFromParent(); 23480b57cec5SDimitry Andric return true; 23490b57cec5SDimitry Andric } 23500b57cec5SDimitry Andric 23510b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil( 23520b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 23530b57cec5SDimitry Andric MachineIRBuilder &B) const { 23540b57cec5SDimitry Andric 23550b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 23560b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 23570b57cec5SDimitry Andric 23580b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 23590b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 23600b57cec5SDimitry Andric 23610b57cec5SDimitry Andric // result = trunc(src) 23620b57cec5SDimitry Andric // if (src > 0.0 && src != result) 23630b57cec5SDimitry Andric // result += 1.0 23640b57cec5SDimitry Andric 23655ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S64, Src); 23660b57cec5SDimitry Andric 23670b57cec5SDimitry Andric const auto Zero = B.buildFConstant(S64, 0.0); 23680b57cec5SDimitry Andric const auto One = B.buildFConstant(S64, 1.0); 23690b57cec5SDimitry Andric auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 23700b57cec5SDimitry Andric auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 23710b57cec5SDimitry Andric auto And = B.buildAnd(S1, Lt0, NeTrunc); 23720b57cec5SDimitry Andric auto Add = B.buildSelect(S64, And, One, Zero); 23730b57cec5SDimitry Andric 23740b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 23750b57cec5SDimitry Andric B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 237604eeddc0SDimitry Andric MI.eraseFromParent(); 23770b57cec5SDimitry Andric return true; 23780b57cec5SDimitry Andric } 23790b57cec5SDimitry Andric 2380e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem( 2381e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 2382e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 2383e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2384e8d8bef9SDimitry Andric Register Src0Reg = MI.getOperand(1).getReg(); 2385e8d8bef9SDimitry Andric Register Src1Reg = MI.getOperand(2).getReg(); 2386e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 2387e8d8bef9SDimitry Andric LLT Ty = MRI.getType(DstReg); 2388e8d8bef9SDimitry Andric 2389e8d8bef9SDimitry Andric auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2390e8d8bef9SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2391e8d8bef9SDimitry Andric auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2392e8d8bef9SDimitry Andric B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2393e8d8bef9SDimitry Andric MI.eraseFromParent(); 2394e8d8bef9SDimitry Andric return true; 2395e8d8bef9SDimitry Andric } 2396e8d8bef9SDimitry Andric 2397e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi, 23980b57cec5SDimitry Andric MachineIRBuilder &B) { 23990b57cec5SDimitry Andric const unsigned FractBits = 52; 24000b57cec5SDimitry Andric const unsigned ExpBits = 11; 24010b57cec5SDimitry Andric LLT S32 = LLT::scalar(32); 24020b57cec5SDimitry Andric 24030b57cec5SDimitry Andric auto Const0 = B.buildConstant(S32, FractBits - 32); 24040b57cec5SDimitry Andric auto Const1 = B.buildConstant(S32, ExpBits); 24050b57cec5SDimitry Andric 24065f757f3fSDimitry Andric auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}) 2407e8d8bef9SDimitry Andric .addUse(Hi) 24080b57cec5SDimitry Andric .addUse(Const0.getReg(0)) 24090b57cec5SDimitry Andric .addUse(Const1.getReg(0)); 24100b57cec5SDimitry Andric 24110b57cec5SDimitry Andric return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 24120b57cec5SDimitry Andric } 24130b57cec5SDimitry Andric 24140b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 24150b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 24160b57cec5SDimitry Andric MachineIRBuilder &B) const { 24170b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 24180b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 24190b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 24200b57cec5SDimitry Andric 24210b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 24220b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 24230b57cec5SDimitry Andric 24240b57cec5SDimitry Andric // TODO: Should this use extract since the low half is unused? 24250b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 24260b57cec5SDimitry Andric Register Hi = Unmerge.getReg(1); 24270b57cec5SDimitry Andric 24280b57cec5SDimitry Andric // Extract the upper half, since this is where we will find the sign and 24290b57cec5SDimitry Andric // exponent. 24300b57cec5SDimitry Andric auto Exp = extractF64Exponent(Hi, B); 24310b57cec5SDimitry Andric 24320b57cec5SDimitry Andric const unsigned FractBits = 52; 24330b57cec5SDimitry Andric 24340b57cec5SDimitry Andric // Extract the sign bit. 24350b57cec5SDimitry Andric const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 24360b57cec5SDimitry Andric auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 24370b57cec5SDimitry Andric 24380b57cec5SDimitry Andric const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 24390b57cec5SDimitry Andric 24400b57cec5SDimitry Andric const auto Zero32 = B.buildConstant(S32, 0); 24410b57cec5SDimitry Andric 24420b57cec5SDimitry Andric // Extend back to 64-bits. 2443bdd1243dSDimitry Andric auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 24440b57cec5SDimitry Andric 24450b57cec5SDimitry Andric auto Shr = B.buildAShr(S64, FractMask, Exp); 24460b57cec5SDimitry Andric auto Not = B.buildNot(S64, Shr); 24470b57cec5SDimitry Andric auto Tmp0 = B.buildAnd(S64, Src, Not); 24480b57cec5SDimitry Andric auto FiftyOne = B.buildConstant(S32, FractBits - 1); 24490b57cec5SDimitry Andric 24500b57cec5SDimitry Andric auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 24510b57cec5SDimitry Andric auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 24520b57cec5SDimitry Andric 24530b57cec5SDimitry Andric auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 24540b57cec5SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2455e8d8bef9SDimitry Andric MI.eraseFromParent(); 24560b57cec5SDimitry Andric return true; 24570b57cec5SDimitry Andric } 24580b57cec5SDimitry Andric 24590b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP( 24600b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 24610b57cec5SDimitry Andric MachineIRBuilder &B, bool Signed) const { 24620b57cec5SDimitry Andric 24630b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 24640b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 24650b57cec5SDimitry Andric 24660b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 24670b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 24680b57cec5SDimitry Andric 2469349cc55cSDimitry Andric assert(MRI.getType(Src) == S64); 24700b57cec5SDimitry Andric 24710b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2472349cc55cSDimitry Andric auto ThirtyTwo = B.buildConstant(S32, 32); 24730b57cec5SDimitry Andric 2474349cc55cSDimitry Andric if (MRI.getType(Dst) == S64) { 2475349cc55cSDimitry Andric auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2476349cc55cSDimitry Andric : B.buildUITOFP(S64, Unmerge.getReg(1)); 24770b57cec5SDimitry Andric 24780b57cec5SDimitry Andric auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 247906c3fb27SDimitry Andric auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo); 24800b57cec5SDimitry Andric 24810b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 24820b57cec5SDimitry Andric B.buildFAdd(Dst, LdExp, CvtLo); 24830b57cec5SDimitry Andric MI.eraseFromParent(); 24840b57cec5SDimitry Andric return true; 24850b57cec5SDimitry Andric } 24860b57cec5SDimitry Andric 2487349cc55cSDimitry Andric assert(MRI.getType(Dst) == S32); 2488349cc55cSDimitry Andric 2489349cc55cSDimitry Andric auto One = B.buildConstant(S32, 1); 2490349cc55cSDimitry Andric 2491349cc55cSDimitry Andric MachineInstrBuilder ShAmt; 2492349cc55cSDimitry Andric if (Signed) { 2493349cc55cSDimitry Andric auto ThirtyOne = B.buildConstant(S32, 31); 2494349cc55cSDimitry Andric auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2495349cc55cSDimitry Andric auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2496349cc55cSDimitry Andric auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 24975f757f3fSDimitry Andric auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}) 2498349cc55cSDimitry Andric .addUse(Unmerge.getReg(1)); 2499349cc55cSDimitry Andric auto LS2 = B.buildSub(S32, LS, One); 2500349cc55cSDimitry Andric ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2501349cc55cSDimitry Andric } else 2502349cc55cSDimitry Andric ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2503349cc55cSDimitry Andric auto Norm = B.buildShl(S64, Src, ShAmt); 2504349cc55cSDimitry Andric auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2505349cc55cSDimitry Andric auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2506349cc55cSDimitry Andric auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2507349cc55cSDimitry Andric auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2508349cc55cSDimitry Andric auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 250906c3fb27SDimitry Andric B.buildFLdexp(Dst, FVal, Scale); 2510349cc55cSDimitry Andric MI.eraseFromParent(); 2511349cc55cSDimitry Andric return true; 2512349cc55cSDimitry Andric } 2513349cc55cSDimitry Andric 25145ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this 25155ffd83dbSDimitry Andric // actually works. 2516fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2517fe6060f1SDimitry Andric MachineRegisterInfo &MRI, 2518fe6060f1SDimitry Andric MachineIRBuilder &B, 2519fe6060f1SDimitry Andric bool Signed) const { 25205ffd83dbSDimitry Andric 25215ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 25225ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 25235ffd83dbSDimitry Andric 25245ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 25255ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 25265ffd83dbSDimitry Andric 2527fe6060f1SDimitry Andric const LLT SrcLT = MRI.getType(Src); 2528fe6060f1SDimitry Andric assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 25295ffd83dbSDimitry Andric 25305ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 25315ffd83dbSDimitry Andric 2532fe6060f1SDimitry Andric // The basic idea of converting a floating point number into a pair of 32-bit 2533fe6060f1SDimitry Andric // integers is illustrated as follows: 2534fe6060f1SDimitry Andric // 2535fe6060f1SDimitry Andric // tf := trunc(val); 2536fe6060f1SDimitry Andric // hif := floor(tf * 2^-32); 2537fe6060f1SDimitry Andric // lof := tf - hif * 2^32; // lof is always positive due to floor. 2538fe6060f1SDimitry Andric // hi := fptoi(hif); 2539fe6060f1SDimitry Andric // lo := fptoi(lof); 2540fe6060f1SDimitry Andric // 2541fe6060f1SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2542fe6060f1SDimitry Andric MachineInstrBuilder Sign; 2543fe6060f1SDimitry Andric if (Signed && SrcLT == S32) { 2544fe6060f1SDimitry Andric // However, a 32-bit floating point number has only 23 bits mantissa and 2545fe6060f1SDimitry Andric // it's not enough to hold all the significant bits of `lof` if val is 2546fe6060f1SDimitry Andric // negative. To avoid the loss of precision, We need to take the absolute 2547fe6060f1SDimitry Andric // value after truncating and flip the result back based on the original 2548fe6060f1SDimitry Andric // signedness. 2549fe6060f1SDimitry Andric Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2550fe6060f1SDimitry Andric Trunc = B.buildFAbs(S32, Trunc, Flags); 2551fe6060f1SDimitry Andric } 2552fe6060f1SDimitry Andric MachineInstrBuilder K0, K1; 2553fe6060f1SDimitry Andric if (SrcLT == S64) { 255406c3fb27SDimitry Andric K0 = B.buildFConstant( 255506c3fb27SDimitry Andric S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 255606c3fb27SDimitry Andric K1 = B.buildFConstant( 255706c3fb27SDimitry Andric S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2558fe6060f1SDimitry Andric } else { 255906c3fb27SDimitry Andric K0 = B.buildFConstant( 256006c3fb27SDimitry Andric S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000))); 256106c3fb27SDimitry Andric K1 = B.buildFConstant( 256206c3fb27SDimitry Andric S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000))); 2563fe6060f1SDimitry Andric } 25645ffd83dbSDimitry Andric 2565fe6060f1SDimitry Andric auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2566fe6060f1SDimitry Andric auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2567fe6060f1SDimitry Andric auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 25685ffd83dbSDimitry Andric 2569fe6060f1SDimitry Andric auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2570fe6060f1SDimitry Andric : B.buildFPTOUI(S32, FloorMul); 25715ffd83dbSDimitry Andric auto Lo = B.buildFPTOUI(S32, Fma); 25725ffd83dbSDimitry Andric 2573fe6060f1SDimitry Andric if (Signed && SrcLT == S32) { 2574fe6060f1SDimitry Andric // Flip the result based on the signedness, which is either all 0s or 1s. 2575bdd1243dSDimitry Andric Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2576fe6060f1SDimitry Andric // r := xor({lo, hi}, sign) - sign; 2577bdd1243dSDimitry Andric B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2578bdd1243dSDimitry Andric Sign); 2579fe6060f1SDimitry Andric } else 2580bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, {Lo, Hi}); 25815ffd83dbSDimitry Andric MI.eraseFromParent(); 25825ffd83dbSDimitry Andric 25835ffd83dbSDimitry Andric return true; 25845ffd83dbSDimitry Andric } 25855ffd83dbSDimitry Andric 25865ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 25875ffd83dbSDimitry Andric MachineInstr &MI) const { 25885ffd83dbSDimitry Andric MachineFunction &MF = Helper.MIRBuilder.getMF(); 25890b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 25900b57cec5SDimitry Andric 25910b57cec5SDimitry Andric const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 25920b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 25930b57cec5SDimitry Andric 25940b57cec5SDimitry Andric // With ieee_mode disabled, the instructions have the correct behavior 25950b57cec5SDimitry Andric // already for G_FMINNUM/G_FMAXNUM 25960b57cec5SDimitry Andric if (!MFI->getMode().IEEE) 25970b57cec5SDimitry Andric return !IsIEEEOp; 25980b57cec5SDimitry Andric 25990b57cec5SDimitry Andric if (IsIEEEOp) 26000b57cec5SDimitry Andric return true; 26010b57cec5SDimitry Andric 26020b57cec5SDimitry Andric return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 26030b57cec5SDimitry Andric } 26040b57cec5SDimitry Andric 26050b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 26060b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 26070b57cec5SDimitry Andric MachineIRBuilder &B) const { 26080b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 26090b57cec5SDimitry Andric 26100b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 26115ffd83dbSDimitry Andric 261206c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 261306c3fb27SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 261406c3fb27SDimitry Andric 261506c3fb27SDimitry Andric LLT VecTy = MRI.getType(Vec); 261606c3fb27SDimitry Andric LLT EltTy = VecTy.getElementType(); 261706c3fb27SDimitry Andric assert(EltTy == MRI.getType(Dst)); 261806c3fb27SDimitry Andric 261906c3fb27SDimitry Andric // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 262006c3fb27SDimitry Andric // but we can't go directly to that logic becasue you can't bitcast a vector 262106c3fb27SDimitry Andric // of pointers to a vector of integers. Therefore, introduce an intermediate 262206c3fb27SDimitry Andric // vector of integers using ptrtoint (and inttoptr on the output) in order to 262306c3fb27SDimitry Andric // drive the legalization forward. 262406c3fb27SDimitry Andric if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 262506c3fb27SDimitry Andric LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 262606c3fb27SDimitry Andric LLT IntVecTy = VecTy.changeElementType(IntTy); 262706c3fb27SDimitry Andric 262806c3fb27SDimitry Andric auto IntVec = B.buildPtrToInt(IntVecTy, Vec); 262906c3fb27SDimitry Andric auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2)); 263006c3fb27SDimitry Andric B.buildIntToPtr(Dst, IntElt); 263106c3fb27SDimitry Andric 263206c3fb27SDimitry Andric MI.eraseFromParent(); 263306c3fb27SDimitry Andric return true; 263406c3fb27SDimitry Andric } 263506c3fb27SDimitry Andric 26365ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 26375ffd83dbSDimitry Andric // constant before this, so we shouldn't need 2638349cc55cSDimitry Andric // getIConstantVRegValWithLookThrough. 2639bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeIdxVal = 2640349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2641e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 26420b57cec5SDimitry Andric return true; 2643bdd1243dSDimitry Andric const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 26440b57cec5SDimitry Andric 264504eeddc0SDimitry Andric if (IdxVal < VecTy.getNumElements()) { 264604eeddc0SDimitry Andric auto Unmerge = B.buildUnmerge(EltTy, Vec); 264704eeddc0SDimitry Andric B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 264804eeddc0SDimitry Andric } else { 26490b57cec5SDimitry Andric B.buildUndef(Dst); 265004eeddc0SDimitry Andric } 26510b57cec5SDimitry Andric 26520b57cec5SDimitry Andric MI.eraseFromParent(); 26530b57cec5SDimitry Andric return true; 26540b57cec5SDimitry Andric } 26550b57cec5SDimitry Andric 26560b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 26570b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 26580b57cec5SDimitry Andric MachineIRBuilder &B) const { 26590b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 26600b57cec5SDimitry Andric 26610b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 26625ffd83dbSDimitry Andric 266306c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 266406c3fb27SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 266506c3fb27SDimitry Andric Register Ins = MI.getOperand(2).getReg(); 266606c3fb27SDimitry Andric 266706c3fb27SDimitry Andric LLT VecTy = MRI.getType(Vec); 266806c3fb27SDimitry Andric LLT EltTy = VecTy.getElementType(); 266906c3fb27SDimitry Andric assert(EltTy == MRI.getType(Ins)); 267006c3fb27SDimitry Andric 267106c3fb27SDimitry Andric // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts 267206c3fb27SDimitry Andric // but we can't go directly to that logic becasue you can't bitcast a vector 267306c3fb27SDimitry Andric // of pointers to a vector of integers. Therefore, make the pointer vector 267406c3fb27SDimitry Andric // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd 267506c3fb27SDimitry Andric // new value, and then inttoptr the result vector back. This will then allow 267606c3fb27SDimitry Andric // the rest of legalization to take over. 267706c3fb27SDimitry Andric if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) { 267806c3fb27SDimitry Andric LLT IntTy = LLT::scalar(EltTy.getSizeInBits()); 267906c3fb27SDimitry Andric LLT IntVecTy = VecTy.changeElementType(IntTy); 268006c3fb27SDimitry Andric 268106c3fb27SDimitry Andric auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec); 268206c3fb27SDimitry Andric auto IntIns = B.buildPtrToInt(IntTy, Ins); 268306c3fb27SDimitry Andric auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns, 268406c3fb27SDimitry Andric MI.getOperand(3)); 268506c3fb27SDimitry Andric B.buildIntToPtr(Dst, IntVecDest); 268606c3fb27SDimitry Andric MI.eraseFromParent(); 268706c3fb27SDimitry Andric return true; 268806c3fb27SDimitry Andric } 268906c3fb27SDimitry Andric 26905ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 26915ffd83dbSDimitry Andric // constant before this, so we shouldn't need 2692349cc55cSDimitry Andric // getIConstantVRegValWithLookThrough. 2693bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeIdxVal = 2694349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2695e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 26960b57cec5SDimitry Andric return true; 26970b57cec5SDimitry Andric 2698bdd1243dSDimitry Andric const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 26990b57cec5SDimitry Andric 270004eeddc0SDimitry Andric unsigned NumElts = VecTy.getNumElements(); 270104eeddc0SDimitry Andric if (IdxVal < NumElts) { 270204eeddc0SDimitry Andric SmallVector<Register, 8> SrcRegs; 270304eeddc0SDimitry Andric for (unsigned i = 0; i < NumElts; ++i) 270404eeddc0SDimitry Andric SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 270504eeddc0SDimitry Andric B.buildUnmerge(SrcRegs, Vec); 270604eeddc0SDimitry Andric 270704eeddc0SDimitry Andric SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2708bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, SrcRegs); 270904eeddc0SDimitry Andric } else { 27100b57cec5SDimitry Andric B.buildUndef(Dst); 271104eeddc0SDimitry Andric } 27120b57cec5SDimitry Andric 27130b57cec5SDimitry Andric MI.eraseFromParent(); 27140b57cec5SDimitry Andric return true; 27150b57cec5SDimitry Andric } 27160b57cec5SDimitry Andric 27178bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos( 27188bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 27198bcb0991SDimitry Andric MachineIRBuilder &B) const { 27208bcb0991SDimitry Andric 27218bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 27228bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 27238bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 27248bcb0991SDimitry Andric unsigned Flags = MI.getFlags(); 27258bcb0991SDimitry Andric 27268bcb0991SDimitry Andric Register TrigVal; 27275ffd83dbSDimitry Andric auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 27288bcb0991SDimitry Andric if (ST.hasTrigReducedRange()) { 27298bcb0991SDimitry Andric auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 27305f757f3fSDimitry Andric TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}) 27318bcb0991SDimitry Andric .addUse(MulVal.getReg(0)) 27325f757f3fSDimitry Andric .setMIFlags(Flags) 27335f757f3fSDimitry Andric .getReg(0); 27348bcb0991SDimitry Andric } else 27358bcb0991SDimitry Andric TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 27368bcb0991SDimitry Andric 27378bcb0991SDimitry Andric Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 27388bcb0991SDimitry Andric Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 27395f757f3fSDimitry Andric B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg)) 27408bcb0991SDimitry Andric .addUse(TrigVal) 27418bcb0991SDimitry Andric .setMIFlags(Flags); 27428bcb0991SDimitry Andric MI.eraseFromParent(); 27438bcb0991SDimitry Andric return true; 27448bcb0991SDimitry Andric } 27458bcb0991SDimitry Andric 27465ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 27475ffd83dbSDimitry Andric MachineIRBuilder &B, 27485ffd83dbSDimitry Andric const GlobalValue *GV, 27495ffd83dbSDimitry Andric int64_t Offset, 27505ffd83dbSDimitry Andric unsigned GAFlags) const { 27515ffd83dbSDimitry Andric assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 27528bcb0991SDimitry Andric // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 27538bcb0991SDimitry Andric // to the following code sequence: 27548bcb0991SDimitry Andric // 27558bcb0991SDimitry Andric // For constant address space: 27568bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 27578bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol 27588bcb0991SDimitry Andric // s_addc_u32 s1, s1, 0 27598bcb0991SDimitry Andric // 27608bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 27618bcb0991SDimitry Andric // a fixup or relocation is emitted to replace $symbol with a literal 27628bcb0991SDimitry Andric // constant, which is a pc-relative offset from the encoding of the $symbol 27638bcb0991SDimitry Andric // operand to the global variable. 27648bcb0991SDimitry Andric // 27658bcb0991SDimitry Andric // For global address space: 27668bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 27678bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 27688bcb0991SDimitry Andric // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 27698bcb0991SDimitry Andric // 27708bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 27718bcb0991SDimitry Andric // fixups or relocations are emitted to replace $symbol@*@lo and 27728bcb0991SDimitry Andric // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 27738bcb0991SDimitry Andric // which is a 64-bit pc-relative offset from the encoding of the $symbol 27748bcb0991SDimitry Andric // operand to the global variable. 27758bcb0991SDimitry Andric 27768bcb0991SDimitry Andric LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 27778bcb0991SDimitry Andric 27788bcb0991SDimitry Andric Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 27798bcb0991SDimitry Andric B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 27808bcb0991SDimitry Andric 27818bcb0991SDimitry Andric MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 27828bcb0991SDimitry Andric .addDef(PCReg); 27838bcb0991SDimitry Andric 27845f757f3fSDimitry Andric MIB.addGlobalAddress(GV, Offset, GAFlags); 27858bcb0991SDimitry Andric if (GAFlags == SIInstrInfo::MO_NONE) 27868bcb0991SDimitry Andric MIB.addImm(0); 27878bcb0991SDimitry Andric else 27885f757f3fSDimitry Andric MIB.addGlobalAddress(GV, Offset, GAFlags + 1); 27898bcb0991SDimitry Andric 279006c3fb27SDimitry Andric if (!B.getMRI()->getRegClassOrNull(PCReg)) 27918bcb0991SDimitry Andric B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 27928bcb0991SDimitry Andric 27938bcb0991SDimitry Andric if (PtrTy.getSizeInBits() == 32) 27948bcb0991SDimitry Andric B.buildExtract(DstReg, PCReg, 0); 27958bcb0991SDimitry Andric return true; 27968bcb0991SDimitry Andric } 27978bcb0991SDimitry Andric 27985f757f3fSDimitry Andric // Emit a ABS32_LO / ABS32_HI relocation stub. 27995f757f3fSDimitry Andric void AMDGPULegalizerInfo::buildAbsGlobalAddress( 28005f757f3fSDimitry Andric Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, 28015f757f3fSDimitry Andric MachineRegisterInfo &MRI) const { 28025f757f3fSDimitry Andric bool RequiresHighHalf = PtrTy.getSizeInBits() != 32; 28035f757f3fSDimitry Andric 28045f757f3fSDimitry Andric LLT S32 = LLT::scalar(32); 28055f757f3fSDimitry Andric 28065f757f3fSDimitry Andric // Use the destination directly, if and only if we store the lower address 28075f757f3fSDimitry Andric // part only and we don't have a register class being set. 28085f757f3fSDimitry Andric Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg) 28095f757f3fSDimitry Andric ? DstReg 28105f757f3fSDimitry Andric : MRI.createGenericVirtualRegister(S32); 28115f757f3fSDimitry Andric 28125f757f3fSDimitry Andric if (!MRI.getRegClassOrNull(AddrLo)) 28135f757f3fSDimitry Andric MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass); 28145f757f3fSDimitry Andric 28155f757f3fSDimitry Andric // Write the lower half. 28165f757f3fSDimitry Andric B.buildInstr(AMDGPU::S_MOV_B32) 28175f757f3fSDimitry Andric .addDef(AddrLo) 28185f757f3fSDimitry Andric .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); 28195f757f3fSDimitry Andric 28205f757f3fSDimitry Andric // If required, write the upper half as well. 28215f757f3fSDimitry Andric if (RequiresHighHalf) { 28225f757f3fSDimitry Andric assert(PtrTy.getSizeInBits() == 64 && 28235f757f3fSDimitry Andric "Must provide a 64-bit pointer type!"); 28245f757f3fSDimitry Andric 28255f757f3fSDimitry Andric Register AddrHi = MRI.createGenericVirtualRegister(S32); 28265f757f3fSDimitry Andric MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass); 28275f757f3fSDimitry Andric 28285f757f3fSDimitry Andric B.buildInstr(AMDGPU::S_MOV_B32) 28295f757f3fSDimitry Andric .addDef(AddrHi) 28305f757f3fSDimitry Andric .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI); 28315f757f3fSDimitry Andric 28325f757f3fSDimitry Andric // Use the destination directly, if and only if we don't have a register 28335f757f3fSDimitry Andric // class being set. 28345f757f3fSDimitry Andric Register AddrDst = !MRI.getRegClassOrNull(DstReg) 28355f757f3fSDimitry Andric ? DstReg 28365f757f3fSDimitry Andric : MRI.createGenericVirtualRegister(LLT::scalar(64)); 28375f757f3fSDimitry Andric 28385f757f3fSDimitry Andric if (!MRI.getRegClassOrNull(AddrDst)) 28395f757f3fSDimitry Andric MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass); 28405f757f3fSDimitry Andric 28415f757f3fSDimitry Andric B.buildMergeValues(AddrDst, {AddrLo, AddrHi}); 28425f757f3fSDimitry Andric 28435f757f3fSDimitry Andric // If we created a new register for the destination, cast the result into 28445f757f3fSDimitry Andric // the final output. 28455f757f3fSDimitry Andric if (AddrDst != DstReg) 28465f757f3fSDimitry Andric B.buildCast(DstReg, AddrDst); 28475f757f3fSDimitry Andric } else if (AddrLo != DstReg) { 28485f757f3fSDimitry Andric // If we created a new register for the destination, cast the result into 28495f757f3fSDimitry Andric // the final output. 28505f757f3fSDimitry Andric B.buildCast(DstReg, AddrLo); 28515f757f3fSDimitry Andric } 28525f757f3fSDimitry Andric } 28535f757f3fSDimitry Andric 28548bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue( 28558bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 28568bcb0991SDimitry Andric MachineIRBuilder &B) const { 28578bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 28588bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 28598bcb0991SDimitry Andric unsigned AS = Ty.getAddressSpace(); 28608bcb0991SDimitry Andric 28618bcb0991SDimitry Andric const GlobalValue *GV = MI.getOperand(1).getGlobal(); 28628bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 28638bcb0991SDimitry Andric SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 28648bcb0991SDimitry Andric 28658bcb0991SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2866fe6060f1SDimitry Andric if (!MFI->isModuleEntryFunction() && 2867fe6060f1SDimitry Andric !GV->getName().equals("llvm.amdgcn.module.lds")) { 28688bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 28698bcb0991SDimitry Andric DiagnosticInfoUnsupported BadLDSDecl( 28705ffd83dbSDimitry Andric Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 28715ffd83dbSDimitry Andric DS_Warning); 28728bcb0991SDimitry Andric Fn.getContext().diagnose(BadLDSDecl); 28735ffd83dbSDimitry Andric 28745ffd83dbSDimitry Andric // We currently don't have a way to correctly allocate LDS objects that 28755ffd83dbSDimitry Andric // aren't directly associated with a kernel. We do force inlining of 28765ffd83dbSDimitry Andric // functions that use local objects. However, if these dead functions are 28775ffd83dbSDimitry Andric // not eliminated, we don't want a compile time error. Just emit a warning 28785ffd83dbSDimitry Andric // and a trap, since there should be no callable path here. 28795f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>()); 28805ffd83dbSDimitry Andric B.buildUndef(DstReg); 28815ffd83dbSDimitry Andric MI.eraseFromParent(); 28825ffd83dbSDimitry Andric return true; 28838bcb0991SDimitry Andric } 28848bcb0991SDimitry Andric 28858bcb0991SDimitry Andric // TODO: We could emit code to handle the initialization somewhere. 2886349cc55cSDimitry Andric // We ignore the initializer for now and legalize it to allow selection. 2887349cc55cSDimitry Andric // The initializer will anyway get errored out during assembly emission. 28885ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 28895ffd83dbSDimitry Andric if (!TLI->shouldUseLDSConstAddress(GV)) { 28905ffd83dbSDimitry Andric MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 28915ffd83dbSDimitry Andric return true; // Leave in place; 28925ffd83dbSDimitry Andric } 28935ffd83dbSDimitry Andric 2894e8d8bef9SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2895e8d8bef9SDimitry Andric Type *Ty = GV->getValueType(); 2896e8d8bef9SDimitry Andric // HIP uses an unsized array `extern __shared__ T s[]` or similar 2897e8d8bef9SDimitry Andric // zero-sized type in other languages to declare the dynamic shared 2898e8d8bef9SDimitry Andric // memory which size is not known at the compile time. They will be 2899e8d8bef9SDimitry Andric // allocated by the runtime and placed directly after the static 2900e8d8bef9SDimitry Andric // allocated ones. They all share the same offset. 2901e8d8bef9SDimitry Andric if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2902e8d8bef9SDimitry Andric // Adjust alignment for that dynamic shared memory array. 290306c3fb27SDimitry Andric MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV)); 2904e8d8bef9SDimitry Andric LLT S32 = LLT::scalar(32); 29055f757f3fSDimitry Andric auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}); 2906e8d8bef9SDimitry Andric B.buildIntToPtr(DstReg, Sz); 2907e8d8bef9SDimitry Andric MI.eraseFromParent(); 2908e8d8bef9SDimitry Andric return true; 2909e8d8bef9SDimitry Andric } 2910e8d8bef9SDimitry Andric } 2911e8d8bef9SDimitry Andric 2912349cc55cSDimitry Andric B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2913349cc55cSDimitry Andric *cast<GlobalVariable>(GV))); 29148bcb0991SDimitry Andric MI.eraseFromParent(); 29158bcb0991SDimitry Andric return true; 29168bcb0991SDimitry Andric } 29178bcb0991SDimitry Andric 29185f757f3fSDimitry Andric if (ST.isAmdPalOS() || ST.isMesa3DOS()) { 29195f757f3fSDimitry Andric buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI); 29205f757f3fSDimitry Andric MI.eraseFromParent(); 29215f757f3fSDimitry Andric return true; 29225f757f3fSDimitry Andric } 29235f757f3fSDimitry Andric 29248bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 29258bcb0991SDimitry Andric 29268bcb0991SDimitry Andric if (TLI->shouldEmitFixup(GV)) { 29278bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 29288bcb0991SDimitry Andric MI.eraseFromParent(); 29298bcb0991SDimitry Andric return true; 29308bcb0991SDimitry Andric } 29318bcb0991SDimitry Andric 29328bcb0991SDimitry Andric if (TLI->shouldEmitPCReloc(GV)) { 29338bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 29348bcb0991SDimitry Andric MI.eraseFromParent(); 29358bcb0991SDimitry Andric return true; 29368bcb0991SDimitry Andric } 29378bcb0991SDimitry Andric 29388bcb0991SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 29398bcb0991SDimitry Andric Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 29408bcb0991SDimitry Andric 2941fe6060f1SDimitry Andric LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 29428bcb0991SDimitry Andric MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 29438bcb0991SDimitry Andric MachinePointerInfo::getGOT(MF), 29448bcb0991SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 29458bcb0991SDimitry Andric MachineMemOperand::MOInvariant, 2946fe6060f1SDimitry Andric LoadTy, Align(8)); 29478bcb0991SDimitry Andric 29488bcb0991SDimitry Andric buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 29498bcb0991SDimitry Andric 29508bcb0991SDimitry Andric if (Ty.getSizeInBits() == 32) { 2951349cc55cSDimitry Andric // Truncate if this is a 32-bit constant address. 29528bcb0991SDimitry Andric auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 29538bcb0991SDimitry Andric B.buildExtract(DstReg, Load, 0); 29548bcb0991SDimitry Andric } else 29558bcb0991SDimitry Andric B.buildLoad(DstReg, GOTAddr, *GOTMMO); 29568bcb0991SDimitry Andric 29578bcb0991SDimitry Andric MI.eraseFromParent(); 29588bcb0991SDimitry Andric return true; 29598bcb0991SDimitry Andric } 29608bcb0991SDimitry Andric 2961e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) { 2962e8d8bef9SDimitry Andric if (Ty.isVector()) 2963fe6060f1SDimitry Andric return Ty.changeElementCount( 2964fe6060f1SDimitry Andric ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2965e8d8bef9SDimitry Andric return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2966e8d8bef9SDimitry Andric } 2967e8d8bef9SDimitry Andric 2968e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2969e8d8bef9SDimitry Andric MachineInstr &MI) const { 2970e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 2971e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 2972e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 2973e8d8bef9SDimitry Andric 2974e8d8bef9SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2975e8d8bef9SDimitry Andric LLT PtrTy = MRI.getType(PtrReg); 2976e8d8bef9SDimitry Andric unsigned AddrSpace = PtrTy.getAddressSpace(); 2977e8d8bef9SDimitry Andric 2978e8d8bef9SDimitry Andric if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 29798bcb0991SDimitry Andric LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2980e8d8bef9SDimitry Andric auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 29818bcb0991SDimitry Andric Observer.changingInstr(MI); 29828bcb0991SDimitry Andric MI.getOperand(1).setReg(Cast.getReg(0)); 29838bcb0991SDimitry Andric Observer.changedInstr(MI); 29848bcb0991SDimitry Andric return true; 29858bcb0991SDimitry Andric } 29868bcb0991SDimitry Andric 2987fe6060f1SDimitry Andric if (MI.getOpcode() != AMDGPU::G_LOAD) 2988fe6060f1SDimitry Andric return false; 2989fe6060f1SDimitry Andric 2990e8d8bef9SDimitry Andric Register ValReg = MI.getOperand(0).getReg(); 2991e8d8bef9SDimitry Andric LLT ValTy = MRI.getType(ValReg); 2992e8d8bef9SDimitry Andric 299306c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(ValTy)) { 299406c3fb27SDimitry Andric Observer.changingInstr(MI); 299506c3fb27SDimitry Andric castBufferRsrcFromV4I32(MI, B, MRI, 0); 299606c3fb27SDimitry Andric Observer.changedInstr(MI); 299706c3fb27SDimitry Andric return true; 299806c3fb27SDimitry Andric } 299906c3fb27SDimitry Andric 3000e8d8bef9SDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 3001e8d8bef9SDimitry Andric const unsigned ValSize = ValTy.getSizeInBits(); 3002fe6060f1SDimitry Andric const LLT MemTy = MMO->getMemoryType(); 3003e8d8bef9SDimitry Andric const Align MemAlign = MMO->getAlign(); 3004fe6060f1SDimitry Andric const unsigned MemSize = MemTy.getSizeInBits(); 300504eeddc0SDimitry Andric const uint64_t AlignInBits = 8 * MemAlign.value(); 3006e8d8bef9SDimitry Andric 3007e8d8bef9SDimitry Andric // Widen non-power-of-2 loads to the alignment if needed 3008fe6060f1SDimitry Andric if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 3009e8d8bef9SDimitry Andric const unsigned WideMemSize = PowerOf2Ceil(MemSize); 3010e8d8bef9SDimitry Andric 3011e8d8bef9SDimitry Andric // This was already the correct extending load result type, so just adjust 3012e8d8bef9SDimitry Andric // the memory type. 3013e8d8bef9SDimitry Andric if (WideMemSize == ValSize) { 3014e8d8bef9SDimitry Andric MachineFunction &MF = B.getMF(); 3015e8d8bef9SDimitry Andric 3016e8d8bef9SDimitry Andric MachineMemOperand *WideMMO = 3017e8d8bef9SDimitry Andric MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 3018e8d8bef9SDimitry Andric Observer.changingInstr(MI); 3019e8d8bef9SDimitry Andric MI.setMemRefs(MF, {WideMMO}); 3020e8d8bef9SDimitry Andric Observer.changedInstr(MI); 3021e8d8bef9SDimitry Andric return true; 3022e8d8bef9SDimitry Andric } 3023e8d8bef9SDimitry Andric 3024e8d8bef9SDimitry Andric // Don't bother handling edge case that should probably never be produced. 3025e8d8bef9SDimitry Andric if (ValSize > WideMemSize) 3026e8d8bef9SDimitry Andric return false; 3027e8d8bef9SDimitry Andric 3028e8d8bef9SDimitry Andric LLT WideTy = widenToNextPowerOf2(ValTy); 3029e8d8bef9SDimitry Andric 3030e8d8bef9SDimitry Andric Register WideLoad; 3031e8d8bef9SDimitry Andric if (!WideTy.isVector()) { 3032e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3033e8d8bef9SDimitry Andric B.buildTrunc(ValReg, WideLoad).getReg(0); 3034e8d8bef9SDimitry Andric } else { 3035e8d8bef9SDimitry Andric // Extract the subvector. 3036e8d8bef9SDimitry Andric 3037e8d8bef9SDimitry Andric if (isRegisterType(ValTy)) { 3038e8d8bef9SDimitry Andric // If this a case where G_EXTRACT is legal, use it. 3039e8d8bef9SDimitry Andric // (e.g. <3 x s32> -> <4 x s32>) 3040e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 3041e8d8bef9SDimitry Andric B.buildExtract(ValReg, WideLoad, 0); 3042e8d8bef9SDimitry Andric } else { 3043e8d8bef9SDimitry Andric // For cases where the widened type isn't a nice register value, unmerge 3044e8d8bef9SDimitry Andric // from a widened register (e.g. <3 x s16> -> <4 x s16>) 30450eae32dcSDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 30460eae32dcSDimitry Andric B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 3047e8d8bef9SDimitry Andric } 3048e8d8bef9SDimitry Andric } 3049e8d8bef9SDimitry Andric 3050e8d8bef9SDimitry Andric MI.eraseFromParent(); 3051e8d8bef9SDimitry Andric return true; 3052e8d8bef9SDimitry Andric } 3053e8d8bef9SDimitry Andric 3054e8d8bef9SDimitry Andric return false; 3055e8d8bef9SDimitry Andric } 3056e8d8bef9SDimitry Andric 305706c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper, 305806c3fb27SDimitry Andric MachineInstr &MI) const { 305906c3fb27SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 306006c3fb27SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 306106c3fb27SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 306206c3fb27SDimitry Andric 306306c3fb27SDimitry Andric Register DataReg = MI.getOperand(0).getReg(); 306406c3fb27SDimitry Andric LLT DataTy = MRI.getType(DataReg); 306506c3fb27SDimitry Andric 306606c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(DataTy)) { 306706c3fb27SDimitry Andric Observer.changingInstr(MI); 306806c3fb27SDimitry Andric castBufferRsrcArgToV4I32(MI, B, 0); 306906c3fb27SDimitry Andric Observer.changedInstr(MI); 307006c3fb27SDimitry Andric return true; 307106c3fb27SDimitry Andric } 307206c3fb27SDimitry Andric return false; 307306c3fb27SDimitry Andric } 307406c3fb27SDimitry Andric 30758bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad( 30768bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 30778bcb0991SDimitry Andric MachineIRBuilder &B) const { 30788bcb0991SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 30798bcb0991SDimitry Andric assert(Ty.isScalar()); 30808bcb0991SDimitry Andric 3081480093f4SDimitry Andric MachineFunction &MF = B.getMF(); 3082480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 3083480093f4SDimitry Andric 30848bcb0991SDimitry Andric // TODO: Always legal with future ftz flag. 30855ffd83dbSDimitry Andric // FIXME: Do we need just output? 30865f757f3fSDimitry Andric if (Ty == LLT::float32() && 308706c3fb27SDimitry Andric MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()) 30888bcb0991SDimitry Andric return true; 30895f757f3fSDimitry Andric if (Ty == LLT::float16() && 309006c3fb27SDimitry Andric MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()) 30918bcb0991SDimitry Andric return true; 30928bcb0991SDimitry Andric 30938bcb0991SDimitry Andric MachineIRBuilder HelperBuilder(MI); 30948bcb0991SDimitry Andric GISelObserverWrapper DummyObserver; 30958bcb0991SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 30968bcb0991SDimitry Andric return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 30978bcb0991SDimitry Andric } 30988bcb0991SDimitry Andric 3099480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 3100480093f4SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 3101480093f4SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 3102480093f4SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 3103480093f4SDimitry Andric Register CmpVal = MI.getOperand(2).getReg(); 3104480093f4SDimitry Andric Register NewVal = MI.getOperand(3).getReg(); 3105480093f4SDimitry Andric 3106e8d8bef9SDimitry Andric assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 3107480093f4SDimitry Andric "this should not have been custom lowered"); 3108480093f4SDimitry Andric 3109480093f4SDimitry Andric LLT ValTy = MRI.getType(CmpVal); 3110fe6060f1SDimitry Andric LLT VecTy = LLT::fixed_vector(2, ValTy); 3111480093f4SDimitry Andric 3112480093f4SDimitry Andric Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 3113480093f4SDimitry Andric 3114480093f4SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 3115480093f4SDimitry Andric .addDef(DstReg) 3116480093f4SDimitry Andric .addUse(PtrReg) 3117480093f4SDimitry Andric .addUse(PackedVal) 3118480093f4SDimitry Andric .setMemRefs(MI.memoperands()); 3119480093f4SDimitry Andric 3120480093f4SDimitry Andric MI.eraseFromParent(); 3121480093f4SDimitry Andric return true; 3122480093f4SDimitry Andric } 3123480093f4SDimitry Andric 312406c3fb27SDimitry Andric /// Return true if it's known that \p Src can never be an f32 denormal value. 312506c3fb27SDimitry Andric static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI, 312606c3fb27SDimitry Andric Register Src) { 31275f757f3fSDimitry Andric const MachineInstr *DefMI = MRI.getVRegDef(Src); 31285f757f3fSDimitry Andric switch (DefMI->getOpcode()) { 31295f757f3fSDimitry Andric case TargetOpcode::G_INTRINSIC: { 31305f757f3fSDimitry Andric switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) { 31315f757f3fSDimitry Andric case Intrinsic::amdgcn_frexp_mant: 31325f757f3fSDimitry Andric return true; 31335f757f3fSDimitry Andric default: 31345f757f3fSDimitry Andric break; 31355f757f3fSDimitry Andric } 31365f757f3fSDimitry Andric 31375f757f3fSDimitry Andric break; 31385f757f3fSDimitry Andric } 31395f757f3fSDimitry Andric case TargetOpcode::G_FFREXP: { 31405f757f3fSDimitry Andric if (DefMI->getOperand(0).getReg() == Src) 31415f757f3fSDimitry Andric return true; 31425f757f3fSDimitry Andric break; 31435f757f3fSDimitry Andric } 31445f757f3fSDimitry Andric case TargetOpcode::G_FPEXT: { 31455f757f3fSDimitry Andric return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16); 31465f757f3fSDimitry Andric } 31475f757f3fSDimitry Andric default: 31485f757f3fSDimitry Andric return false; 31495f757f3fSDimitry Andric } 31505f757f3fSDimitry Andric 315106c3fb27SDimitry Andric return false; 315206c3fb27SDimitry Andric } 315306c3fb27SDimitry Andric 315406c3fb27SDimitry Andric static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) { 315506c3fb27SDimitry Andric if (Flags & MachineInstr::FmAfn) 315606c3fb27SDimitry Andric return true; 315706c3fb27SDimitry Andric const auto &Options = MF.getTarget().Options; 315806c3fb27SDimitry Andric return Options.UnsafeFPMath || Options.ApproxFuncFPMath; 315906c3fb27SDimitry Andric } 316006c3fb27SDimitry Andric 316106c3fb27SDimitry Andric static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, 316206c3fb27SDimitry Andric unsigned Flags) { 316306c3fb27SDimitry Andric return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) && 316406c3fb27SDimitry Andric MF.getDenormalMode(APFloat::IEEEsingle()).Input != 316506c3fb27SDimitry Andric DenormalMode::PreserveSign; 316606c3fb27SDimitry Andric } 316706c3fb27SDimitry Andric 316806c3fb27SDimitry Andric std::pair<Register, Register> 316906c3fb27SDimitry Andric AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src, 317006c3fb27SDimitry Andric unsigned Flags) const { 31718a4dda33SDimitry Andric if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) 317206c3fb27SDimitry Andric return {}; 317306c3fb27SDimitry Andric 317406c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 317506c3fb27SDimitry Andric auto SmallestNormal = B.buildFConstant( 317606c3fb27SDimitry Andric F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle())); 317706c3fb27SDimitry Andric auto IsLtSmallestNormal = 317806c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal); 317906c3fb27SDimitry Andric 318006c3fb27SDimitry Andric auto Scale32 = B.buildFConstant(F32, 0x1.0p+32); 318106c3fb27SDimitry Andric auto One = B.buildFConstant(F32, 1.0); 318206c3fb27SDimitry Andric auto ScaleFactor = 318306c3fb27SDimitry Andric B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags); 318406c3fb27SDimitry Andric auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags); 318506c3fb27SDimitry Andric 318606c3fb27SDimitry Andric return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)}; 318706c3fb27SDimitry Andric } 318806c3fb27SDimitry Andric 318906c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI, 319006c3fb27SDimitry Andric MachineIRBuilder &B) const { 319106c3fb27SDimitry Andric // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals. 319206c3fb27SDimitry Andric // If we have to handle denormals, scale up the input and adjust the result. 319306c3fb27SDimitry Andric 319406c3fb27SDimitry Andric // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0) 319506c3fb27SDimitry Andric // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0) 319606c3fb27SDimitry Andric 31975ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 31985ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 31995ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 32005ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 32015ffd83dbSDimitry Andric 320206c3fb27SDimitry Andric if (Ty == LLT::scalar(16)) { 320306c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 320406c3fb27SDimitry Andric // Nothing in half is a denormal when promoted to f32. 320506c3fb27SDimitry Andric auto Ext = B.buildFPExt(F32, Src, Flags); 32065f757f3fSDimitry Andric auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}) 320706c3fb27SDimitry Andric .addUse(Ext.getReg(0)) 320806c3fb27SDimitry Andric .setMIFlags(Flags); 320906c3fb27SDimitry Andric B.buildFPTrunc(Dst, Log2, Flags); 32105ffd83dbSDimitry Andric MI.eraseFromParent(); 32115ffd83dbSDimitry Andric return true; 32125ffd83dbSDimitry Andric } 32135ffd83dbSDimitry Andric 321406c3fb27SDimitry Andric assert(Ty == LLT::scalar(32)); 321506c3fb27SDimitry Andric 321606c3fb27SDimitry Andric auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags); 321706c3fb27SDimitry Andric if (!ScaledInput) { 32185f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}) 321906c3fb27SDimitry Andric .addUse(Src) 322006c3fb27SDimitry Andric .setMIFlags(Flags); 322106c3fb27SDimitry Andric MI.eraseFromParent(); 322206c3fb27SDimitry Andric return true; 322306c3fb27SDimitry Andric } 322406c3fb27SDimitry Andric 32255f757f3fSDimitry Andric auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 322606c3fb27SDimitry Andric .addUse(ScaledInput) 322706c3fb27SDimitry Andric .setMIFlags(Flags); 322806c3fb27SDimitry Andric 322906c3fb27SDimitry Andric auto ThirtyTwo = B.buildFConstant(Ty, 32.0); 323006c3fb27SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 323106c3fb27SDimitry Andric auto ResultOffset = 323206c3fb27SDimitry Andric B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags); 323306c3fb27SDimitry Andric B.buildFSub(Dst, Log2, ResultOffset, Flags); 323406c3fb27SDimitry Andric 323506c3fb27SDimitry Andric MI.eraseFromParent(); 323606c3fb27SDimitry Andric return true; 323706c3fb27SDimitry Andric } 323806c3fb27SDimitry Andric 323906c3fb27SDimitry Andric static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y, 324006c3fb27SDimitry Andric Register Z, unsigned Flags) { 324106c3fb27SDimitry Andric auto FMul = B.buildFMul(Ty, X, Y, Flags); 324206c3fb27SDimitry Andric return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0); 324306c3fb27SDimitry Andric } 324406c3fb27SDimitry Andric 324506c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI, 324606c3fb27SDimitry Andric MachineIRBuilder &B) const { 324706c3fb27SDimitry Andric const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10; 324806c3fb27SDimitry Andric assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG); 324906c3fb27SDimitry Andric 325006c3fb27SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 325106c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 325206c3fb27SDimitry Andric Register X = MI.getOperand(1).getReg(); 325306c3fb27SDimitry Andric unsigned Flags = MI.getFlags(); 325406c3fb27SDimitry Andric const LLT Ty = MRI.getType(X); 325506c3fb27SDimitry Andric MachineFunction &MF = B.getMF(); 325606c3fb27SDimitry Andric 325706c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 325806c3fb27SDimitry Andric const LLT F16 = LLT::scalar(16); 325906c3fb27SDimitry Andric 326006c3fb27SDimitry Andric const AMDGPUTargetMachine &TM = 326106c3fb27SDimitry Andric static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 326206c3fb27SDimitry Andric 326306c3fb27SDimitry Andric if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) || 326406c3fb27SDimitry Andric TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) { 326506c3fb27SDimitry Andric if (Ty == F16 && !ST.has16BitInsts()) { 326606c3fb27SDimitry Andric Register LogVal = MRI.createGenericVirtualRegister(F32); 326706c3fb27SDimitry Andric auto PromoteSrc = B.buildFPExt(F32, X); 32688a4dda33SDimitry Andric legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags); 326906c3fb27SDimitry Andric B.buildFPTrunc(Dst, LogVal); 327006c3fb27SDimitry Andric } else { 32718a4dda33SDimitry Andric legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags); 327206c3fb27SDimitry Andric } 327306c3fb27SDimitry Andric 327406c3fb27SDimitry Andric MI.eraseFromParent(); 327506c3fb27SDimitry Andric return true; 327606c3fb27SDimitry Andric } 327706c3fb27SDimitry Andric 327806c3fb27SDimitry Andric auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags); 327906c3fb27SDimitry Andric if (ScaledInput) 328006c3fb27SDimitry Andric X = ScaledInput; 328106c3fb27SDimitry Andric 32825f757f3fSDimitry Andric auto Y = 32835f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags); 328406c3fb27SDimitry Andric 328506c3fb27SDimitry Andric Register R; 328606c3fb27SDimitry Andric if (ST.hasFastFMAF32()) { 328706c3fb27SDimitry Andric // c+cc are ln(2)/ln(10) to more than 49 bits 328806c3fb27SDimitry Andric const float c_log10 = 0x1.344134p-2f; 328906c3fb27SDimitry Andric const float cc_log10 = 0x1.09f79ep-26f; 329006c3fb27SDimitry Andric 329106c3fb27SDimitry Andric // c + cc is ln(2) to more than 49 bits 329206c3fb27SDimitry Andric const float c_log = 0x1.62e42ep-1f; 329306c3fb27SDimitry Andric const float cc_log = 0x1.efa39ep-25f; 329406c3fb27SDimitry Andric 329506c3fb27SDimitry Andric auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log); 329606c3fb27SDimitry Andric auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log); 329706c3fb27SDimitry Andric 329806c3fb27SDimitry Andric R = B.buildFMul(Ty, Y, C, Flags).getReg(0); 329906c3fb27SDimitry Andric auto NegR = B.buildFNeg(Ty, R, Flags); 330006c3fb27SDimitry Andric auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags); 330106c3fb27SDimitry Andric auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags); 330206c3fb27SDimitry Andric R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0); 330306c3fb27SDimitry Andric } else { 330406c3fb27SDimitry Andric // ch+ct is ln(2)/ln(10) to more than 36 bits 330506c3fb27SDimitry Andric const float ch_log10 = 0x1.344000p-2f; 330606c3fb27SDimitry Andric const float ct_log10 = 0x1.3509f6p-18f; 330706c3fb27SDimitry Andric 330806c3fb27SDimitry Andric // ch + ct is ln(2) to more than 36 bits 330906c3fb27SDimitry Andric const float ch_log = 0x1.62e000p-1f; 331006c3fb27SDimitry Andric const float ct_log = 0x1.0bfbe8p-15f; 331106c3fb27SDimitry Andric 331206c3fb27SDimitry Andric auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log); 331306c3fb27SDimitry Andric auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log); 331406c3fb27SDimitry Andric 331506c3fb27SDimitry Andric auto MaskConst = B.buildConstant(Ty, 0xfffff000); 331606c3fb27SDimitry Andric auto YH = B.buildAnd(Ty, Y, MaskConst); 331706c3fb27SDimitry Andric auto YT = B.buildFSub(Ty, Y, YH, Flags); 331806c3fb27SDimitry Andric auto YTCT = B.buildFMul(Ty, YT, CT, Flags); 331906c3fb27SDimitry Andric 332006c3fb27SDimitry Andric Register Mad0 = 332106c3fb27SDimitry Andric getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags); 332206c3fb27SDimitry Andric Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags); 332306c3fb27SDimitry Andric R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags); 332406c3fb27SDimitry Andric } 332506c3fb27SDimitry Andric 332606c3fb27SDimitry Andric const bool IsFiniteOnly = 332706c3fb27SDimitry Andric (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) && 332806c3fb27SDimitry Andric (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath); 332906c3fb27SDimitry Andric 333006c3fb27SDimitry Andric if (!IsFiniteOnly) { 333106c3fb27SDimitry Andric // Expand isfinite(x) => fabs(x) < inf 333206c3fb27SDimitry Andric auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 333306c3fb27SDimitry Andric auto Fabs = B.buildFAbs(Ty, Y); 333406c3fb27SDimitry Andric auto IsFinite = 333506c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 333606c3fb27SDimitry Andric R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0); 333706c3fb27SDimitry Andric } 333806c3fb27SDimitry Andric 333906c3fb27SDimitry Andric if (ScaledInput) { 334006c3fb27SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 334106c3fb27SDimitry Andric auto ShiftK = 334206c3fb27SDimitry Andric B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f); 334306c3fb27SDimitry Andric auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags); 334406c3fb27SDimitry Andric B.buildFSub(Dst, R, Shift, Flags); 334506c3fb27SDimitry Andric } else { 334606c3fb27SDimitry Andric B.buildCopy(Dst, R); 334706c3fb27SDimitry Andric } 334806c3fb27SDimitry Andric 334906c3fb27SDimitry Andric MI.eraseFromParent(); 335006c3fb27SDimitry Andric return true; 335106c3fb27SDimitry Andric } 335206c3fb27SDimitry Andric 335306c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, 33548a4dda33SDimitry Andric Register Src, bool IsLog10, 335506c3fb27SDimitry Andric unsigned Flags) const { 33568a4dda33SDimitry Andric const double Log2BaseInverted = 33578a4dda33SDimitry Andric IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2; 33588a4dda33SDimitry Andric 335906c3fb27SDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 33608a4dda33SDimitry Andric 33618a4dda33SDimitry Andric if (Ty == LLT::scalar(32)) { 33628a4dda33SDimitry Andric auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags); 33638a4dda33SDimitry Andric if (ScaledInput) { 33645f757f3fSDimitry Andric auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 33658a4dda33SDimitry Andric .addUse(Src) 33668a4dda33SDimitry Andric .setMIFlags(Flags); 33678a4dda33SDimitry Andric auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted); 33688a4dda33SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 33698a4dda33SDimitry Andric auto ResultOffset = 33708a4dda33SDimitry Andric B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags); 33718a4dda33SDimitry Andric auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted); 33728a4dda33SDimitry Andric 33738a4dda33SDimitry Andric if (ST.hasFastFMAF32()) 33748a4dda33SDimitry Andric B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags); 33758a4dda33SDimitry Andric else { 33768a4dda33SDimitry Andric auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags); 33778a4dda33SDimitry Andric B.buildFAdd(Dst, Mul, ResultOffset, Flags); 33788a4dda33SDimitry Andric } 33798a4dda33SDimitry Andric 33808a4dda33SDimitry Andric return true; 33818a4dda33SDimitry Andric } 33828a4dda33SDimitry Andric } 33838a4dda33SDimitry Andric 338406c3fb27SDimitry Andric auto Log2Operand = Ty == LLT::scalar(16) 338506c3fb27SDimitry Andric ? B.buildFLog2(Ty, Src, Flags) 33865f757f3fSDimitry Andric : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}) 338706c3fb27SDimitry Andric .addUse(Src) 338806c3fb27SDimitry Andric .setMIFlags(Flags); 338906c3fb27SDimitry Andric auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 339006c3fb27SDimitry Andric B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 339106c3fb27SDimitry Andric return true; 339206c3fb27SDimitry Andric } 339306c3fb27SDimitry Andric 339406c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI, 339506c3fb27SDimitry Andric MachineIRBuilder &B) const { 339606c3fb27SDimitry Andric // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals. 339706c3fb27SDimitry Andric // If we have to handle denormals, scale up the input and adjust the result. 339806c3fb27SDimitry Andric 339906c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 340006c3fb27SDimitry Andric Register Src = MI.getOperand(1).getReg(); 340106c3fb27SDimitry Andric unsigned Flags = MI.getFlags(); 340206c3fb27SDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 340306c3fb27SDimitry Andric const LLT F16 = LLT::scalar(16); 340406c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 340506c3fb27SDimitry Andric 340606c3fb27SDimitry Andric if (Ty == F16) { 340706c3fb27SDimitry Andric // Nothing in half is a denormal when promoted to f32. 340806c3fb27SDimitry Andric auto Ext = B.buildFPExt(F32, Src, Flags); 34095f757f3fSDimitry Andric auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}) 341006c3fb27SDimitry Andric .addUse(Ext.getReg(0)) 341106c3fb27SDimitry Andric .setMIFlags(Flags); 341206c3fb27SDimitry Andric B.buildFPTrunc(Dst, Log2, Flags); 341306c3fb27SDimitry Andric MI.eraseFromParent(); 341406c3fb27SDimitry Andric return true; 341506c3fb27SDimitry Andric } 341606c3fb27SDimitry Andric 341706c3fb27SDimitry Andric assert(Ty == F32); 341806c3fb27SDimitry Andric 34198a4dda33SDimitry Andric if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) { 34205f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 342106c3fb27SDimitry Andric .addUse(Src) 342206c3fb27SDimitry Andric .setMIFlags(Flags); 342306c3fb27SDimitry Andric MI.eraseFromParent(); 342406c3fb27SDimitry Andric return true; 342506c3fb27SDimitry Andric } 342606c3fb27SDimitry Andric 342706c3fb27SDimitry Andric // bool needs_scaling = x < -0x1.f80000p+6f; 342806c3fb27SDimitry Andric // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f); 342906c3fb27SDimitry Andric 343006c3fb27SDimitry Andric // -nextafter(128.0, -1) 343106c3fb27SDimitry Andric auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f); 343206c3fb27SDimitry Andric auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, 343306c3fb27SDimitry Andric RangeCheckConst, Flags); 343406c3fb27SDimitry Andric 343506c3fb27SDimitry Andric auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f); 343606c3fb27SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 343706c3fb27SDimitry Andric auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags); 343806c3fb27SDimitry Andric auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags); 343906c3fb27SDimitry Andric 34405f757f3fSDimitry Andric auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 344106c3fb27SDimitry Andric .addUse(AddInput.getReg(0)) 344206c3fb27SDimitry Andric .setMIFlags(Flags); 344306c3fb27SDimitry Andric 344406c3fb27SDimitry Andric auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f); 344506c3fb27SDimitry Andric auto One = B.buildFConstant(Ty, 1.0); 344606c3fb27SDimitry Andric auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags); 344706c3fb27SDimitry Andric B.buildFMul(Dst, Exp2, ResultScale, Flags); 344806c3fb27SDimitry Andric MI.eraseFromParent(); 344906c3fb27SDimitry Andric return true; 345006c3fb27SDimitry Andric } 345106c3fb27SDimitry Andric 345206c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, 34535f757f3fSDimitry Andric Register X, unsigned Flags) const { 345406c3fb27SDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 34555f757f3fSDimitry Andric LLT F32 = LLT::scalar(32); 345606c3fb27SDimitry Andric 34575f757f3fSDimitry Andric if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) { 34585f757f3fSDimitry Andric auto Log2E = B.buildFConstant(Ty, numbers::log2e); 34595f757f3fSDimitry Andric auto Mul = B.buildFMul(Ty, X, Log2E, Flags); 34605f757f3fSDimitry Andric 34615f757f3fSDimitry Andric if (Ty == F32) { 34625f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}) 346306c3fb27SDimitry Andric .addUse(Mul.getReg(0)) 346406c3fb27SDimitry Andric .setMIFlags(Flags); 346506c3fb27SDimitry Andric } else { 346606c3fb27SDimitry Andric B.buildFExp2(Dst, Mul.getReg(0), Flags); 346706c3fb27SDimitry Andric } 346806c3fb27SDimitry Andric 346906c3fb27SDimitry Andric return true; 347006c3fb27SDimitry Andric } 347106c3fb27SDimitry Andric 34725f757f3fSDimitry Andric auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f); 34735f757f3fSDimitry Andric auto NeedsScaling = 34745f757f3fSDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags); 34755f757f3fSDimitry Andric auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f); 34765f757f3fSDimitry Andric auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags); 34775f757f3fSDimitry Andric auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags); 34785f757f3fSDimitry Andric 34795f757f3fSDimitry Andric auto Log2E = B.buildFConstant(Ty, numbers::log2e); 34805f757f3fSDimitry Andric auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags); 34815f757f3fSDimitry Andric 34825f757f3fSDimitry Andric auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 34835f757f3fSDimitry Andric .addUse(ExpInput.getReg(0)) 34845f757f3fSDimitry Andric .setMIFlags(Flags); 34855f757f3fSDimitry Andric 34865f757f3fSDimitry Andric auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f); 34875f757f3fSDimitry Andric auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags); 34885f757f3fSDimitry Andric B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags); 34895f757f3fSDimitry Andric return true; 34905f757f3fSDimitry Andric } 34915f757f3fSDimitry Andric 34925ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 34935ffd83dbSDimitry Andric MachineIRBuilder &B) const { 34945ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 349506c3fb27SDimitry Andric Register X = MI.getOperand(1).getReg(); 349606c3fb27SDimitry Andric const unsigned Flags = MI.getFlags(); 349706c3fb27SDimitry Andric MachineFunction &MF = B.getMF(); 349806c3fb27SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 349906c3fb27SDimitry Andric LLT Ty = MRI.getType(Dst); 350006c3fb27SDimitry Andric const LLT F16 = LLT::scalar(16); 350106c3fb27SDimitry Andric const LLT F32 = LLT::scalar(32); 35025f757f3fSDimitry Andric const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10; 35035ffd83dbSDimitry Andric 350406c3fb27SDimitry Andric if (Ty == F16) { 350506c3fb27SDimitry Andric // v_exp_f16 (fmul x, log2e) 350606c3fb27SDimitry Andric if (allowApproxFunc(MF, Flags)) { 350706c3fb27SDimitry Andric // TODO: Does this really require fast? 350806c3fb27SDimitry Andric legalizeFExpUnsafe(B, Dst, X, Flags); 350906c3fb27SDimitry Andric MI.eraseFromParent(); 351006c3fb27SDimitry Andric return true; 351106c3fb27SDimitry Andric } 351206c3fb27SDimitry Andric 351306c3fb27SDimitry Andric // exp(f16 x) -> 351406c3fb27SDimitry Andric // fptrunc (v_exp_f32 (fmul (fpext x), log2e)) 351506c3fb27SDimitry Andric 351606c3fb27SDimitry Andric // Nothing in half is a denormal when promoted to f32. 351706c3fb27SDimitry Andric auto Ext = B.buildFPExt(F32, X, Flags); 351806c3fb27SDimitry Andric Register Lowered = MRI.createGenericVirtualRegister(F32); 351906c3fb27SDimitry Andric legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags); 352006c3fb27SDimitry Andric B.buildFPTrunc(Dst, Lowered, Flags); 352106c3fb27SDimitry Andric MI.eraseFromParent(); 352206c3fb27SDimitry Andric return true; 352306c3fb27SDimitry Andric } 352406c3fb27SDimitry Andric 352506c3fb27SDimitry Andric assert(Ty == F32); 352606c3fb27SDimitry Andric 352706c3fb27SDimitry Andric // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying 352806c3fb27SDimitry Andric // library behavior. Also, is known-not-daz source sufficient? 35295f757f3fSDimitry Andric if (allowApproxFunc(MF, Flags)) { 353006c3fb27SDimitry Andric legalizeFExpUnsafe(B, Dst, X, Flags); 353106c3fb27SDimitry Andric MI.eraseFromParent(); 353206c3fb27SDimitry Andric return true; 353306c3fb27SDimitry Andric } 353406c3fb27SDimitry Andric 353506c3fb27SDimitry Andric // Algorithm: 353606c3fb27SDimitry Andric // 353706c3fb27SDimitry Andric // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64) 353806c3fb27SDimitry Andric // 353906c3fb27SDimitry Andric // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer 354006c3fb27SDimitry Andric // n = 64*m + j, 0 <= j < 64 354106c3fb27SDimitry Andric // 354206c3fb27SDimitry Andric // e^x = 2^((64*m + j + f)/64) 354306c3fb27SDimitry Andric // = (2^m) * (2^(j/64)) * 2^(f/64) 354406c3fb27SDimitry Andric // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64)) 354506c3fb27SDimitry Andric // 354606c3fb27SDimitry Andric // f = x*(64/ln(2)) - n 354706c3fb27SDimitry Andric // r = f*(ln(2)/64) = x - n*(ln(2)/64) 354806c3fb27SDimitry Andric // 354906c3fb27SDimitry Andric // e^x = (2^m) * (2^(j/64)) * e^r 355006c3fb27SDimitry Andric // 355106c3fb27SDimitry Andric // (2^(j/64)) is precomputed 355206c3fb27SDimitry Andric // 355306c3fb27SDimitry Andric // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 355406c3fb27SDimitry Andric // e^r = 1 + q 355506c3fb27SDimitry Andric // 355606c3fb27SDimitry Andric // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! 355706c3fb27SDimitry Andric // 355806c3fb27SDimitry Andric // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 355906c3fb27SDimitry Andric const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract; 356006c3fb27SDimitry Andric Register PH, PL; 356106c3fb27SDimitry Andric 356206c3fb27SDimitry Andric if (ST.hasFastFMAF32()) { 356306c3fb27SDimitry Andric const float c_exp = numbers::log2ef; 356406c3fb27SDimitry Andric const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits 356506c3fb27SDimitry Andric const float c_exp10 = 0x1.a934f0p+1f; 356606c3fb27SDimitry Andric const float cc_exp10 = 0x1.2f346ep-24f; 356706c3fb27SDimitry Andric 356806c3fb27SDimitry Andric auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp); 356906c3fb27SDimitry Andric PH = B.buildFMul(Ty, X, C, Flags).getReg(0); 357006c3fb27SDimitry Andric auto NegPH = B.buildFNeg(Ty, PH, Flags); 357106c3fb27SDimitry Andric auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags); 357206c3fb27SDimitry Andric 357306c3fb27SDimitry Andric auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp); 357406c3fb27SDimitry Andric PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0); 357506c3fb27SDimitry Andric } else { 357606c3fb27SDimitry Andric const float ch_exp = 0x1.714000p+0f; 357706c3fb27SDimitry Andric const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits 357806c3fb27SDimitry Andric 357906c3fb27SDimitry Andric const float ch_exp10 = 0x1.a92000p+1f; 358006c3fb27SDimitry Andric const float cl_exp10 = 0x1.4f0978p-11f; 358106c3fb27SDimitry Andric 358206c3fb27SDimitry Andric auto MaskConst = B.buildConstant(Ty, 0xfffff000); 358306c3fb27SDimitry Andric auto XH = B.buildAnd(Ty, X, MaskConst); 358406c3fb27SDimitry Andric auto XL = B.buildFSub(Ty, X, XH, Flags); 358506c3fb27SDimitry Andric 358606c3fb27SDimitry Andric auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp); 358706c3fb27SDimitry Andric PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0); 358806c3fb27SDimitry Andric 358906c3fb27SDimitry Andric auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp); 359006c3fb27SDimitry Andric auto XLCL = B.buildFMul(Ty, XL, CL, Flags); 359106c3fb27SDimitry Andric 359206c3fb27SDimitry Andric Register Mad0 = 359306c3fb27SDimitry Andric getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags); 359406c3fb27SDimitry Andric PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags); 359506c3fb27SDimitry Andric } 359606c3fb27SDimitry Andric 35975f757f3fSDimitry Andric auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags); 359806c3fb27SDimitry Andric 359906c3fb27SDimitry Andric // It is unsafe to contract this fsub into the PH multiply. 360006c3fb27SDimitry Andric auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract); 360106c3fb27SDimitry Andric auto A = B.buildFAdd(Ty, PHSubE, PL, Flags); 360206c3fb27SDimitry Andric auto IntE = B.buildFPTOSI(LLT::scalar(32), E); 360306c3fb27SDimitry Andric 36045f757f3fSDimitry Andric auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}) 360506c3fb27SDimitry Andric .addUse(A.getReg(0)) 360606c3fb27SDimitry Andric .setMIFlags(Flags); 360706c3fb27SDimitry Andric auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags); 360806c3fb27SDimitry Andric 360906c3fb27SDimitry Andric auto UnderflowCheckConst = 361006c3fb27SDimitry Andric B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f); 361106c3fb27SDimitry Andric auto Zero = B.buildFConstant(Ty, 0.0); 361206c3fb27SDimitry Andric auto Underflow = 361306c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst); 361406c3fb27SDimitry Andric 361506c3fb27SDimitry Andric R = B.buildSelect(Ty, Underflow, Zero, R); 361606c3fb27SDimitry Andric 361706c3fb27SDimitry Andric const auto &Options = MF.getTarget().Options; 361806c3fb27SDimitry Andric 361906c3fb27SDimitry Andric if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) { 362006c3fb27SDimitry Andric auto OverflowCheckConst = 362106c3fb27SDimitry Andric B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f); 362206c3fb27SDimitry Andric 362306c3fb27SDimitry Andric auto Overflow = 362406c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst); 362506c3fb27SDimitry Andric auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle())); 362606c3fb27SDimitry Andric R = B.buildSelect(Ty, Overflow, Inf, R, Flags); 362706c3fb27SDimitry Andric } 362806c3fb27SDimitry Andric 362906c3fb27SDimitry Andric B.buildCopy(Dst, R); 36305ffd83dbSDimitry Andric MI.eraseFromParent(); 36315ffd83dbSDimitry Andric return true; 36325ffd83dbSDimitry Andric } 36335ffd83dbSDimitry Andric 36345ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 36355ffd83dbSDimitry Andric MachineIRBuilder &B) const { 36365ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 36375ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 36385ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 36395ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 36405ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 36415f757f3fSDimitry Andric const LLT F16 = LLT::float16(); 36425f757f3fSDimitry Andric const LLT F32 = LLT::float32(); 36435ffd83dbSDimitry Andric 36445f757f3fSDimitry Andric if (Ty == F32) { 36455f757f3fSDimitry Andric auto Log = B.buildFLog2(F32, Src0, Flags); 36465f757f3fSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 36475ffd83dbSDimitry Andric .addUse(Log.getReg(0)) 36485ffd83dbSDimitry Andric .addUse(Src1) 36495ffd83dbSDimitry Andric .setMIFlags(Flags); 36505ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 36515f757f3fSDimitry Andric } else if (Ty == F16) { 36525ffd83dbSDimitry Andric // There's no f16 fmul_legacy, so we need to convert for it. 36535f757f3fSDimitry Andric auto Log = B.buildFLog2(F16, Src0, Flags); 36545f757f3fSDimitry Andric auto Ext0 = B.buildFPExt(F32, Log, Flags); 36555f757f3fSDimitry Andric auto Ext1 = B.buildFPExt(F32, Src1, Flags); 36565f757f3fSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32}) 36575ffd83dbSDimitry Andric .addUse(Ext0.getReg(0)) 36585ffd83dbSDimitry Andric .addUse(Ext1.getReg(0)) 36595ffd83dbSDimitry Andric .setMIFlags(Flags); 36605f757f3fSDimitry Andric B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags); 36615ffd83dbSDimitry Andric } else 36625ffd83dbSDimitry Andric return false; 36635ffd83dbSDimitry Andric 36645ffd83dbSDimitry Andric MI.eraseFromParent(); 36655ffd83dbSDimitry Andric return true; 36665ffd83dbSDimitry Andric } 36675ffd83dbSDimitry Andric 36685ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers. 36695ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 36705ffd83dbSDimitry Andric Register ModSrc = OrigSrc; 36715ffd83dbSDimitry Andric if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 36725ffd83dbSDimitry Andric ModSrc = SrcFNeg->getOperand(1).getReg(); 36735ffd83dbSDimitry Andric if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 36745ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 36755ffd83dbSDimitry Andric } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 36765ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 36775ffd83dbSDimitry Andric return ModSrc; 36785ffd83dbSDimitry Andric } 36795ffd83dbSDimitry Andric 36805ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 36815ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 36825ffd83dbSDimitry Andric MachineIRBuilder &B) const { 36835ffd83dbSDimitry Andric 36845ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 36855f757f3fSDimitry Andric const LLT F64 = LLT::float64(); 36865ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 36875ffd83dbSDimitry Andric Register OrigSrc = MI.getOperand(1).getReg(); 36885ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 36895f757f3fSDimitry Andric assert(ST.hasFractBug() && MRI.getType(Dst) == F64 && 36905ffd83dbSDimitry Andric "this should not have been custom lowered"); 36915ffd83dbSDimitry Andric 36925ffd83dbSDimitry Andric // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 36935ffd83dbSDimitry Andric // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 36945ffd83dbSDimitry Andric // efficient way to implement it is using V_FRACT_F64. The workaround for the 36955ffd83dbSDimitry Andric // V_FRACT bug is: 36965ffd83dbSDimitry Andric // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 36975ffd83dbSDimitry Andric // 36985ffd83dbSDimitry Andric // Convert floor(x) to (x - fract(x)) 36995ffd83dbSDimitry Andric 37005f757f3fSDimitry Andric auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64}) 37015ffd83dbSDimitry Andric .addUse(OrigSrc) 37025ffd83dbSDimitry Andric .setMIFlags(Flags); 37035ffd83dbSDimitry Andric 37045ffd83dbSDimitry Andric // Give source modifier matching some assistance before obscuring a foldable 37055ffd83dbSDimitry Andric // pattern. 37065ffd83dbSDimitry Andric 37075ffd83dbSDimitry Andric // TODO: We can avoid the neg on the fract? The input sign to fract 37085ffd83dbSDimitry Andric // shouldn't matter? 37095ffd83dbSDimitry Andric Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 37105ffd83dbSDimitry Andric 371106c3fb27SDimitry Andric auto Const = 37125f757f3fSDimitry Andric B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff)); 37135ffd83dbSDimitry Andric 37145f757f3fSDimitry Andric Register Min = MRI.createGenericVirtualRegister(F64); 37155ffd83dbSDimitry Andric 37165ffd83dbSDimitry Andric // We don't need to concern ourselves with the snan handling difference, so 37175ffd83dbSDimitry Andric // use the one which will directly select. 37185ffd83dbSDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 37195ffd83dbSDimitry Andric if (MFI->getMode().IEEE) 37205ffd83dbSDimitry Andric B.buildFMinNumIEEE(Min, Fract, Const, Flags); 37215ffd83dbSDimitry Andric else 37225ffd83dbSDimitry Andric B.buildFMinNum(Min, Fract, Const, Flags); 37235ffd83dbSDimitry Andric 37245ffd83dbSDimitry Andric Register CorrectedFract = Min; 37255ffd83dbSDimitry Andric if (!MI.getFlag(MachineInstr::FmNoNans)) { 37265ffd83dbSDimitry Andric auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 37275f757f3fSDimitry Andric CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0); 37285ffd83dbSDimitry Andric } 37295ffd83dbSDimitry Andric 37305f757f3fSDimitry Andric auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags); 37315ffd83dbSDimitry Andric B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 37325ffd83dbSDimitry Andric 37335ffd83dbSDimitry Andric MI.eraseFromParent(); 37345ffd83dbSDimitry Andric return true; 37355ffd83dbSDimitry Andric } 37365ffd83dbSDimitry Andric 37375ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations. 37385ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper. 37395ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector( 37405ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 37415ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 37425ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3743bdd1243dSDimitry Andric const LLT S16 = LLT::scalar(16); 3744fe6060f1SDimitry Andric assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 37455ffd83dbSDimitry Andric 37465ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 37475ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 37485ffd83dbSDimitry Andric 3749bdd1243dSDimitry Andric if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 3750bdd1243dSDimitry Andric assert(MRI.getType(Src0) == S32); 3751bdd1243dSDimitry Andric Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 3752bdd1243dSDimitry Andric Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 3753bdd1243dSDimitry Andric } 3754bdd1243dSDimitry Andric 3755bdd1243dSDimitry Andric auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 37565ffd83dbSDimitry Andric B.buildBitcast(Dst, Merge); 37575ffd83dbSDimitry Andric 37585ffd83dbSDimitry Andric MI.eraseFromParent(); 37595ffd83dbSDimitry Andric return true; 37605ffd83dbSDimitry Andric } 37615ffd83dbSDimitry Andric 376281ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 376381ad6265SDimitry Andric // 376481ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits. 376581ad6265SDimitry Andric // 376681ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence 376781ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of 376881ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go 376981ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction 377081ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions. 377106c3fb27SDimitry Andric void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper, 377206c3fb27SDimitry Andric MutableArrayRef<Register> Accum, 377306c3fb27SDimitry Andric ArrayRef<Register> Src0, 377406c3fb27SDimitry Andric ArrayRef<Register> Src1, 377506c3fb27SDimitry Andric bool UsePartialMad64_32, 377606c3fb27SDimitry Andric bool SeparateOddAlignedProducts) const { 377781ad6265SDimitry Andric // Use (possibly empty) vectors of S1 registers to represent the set of 377881ad6265SDimitry Andric // carries from one pair of positions to the next. 377981ad6265SDimitry Andric using Carry = SmallVector<Register, 2>; 378081ad6265SDimitry Andric 378181ad6265SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 378206c3fb27SDimitry Andric GISelKnownBits &KB = *Helper.getKnownBits(); 378381ad6265SDimitry Andric 378481ad6265SDimitry Andric const LLT S1 = LLT::scalar(1); 378581ad6265SDimitry Andric const LLT S32 = LLT::scalar(32); 378681ad6265SDimitry Andric const LLT S64 = LLT::scalar(64); 378781ad6265SDimitry Andric 378881ad6265SDimitry Andric Register Zero32; 378981ad6265SDimitry Andric Register Zero64; 379081ad6265SDimitry Andric 379181ad6265SDimitry Andric auto getZero32 = [&]() -> Register { 379281ad6265SDimitry Andric if (!Zero32) 379381ad6265SDimitry Andric Zero32 = B.buildConstant(S32, 0).getReg(0); 379481ad6265SDimitry Andric return Zero32; 379581ad6265SDimitry Andric }; 379681ad6265SDimitry Andric auto getZero64 = [&]() -> Register { 379781ad6265SDimitry Andric if (!Zero64) 379881ad6265SDimitry Andric Zero64 = B.buildConstant(S64, 0).getReg(0); 379981ad6265SDimitry Andric return Zero64; 380081ad6265SDimitry Andric }; 380181ad6265SDimitry Andric 380206c3fb27SDimitry Andric SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros; 380306c3fb27SDimitry Andric for (unsigned i = 0; i < Src0.size(); ++i) { 380406c3fb27SDimitry Andric Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero()); 380506c3fb27SDimitry Andric Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero()); 380606c3fb27SDimitry Andric } 380706c3fb27SDimitry Andric 380881ad6265SDimitry Andric // Merge the given carries into the 32-bit LocalAccum, which is modified 380981ad6265SDimitry Andric // in-place. 381081ad6265SDimitry Andric // 381181ad6265SDimitry Andric // Returns the carry-out, which is a single S1 register or null. 381281ad6265SDimitry Andric auto mergeCarry = 381381ad6265SDimitry Andric [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 381481ad6265SDimitry Andric if (CarryIn.empty()) 381581ad6265SDimitry Andric return Register(); 381681ad6265SDimitry Andric 381781ad6265SDimitry Andric bool HaveCarryOut = true; 381881ad6265SDimitry Andric Register CarryAccum; 381981ad6265SDimitry Andric if (CarryIn.size() == 1) { 382081ad6265SDimitry Andric if (!LocalAccum) { 382181ad6265SDimitry Andric LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 382281ad6265SDimitry Andric return Register(); 382381ad6265SDimitry Andric } 382481ad6265SDimitry Andric 382581ad6265SDimitry Andric CarryAccum = getZero32(); 382681ad6265SDimitry Andric } else { 382781ad6265SDimitry Andric CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 382881ad6265SDimitry Andric for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 382981ad6265SDimitry Andric CarryAccum = 383081ad6265SDimitry Andric B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 383181ad6265SDimitry Andric .getReg(0); 383281ad6265SDimitry Andric } 383381ad6265SDimitry Andric 383481ad6265SDimitry Andric if (!LocalAccum) { 383581ad6265SDimitry Andric LocalAccum = getZero32(); 383681ad6265SDimitry Andric HaveCarryOut = false; 383781ad6265SDimitry Andric } 383881ad6265SDimitry Andric } 383981ad6265SDimitry Andric 384081ad6265SDimitry Andric auto Add = 384181ad6265SDimitry Andric B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 384281ad6265SDimitry Andric LocalAccum = Add.getReg(0); 384381ad6265SDimitry Andric return HaveCarryOut ? Add.getReg(1) : Register(); 384481ad6265SDimitry Andric }; 384581ad6265SDimitry Andric 384681ad6265SDimitry Andric // Build a multiply-add chain to compute 384781ad6265SDimitry Andric // 384881ad6265SDimitry Andric // LocalAccum + (partial products at DstIndex) 384981ad6265SDimitry Andric // + (opportunistic subset of CarryIn) 385081ad6265SDimitry Andric // 385181ad6265SDimitry Andric // LocalAccum is an array of one or two 32-bit registers that are updated 385281ad6265SDimitry Andric // in-place. The incoming registers may be null. 385381ad6265SDimitry Andric // 385481ad6265SDimitry Andric // In some edge cases, carry-ins can be consumed "for free". In that case, 385581ad6265SDimitry Andric // the consumed carry bits are removed from CarryIn in-place. 385681ad6265SDimitry Andric auto buildMadChain = 385781ad6265SDimitry Andric [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 385881ad6265SDimitry Andric -> Carry { 385981ad6265SDimitry Andric assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 386081ad6265SDimitry Andric (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 386181ad6265SDimitry Andric 386281ad6265SDimitry Andric Carry CarryOut; 386381ad6265SDimitry Andric unsigned j0 = 0; 386481ad6265SDimitry Andric 386581ad6265SDimitry Andric // Use plain 32-bit multiplication for the most significant part of the 386681ad6265SDimitry Andric // result by default. 386781ad6265SDimitry Andric if (LocalAccum.size() == 1 && 386881ad6265SDimitry Andric (!UsePartialMad64_32 || !CarryIn.empty())) { 386981ad6265SDimitry Andric do { 387006c3fb27SDimitry Andric // Skip multiplication if one of the operands is 0 387181ad6265SDimitry Andric unsigned j1 = DstIndex - j0; 387206c3fb27SDimitry Andric if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 387306c3fb27SDimitry Andric ++j0; 387406c3fb27SDimitry Andric continue; 387506c3fb27SDimitry Andric } 387681ad6265SDimitry Andric auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 387706c3fb27SDimitry Andric if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) { 387881ad6265SDimitry Andric LocalAccum[0] = Mul.getReg(0); 387981ad6265SDimitry Andric } else { 388081ad6265SDimitry Andric if (CarryIn.empty()) { 388181ad6265SDimitry Andric LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 388281ad6265SDimitry Andric } else { 388381ad6265SDimitry Andric LocalAccum[0] = 388481ad6265SDimitry Andric B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 388581ad6265SDimitry Andric .getReg(0); 388681ad6265SDimitry Andric CarryIn.pop_back(); 388781ad6265SDimitry Andric } 388881ad6265SDimitry Andric } 388981ad6265SDimitry Andric ++j0; 389081ad6265SDimitry Andric } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 389181ad6265SDimitry Andric } 389281ad6265SDimitry Andric 389381ad6265SDimitry Andric // Build full 64-bit multiplies. 389481ad6265SDimitry Andric if (j0 <= DstIndex) { 389581ad6265SDimitry Andric bool HaveSmallAccum = false; 389681ad6265SDimitry Andric Register Tmp; 389781ad6265SDimitry Andric 389881ad6265SDimitry Andric if (LocalAccum[0]) { 389981ad6265SDimitry Andric if (LocalAccum.size() == 1) { 390081ad6265SDimitry Andric Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 390181ad6265SDimitry Andric HaveSmallAccum = true; 390281ad6265SDimitry Andric } else if (LocalAccum[1]) { 3903bdd1243dSDimitry Andric Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 390481ad6265SDimitry Andric HaveSmallAccum = false; 390581ad6265SDimitry Andric } else { 390681ad6265SDimitry Andric Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 390781ad6265SDimitry Andric HaveSmallAccum = true; 390881ad6265SDimitry Andric } 390981ad6265SDimitry Andric } else { 391081ad6265SDimitry Andric assert(LocalAccum.size() == 1 || !LocalAccum[1]); 391181ad6265SDimitry Andric Tmp = getZero64(); 391281ad6265SDimitry Andric HaveSmallAccum = true; 391381ad6265SDimitry Andric } 391481ad6265SDimitry Andric 391581ad6265SDimitry Andric do { 391681ad6265SDimitry Andric unsigned j1 = DstIndex - j0; 391706c3fb27SDimitry Andric if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) { 391806c3fb27SDimitry Andric ++j0; 391906c3fb27SDimitry Andric continue; 392006c3fb27SDimitry Andric } 392181ad6265SDimitry Andric auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 392281ad6265SDimitry Andric {Src0[j0], Src1[j1], Tmp}); 392381ad6265SDimitry Andric Tmp = Mad.getReg(0); 392481ad6265SDimitry Andric if (!HaveSmallAccum) 392581ad6265SDimitry Andric CarryOut.push_back(Mad.getReg(1)); 392681ad6265SDimitry Andric HaveSmallAccum = false; 392706c3fb27SDimitry Andric 392881ad6265SDimitry Andric ++j0; 392981ad6265SDimitry Andric } while (j0 <= DstIndex); 393081ad6265SDimitry Andric 393181ad6265SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Tmp); 393281ad6265SDimitry Andric LocalAccum[0] = Unmerge.getReg(0); 393381ad6265SDimitry Andric if (LocalAccum.size() > 1) 393481ad6265SDimitry Andric LocalAccum[1] = Unmerge.getReg(1); 393581ad6265SDimitry Andric } 393681ad6265SDimitry Andric 393781ad6265SDimitry Andric return CarryOut; 393881ad6265SDimitry Andric }; 393981ad6265SDimitry Andric 394081ad6265SDimitry Andric // Outer multiply loop, iterating over destination parts from least 394181ad6265SDimitry Andric // significant to most significant parts. 394281ad6265SDimitry Andric // 394381ad6265SDimitry Andric // The columns of the following diagram correspond to the destination parts 394481ad6265SDimitry Andric // affected by one iteration of the outer loop (ignoring boundary 394581ad6265SDimitry Andric // conditions). 394681ad6265SDimitry Andric // 394781ad6265SDimitry Andric // Dest index relative to 2 * i: 1 0 -1 394881ad6265SDimitry Andric // ------ 394981ad6265SDimitry Andric // Carries from previous iteration: e o 395081ad6265SDimitry Andric // Even-aligned partial product sum: E E . 395181ad6265SDimitry Andric // Odd-aligned partial product sum: O O 395281ad6265SDimitry Andric // 395381ad6265SDimitry Andric // 'o' is OddCarry, 'e' is EvenCarry. 395481ad6265SDimitry Andric // EE and OO are computed from partial products via buildMadChain and use 395581ad6265SDimitry Andric // accumulation where possible and appropriate. 395681ad6265SDimitry Andric // 395781ad6265SDimitry Andric Register SeparateOddCarry; 395881ad6265SDimitry Andric Carry EvenCarry; 395981ad6265SDimitry Andric Carry OddCarry; 396081ad6265SDimitry Andric 396181ad6265SDimitry Andric for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 396281ad6265SDimitry Andric Carry OddCarryIn = std::move(OddCarry); 396381ad6265SDimitry Andric Carry EvenCarryIn = std::move(EvenCarry); 396481ad6265SDimitry Andric OddCarry.clear(); 396581ad6265SDimitry Andric EvenCarry.clear(); 396681ad6265SDimitry Andric 396781ad6265SDimitry Andric // Partial products at offset 2 * i. 396881ad6265SDimitry Andric if (2 * i < Accum.size()) { 396981ad6265SDimitry Andric auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 397081ad6265SDimitry Andric EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 397181ad6265SDimitry Andric } 397281ad6265SDimitry Andric 397381ad6265SDimitry Andric // Partial products at offset 2 * i - 1. 397481ad6265SDimitry Andric if (i > 0) { 397581ad6265SDimitry Andric if (!SeparateOddAlignedProducts) { 397681ad6265SDimitry Andric auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 397781ad6265SDimitry Andric OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 397881ad6265SDimitry Andric } else { 397981ad6265SDimitry Andric bool IsHighest = 2 * i >= Accum.size(); 398081ad6265SDimitry Andric Register SeparateOddOut[2]; 3981bdd1243dSDimitry Andric auto LocalAccum = MutableArrayRef(SeparateOddOut) 398281ad6265SDimitry Andric .take_front(IsHighest ? 1 : 2); 398381ad6265SDimitry Andric OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 398481ad6265SDimitry Andric 398581ad6265SDimitry Andric MachineInstr *Lo; 398681ad6265SDimitry Andric 398781ad6265SDimitry Andric if (i == 1) { 398881ad6265SDimitry Andric if (!IsHighest) 398981ad6265SDimitry Andric Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 399081ad6265SDimitry Andric else 399181ad6265SDimitry Andric Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 399281ad6265SDimitry Andric } else { 399381ad6265SDimitry Andric Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 399481ad6265SDimitry Andric SeparateOddCarry); 399581ad6265SDimitry Andric } 399681ad6265SDimitry Andric Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 399781ad6265SDimitry Andric 399881ad6265SDimitry Andric if (!IsHighest) { 399981ad6265SDimitry Andric auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 400081ad6265SDimitry Andric Lo->getOperand(1).getReg()); 400181ad6265SDimitry Andric Accum[2 * i] = Hi.getReg(0); 400281ad6265SDimitry Andric SeparateOddCarry = Hi.getReg(1); 400381ad6265SDimitry Andric } 400481ad6265SDimitry Andric } 400581ad6265SDimitry Andric } 400681ad6265SDimitry Andric 400781ad6265SDimitry Andric // Add in the carries from the previous iteration 400881ad6265SDimitry Andric if (i > 0) { 400981ad6265SDimitry Andric if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 401081ad6265SDimitry Andric EvenCarryIn.push_back(CarryOut); 401181ad6265SDimitry Andric 401281ad6265SDimitry Andric if (2 * i < Accum.size()) { 401381ad6265SDimitry Andric if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 401481ad6265SDimitry Andric OddCarry.push_back(CarryOut); 401581ad6265SDimitry Andric } 401681ad6265SDimitry Andric } 401781ad6265SDimitry Andric } 401881ad6265SDimitry Andric } 401981ad6265SDimitry Andric 402081ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions. 402181ad6265SDimitry Andric // 402281ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to 402381ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 402481ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 402581ad6265SDimitry Andric MachineInstr &MI) const { 402681ad6265SDimitry Andric assert(ST.hasMad64_32()); 402781ad6265SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_MUL); 402881ad6265SDimitry Andric 402981ad6265SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 403081ad6265SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 403181ad6265SDimitry Andric 403281ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 403381ad6265SDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 403481ad6265SDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 403581ad6265SDimitry Andric 403681ad6265SDimitry Andric LLT Ty = MRI.getType(DstReg); 403781ad6265SDimitry Andric assert(Ty.isScalar()); 403881ad6265SDimitry Andric 403981ad6265SDimitry Andric unsigned Size = Ty.getSizeInBits(); 404081ad6265SDimitry Andric unsigned NumParts = Size / 32; 404181ad6265SDimitry Andric assert((Size % 32) == 0); 404281ad6265SDimitry Andric assert(NumParts >= 2); 404381ad6265SDimitry Andric 404481ad6265SDimitry Andric // Whether to use MAD_64_32 for partial products whose high half is 404581ad6265SDimitry Andric // discarded. This avoids some ADD instructions but risks false dependency 404681ad6265SDimitry Andric // stalls on some subtargets in some cases. 404781ad6265SDimitry Andric const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 404881ad6265SDimitry Andric 404981ad6265SDimitry Andric // Whether to compute odd-aligned partial products separately. This is 405081ad6265SDimitry Andric // advisable on subtargets where the accumulator of MAD_64_32 must be placed 405181ad6265SDimitry Andric // in an even-aligned VGPR. 405281ad6265SDimitry Andric const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 405381ad6265SDimitry Andric 405481ad6265SDimitry Andric LLT S32 = LLT::scalar(32); 405581ad6265SDimitry Andric SmallVector<Register, 2> Src0Parts, Src1Parts; 405681ad6265SDimitry Andric for (unsigned i = 0; i < NumParts; ++i) { 405781ad6265SDimitry Andric Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 405881ad6265SDimitry Andric Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 405981ad6265SDimitry Andric } 406081ad6265SDimitry Andric B.buildUnmerge(Src0Parts, Src0); 406181ad6265SDimitry Andric B.buildUnmerge(Src1Parts, Src1); 406281ad6265SDimitry Andric 406381ad6265SDimitry Andric SmallVector<Register, 2> AccumRegs(NumParts); 406481ad6265SDimitry Andric buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 406581ad6265SDimitry Andric SeparateOddAlignedProducts); 406681ad6265SDimitry Andric 4067bdd1243dSDimitry Andric B.buildMergeLikeInstr(DstReg, AccumRegs); 406881ad6265SDimitry Andric MI.eraseFromParent(); 406981ad6265SDimitry Andric return true; 407081ad6265SDimitry Andric } 407181ad6265SDimitry Andric 4072349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 4073349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 4074349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select. 4075349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 4076349cc55cSDimitry Andric MachineRegisterInfo &MRI, 4077349cc55cSDimitry Andric MachineIRBuilder &B) const { 4078349cc55cSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 4079349cc55cSDimitry Andric Register Src = MI.getOperand(1).getReg(); 4080349cc55cSDimitry Andric LLT DstTy = MRI.getType(Dst); 4081349cc55cSDimitry Andric LLT SrcTy = MRI.getType(Src); 4082349cc55cSDimitry Andric 4083349cc55cSDimitry Andric unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 4084349cc55cSDimitry Andric ? AMDGPU::G_AMDGPU_FFBH_U32 4085349cc55cSDimitry Andric : AMDGPU::G_AMDGPU_FFBL_B32; 4086349cc55cSDimitry Andric auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 4087349cc55cSDimitry Andric B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 4088349cc55cSDimitry Andric 4089349cc55cSDimitry Andric MI.eraseFromParent(); 4090349cc55cSDimitry Andric return true; 4091349cc55cSDimitry Andric } 4092349cc55cSDimitry Andric 4093e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1 4094e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 4095e8d8bef9SDimitry Andric if (MI.getOpcode() != TargetOpcode::G_XOR) 4096e8d8bef9SDimitry Andric return false; 4097349cc55cSDimitry Andric auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 4098e8d8bef9SDimitry Andric return ConstVal && *ConstVal == -1; 4099e8d8bef9SDimitry Andric } 4100e8d8bef9SDimitry Andric 41010b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid. 4102e8d8bef9SDimitry Andric static MachineInstr * 4103e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 4104e8d8bef9SDimitry Andric MachineBasicBlock *&UncondBrTarget, bool &Negated) { 41050b57cec5SDimitry Andric Register CondDef = MI.getOperand(0).getReg(); 41060b57cec5SDimitry Andric if (!MRI.hasOneNonDBGUse(CondDef)) 41070b57cec5SDimitry Andric return nullptr; 41080b57cec5SDimitry Andric 41095ffd83dbSDimitry Andric MachineBasicBlock *Parent = MI.getParent(); 4110e8d8bef9SDimitry Andric MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 4111e8d8bef9SDimitry Andric 4112e8d8bef9SDimitry Andric if (isNot(MRI, *UseMI)) { 4113e8d8bef9SDimitry Andric Register NegatedCond = UseMI->getOperand(0).getReg(); 4114e8d8bef9SDimitry Andric if (!MRI.hasOneNonDBGUse(NegatedCond)) 4115e8d8bef9SDimitry Andric return nullptr; 4116e8d8bef9SDimitry Andric 4117e8d8bef9SDimitry Andric // We're deleting the def of this value, so we need to remove it. 4118349cc55cSDimitry Andric eraseInstr(*UseMI, MRI); 4119e8d8bef9SDimitry Andric 4120e8d8bef9SDimitry Andric UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 4121e8d8bef9SDimitry Andric Negated = true; 4122e8d8bef9SDimitry Andric } 4123e8d8bef9SDimitry Andric 4124e8d8bef9SDimitry Andric if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 4125480093f4SDimitry Andric return nullptr; 4126480093f4SDimitry Andric 41275ffd83dbSDimitry Andric // Make sure the cond br is followed by a G_BR, or is the last instruction. 4128e8d8bef9SDimitry Andric MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 41295ffd83dbSDimitry Andric if (Next == Parent->end()) { 41305ffd83dbSDimitry Andric MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 41315ffd83dbSDimitry Andric if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 41325ffd83dbSDimitry Andric return nullptr; 41335ffd83dbSDimitry Andric UncondBrTarget = &*NextMBB; 41345ffd83dbSDimitry Andric } else { 4135480093f4SDimitry Andric if (Next->getOpcode() != AMDGPU::G_BR) 4136480093f4SDimitry Andric return nullptr; 4137480093f4SDimitry Andric Br = &*Next; 41385ffd83dbSDimitry Andric UncondBrTarget = Br->getOperand(0).getMBB(); 4139480093f4SDimitry Andric } 4140480093f4SDimitry Andric 4141e8d8bef9SDimitry Andric return UseMI; 41420b57cec5SDimitry Andric } 41430b57cec5SDimitry Andric 41440b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 4145e8d8bef9SDimitry Andric const ArgDescriptor *Arg, 4146e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC, 4147e8d8bef9SDimitry Andric LLT ArgTy) const { 4148e8d8bef9SDimitry Andric MCRegister SrcReg = Arg->getRegister(); 4149e8d8bef9SDimitry Andric assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 41505ffd83dbSDimitry Andric assert(DstReg.isVirtual() && "Virtual register expected"); 41510b57cec5SDimitry Andric 415204eeddc0SDimitry Andric Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 415304eeddc0SDimitry Andric *ArgRC, B.getDebugLoc(), ArgTy); 41540b57cec5SDimitry Andric if (Arg->isMasked()) { 41550b57cec5SDimitry Andric // TODO: Should we try to emit this once in the entry block? 41560b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 41570b57cec5SDimitry Andric const unsigned Mask = Arg->getMask(); 415806c3fb27SDimitry Andric const unsigned Shift = llvm::countr_zero<unsigned>(Mask); 41590b57cec5SDimitry Andric 41608bcb0991SDimitry Andric Register AndMaskSrc = LiveIn; 41618bcb0991SDimitry Andric 416204eeddc0SDimitry Andric // TODO: Avoid clearing the high bits if we know workitem id y/z are always 416304eeddc0SDimitry Andric // 0. 41648bcb0991SDimitry Andric if (Shift != 0) { 41650b57cec5SDimitry Andric auto ShiftAmt = B.buildConstant(S32, Shift); 41668bcb0991SDimitry Andric AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 41678bcb0991SDimitry Andric } 41688bcb0991SDimitry Andric 41698bcb0991SDimitry Andric B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 41705ffd83dbSDimitry Andric } else { 41710b57cec5SDimitry Andric B.buildCopy(DstReg, LiveIn); 41720b57cec5SDimitry Andric } 41730b57cec5SDimitry Andric 41740b57cec5SDimitry Andric return true; 41750b57cec5SDimitry Andric } 41760b57cec5SDimitry Andric 4177e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue( 4178e8d8bef9SDimitry Andric Register DstReg, MachineIRBuilder &B, 4179e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4180e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4181*b3edf446SDimitry Andric const ArgDescriptor *Arg = nullptr; 4182e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC; 4183e8d8bef9SDimitry Andric LLT ArgTy; 4184*b3edf446SDimitry Andric 4185*b3edf446SDimitry Andric CallingConv::ID CC = B.getMF().getFunction().getCallingConv(); 4186*b3edf446SDimitry Andric const ArgDescriptor WorkGroupIDX = 4187*b3edf446SDimitry Andric ArgDescriptor::createRegister(AMDGPU::TTMP9); 4188*b3edf446SDimitry Andric // If GridZ is not programmed in an entry function then the hardware will set 4189*b3edf446SDimitry Andric // it to all zeros, so there is no need to mask the GridY value in the low 4190*b3edf446SDimitry Andric // order bits. 4191*b3edf446SDimitry Andric const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister( 4192*b3edf446SDimitry Andric AMDGPU::TTMP7, 4193*b3edf446SDimitry Andric AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu); 4194*b3edf446SDimitry Andric const ArgDescriptor WorkGroupIDZ = 4195*b3edf446SDimitry Andric ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u); 4196*b3edf446SDimitry Andric if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) { 4197*b3edf446SDimitry Andric switch (ArgType) { 4198*b3edf446SDimitry Andric case AMDGPUFunctionArgInfo::WORKGROUP_ID_X: 4199*b3edf446SDimitry Andric Arg = &WorkGroupIDX; 4200*b3edf446SDimitry Andric ArgRC = &AMDGPU::SReg_32RegClass; 4201*b3edf446SDimitry Andric ArgTy = LLT::scalar(32); 4202*b3edf446SDimitry Andric break; 4203*b3edf446SDimitry Andric case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y: 4204*b3edf446SDimitry Andric Arg = &WorkGroupIDY; 4205*b3edf446SDimitry Andric ArgRC = &AMDGPU::SReg_32RegClass; 4206*b3edf446SDimitry Andric ArgTy = LLT::scalar(32); 4207*b3edf446SDimitry Andric break; 4208*b3edf446SDimitry Andric case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z: 4209*b3edf446SDimitry Andric Arg = &WorkGroupIDZ; 4210*b3edf446SDimitry Andric ArgRC = &AMDGPU::SReg_32RegClass; 4211*b3edf446SDimitry Andric ArgTy = LLT::scalar(32); 4212*b3edf446SDimitry Andric break; 4213*b3edf446SDimitry Andric default: 4214*b3edf446SDimitry Andric break; 4215*b3edf446SDimitry Andric } 4216*b3edf446SDimitry Andric } 4217*b3edf446SDimitry Andric 4218*b3edf446SDimitry Andric if (!Arg) 4219e8d8bef9SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 4220e8d8bef9SDimitry Andric 4221349cc55cSDimitry Andric if (!Arg) { 4222349cc55cSDimitry Andric if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 4223349cc55cSDimitry Andric // The intrinsic may appear when we have a 0 sized kernarg segment, in which 4224349cc55cSDimitry Andric // case the pointer argument may be missing and we use null. 4225349cc55cSDimitry Andric B.buildConstant(DstReg, 0); 4226349cc55cSDimitry Andric return true; 4227349cc55cSDimitry Andric } 4228349cc55cSDimitry Andric 4229349cc55cSDimitry Andric // It's undefined behavior if a function marked with the amdgpu-no-* 4230349cc55cSDimitry Andric // attributes uses the corresponding intrinsic. 4231349cc55cSDimitry Andric B.buildUndef(DstReg); 4232349cc55cSDimitry Andric return true; 4233349cc55cSDimitry Andric } 4234349cc55cSDimitry Andric 4235e8d8bef9SDimitry Andric if (!Arg->isRegister() || !Arg->getRegister().isValid()) 4236e8d8bef9SDimitry Andric return false; // TODO: Handle these 4237e8d8bef9SDimitry Andric return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 4238e8d8bef9SDimitry Andric } 4239e8d8bef9SDimitry Andric 42400b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 42415ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 42420b57cec5SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 4243e8d8bef9SDimitry Andric if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 42445ffd83dbSDimitry Andric return false; 42455ffd83dbSDimitry Andric 42460b57cec5SDimitry Andric MI.eraseFromParent(); 42470b57cec5SDimitry Andric return true; 42480b57cec5SDimitry Andric } 42490b57cec5SDimitry Andric 425081ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 425181ad6265SDimitry Andric int64_t C) { 425281ad6265SDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), C); 425381ad6265SDimitry Andric MI.eraseFromParent(); 425481ad6265SDimitry Andric return true; 425581ad6265SDimitry Andric } 425681ad6265SDimitry Andric 425781ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 425881ad6265SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 425981ad6265SDimitry Andric unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 426081ad6265SDimitry Andric unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 426181ad6265SDimitry Andric if (MaxID == 0) 426281ad6265SDimitry Andric return replaceWithConstant(B, MI, 0); 426381ad6265SDimitry Andric 426481ad6265SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 426581ad6265SDimitry Andric const ArgDescriptor *Arg; 426681ad6265SDimitry Andric const TargetRegisterClass *ArgRC; 426781ad6265SDimitry Andric LLT ArgTy; 426881ad6265SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 426981ad6265SDimitry Andric 427081ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 427181ad6265SDimitry Andric if (!Arg) { 427281ad6265SDimitry Andric // It's undefined behavior if a function marked with the amdgpu-no-* 427381ad6265SDimitry Andric // attributes uses the corresponding intrinsic. 427481ad6265SDimitry Andric B.buildUndef(DstReg); 427581ad6265SDimitry Andric MI.eraseFromParent(); 427681ad6265SDimitry Andric return true; 427781ad6265SDimitry Andric } 427881ad6265SDimitry Andric 427981ad6265SDimitry Andric if (Arg->isMasked()) { 428081ad6265SDimitry Andric // Don't bother inserting AssertZext for packed IDs since we're emitting the 428181ad6265SDimitry Andric // masking operations anyway. 428281ad6265SDimitry Andric // 428381ad6265SDimitry Andric // TODO: We could assert the top bit is 0 for the source copy. 428481ad6265SDimitry Andric if (!loadInputValue(DstReg, B, ArgType)) 428581ad6265SDimitry Andric return false; 428681ad6265SDimitry Andric } else { 428781ad6265SDimitry Andric Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 428881ad6265SDimitry Andric if (!loadInputValue(TmpReg, B, ArgType)) 428981ad6265SDimitry Andric return false; 4290bdd1243dSDimitry Andric B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 429181ad6265SDimitry Andric } 429281ad6265SDimitry Andric 429381ad6265SDimitry Andric MI.eraseFromParent(); 429481ad6265SDimitry Andric return true; 429581ad6265SDimitry Andric } 429681ad6265SDimitry Andric 429781ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 429881ad6265SDimitry Andric int64_t Offset) const { 429981ad6265SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 430081ad6265SDimitry Andric Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 430181ad6265SDimitry Andric 430281ad6265SDimitry Andric // TODO: If we passed in the base kernel offset we could have a better 430381ad6265SDimitry Andric // alignment than 4, but we don't really need it. 430481ad6265SDimitry Andric if (!loadInputValue(KernArgReg, B, 430581ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 430681ad6265SDimitry Andric llvm_unreachable("failed to find kernarg segment ptr"); 430781ad6265SDimitry Andric 430881ad6265SDimitry Andric auto COffset = B.buildConstant(LLT::scalar(64), Offset); 430981ad6265SDimitry Andric // TODO: Should get nuw 431081ad6265SDimitry Andric return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 431181ad6265SDimitry Andric } 431281ad6265SDimitry Andric 431381ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by 431481ad6265SDimitry Andric /// legacy intrinsics. 431581ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 431681ad6265SDimitry Andric MachineIRBuilder &B, 431781ad6265SDimitry Andric uint64_t Offset, 431881ad6265SDimitry Andric Align Alignment) const { 431981ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 432081ad6265SDimitry Andric 432181ad6265SDimitry Andric assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 432281ad6265SDimitry Andric "unexpected kernarg parameter type"); 432381ad6265SDimitry Andric 432481ad6265SDimitry Andric Register Ptr = getKernargParameterPtr(B, Offset); 432581ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 432681ad6265SDimitry Andric B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 432781ad6265SDimitry Andric MachineMemOperand::MODereferenceable | 432881ad6265SDimitry Andric MachineMemOperand::MOInvariant); 432981ad6265SDimitry Andric MI.eraseFromParent(); 433081ad6265SDimitry Andric return true; 433181ad6265SDimitry Andric } 433281ad6265SDimitry Andric 43338bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 43348bcb0991SDimitry Andric MachineRegisterInfo &MRI, 43358bcb0991SDimitry Andric MachineIRBuilder &B) const { 4336480093f4SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 4337480093f4SDimitry Andric LLT DstTy = MRI.getType(Dst); 4338480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 4339480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4340480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 43418bcb0991SDimitry Andric 4342480093f4SDimitry Andric if (DstTy == S16) 4343480093f4SDimitry Andric return legalizeFDIV16(MI, MRI, B); 4344480093f4SDimitry Andric if (DstTy == S32) 4345480093f4SDimitry Andric return legalizeFDIV32(MI, MRI, B); 4346480093f4SDimitry Andric if (DstTy == S64) 4347480093f4SDimitry Andric return legalizeFDIV64(MI, MRI, B); 4348480093f4SDimitry Andric 43498bcb0991SDimitry Andric return false; 43508bcb0991SDimitry Andric } 43518bcb0991SDimitry Andric 4352fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 4353fe6060f1SDimitry Andric Register DstDivReg, 4354fe6060f1SDimitry Andric Register DstRemReg, 43555ffd83dbSDimitry Andric Register X, 4356fe6060f1SDimitry Andric Register Y) const { 43575ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 43585ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 43595ffd83dbSDimitry Andric 43605ffd83dbSDimitry Andric // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 43615ffd83dbSDimitry Andric // algorithm used here. 43625ffd83dbSDimitry Andric 43635ffd83dbSDimitry Andric // Initial estimate of inv(y). 43645ffd83dbSDimitry Andric auto FloatY = B.buildUITOFP(S32, Y); 43655ffd83dbSDimitry Andric auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 436606c3fb27SDimitry Andric auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe)); 43675ffd83dbSDimitry Andric auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 43685ffd83dbSDimitry Andric auto Z = B.buildFPTOUI(S32, ScaledY); 43695ffd83dbSDimitry Andric 43705ffd83dbSDimitry Andric // One round of UNR. 43715ffd83dbSDimitry Andric auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 43725ffd83dbSDimitry Andric auto NegYZ = B.buildMul(S32, NegY, Z); 43735ffd83dbSDimitry Andric Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 43745ffd83dbSDimitry Andric 43755ffd83dbSDimitry Andric // Quotient/remainder estimate. 43765ffd83dbSDimitry Andric auto Q = B.buildUMulH(S32, X, Z); 43775ffd83dbSDimitry Andric auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 43785ffd83dbSDimitry Andric 43795ffd83dbSDimitry Andric // First quotient/remainder refinement. 43805ffd83dbSDimitry Andric auto One = B.buildConstant(S32, 1); 43815ffd83dbSDimitry Andric auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4382fe6060f1SDimitry Andric if (DstDivReg) 43835ffd83dbSDimitry Andric Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 43845ffd83dbSDimitry Andric R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 43855ffd83dbSDimitry Andric 43865ffd83dbSDimitry Andric // Second quotient/remainder refinement. 43875ffd83dbSDimitry Andric Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 4388fe6060f1SDimitry Andric if (DstDivReg) 4389fe6060f1SDimitry Andric B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 43905ffd83dbSDimitry Andric 4391fe6060f1SDimitry Andric if (DstRemReg) 4392fe6060f1SDimitry Andric B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 43935ffd83dbSDimitry Andric } 43945ffd83dbSDimitry Andric 4395349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32 43965ffd83dbSDimitry Andric // 43975ffd83dbSDimitry Andric // Return lo, hi of result 43985ffd83dbSDimitry Andric // 43995ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo 44005ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi 44015ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 44025ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad 44035ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc 44045ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32) 44055ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2 44065ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1 44075ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 44085ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 44095ffd83dbSDimitry Andric Register Val) { 44105ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 44115ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, Val); 44125ffd83dbSDimitry Andric 44135ffd83dbSDimitry Andric auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 44145ffd83dbSDimitry Andric auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 44155ffd83dbSDimitry Andric 441606c3fb27SDimitry Andric auto Mad = B.buildFMAD( 441706c3fb27SDimitry Andric S32, CvtHi, // 2**32 441806c3fb27SDimitry Andric B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo); 44195ffd83dbSDimitry Andric 44205ffd83dbSDimitry Andric auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 442106c3fb27SDimitry Andric auto Mul1 = B.buildFMul( 442206c3fb27SDimitry Andric S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc))); 44235ffd83dbSDimitry Andric 44245ffd83dbSDimitry Andric // 2**(-32) 442506c3fb27SDimitry Andric auto Mul2 = B.buildFMul( 442606c3fb27SDimitry Andric S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000))); 44275ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 44285ffd83dbSDimitry Andric 44295ffd83dbSDimitry Andric // -(2**32) 443006c3fb27SDimitry Andric auto Mad2 = B.buildFMAD( 443106c3fb27SDimitry Andric S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)), 443206c3fb27SDimitry Andric Mul1); 44335ffd83dbSDimitry Andric 44345ffd83dbSDimitry Andric auto ResultLo = B.buildFPTOUI(S32, Mad2); 44355ffd83dbSDimitry Andric auto ResultHi = B.buildFPTOUI(S32, Trunc); 44365ffd83dbSDimitry Andric 44375ffd83dbSDimitry Andric return {ResultLo.getReg(0), ResultHi.getReg(0)}; 44385ffd83dbSDimitry Andric } 44395ffd83dbSDimitry Andric 4440fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 4441fe6060f1SDimitry Andric Register DstDivReg, 4442fe6060f1SDimitry Andric Register DstRemReg, 44435ffd83dbSDimitry Andric Register Numer, 4444fe6060f1SDimitry Andric Register Denom) const { 44455ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 44465ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 44475ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 44485ffd83dbSDimitry Andric Register RcpLo, RcpHi; 44495ffd83dbSDimitry Andric 44505ffd83dbSDimitry Andric std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 44515ffd83dbSDimitry Andric 4452bdd1243dSDimitry Andric auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 44535ffd83dbSDimitry Andric 44545ffd83dbSDimitry Andric auto Zero64 = B.buildConstant(S64, 0); 44555ffd83dbSDimitry Andric auto NegDenom = B.buildSub(S64, Zero64, Denom); 44565ffd83dbSDimitry Andric 44575ffd83dbSDimitry Andric auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 44585ffd83dbSDimitry Andric auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 44595ffd83dbSDimitry Andric 44605ffd83dbSDimitry Andric auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 44615ffd83dbSDimitry Andric Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 44625ffd83dbSDimitry Andric Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 44635ffd83dbSDimitry Andric 44645ffd83dbSDimitry Andric auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 44655ffd83dbSDimitry Andric auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 4466bdd1243dSDimitry Andric auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 44675ffd83dbSDimitry Andric 44685ffd83dbSDimitry Andric auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 44695ffd83dbSDimitry Andric auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 44705ffd83dbSDimitry Andric auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 44715ffd83dbSDimitry Andric Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 44725ffd83dbSDimitry Andric Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 44735ffd83dbSDimitry Andric 44745ffd83dbSDimitry Andric auto Zero32 = B.buildConstant(S32, 0); 44755ffd83dbSDimitry Andric auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 4476349cc55cSDimitry Andric auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 4477bdd1243dSDimitry Andric auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 44785ffd83dbSDimitry Andric 44795ffd83dbSDimitry Andric auto UnmergeNumer = B.buildUnmerge(S32, Numer); 44805ffd83dbSDimitry Andric Register NumerLo = UnmergeNumer.getReg(0); 44815ffd83dbSDimitry Andric Register NumerHi = UnmergeNumer.getReg(1); 44825ffd83dbSDimitry Andric 44835ffd83dbSDimitry Andric auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 44845ffd83dbSDimitry Andric auto Mul3 = B.buildMul(S64, Denom, MulHi3); 44855ffd83dbSDimitry Andric auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 44865ffd83dbSDimitry Andric Register Mul3_Lo = UnmergeMul3.getReg(0); 44875ffd83dbSDimitry Andric Register Mul3_Hi = UnmergeMul3.getReg(1); 44885ffd83dbSDimitry Andric auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 44895ffd83dbSDimitry Andric auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 44905ffd83dbSDimitry Andric auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 4491bdd1243dSDimitry Andric auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 44925ffd83dbSDimitry Andric 44935ffd83dbSDimitry Andric auto UnmergeDenom = B.buildUnmerge(S32, Denom); 44945ffd83dbSDimitry Andric Register DenomLo = UnmergeDenom.getReg(0); 44955ffd83dbSDimitry Andric Register DenomHi = UnmergeDenom.getReg(1); 44965ffd83dbSDimitry Andric 44975ffd83dbSDimitry Andric auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 44985ffd83dbSDimitry Andric auto C1 = B.buildSExt(S32, CmpHi); 44995ffd83dbSDimitry Andric 45005ffd83dbSDimitry Andric auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 45015ffd83dbSDimitry Andric auto C2 = B.buildSExt(S32, CmpLo); 45025ffd83dbSDimitry Andric 45035ffd83dbSDimitry Andric auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 45045ffd83dbSDimitry Andric auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 45055ffd83dbSDimitry Andric 45065ffd83dbSDimitry Andric // TODO: Here and below portions of the code can be enclosed into if/endif. 45075ffd83dbSDimitry Andric // Currently control flow is unconditional and we have 4 selects after 45085ffd83dbSDimitry Andric // potential endif to substitute PHIs. 45095ffd83dbSDimitry Andric 45105ffd83dbSDimitry Andric // if C3 != 0 ... 45115ffd83dbSDimitry Andric auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 45125ffd83dbSDimitry Andric auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 45135ffd83dbSDimitry Andric auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 4514bdd1243dSDimitry Andric auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 45155ffd83dbSDimitry Andric 45165ffd83dbSDimitry Andric auto One64 = B.buildConstant(S64, 1); 45175ffd83dbSDimitry Andric auto Add3 = B.buildAdd(S64, MulHi3, One64); 45185ffd83dbSDimitry Andric 45195ffd83dbSDimitry Andric auto C4 = 45205ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 45215ffd83dbSDimitry Andric auto C5 = 45225ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 45235ffd83dbSDimitry Andric auto C6 = B.buildSelect( 45245ffd83dbSDimitry Andric S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 45255ffd83dbSDimitry Andric 45265ffd83dbSDimitry Andric // if (C6 != 0) 45275ffd83dbSDimitry Andric auto Add4 = B.buildAdd(S64, Add3, One64); 45285ffd83dbSDimitry Andric auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 45295ffd83dbSDimitry Andric 45305ffd83dbSDimitry Andric auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 45315ffd83dbSDimitry Andric auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 4532bdd1243dSDimitry Andric auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 45335ffd83dbSDimitry Andric 45345ffd83dbSDimitry Andric // endif C6 45355ffd83dbSDimitry Andric // endif C3 45365ffd83dbSDimitry Andric 4537fe6060f1SDimitry Andric if (DstDivReg) { 45385ffd83dbSDimitry Andric auto Sel1 = B.buildSelect( 45395ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 4540fe6060f1SDimitry Andric B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4541fe6060f1SDimitry Andric Sel1, MulHi3); 4542fe6060f1SDimitry Andric } 4543fe6060f1SDimitry Andric 4544fe6060f1SDimitry Andric if (DstRemReg) { 45455ffd83dbSDimitry Andric auto Sel2 = B.buildSelect( 45465ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 4547fe6060f1SDimitry Andric B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 4548fe6060f1SDimitry Andric Sel2, Sub1); 45495ffd83dbSDimitry Andric } 45505ffd83dbSDimitry Andric } 45515ffd83dbSDimitry Andric 4552fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 45535ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 45545ffd83dbSDimitry Andric MachineIRBuilder &B) const { 4555fe6060f1SDimitry Andric Register DstDivReg, DstRemReg; 4556fe6060f1SDimitry Andric switch (MI.getOpcode()) { 4557fe6060f1SDimitry Andric default: 4558fe6060f1SDimitry Andric llvm_unreachable("Unexpected opcode!"); 4559fe6060f1SDimitry Andric case AMDGPU::G_UDIV: { 4560fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 4561fe6060f1SDimitry Andric break; 4562fe6060f1SDimitry Andric } 4563fe6060f1SDimitry Andric case AMDGPU::G_UREM: { 4564fe6060f1SDimitry Andric DstRemReg = MI.getOperand(0).getReg(); 4565fe6060f1SDimitry Andric break; 4566fe6060f1SDimitry Andric } 4567fe6060f1SDimitry Andric case AMDGPU::G_UDIVREM: { 4568fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 4569fe6060f1SDimitry Andric DstRemReg = MI.getOperand(1).getReg(); 4570fe6060f1SDimitry Andric break; 4571fe6060f1SDimitry Andric } 4572fe6060f1SDimitry Andric } 4573fe6060f1SDimitry Andric 45745ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 45755ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 4576fe6060f1SDimitry Andric const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4577fe6060f1SDimitry Andric Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 4578fe6060f1SDimitry Andric Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 4579fe6060f1SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 45805ffd83dbSDimitry Andric 45815ffd83dbSDimitry Andric if (Ty == S32) 4582fe6060f1SDimitry Andric legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 45835ffd83dbSDimitry Andric else if (Ty == S64) 4584fe6060f1SDimitry Andric legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 45855ffd83dbSDimitry Andric else 45865ffd83dbSDimitry Andric return false; 45875ffd83dbSDimitry Andric 45885ffd83dbSDimitry Andric MI.eraseFromParent(); 45895ffd83dbSDimitry Andric return true; 45905ffd83dbSDimitry Andric } 45915ffd83dbSDimitry Andric 4592fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 45935ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 45945ffd83dbSDimitry Andric MachineIRBuilder &B) const { 45955ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 45965ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 45975ffd83dbSDimitry Andric 4598fe6060f1SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 45995ffd83dbSDimitry Andric if (Ty != S32 && Ty != S64) 46005ffd83dbSDimitry Andric return false; 46015ffd83dbSDimitry Andric 4602fe6060f1SDimitry Andric const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 4603fe6060f1SDimitry Andric Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 4604fe6060f1SDimitry Andric Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 46055ffd83dbSDimitry Andric 46065ffd83dbSDimitry Andric auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 46075ffd83dbSDimitry Andric auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 46085ffd83dbSDimitry Andric auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 46095ffd83dbSDimitry Andric 46105ffd83dbSDimitry Andric LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 46115ffd83dbSDimitry Andric RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 46125ffd83dbSDimitry Andric 46135ffd83dbSDimitry Andric LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 46145ffd83dbSDimitry Andric RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 46155ffd83dbSDimitry Andric 4616fe6060f1SDimitry Andric Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 4617fe6060f1SDimitry Andric switch (MI.getOpcode()) { 4618fe6060f1SDimitry Andric default: 4619fe6060f1SDimitry Andric llvm_unreachable("Unexpected opcode!"); 4620fe6060f1SDimitry Andric case AMDGPU::G_SDIV: { 4621fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 4622fe6060f1SDimitry Andric TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4623fe6060f1SDimitry Andric break; 4624fe6060f1SDimitry Andric } 4625fe6060f1SDimitry Andric case AMDGPU::G_SREM: { 4626fe6060f1SDimitry Andric DstRemReg = MI.getOperand(0).getReg(); 4627fe6060f1SDimitry Andric TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4628fe6060f1SDimitry Andric break; 4629fe6060f1SDimitry Andric } 4630fe6060f1SDimitry Andric case AMDGPU::G_SDIVREM: { 4631fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 4632fe6060f1SDimitry Andric DstRemReg = MI.getOperand(1).getReg(); 4633fe6060f1SDimitry Andric TmpDivReg = MRI.createGenericVirtualRegister(Ty); 4634fe6060f1SDimitry Andric TmpRemReg = MRI.createGenericVirtualRegister(Ty); 4635fe6060f1SDimitry Andric break; 4636fe6060f1SDimitry Andric } 4637fe6060f1SDimitry Andric } 4638fe6060f1SDimitry Andric 46395ffd83dbSDimitry Andric if (Ty == S32) 4640fe6060f1SDimitry Andric legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 46415ffd83dbSDimitry Andric else 4642fe6060f1SDimitry Andric legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 46435ffd83dbSDimitry Andric 4644fe6060f1SDimitry Andric if (DstDivReg) { 4645fe6060f1SDimitry Andric auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 4646fe6060f1SDimitry Andric auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 4647fe6060f1SDimitry Andric B.buildSub(DstDivReg, SignXor, Sign); 4648fe6060f1SDimitry Andric } 46495ffd83dbSDimitry Andric 4650fe6060f1SDimitry Andric if (DstRemReg) { 4651fe6060f1SDimitry Andric auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 4652fe6060f1SDimitry Andric auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 4653fe6060f1SDimitry Andric B.buildSub(DstRemReg, SignXor, Sign); 4654fe6060f1SDimitry Andric } 46555ffd83dbSDimitry Andric 46565ffd83dbSDimitry Andric MI.eraseFromParent(); 46575ffd83dbSDimitry Andric return true; 46585ffd83dbSDimitry Andric } 46595ffd83dbSDimitry Andric 46608bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 46618bcb0991SDimitry Andric MachineRegisterInfo &MRI, 46628bcb0991SDimitry Andric MachineIRBuilder &B) const { 46638bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 46648bcb0991SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 46658bcb0991SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 46668bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 46678bcb0991SDimitry Andric LLT ResTy = MRI.getType(Res); 46688bcb0991SDimitry Andric 46698bcb0991SDimitry Andric const MachineFunction &MF = B.getMF(); 467006c3fb27SDimitry Andric bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) || 467106c3fb27SDimitry Andric MF.getTarget().Options.UnsafeFPMath; 46728bcb0991SDimitry Andric 46738bcb0991SDimitry Andric if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 467406c3fb27SDimitry Andric if (!AllowInaccurateRcp && ResTy != LLT::scalar(16)) 467506c3fb27SDimitry Andric return false; 467606c3fb27SDimitry Andric 467706c3fb27SDimitry Andric // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to 467806c3fb27SDimitry Andric // the CI documentation has a worst case error of 1 ulp. 467906c3fb27SDimitry Andric // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to 468006c3fb27SDimitry Andric // use it as long as we aren't trying to use denormals. 468106c3fb27SDimitry Andric // 468206c3fb27SDimitry Andric // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp. 468306c3fb27SDimitry Andric 46848bcb0991SDimitry Andric // 1 / x -> RCP(x) 46858bcb0991SDimitry Andric if (CLHS->isExactlyValue(1.0)) { 46865f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 46878bcb0991SDimitry Andric .addUse(RHS) 46888bcb0991SDimitry Andric .setMIFlags(Flags); 46898bcb0991SDimitry Andric 46908bcb0991SDimitry Andric MI.eraseFromParent(); 46918bcb0991SDimitry Andric return true; 46928bcb0991SDimitry Andric } 46938bcb0991SDimitry Andric 46948bcb0991SDimitry Andric // -1 / x -> RCP( FNEG(x) ) 46958bcb0991SDimitry Andric if (CLHS->isExactlyValue(-1.0)) { 46968bcb0991SDimitry Andric auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 46975f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res) 46988bcb0991SDimitry Andric .addUse(FNeg.getReg(0)) 46998bcb0991SDimitry Andric .setMIFlags(Flags); 47008bcb0991SDimitry Andric 47018bcb0991SDimitry Andric MI.eraseFromParent(); 47028bcb0991SDimitry Andric return true; 47038bcb0991SDimitry Andric } 47048bcb0991SDimitry Andric } 47058bcb0991SDimitry Andric 47065f757f3fSDimitry Andric // For f16 require afn or arcp. 47075f757f3fSDimitry Andric // For f32 require afn. 470806c3fb27SDimitry Andric if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) || 470906c3fb27SDimitry Andric !MI.getFlag(MachineInstr::FmArcp))) 471006c3fb27SDimitry Andric return false; 471106c3fb27SDimitry Andric 47128bcb0991SDimitry Andric // x / y -> x * (1.0 / y) 47135f757f3fSDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 47148bcb0991SDimitry Andric .addUse(RHS) 47158bcb0991SDimitry Andric .setMIFlags(Flags); 47168bcb0991SDimitry Andric B.buildFMul(Res, LHS, RCP, Flags); 47178bcb0991SDimitry Andric 47188bcb0991SDimitry Andric MI.eraseFromParent(); 47198bcb0991SDimitry Andric return true; 47208bcb0991SDimitry Andric } 47218bcb0991SDimitry Andric 4722e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 4723e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 4724e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 4725e8d8bef9SDimitry Andric Register Res = MI.getOperand(0).getReg(); 4726e8d8bef9SDimitry Andric Register X = MI.getOperand(1).getReg(); 4727e8d8bef9SDimitry Andric Register Y = MI.getOperand(2).getReg(); 4728e8d8bef9SDimitry Andric uint16_t Flags = MI.getFlags(); 4729e8d8bef9SDimitry Andric LLT ResTy = MRI.getType(Res); 4730e8d8bef9SDimitry Andric 4731e8d8bef9SDimitry Andric const MachineFunction &MF = B.getMF(); 4732e8d8bef9SDimitry Andric bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 4733e8d8bef9SDimitry Andric MI.getFlag(MachineInstr::FmAfn); 4734e8d8bef9SDimitry Andric 4735e8d8bef9SDimitry Andric if (!AllowInaccurateRcp) 47368bcb0991SDimitry Andric return false; 4737e8d8bef9SDimitry Andric 4738e8d8bef9SDimitry Andric auto NegY = B.buildFNeg(ResTy, Y); 4739e8d8bef9SDimitry Andric auto One = B.buildFConstant(ResTy, 1.0); 4740e8d8bef9SDimitry Andric 47415f757f3fSDimitry Andric auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}) 4742e8d8bef9SDimitry Andric .addUse(Y) 4743e8d8bef9SDimitry Andric .setMIFlags(Flags); 4744e8d8bef9SDimitry Andric 4745e8d8bef9SDimitry Andric auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 4746e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp0, R, R); 4747e8d8bef9SDimitry Andric 4748e8d8bef9SDimitry Andric auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 4749e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp1, R, R); 4750e8d8bef9SDimitry Andric 4751e8d8bef9SDimitry Andric auto Ret = B.buildFMul(ResTy, X, R); 4752e8d8bef9SDimitry Andric auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 4753e8d8bef9SDimitry Andric 4754e8d8bef9SDimitry Andric B.buildFMA(Res, Tmp2, R, Ret); 4755e8d8bef9SDimitry Andric MI.eraseFromParent(); 4756e8d8bef9SDimitry Andric return true; 47578bcb0991SDimitry Andric } 47588bcb0991SDimitry Andric 4759480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 4760480093f4SDimitry Andric MachineRegisterInfo &MRI, 4761480093f4SDimitry Andric MachineIRBuilder &B) const { 4762e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4763e8d8bef9SDimitry Andric return true; 4764e8d8bef9SDimitry Andric 4765480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 4766480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 4767480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 4768480093f4SDimitry Andric 4769480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 4770480093f4SDimitry Andric 4771480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 4772480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4773480093f4SDimitry Andric 4774480093f4SDimitry Andric auto LHSExt = B.buildFPExt(S32, LHS, Flags); 4775480093f4SDimitry Andric auto RHSExt = B.buildFPExt(S32, RHS, Flags); 4776480093f4SDimitry Andric 47775f757f3fSDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4778480093f4SDimitry Andric .addUse(RHSExt.getReg(0)) 4779480093f4SDimitry Andric .setMIFlags(Flags); 4780480093f4SDimitry Andric 4781480093f4SDimitry Andric auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 4782480093f4SDimitry Andric auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 4783480093f4SDimitry Andric 47845f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4785480093f4SDimitry Andric .addUse(RDst.getReg(0)) 4786480093f4SDimitry Andric .addUse(RHS) 4787480093f4SDimitry Andric .addUse(LHS) 4788480093f4SDimitry Andric .setMIFlags(Flags); 4789480093f4SDimitry Andric 4790480093f4SDimitry Andric MI.eraseFromParent(); 4791480093f4SDimitry Andric return true; 4792480093f4SDimitry Andric } 4793480093f4SDimitry Andric 47945f757f3fSDimitry Andric static const unsigned SPDenormModeBitField = 47955f757f3fSDimitry Andric AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 47965f757f3fSDimitry Andric (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 47975f757f3fSDimitry Andric 4798480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 4799480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode. 480006c3fb27SDimitry Andric static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, 4801480093f4SDimitry Andric const GCNSubtarget &ST, 480206c3fb27SDimitry Andric SIModeRegisterDefaults Mode) { 4803480093f4SDimitry Andric // Set SP denorm mode to this value. 4804480093f4SDimitry Andric unsigned SPDenormMode = 48055ffd83dbSDimitry Andric Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 4806480093f4SDimitry Andric 4807480093f4SDimitry Andric if (ST.hasDenormModeInst()) { 4808480093f4SDimitry Andric // Preserve default FP64FP16 denorm mode while updating FP32 mode. 48095ffd83dbSDimitry Andric uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 4810480093f4SDimitry Andric 48115ffd83dbSDimitry Andric uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 4812480093f4SDimitry Andric B.buildInstr(AMDGPU::S_DENORM_MODE) 4813480093f4SDimitry Andric .addImm(NewDenormModeValue); 4814480093f4SDimitry Andric 4815480093f4SDimitry Andric } else { 4816480093f4SDimitry Andric B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 4817480093f4SDimitry Andric .addImm(SPDenormMode) 4818480093f4SDimitry Andric .addImm(SPDenormModeBitField); 4819480093f4SDimitry Andric } 4820480093f4SDimitry Andric } 4821480093f4SDimitry Andric 4822480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 4823480093f4SDimitry Andric MachineRegisterInfo &MRI, 4824480093f4SDimitry Andric MachineIRBuilder &B) const { 4825e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 4826e8d8bef9SDimitry Andric return true; 4827e8d8bef9SDimitry Andric 4828480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 4829480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 4830480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 4831480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 483206c3fb27SDimitry Andric SIModeRegisterDefaults Mode = MFI->getMode(); 4833480093f4SDimitry Andric 4834480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 4835480093f4SDimitry Andric 4836480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4837480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 4838480093f4SDimitry Andric 4839480093f4SDimitry Andric auto One = B.buildFConstant(S32, 1.0f); 4840480093f4SDimitry Andric 4841480093f4SDimitry Andric auto DenominatorScaled = 48425f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 4843480093f4SDimitry Andric .addUse(LHS) 48445ffd83dbSDimitry Andric .addUse(RHS) 48455ffd83dbSDimitry Andric .addImm(0) 4846480093f4SDimitry Andric .setMIFlags(Flags); 4847480093f4SDimitry Andric auto NumeratorScaled = 48485f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}) 4849480093f4SDimitry Andric .addUse(LHS) 4850480093f4SDimitry Andric .addUse(RHS) 48515ffd83dbSDimitry Andric .addImm(1) 4852480093f4SDimitry Andric .setMIFlags(Flags); 4853480093f4SDimitry Andric 48545f757f3fSDimitry Andric auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 4855480093f4SDimitry Andric .addUse(DenominatorScaled.getReg(0)) 4856480093f4SDimitry Andric .setMIFlags(Flags); 4857480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 4858480093f4SDimitry Andric 48595f757f3fSDimitry Andric const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE(); 48605f757f3fSDimitry Andric const bool HasDynamicDenormals = 48615f757f3fSDimitry Andric (Mode.FP32Denormals.Input == DenormalMode::Dynamic) || 48625f757f3fSDimitry Andric (Mode.FP32Denormals.Output == DenormalMode::Dynamic); 48635f757f3fSDimitry Andric 48645f757f3fSDimitry Andric Register SavedSPDenormMode; 48655f757f3fSDimitry Andric if (!PreservesDenormals) { 48665f757f3fSDimitry Andric if (HasDynamicDenormals) { 48675f757f3fSDimitry Andric SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 48685f757f3fSDimitry Andric B.buildInstr(AMDGPU::S_GETREG_B32) 48695f757f3fSDimitry Andric .addDef(SavedSPDenormMode) 48705f757f3fSDimitry Andric .addImm(SPDenormModeBitField); 48715f757f3fSDimitry Andric } 4872480093f4SDimitry Andric toggleSPDenormMode(true, B, ST, Mode); 48735f757f3fSDimitry Andric } 4874480093f4SDimitry Andric 4875480093f4SDimitry Andric auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 4876480093f4SDimitry Andric auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 4877480093f4SDimitry Andric auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 4878480093f4SDimitry Andric auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 4879480093f4SDimitry Andric auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 4880480093f4SDimitry Andric auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 4881480093f4SDimitry Andric 48825f757f3fSDimitry Andric if (!PreservesDenormals) { 48835f757f3fSDimitry Andric if (HasDynamicDenormals) { 48845f757f3fSDimitry Andric assert(SavedSPDenormMode); 48855f757f3fSDimitry Andric B.buildInstr(AMDGPU::S_SETREG_B32) 48865f757f3fSDimitry Andric .addReg(SavedSPDenormMode) 48875f757f3fSDimitry Andric .addImm(SPDenormModeBitField); 48885f757f3fSDimitry Andric } else 4889480093f4SDimitry Andric toggleSPDenormMode(false, B, ST, Mode); 48905f757f3fSDimitry Andric } 4891480093f4SDimitry Andric 48925f757f3fSDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}) 4893480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 4894480093f4SDimitry Andric .addUse(Fma1.getReg(0)) 4895480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 4896480093f4SDimitry Andric .addUse(NumeratorScaled.getReg(1)) 4897480093f4SDimitry Andric .setMIFlags(Flags); 4898480093f4SDimitry Andric 48995f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res) 4900480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 4901480093f4SDimitry Andric .addUse(RHS) 4902480093f4SDimitry Andric .addUse(LHS) 4903480093f4SDimitry Andric .setMIFlags(Flags); 4904480093f4SDimitry Andric 4905480093f4SDimitry Andric MI.eraseFromParent(); 4906480093f4SDimitry Andric return true; 4907480093f4SDimitry Andric } 4908480093f4SDimitry Andric 4909480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 4910480093f4SDimitry Andric MachineRegisterInfo &MRI, 4911480093f4SDimitry Andric MachineIRBuilder &B) const { 4912e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 4913e8d8bef9SDimitry Andric return true; 4914e8d8bef9SDimitry Andric 4915480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 4916480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 4917480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 4918480093f4SDimitry Andric 4919480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 4920480093f4SDimitry Andric 4921480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 4922480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 4923480093f4SDimitry Andric 4924480093f4SDimitry Andric auto One = B.buildFConstant(S64, 1.0); 4925480093f4SDimitry Andric 49265f757f3fSDimitry Andric auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 4927480093f4SDimitry Andric .addUse(LHS) 4928480093f4SDimitry Andric .addUse(RHS) 49295ffd83dbSDimitry Andric .addImm(0) 4930480093f4SDimitry Andric .setMIFlags(Flags); 4931480093f4SDimitry Andric 4932480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 4933480093f4SDimitry Andric 49345f757f3fSDimitry Andric auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}) 4935480093f4SDimitry Andric .addUse(DivScale0.getReg(0)) 4936480093f4SDimitry Andric .setMIFlags(Flags); 4937480093f4SDimitry Andric 4938480093f4SDimitry Andric auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 4939480093f4SDimitry Andric auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 4940480093f4SDimitry Andric auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4941480093f4SDimitry Andric 49425f757f3fSDimitry Andric auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}) 4943480093f4SDimitry Andric .addUse(LHS) 4944480093f4SDimitry Andric .addUse(RHS) 49455ffd83dbSDimitry Andric .addImm(1) 4946480093f4SDimitry Andric .setMIFlags(Flags); 4947480093f4SDimitry Andric 4948480093f4SDimitry Andric auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 49495ffd83dbSDimitry Andric auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4950480093f4SDimitry Andric auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4951480093f4SDimitry Andric 4952480093f4SDimitry Andric Register Scale; 4953480093f4SDimitry Andric if (!ST.hasUsableDivScaleConditionOutput()) { 4954480093f4SDimitry Andric // Workaround a hardware bug on SI where the condition output from div_scale 4955480093f4SDimitry Andric // is not usable. 4956480093f4SDimitry Andric 4957480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4958480093f4SDimitry Andric 4959480093f4SDimitry Andric auto NumUnmerge = B.buildUnmerge(S32, LHS); 4960480093f4SDimitry Andric auto DenUnmerge = B.buildUnmerge(S32, RHS); 4961480093f4SDimitry Andric auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4962480093f4SDimitry Andric auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4963480093f4SDimitry Andric 4964480093f4SDimitry Andric auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4965480093f4SDimitry Andric Scale1Unmerge.getReg(1)); 4966480093f4SDimitry Andric auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4967480093f4SDimitry Andric Scale0Unmerge.getReg(1)); 49685ffd83dbSDimitry Andric Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4969480093f4SDimitry Andric } else { 4970480093f4SDimitry Andric Scale = DivScale1.getReg(1); 4971480093f4SDimitry Andric } 4972480093f4SDimitry Andric 49735f757f3fSDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}) 4974480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 4975480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 4976480093f4SDimitry Andric .addUse(Mul.getReg(0)) 4977480093f4SDimitry Andric .addUse(Scale) 4978480093f4SDimitry Andric .setMIFlags(Flags); 4979480093f4SDimitry Andric 49805f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res)) 4981480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 4982480093f4SDimitry Andric .addUse(RHS) 4983480093f4SDimitry Andric .addUse(LHS) 4984480093f4SDimitry Andric .setMIFlags(Flags); 4985480093f4SDimitry Andric 4986480093f4SDimitry Andric MI.eraseFromParent(); 4987480093f4SDimitry Andric return true; 4988480093f4SDimitry Andric } 4989480093f4SDimitry Andric 499006c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI, 499106c3fb27SDimitry Andric MachineRegisterInfo &MRI, 499206c3fb27SDimitry Andric MachineIRBuilder &B) const { 499306c3fb27SDimitry Andric Register Res0 = MI.getOperand(0).getReg(); 499406c3fb27SDimitry Andric Register Res1 = MI.getOperand(1).getReg(); 499506c3fb27SDimitry Andric Register Val = MI.getOperand(2).getReg(); 499606c3fb27SDimitry Andric uint16_t Flags = MI.getFlags(); 499706c3fb27SDimitry Andric 499806c3fb27SDimitry Andric LLT Ty = MRI.getType(Res0); 499906c3fb27SDimitry Andric LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32); 500006c3fb27SDimitry Andric 50015f757f3fSDimitry Andric auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}) 500206c3fb27SDimitry Andric .addUse(Val) 500306c3fb27SDimitry Andric .setMIFlags(Flags); 50045f757f3fSDimitry Andric auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}) 500506c3fb27SDimitry Andric .addUse(Val) 500606c3fb27SDimitry Andric .setMIFlags(Flags); 500706c3fb27SDimitry Andric 500806c3fb27SDimitry Andric if (ST.hasFractBug()) { 500906c3fb27SDimitry Andric auto Fabs = B.buildFAbs(Ty, Val); 501006c3fb27SDimitry Andric auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty))); 501106c3fb27SDimitry Andric auto IsFinite = 501206c3fb27SDimitry Andric B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags); 501306c3fb27SDimitry Andric auto Zero = B.buildConstant(InstrExpTy, 0); 501406c3fb27SDimitry Andric Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero); 501506c3fb27SDimitry Andric Mant = B.buildSelect(Ty, IsFinite, Mant, Val); 501606c3fb27SDimitry Andric } 501706c3fb27SDimitry Andric 501806c3fb27SDimitry Andric B.buildCopy(Res0, Mant); 501906c3fb27SDimitry Andric B.buildSExtOrTrunc(Res1, Exp); 502006c3fb27SDimitry Andric 502106c3fb27SDimitry Andric MI.eraseFromParent(); 502206c3fb27SDimitry Andric return true; 502306c3fb27SDimitry Andric } 502406c3fb27SDimitry Andric 50258bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 50268bcb0991SDimitry Andric MachineRegisterInfo &MRI, 50278bcb0991SDimitry Andric MachineIRBuilder &B) const { 50288bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 50298bcb0991SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 50308bcb0991SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 50318bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 50328bcb0991SDimitry Andric 50338bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 50348bcb0991SDimitry Andric LLT S1 = LLT::scalar(1); 50358bcb0991SDimitry Andric 50368bcb0991SDimitry Andric auto Abs = B.buildFAbs(S32, RHS, Flags); 50378bcb0991SDimitry Andric const APFloat C0Val(1.0f); 50388bcb0991SDimitry Andric 503906c3fb27SDimitry Andric auto C0 = B.buildFConstant(S32, 0x1p+96f); 504006c3fb27SDimitry Andric auto C1 = B.buildFConstant(S32, 0x1p-32f); 504106c3fb27SDimitry Andric auto C2 = B.buildFConstant(S32, 1.0f); 50428bcb0991SDimitry Andric 50438bcb0991SDimitry Andric auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 50448bcb0991SDimitry Andric auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 50458bcb0991SDimitry Andric 50468bcb0991SDimitry Andric auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 50478bcb0991SDimitry Andric 50485f757f3fSDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}) 50498bcb0991SDimitry Andric .addUse(Mul0.getReg(0)) 50508bcb0991SDimitry Andric .setMIFlags(Flags); 50518bcb0991SDimitry Andric 50528bcb0991SDimitry Andric auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 50538bcb0991SDimitry Andric 50548bcb0991SDimitry Andric B.buildFMul(Res, Sel, Mul1, Flags); 50558bcb0991SDimitry Andric 50568bcb0991SDimitry Andric MI.eraseFromParent(); 50578bcb0991SDimitry Andric return true; 50588bcb0991SDimitry Andric } 50598bcb0991SDimitry Andric 50605f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI, 50615f757f3fSDimitry Andric MachineRegisterInfo &MRI, 50625f757f3fSDimitry Andric MachineIRBuilder &B) const { 50635f757f3fSDimitry Andric // Bypass the correct expansion a standard promotion through G_FSQRT would 50645f757f3fSDimitry Andric // get. The f32 op is accurate enough for the f16 cas. 50655f757f3fSDimitry Andric unsigned Flags = MI.getFlags(); 50665f757f3fSDimitry Andric assert(!ST.has16BitInsts()); 50675f757f3fSDimitry Andric const LLT F32 = LLT::scalar(32); 50685f757f3fSDimitry Andric auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags); 50695f757f3fSDimitry Andric auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32}) 50705f757f3fSDimitry Andric .addUse(Ext.getReg(0)) 50715f757f3fSDimitry Andric .setMIFlags(Flags); 50725f757f3fSDimitry Andric B.buildFPTrunc(MI.getOperand(0), Log2, Flags); 50735f757f3fSDimitry Andric MI.eraseFromParent(); 50745f757f3fSDimitry Andric return true; 50755f757f3fSDimitry Andric } 50765f757f3fSDimitry Andric 50775f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI, 50785f757f3fSDimitry Andric MachineRegisterInfo &MRI, 50795f757f3fSDimitry Andric MachineIRBuilder &B) const { 50805f757f3fSDimitry Andric MachineFunction &MF = B.getMF(); 50815f757f3fSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 50825f757f3fSDimitry Andric Register X = MI.getOperand(1).getReg(); 50835f757f3fSDimitry Andric const unsigned Flags = MI.getFlags(); 50845f757f3fSDimitry Andric const LLT S1 = LLT::scalar(1); 50855f757f3fSDimitry Andric const LLT F32 = LLT::scalar(32); 50865f757f3fSDimitry Andric const LLT I32 = LLT::scalar(32); 50875f757f3fSDimitry Andric 50885f757f3fSDimitry Andric if (allowApproxFunc(MF, Flags)) { 50895f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst})) 50905f757f3fSDimitry Andric .addUse(X) 50915f757f3fSDimitry Andric .setMIFlags(Flags); 50925f757f3fSDimitry Andric MI.eraseFromParent(); 50935f757f3fSDimitry Andric return true; 50945f757f3fSDimitry Andric } 50955f757f3fSDimitry Andric 50965f757f3fSDimitry Andric auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f); 50975f757f3fSDimitry Andric auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags); 50985f757f3fSDimitry Andric auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f); 50995f757f3fSDimitry Andric auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags); 51005f757f3fSDimitry Andric auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags); 51015f757f3fSDimitry Andric 51025f757f3fSDimitry Andric Register SqrtS = MRI.createGenericVirtualRegister(F32); 51035f757f3fSDimitry Andric if (needsDenormHandlingF32(MF, X, Flags)) { 51045f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS})) 51055f757f3fSDimitry Andric .addUse(SqrtX.getReg(0)) 51065f757f3fSDimitry Andric .setMIFlags(Flags); 51075f757f3fSDimitry Andric 51085f757f3fSDimitry Andric auto NegOne = B.buildConstant(I32, -1); 51095f757f3fSDimitry Andric auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne); 51105f757f3fSDimitry Andric 51115f757f3fSDimitry Andric auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags); 51125f757f3fSDimitry Andric auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags); 51135f757f3fSDimitry Andric 51145f757f3fSDimitry Andric auto PosOne = B.buildConstant(I32, 1); 51155f757f3fSDimitry Andric auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne); 51165f757f3fSDimitry Andric 51175f757f3fSDimitry Andric auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags); 51185f757f3fSDimitry Andric auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags); 51195f757f3fSDimitry Andric 51205f757f3fSDimitry Andric auto Zero = B.buildFConstant(F32, 0.0f); 51215f757f3fSDimitry Andric auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags); 51225f757f3fSDimitry Andric 51235f757f3fSDimitry Andric SqrtS = 51245f757f3fSDimitry Andric B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0); 51255f757f3fSDimitry Andric 51265f757f3fSDimitry Andric auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags); 51275f757f3fSDimitry Andric SqrtS = 51285f757f3fSDimitry Andric B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0); 51295f757f3fSDimitry Andric } else { 51305f757f3fSDimitry Andric auto SqrtR = 51315f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0)); 51325f757f3fSDimitry Andric B.buildFMul(SqrtS, SqrtX, SqrtR, Flags); 51335f757f3fSDimitry Andric 51345f757f3fSDimitry Andric auto Half = B.buildFConstant(F32, 0.5f); 51355f757f3fSDimitry Andric auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags); 51365f757f3fSDimitry Andric auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags); 51375f757f3fSDimitry Andric auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags); 51385f757f3fSDimitry Andric SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags); 51395f757f3fSDimitry Andric SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0); 51405f757f3fSDimitry Andric auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags); 51415f757f3fSDimitry Andric auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags); 51425f757f3fSDimitry Andric SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0); 51435f757f3fSDimitry Andric } 51445f757f3fSDimitry Andric 51455f757f3fSDimitry Andric auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f); 51465f757f3fSDimitry Andric 51475f757f3fSDimitry Andric auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags); 51485f757f3fSDimitry Andric 51495f757f3fSDimitry Andric SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0); 51505f757f3fSDimitry Andric 51515f757f3fSDimitry Andric auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 51525f757f3fSDimitry Andric B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags); 51535f757f3fSDimitry Andric 51545f757f3fSDimitry Andric MI.eraseFromParent(); 51555f757f3fSDimitry Andric return true; 51565f757f3fSDimitry Andric } 51575f757f3fSDimitry Andric 51585f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, 515906c3fb27SDimitry Andric MachineRegisterInfo &MRI, 516006c3fb27SDimitry Andric MachineIRBuilder &B) const { 516106c3fb27SDimitry Andric // For double type, the SQRT and RSQ instructions don't have required 516206c3fb27SDimitry Andric // precision, we apply Goldschmidt's algorithm to improve the result: 516306c3fb27SDimitry Andric // 516406c3fb27SDimitry Andric // y0 = rsq(x) 516506c3fb27SDimitry Andric // g0 = x * y0 516606c3fb27SDimitry Andric // h0 = 0.5 * y0 516706c3fb27SDimitry Andric // 516806c3fb27SDimitry Andric // r0 = 0.5 - h0 * g0 516906c3fb27SDimitry Andric // g1 = g0 * r0 + g0 517006c3fb27SDimitry Andric // h1 = h0 * r0 + h0 517106c3fb27SDimitry Andric // 517206c3fb27SDimitry Andric // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 517306c3fb27SDimitry Andric // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1 517406c3fb27SDimitry Andric // h2 = h1 * r1 + h1 517506c3fb27SDimitry Andric // 517606c3fb27SDimitry Andric // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 517706c3fb27SDimitry Andric // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2 517806c3fb27SDimitry Andric // 517906c3fb27SDimitry Andric // sqrt(x) = g3 518006c3fb27SDimitry Andric 518106c3fb27SDimitry Andric const LLT S1 = LLT::scalar(1); 518206c3fb27SDimitry Andric const LLT S32 = LLT::scalar(32); 518306c3fb27SDimitry Andric const LLT F64 = LLT::scalar(64); 518406c3fb27SDimitry Andric 518506c3fb27SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 518606c3fb27SDimitry Andric assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt"); 518706c3fb27SDimitry Andric 518806c3fb27SDimitry Andric Register X = MI.getOperand(1).getReg(); 518906c3fb27SDimitry Andric unsigned Flags = MI.getFlags(); 519006c3fb27SDimitry Andric 519106c3fb27SDimitry Andric auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); 519206c3fb27SDimitry Andric 519306c3fb27SDimitry Andric auto ZeroInt = B.buildConstant(S32, 0); 519406c3fb27SDimitry Andric auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); 519506c3fb27SDimitry Andric 519606c3fb27SDimitry Andric // Scale up input if it is too small. 519706c3fb27SDimitry Andric auto ScaleUpFactor = B.buildConstant(S32, 256); 519806c3fb27SDimitry Andric auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); 519906c3fb27SDimitry Andric auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); 520006c3fb27SDimitry Andric 52015f757f3fSDimitry Andric auto SqrtY = 52025f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0)); 520306c3fb27SDimitry Andric 520406c3fb27SDimitry Andric auto Half = B.buildFConstant(F64, 0.5); 520506c3fb27SDimitry Andric auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); 520606c3fb27SDimitry Andric auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY); 520706c3fb27SDimitry Andric 520806c3fb27SDimitry Andric auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0); 520906c3fb27SDimitry Andric auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half); 521006c3fb27SDimitry Andric 521106c3fb27SDimitry Andric auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0); 521206c3fb27SDimitry Andric auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0); 521306c3fb27SDimitry Andric 521406c3fb27SDimitry Andric auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1); 521506c3fb27SDimitry Andric auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX); 521606c3fb27SDimitry Andric 521706c3fb27SDimitry Andric auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); 521806c3fb27SDimitry Andric 521906c3fb27SDimitry Andric auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); 522006c3fb27SDimitry Andric auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); 522106c3fb27SDimitry Andric 522206c3fb27SDimitry Andric auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); 522306c3fb27SDimitry Andric 522406c3fb27SDimitry Andric // Scale down the result. 522506c3fb27SDimitry Andric auto ScaleDownFactor = B.buildConstant(S32, -128); 522606c3fb27SDimitry Andric auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); 522706c3fb27SDimitry Andric SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); 522806c3fb27SDimitry Andric 522906c3fb27SDimitry Andric // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check 523006c3fb27SDimitry Andric // with finite only or nsz because rsq(+/-0) = +/-inf 523106c3fb27SDimitry Andric 523206c3fb27SDimitry Andric // TODO: Check for DAZ and expand to subnormals 523306c3fb27SDimitry Andric auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf); 523406c3fb27SDimitry Andric 523506c3fb27SDimitry Andric // If x is +INF, +0, or -0, use its original value 523606c3fb27SDimitry Andric B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags); 523706c3fb27SDimitry Andric 523806c3fb27SDimitry Andric MI.eraseFromParent(); 523906c3fb27SDimitry Andric return true; 524006c3fb27SDimitry Andric } 524106c3fb27SDimitry Andric 52425f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI, 52435f757f3fSDimitry Andric MachineRegisterInfo &MRI, 52445f757f3fSDimitry Andric MachineIRBuilder &B) const { 52455f757f3fSDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 52465f757f3fSDimitry Andric if (Ty == LLT::scalar(32)) 52475f757f3fSDimitry Andric return legalizeFSQRTF32(MI, MRI, B); 52485f757f3fSDimitry Andric if (Ty == LLT::scalar(64)) 52495f757f3fSDimitry Andric return legalizeFSQRTF64(MI, MRI, B); 52505f757f3fSDimitry Andric if (Ty == LLT::scalar(16)) 52515f757f3fSDimitry Andric return legalizeFSQRTF16(MI, MRI, B); 52525f757f3fSDimitry Andric return false; 52535f757f3fSDimitry Andric } 52545f757f3fSDimitry Andric 5255e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 5256e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions? 5257e8d8bef9SDimitry Andric // 5258e8d8bef9SDimitry Andric // Reciprocal square root. The clamp prevents infinite results, clamping 5259e8d8bef9SDimitry Andric // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 5260e8d8bef9SDimitry Andric // +-max_float. 5261e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 5262e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 5263e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 5264e8d8bef9SDimitry Andric if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 5265e8d8bef9SDimitry Andric return true; 5266e8d8bef9SDimitry Andric 5267e8d8bef9SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 5268e8d8bef9SDimitry Andric Register Src = MI.getOperand(2).getReg(); 5269e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 5270e8d8bef9SDimitry Andric 5271e8d8bef9SDimitry Andric LLT Ty = MRI.getType(Dst); 5272e8d8bef9SDimitry Andric 5273e8d8bef9SDimitry Andric const fltSemantics *FltSemantics; 5274e8d8bef9SDimitry Andric if (Ty == LLT::scalar(32)) 5275e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEsingle(); 5276e8d8bef9SDimitry Andric else if (Ty == LLT::scalar(64)) 5277e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEdouble(); 5278e8d8bef9SDimitry Andric else 5279e8d8bef9SDimitry Andric return false; 5280e8d8bef9SDimitry Andric 52815f757f3fSDimitry Andric auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}) 5282e8d8bef9SDimitry Andric .addUse(Src) 5283e8d8bef9SDimitry Andric .setMIFlags(Flags); 5284e8d8bef9SDimitry Andric 5285e8d8bef9SDimitry Andric // We don't need to concern ourselves with the snan handling difference, since 5286e8d8bef9SDimitry Andric // the rsq quieted (or not) so use the one which will directly select. 5287e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5288e8d8bef9SDimitry Andric const bool UseIEEE = MFI->getMode().IEEE; 5289e8d8bef9SDimitry Andric 5290e8d8bef9SDimitry Andric auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 5291e8d8bef9SDimitry Andric auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 5292e8d8bef9SDimitry Andric B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 5293e8d8bef9SDimitry Andric 5294e8d8bef9SDimitry Andric auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 5295e8d8bef9SDimitry Andric 5296e8d8bef9SDimitry Andric if (UseIEEE) 5297e8d8bef9SDimitry Andric B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 5298e8d8bef9SDimitry Andric else 5299e8d8bef9SDimitry Andric B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 5300e8d8bef9SDimitry Andric MI.eraseFromParent(); 5301e8d8bef9SDimitry Andric return true; 5302e8d8bef9SDimitry Andric } 5303e8d8bef9SDimitry Andric 5304e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 5305e8d8bef9SDimitry Andric switch (IID) { 5306e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 5307e8d8bef9SDimitry Andric return AMDGPU::G_ATOMICRMW_FADD; 5308e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 5309e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 5310e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 5311e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 5312e8d8bef9SDimitry Andric default: 5313e8d8bef9SDimitry Andric llvm_unreachable("not a DS FP intrinsic"); 5314e8d8bef9SDimitry Andric } 5315e8d8bef9SDimitry Andric } 5316e8d8bef9SDimitry Andric 5317e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 5318e8d8bef9SDimitry Andric MachineInstr &MI, 5319e8d8bef9SDimitry Andric Intrinsic::ID IID) const { 5320e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 5321e8d8bef9SDimitry Andric Observer.changingInstr(MI); 5322e8d8bef9SDimitry Andric 5323e8d8bef9SDimitry Andric MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 5324e8d8bef9SDimitry Andric 5325e8d8bef9SDimitry Andric // The remaining operands were used to set fields in the MemOperand on 5326e8d8bef9SDimitry Andric // construction. 5327e8d8bef9SDimitry Andric for (int I = 6; I > 3; --I) 532881ad6265SDimitry Andric MI.removeOperand(I); 5329e8d8bef9SDimitry Andric 533081ad6265SDimitry Andric MI.removeOperand(1); // Remove the intrinsic ID. 5331e8d8bef9SDimitry Andric Observer.changedInstr(MI); 5332e8d8bef9SDimitry Andric return true; 5333e8d8bef9SDimitry Andric } 5334e8d8bef9SDimitry Andric 5335e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 5336e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 5337e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 5338e8d8bef9SDimitry Andric uint64_t Offset = 5339e8d8bef9SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset( 5340e8d8bef9SDimitry Andric B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 5341e8d8bef9SDimitry Andric LLT DstTy = MRI.getType(DstReg); 5342e8d8bef9SDimitry Andric LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 5343e8d8bef9SDimitry Andric 5344e8d8bef9SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 5345e8d8bef9SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 5346e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 5347e8d8bef9SDimitry Andric return false; 5348e8d8bef9SDimitry Andric 5349e8d8bef9SDimitry Andric // FIXME: This should be nuw 5350e8d8bef9SDimitry Andric B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 5351e8d8bef9SDimitry Andric return true; 5352e8d8bef9SDimitry Andric } 5353e8d8bef9SDimitry Andric 535406c3fb27SDimitry Andric /// To create a buffer resource from a 64-bit pointer, mask off the upper 32 535506c3fb27SDimitry Andric /// bits of the pointer and replace them with the stride argument, then 535606c3fb27SDimitry Andric /// merge_values everything together. In the common case of a raw buffer (the 535706c3fb27SDimitry Andric /// stride component is 0), we can just AND off the upper half. 535806c3fb27SDimitry Andric bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin( 535906c3fb27SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 536006c3fb27SDimitry Andric Register Result = MI.getOperand(0).getReg(); 536106c3fb27SDimitry Andric Register Pointer = MI.getOperand(2).getReg(); 536206c3fb27SDimitry Andric Register Stride = MI.getOperand(3).getReg(); 536306c3fb27SDimitry Andric Register NumRecords = MI.getOperand(4).getReg(); 536406c3fb27SDimitry Andric Register Flags = MI.getOperand(5).getReg(); 536506c3fb27SDimitry Andric 536606c3fb27SDimitry Andric LLT S32 = LLT::scalar(32); 536706c3fb27SDimitry Andric 536806c3fb27SDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 536906c3fb27SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Pointer); 537006c3fb27SDimitry Andric Register LowHalf = Unmerge.getReg(0); 537106c3fb27SDimitry Andric Register HighHalf = Unmerge.getReg(1); 537206c3fb27SDimitry Andric 537306c3fb27SDimitry Andric auto AndMask = B.buildConstant(S32, 0x0000ffff); 537406c3fb27SDimitry Andric auto Masked = B.buildAnd(S32, HighHalf, AndMask); 537506c3fb27SDimitry Andric 537606c3fb27SDimitry Andric MachineInstrBuilder NewHighHalf = Masked; 537706c3fb27SDimitry Andric std::optional<ValueAndVReg> StrideConst = 537806c3fb27SDimitry Andric getIConstantVRegValWithLookThrough(Stride, MRI); 537906c3fb27SDimitry Andric if (!StrideConst || !StrideConst->Value.isZero()) { 538006c3fb27SDimitry Andric MachineInstrBuilder ShiftedStride; 538106c3fb27SDimitry Andric if (StrideConst) { 538206c3fb27SDimitry Andric uint32_t StrideVal = StrideConst->Value.getZExtValue(); 538306c3fb27SDimitry Andric uint32_t ShiftedStrideVal = StrideVal << 16; 538406c3fb27SDimitry Andric ShiftedStride = B.buildConstant(S32, ShiftedStrideVal); 538506c3fb27SDimitry Andric } else { 538606c3fb27SDimitry Andric auto ExtStride = B.buildAnyExt(S32, Stride); 538706c3fb27SDimitry Andric auto ShiftConst = B.buildConstant(S32, 16); 538806c3fb27SDimitry Andric ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst); 538906c3fb27SDimitry Andric } 539006c3fb27SDimitry Andric NewHighHalf = B.buildOr(S32, Masked, ShiftedStride); 539106c3fb27SDimitry Andric } 539206c3fb27SDimitry Andric Register NewHighHalfReg = NewHighHalf.getReg(0); 539306c3fb27SDimitry Andric B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags}); 539406c3fb27SDimitry Andric MI.eraseFromParent(); 539506c3fb27SDimitry Andric return true; 539606c3fb27SDimitry Andric } 539706c3fb27SDimitry Andric 53980b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 53990b57cec5SDimitry Andric MachineRegisterInfo &MRI, 54000b57cec5SDimitry Andric MachineIRBuilder &B) const { 54010b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 54020b57cec5SDimitry Andric if (!MFI->isEntryFunction()) { 54030b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 54040b57cec5SDimitry Andric AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 54050b57cec5SDimitry Andric } 54060b57cec5SDimitry Andric 54070b57cec5SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 5408e8d8bef9SDimitry Andric if (!getImplicitArgPtr(DstReg, MRI, B)) 54090b57cec5SDimitry Andric return false; 54100b57cec5SDimitry Andric 54110b57cec5SDimitry Andric MI.eraseFromParent(); 54120b57cec5SDimitry Andric return true; 54130b57cec5SDimitry Andric } 54140b57cec5SDimitry Andric 5415fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 5416fcaf7f86SDimitry Andric MachineRegisterInfo &MRI, 5417fcaf7f86SDimitry Andric MachineIRBuilder &B) const { 5418fcaf7f86SDimitry Andric Function &F = B.getMF().getFunction(); 5419bdd1243dSDimitry Andric std::optional<uint32_t> KnownSize = 5420fcaf7f86SDimitry Andric AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 5421fcaf7f86SDimitry Andric if (KnownSize.has_value()) 5422bdd1243dSDimitry Andric B.buildConstant(DstReg, *KnownSize); 5423fcaf7f86SDimitry Andric return false; 5424fcaf7f86SDimitry Andric } 5425fcaf7f86SDimitry Andric 5426fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 5427fcaf7f86SDimitry Andric MachineRegisterInfo &MRI, 5428fcaf7f86SDimitry Andric MachineIRBuilder &B) const { 5429fcaf7f86SDimitry Andric 5430fcaf7f86SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 5431fcaf7f86SDimitry Andric if (!MFI->isEntryFunction()) { 5432fcaf7f86SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 5433fcaf7f86SDimitry Andric AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 5434fcaf7f86SDimitry Andric } 5435fcaf7f86SDimitry Andric 5436fcaf7f86SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 5437fcaf7f86SDimitry Andric if (!getLDSKernelId(DstReg, MRI, B)) 5438fcaf7f86SDimitry Andric return false; 5439fcaf7f86SDimitry Andric 5440fcaf7f86SDimitry Andric MI.eraseFromParent(); 5441fcaf7f86SDimitry Andric return true; 5442fcaf7f86SDimitry Andric } 5443fcaf7f86SDimitry Andric 54448bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 54458bcb0991SDimitry Andric MachineRegisterInfo &MRI, 54468bcb0991SDimitry Andric MachineIRBuilder &B, 54478bcb0991SDimitry Andric unsigned AddrSpace) const { 54488bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 5449e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 5450e8d8bef9SDimitry Andric Register Hi32 = Unmerge.getReg(1); 5451e8d8bef9SDimitry Andric 54528bcb0991SDimitry Andric B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 54538bcb0991SDimitry Andric MI.eraseFromParent(); 54548bcb0991SDimitry Andric return true; 54558bcb0991SDimitry Andric } 54568bcb0991SDimitry Andric 54575ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 54585ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be 54595ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset 54605ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in 54615ffd83dbSDimitry Andric // the instruction's soffset field). This function takes the first kind of 54625ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset. 5463fe6060f1SDimitry Andric std::pair<Register, unsigned> 54645ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 54655ffd83dbSDimitry Andric Register OrigOffset) const { 54665f757f3fSDimitry Andric const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST); 54675ffd83dbSDimitry Andric Register BaseReg; 5468fe6060f1SDimitry Andric unsigned ImmOffset; 54695ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 5470fe6060f1SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 54715ffd83dbSDimitry Andric 5472fe6060f1SDimitry Andric std::tie(BaseReg, ImmOffset) = 5473fe6060f1SDimitry Andric AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 54745ffd83dbSDimitry Andric 5475fe6060f1SDimitry Andric // If BaseReg is a pointer, convert it to int. 5476fe6060f1SDimitry Andric if (MRI.getType(BaseReg).isPointer()) 5477fe6060f1SDimitry Andric BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 54785ffd83dbSDimitry Andric 547906c3fb27SDimitry Andric // If the immediate value is too big for the immoffset field, put only bits 548006c3fb27SDimitry Andric // that would normally fit in the immoffset field. The remaining value that 548106c3fb27SDimitry Andric // is copied/added for the voffset field is a large power of 2, and it 548206c3fb27SDimitry Andric // stands more chance of being CSEd with the copy/add for another similar 548306c3fb27SDimitry Andric // load/store. 548406c3fb27SDimitry Andric // However, do not do that rounding down if that is a negative 548506c3fb27SDimitry Andric // number, as it appears to be illegal to have a negative offset in the 548606c3fb27SDimitry Andric // vgpr, even if adding the immediate offset makes it positive. 54875ffd83dbSDimitry Andric unsigned Overflow = ImmOffset & ~MaxImm; 54885ffd83dbSDimitry Andric ImmOffset -= Overflow; 54895ffd83dbSDimitry Andric if ((int32_t)Overflow < 0) { 54905ffd83dbSDimitry Andric Overflow += ImmOffset; 54915ffd83dbSDimitry Andric ImmOffset = 0; 54925ffd83dbSDimitry Andric } 54935ffd83dbSDimitry Andric 54945ffd83dbSDimitry Andric if (Overflow != 0) { 54955ffd83dbSDimitry Andric if (!BaseReg) { 54965ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, Overflow).getReg(0); 54975ffd83dbSDimitry Andric } else { 54985ffd83dbSDimitry Andric auto OverflowVal = B.buildConstant(S32, Overflow); 54995ffd83dbSDimitry Andric BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 55005ffd83dbSDimitry Andric } 55015ffd83dbSDimitry Andric } 55025ffd83dbSDimitry Andric 55035ffd83dbSDimitry Andric if (!BaseReg) 55045ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, 0).getReg(0); 55055ffd83dbSDimitry Andric 5506bdd1243dSDimitry Andric return std::pair(BaseReg, ImmOffset); 5507fe6060f1SDimitry Andric } 5508fe6060f1SDimitry Andric 55098bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets. 55108bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 55118bcb0991SDimitry Andric MachineRegisterInfo &MRI, 5512e8d8bef9SDimitry Andric Register Reg, 5513e8d8bef9SDimitry Andric bool ImageStore) const { 55148bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 55158bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 55168bcb0991SDimitry Andric LLT StoreVT = MRI.getType(Reg); 55178bcb0991SDimitry Andric assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 55188bcb0991SDimitry Andric 5519e8d8bef9SDimitry Andric if (ST.hasUnpackedD16VMem()) { 55208bcb0991SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 55218bcb0991SDimitry Andric 55228bcb0991SDimitry Andric SmallVector<Register, 4> WideRegs; 55238bcb0991SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 55248bcb0991SDimitry Andric WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 55258bcb0991SDimitry Andric 55268bcb0991SDimitry Andric int NumElts = StoreVT.getNumElements(); 55278bcb0991SDimitry Andric 5528fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 5529fe6060f1SDimitry Andric .getReg(0); 55308bcb0991SDimitry Andric } 55318bcb0991SDimitry Andric 5532e8d8bef9SDimitry Andric if (ImageStore && ST.hasImageStoreD16Bug()) { 5533e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 2) { 5534e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 5535e8d8bef9SDimitry Andric Reg = B.buildBitcast(S32, Reg).getReg(0); 5536e8d8bef9SDimitry Andric PackedRegs.push_back(Reg); 5537e8d8bef9SDimitry Andric PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 5538fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 5539fe6060f1SDimitry Andric .getReg(0); 5540e8d8bef9SDimitry Andric } 5541e8d8bef9SDimitry Andric 5542e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 3) { 5543e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 5544e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 5545e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5546e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 5547e8d8bef9SDimitry Andric PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 5548fe6060f1SDimitry Andric Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 5549fe6060f1SDimitry Andric return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 5550e8d8bef9SDimitry Andric } 5551e8d8bef9SDimitry Andric 5552e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 4) { 5553e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 5554fe6060f1SDimitry Andric Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 5555e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Reg); 5556e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 5557e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 5558e8d8bef9SDimitry Andric PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 5559fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 5560fe6060f1SDimitry Andric .getReg(0); 5561e8d8bef9SDimitry Andric } 5562e8d8bef9SDimitry Andric 5563e8d8bef9SDimitry Andric llvm_unreachable("invalid data type"); 5564e8d8bef9SDimitry Andric } 5565e8d8bef9SDimitry Andric 55660eae32dcSDimitry Andric if (StoreVT == LLT::fixed_vector(3, S16)) { 55670eae32dcSDimitry Andric Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 55680eae32dcSDimitry Andric .getReg(0); 55690eae32dcSDimitry Andric } 5570e8d8bef9SDimitry Andric return Reg; 5571e8d8bef9SDimitry Andric } 5572e8d8bef9SDimitry Andric 55735ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType( 55745ffd83dbSDimitry Andric MachineIRBuilder &B, Register VData, bool IsFormat) const { 55755ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 55765ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData); 55778bcb0991SDimitry Andric 55788bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 55798bcb0991SDimitry Andric 558006c3fb27SDimitry Andric // Fixup buffer resources themselves needing to be v4i128. 558106c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(Ty)) 558206c3fb27SDimitry Andric return castBufferRsrcToV4I32(VData, B); 558306c3fb27SDimitry Andric 55848bcb0991SDimitry Andric // Fixup illegal register types for i8 stores. 55858bcb0991SDimitry Andric if (Ty == LLT::scalar(8) || Ty == S16) { 55868bcb0991SDimitry Andric Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 55875ffd83dbSDimitry Andric return AnyExt; 55888bcb0991SDimitry Andric } 55898bcb0991SDimitry Andric 55908bcb0991SDimitry Andric if (Ty.isVector()) { 55918bcb0991SDimitry Andric if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 55928bcb0991SDimitry Andric if (IsFormat) 55935ffd83dbSDimitry Andric return handleD16VData(B, *MRI, VData); 55945ffd83dbSDimitry Andric } 55955ffd83dbSDimitry Andric } 55965ffd83dbSDimitry Andric 55975ffd83dbSDimitry Andric return VData; 55985ffd83dbSDimitry Andric } 55995ffd83dbSDimitry Andric 56005ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 56015ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 56025ffd83dbSDimitry Andric MachineIRBuilder &B, 56035ffd83dbSDimitry Andric bool IsTyped, 56045ffd83dbSDimitry Andric bool IsFormat) const { 56055ffd83dbSDimitry Andric Register VData = MI.getOperand(1).getReg(); 56065ffd83dbSDimitry Andric LLT Ty = MRI.getType(VData); 56075ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 56085ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 56095ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 56105ffd83dbSDimitry Andric 56115ffd83dbSDimitry Andric VData = fixStoreSourceType(B, VData, IsFormat); 561206c3fb27SDimitry Andric castBufferRsrcArgToV4I32(MI, B, 2); 56135ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 56145ffd83dbSDimitry Andric 56155ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 56165ffd83dbSDimitry Andric const int MemSize = MMO->getSize(); 56175ffd83dbSDimitry Andric 56185ffd83dbSDimitry Andric unsigned ImmOffset; 56195ffd83dbSDimitry Andric 56205ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 56215ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 56225ffd83dbSDimitry Andric 56235ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 56245ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 56255ffd83dbSDimitry Andric Register VIndex; 56265ffd83dbSDimitry Andric int OpOffset = 0; 56275ffd83dbSDimitry Andric if (HasVIndex) { 56285ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 56295ffd83dbSDimitry Andric OpOffset = 1; 5630fe6060f1SDimitry Andric } else { 5631fe6060f1SDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 56325ffd83dbSDimitry Andric } 56335ffd83dbSDimitry Andric 56345ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 56355ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 56365ffd83dbSDimitry Andric 56375ffd83dbSDimitry Andric unsigned Format = 0; 56385ffd83dbSDimitry Andric if (IsTyped) { 56395ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 56405ffd83dbSDimitry Andric ++OpOffset; 56415ffd83dbSDimitry Andric } 56425ffd83dbSDimitry Andric 56435ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 56445ffd83dbSDimitry Andric 5645fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 56465ffd83dbSDimitry Andric 56475ffd83dbSDimitry Andric unsigned Opc; 56485ffd83dbSDimitry Andric if (IsTyped) { 56495ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 56505ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 56515ffd83dbSDimitry Andric } else if (IsFormat) { 56525ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 56535ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 56545ffd83dbSDimitry Andric } else { 56555ffd83dbSDimitry Andric switch (MemSize) { 56565ffd83dbSDimitry Andric case 1: 56575ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 56585ffd83dbSDimitry Andric break; 56595ffd83dbSDimitry Andric case 2: 56605ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 56615ffd83dbSDimitry Andric break; 56625ffd83dbSDimitry Andric default: 56635ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 56645ffd83dbSDimitry Andric break; 56655ffd83dbSDimitry Andric } 56665ffd83dbSDimitry Andric } 56675ffd83dbSDimitry Andric 56685ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 56695ffd83dbSDimitry Andric .addUse(VData) // vdata 56705ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 56715ffd83dbSDimitry Andric .addUse(VIndex) // vindex 56725ffd83dbSDimitry Andric .addUse(VOffset) // voffset 56735ffd83dbSDimitry Andric .addUse(SOffset) // soffset 56745ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 56755ffd83dbSDimitry Andric 56765ffd83dbSDimitry Andric if (IsTyped) 56775ffd83dbSDimitry Andric MIB.addImm(Format); 56785ffd83dbSDimitry Andric 56795ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 56805ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 56815ffd83dbSDimitry Andric .addMemOperand(MMO); 56825ffd83dbSDimitry Andric 56835ffd83dbSDimitry Andric MI.eraseFromParent(); 56848bcb0991SDimitry Andric return true; 56858bcb0991SDimitry Andric } 56868bcb0991SDimitry Andric 5687bdd1243dSDimitry Andric static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 5688bdd1243dSDimitry Andric Register VIndex, Register VOffset, Register SOffset, 5689bdd1243dSDimitry Andric unsigned ImmOffset, unsigned Format, 5690bdd1243dSDimitry Andric unsigned AuxiliaryData, MachineMemOperand *MMO, 5691bdd1243dSDimitry Andric bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 5692bdd1243dSDimitry Andric auto MIB = B.buildInstr(Opc) 5693bdd1243dSDimitry Andric .addDef(LoadDstReg) // vdata 5694bdd1243dSDimitry Andric .addUse(RSrc) // rsrc 5695bdd1243dSDimitry Andric .addUse(VIndex) // vindex 5696bdd1243dSDimitry Andric .addUse(VOffset) // voffset 5697bdd1243dSDimitry Andric .addUse(SOffset) // soffset 5698bdd1243dSDimitry Andric .addImm(ImmOffset); // offset(imm) 5699bdd1243dSDimitry Andric 5700bdd1243dSDimitry Andric if (IsTyped) 5701bdd1243dSDimitry Andric MIB.addImm(Format); 5702bdd1243dSDimitry Andric 5703bdd1243dSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 5704bdd1243dSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 5705bdd1243dSDimitry Andric .addMemOperand(MMO); 5706bdd1243dSDimitry Andric } 5707bdd1243dSDimitry Andric 57085ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 57095ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 57105ffd83dbSDimitry Andric MachineIRBuilder &B, 57115ffd83dbSDimitry Andric bool IsFormat, 57125ffd83dbSDimitry Andric bool IsTyped) const { 57135ffd83dbSDimitry Andric // FIXME: Verifier should enforce 1 MMO for these intrinsics. 57145ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 5715fe6060f1SDimitry Andric const LLT MemTy = MMO->getMemoryType(); 57165ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 57175ffd83dbSDimitry Andric 57185ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 5719bdd1243dSDimitry Andric 5720bdd1243dSDimitry Andric Register StatusDst; 5721bdd1243dSDimitry Andric int OpOffset = 0; 5722bdd1243dSDimitry Andric assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 5723bdd1243dSDimitry Andric bool IsTFE = MI.getNumExplicitDefs() == 2; 5724bdd1243dSDimitry Andric if (IsTFE) { 5725bdd1243dSDimitry Andric StatusDst = MI.getOperand(1).getReg(); 5726bdd1243dSDimitry Andric ++OpOffset; 5727bdd1243dSDimitry Andric } 5728bdd1243dSDimitry Andric 572906c3fb27SDimitry Andric castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset); 5730bdd1243dSDimitry Andric Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 57315ffd83dbSDimitry Andric 57325ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 57335ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 57345ffd83dbSDimitry Andric 57355ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 5736bdd1243dSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 57375ffd83dbSDimitry Andric Register VIndex; 57385ffd83dbSDimitry Andric if (HasVIndex) { 5739bdd1243dSDimitry Andric VIndex = MI.getOperand(3 + OpOffset).getReg(); 5740bdd1243dSDimitry Andric ++OpOffset; 5741fe6060f1SDimitry Andric } else { 5742fe6060f1SDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 57438bcb0991SDimitry Andric } 57448bcb0991SDimitry Andric 57455ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 57465ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 57475ffd83dbSDimitry Andric 57485ffd83dbSDimitry Andric unsigned Format = 0; 57495ffd83dbSDimitry Andric if (IsTyped) { 57505ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 57515ffd83dbSDimitry Andric ++OpOffset; 57528bcb0991SDimitry Andric } 57538bcb0991SDimitry Andric 57545ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 57555ffd83dbSDimitry Andric unsigned ImmOffset; 57565ffd83dbSDimitry Andric 57575ffd83dbSDimitry Andric LLT Ty = MRI.getType(Dst); 575806c3fb27SDimitry Andric // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the 575906c3fb27SDimitry Andric // logic doesn't have to handle that case. 576006c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(Ty)) { 576106c3fb27SDimitry Andric Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0); 576206c3fb27SDimitry Andric Dst = MI.getOperand(0).getReg(); 576306c3fb27SDimitry Andric } 57645ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 57655ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 57665ffd83dbSDimitry Andric const bool Unpacked = ST.hasUnpackedD16VMem(); 57675ffd83dbSDimitry Andric 5768fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 57695ffd83dbSDimitry Andric 57705ffd83dbSDimitry Andric unsigned Opc; 57715ffd83dbSDimitry Andric 5772bdd1243dSDimitry Andric // TODO: Support TFE for typed and narrow loads. 57735ffd83dbSDimitry Andric if (IsTyped) { 5774bdd1243dSDimitry Andric if (IsTFE) 5775bdd1243dSDimitry Andric return false; 57765ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 57775ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 57785ffd83dbSDimitry Andric } else if (IsFormat) { 5779bdd1243dSDimitry Andric if (IsD16) { 5780bdd1243dSDimitry Andric if (IsTFE) 5781bdd1243dSDimitry Andric return false; 5782bdd1243dSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 57835ffd83dbSDimitry Andric } else { 5784bdd1243dSDimitry Andric Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 5785bdd1243dSDimitry Andric : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 5786bdd1243dSDimitry Andric } 5787bdd1243dSDimitry Andric } else { 5788bdd1243dSDimitry Andric if (IsTFE) 5789bdd1243dSDimitry Andric return false; 5790fe6060f1SDimitry Andric switch (MemTy.getSizeInBits()) { 5791fe6060f1SDimitry Andric case 8: 57925ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 57935ffd83dbSDimitry Andric break; 5794fe6060f1SDimitry Andric case 16: 57955ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 57965ffd83dbSDimitry Andric break; 57975ffd83dbSDimitry Andric default: 57985ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 57995ffd83dbSDimitry Andric break; 58005ffd83dbSDimitry Andric } 58015ffd83dbSDimitry Andric } 58025ffd83dbSDimitry Andric 5803bdd1243dSDimitry Andric if (IsTFE) { 5804bdd1243dSDimitry Andric unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 5805bdd1243dSDimitry Andric unsigned NumLoadDWords = NumValueDWords + 1; 5806bdd1243dSDimitry Andric LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 5807bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 5808bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5809bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5810bdd1243dSDimitry Andric if (NumValueDWords == 1) { 5811bdd1243dSDimitry Andric B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 5812bdd1243dSDimitry Andric } else { 5813bdd1243dSDimitry Andric SmallVector<Register, 5> LoadElts; 5814bdd1243dSDimitry Andric for (unsigned I = 0; I != NumValueDWords; ++I) 5815bdd1243dSDimitry Andric LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 5816bdd1243dSDimitry Andric LoadElts.push_back(StatusDst); 5817bdd1243dSDimitry Andric B.buildUnmerge(LoadElts, LoadDstReg); 5818bdd1243dSDimitry Andric LoadElts.truncate(NumValueDWords); 5819bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, LoadElts); 5820bdd1243dSDimitry Andric } 5821bdd1243dSDimitry Andric } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 5822bdd1243dSDimitry Andric (IsD16 && !Ty.isVector())) { 5823bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 5824bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5825bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 58265ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 58275ffd83dbSDimitry Andric B.buildTrunc(Dst, LoadDstReg); 5828bdd1243dSDimitry Andric } else if (Unpacked && IsD16 && Ty.isVector()) { 5829bdd1243dSDimitry Andric LLT UnpackedTy = Ty.changeElementSize(32); 5830bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 5831bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 5832bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 5833bdd1243dSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 58345ffd83dbSDimitry Andric // FIXME: G_TRUNC should work, but legalization currently fails 58355ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 58365ffd83dbSDimitry Andric SmallVector<Register, 4> Repack; 58375ffd83dbSDimitry Andric for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 58385ffd83dbSDimitry Andric Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 5839bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, Repack); 5840bdd1243dSDimitry Andric } else { 5841bdd1243dSDimitry Andric buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 5842bdd1243dSDimitry Andric AuxiliaryData, MMO, IsTyped, HasVIndex, B); 58435ffd83dbSDimitry Andric } 58445ffd83dbSDimitry Andric 58455ffd83dbSDimitry Andric MI.eraseFromParent(); 58465ffd83dbSDimitry Andric return true; 58475ffd83dbSDimitry Andric } 58485ffd83dbSDimitry Andric 58495ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 58505ffd83dbSDimitry Andric switch (IntrID) { 58515ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 585206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 58535ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 585406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 58555ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 58565ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 585706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 58585ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 585906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 58605ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 58615ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 586206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 58635ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 586406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 58655ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 58665ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 586706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 58685ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 586906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 58705ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 58715ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 587206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 58735ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 587406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 58755ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 58765ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 587706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 58785ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 587906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 58805ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 58815ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 588206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 58835ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 588406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 58855ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 58865ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 588706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 58885ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 588906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 58905ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 58915ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 589206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 58935ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 589406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 58955ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 58965ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 589706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 58985ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 589906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 59005ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 59015ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 590206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 59035ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 590406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 59055ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 59065ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 590706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 59085ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 590906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 59105ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 59115ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 591206c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 59135ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 591406c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 59155ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 5916e8d8bef9SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 591706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 5918e8d8bef9SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 591906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 5920e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 59217a6dacacSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: 59227a6dacacSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: 59237a6dacacSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16; 5924fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 592506c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 5926fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 592706c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 5928fe6060f1SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 5929fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 593006c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 5931fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 593206c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 5933fe6060f1SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 59347a6dacacSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32: 59357a6dacacSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32: 59367a6dacacSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32; 59375ffd83dbSDimitry Andric default: 59385ffd83dbSDimitry Andric llvm_unreachable("unhandled atomic opcode"); 59395ffd83dbSDimitry Andric } 59405ffd83dbSDimitry Andric } 59415ffd83dbSDimitry Andric 59425ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 59435ffd83dbSDimitry Andric MachineIRBuilder &B, 59445ffd83dbSDimitry Andric Intrinsic::ID IID) const { 594506c3fb27SDimitry Andric const bool IsCmpSwap = 594606c3fb27SDimitry Andric IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 594706c3fb27SDimitry Andric IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap || 594806c3fb27SDimitry Andric IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap || 594906c3fb27SDimitry Andric IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap; 59505ffd83dbSDimitry Andric 59515f757f3fSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 595206c3fb27SDimitry Andric // Since we don't have 128-bit atomics, we don't need to handle the case of 595306c3fb27SDimitry Andric // p8 argmunents to the atomic itself 59545f757f3fSDimitry Andric Register VData = MI.getOperand(2).getReg(); 59555f757f3fSDimitry Andric 5956e8d8bef9SDimitry Andric Register CmpVal; 59575f757f3fSDimitry Andric int OpOffset = 0; 59585ffd83dbSDimitry Andric 59595ffd83dbSDimitry Andric if (IsCmpSwap) { 59605f757f3fSDimitry Andric CmpVal = MI.getOperand(3).getReg(); 59615ffd83dbSDimitry Andric ++OpOffset; 59625ffd83dbSDimitry Andric } 59635ffd83dbSDimitry Andric 596406c3fb27SDimitry Andric castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset); 59655ffd83dbSDimitry Andric Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 59665f757f3fSDimitry Andric const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 59675ffd83dbSDimitry Andric 59685ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 59695ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 59705ffd83dbSDimitry Andric Register VIndex; 59715ffd83dbSDimitry Andric if (HasVIndex) { 59725ffd83dbSDimitry Andric VIndex = MI.getOperand(4 + OpOffset).getReg(); 59735ffd83dbSDimitry Andric ++OpOffset; 5974fe6060f1SDimitry Andric } else { 5975fe6060f1SDimitry Andric VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 59765ffd83dbSDimitry Andric } 59775ffd83dbSDimitry Andric 59785ffd83dbSDimitry Andric Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 59795ffd83dbSDimitry Andric Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 59805ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 59815ffd83dbSDimitry Andric 59825ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 59835ffd83dbSDimitry Andric 59845ffd83dbSDimitry Andric unsigned ImmOffset; 5985fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 59865ffd83dbSDimitry Andric 59875f757f3fSDimitry Andric auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 59885f757f3fSDimitry Andric .addDef(Dst) 59895f757f3fSDimitry Andric .addUse(VData); // vdata 59905ffd83dbSDimitry Andric 59915ffd83dbSDimitry Andric if (IsCmpSwap) 59925ffd83dbSDimitry Andric MIB.addReg(CmpVal); 59935ffd83dbSDimitry Andric 59945ffd83dbSDimitry Andric MIB.addUse(RSrc) // rsrc 59955ffd83dbSDimitry Andric .addUse(VIndex) // vindex 59965ffd83dbSDimitry Andric .addUse(VOffset) // voffset 59975ffd83dbSDimitry Andric .addUse(SOffset) // soffset 59985ffd83dbSDimitry Andric .addImm(ImmOffset) // offset(imm) 59995ffd83dbSDimitry Andric .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 60005ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 60015ffd83dbSDimitry Andric .addMemOperand(MMO); 60025ffd83dbSDimitry Andric 60035ffd83dbSDimitry Andric MI.eraseFromParent(); 60045ffd83dbSDimitry Andric return true; 60055ffd83dbSDimitry Andric } 60065ffd83dbSDimitry Andric 6007fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 60085ffd83dbSDimitry Andric /// vector with s16 typed elements. 6009fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 6010fe6060f1SDimitry Andric SmallVectorImpl<Register> &PackedAddrs, 6011fe6060f1SDimitry Andric unsigned ArgOffset, 6012fe6060f1SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr, 6013fe6060f1SDimitry Andric bool IsA16, bool IsG16) { 60145ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 6015fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 6016fe6060f1SDimitry Andric auto EndIdx = Intr->VAddrEnd; 60175ffd83dbSDimitry Andric 6018e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 6019e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 60205ffd83dbSDimitry Andric if (!SrcOp.isReg()) 60215ffd83dbSDimitry Andric continue; // _L to _LZ may have eliminated this. 60225ffd83dbSDimitry Andric 60235ffd83dbSDimitry Andric Register AddrReg = SrcOp.getReg(); 60245ffd83dbSDimitry Andric 6025fe6060f1SDimitry Andric if ((I < Intr->GradientStart) || 6026fe6060f1SDimitry Andric (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 6027fe6060f1SDimitry Andric (I >= Intr->CoordStart && !IsA16)) { 60280eae32dcSDimitry Andric if ((I < Intr->GradientStart) && IsA16 && 60290eae32dcSDimitry Andric (B.getMRI()->getType(AddrReg) == S16)) { 603004eeddc0SDimitry Andric assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 60310eae32dcSDimitry Andric // Special handling of bias when A16 is on. Bias is of type half but 60320eae32dcSDimitry Andric // occupies full 32-bit. 60330eae32dcSDimitry Andric PackedAddrs.push_back( 60340eae32dcSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 60350eae32dcSDimitry Andric .getReg(0)); 60360eae32dcSDimitry Andric } else { 603704eeddc0SDimitry Andric assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 603804eeddc0SDimitry Andric "Bias needs to be converted to 16 bit in A16 mode"); 603904eeddc0SDimitry Andric // Handle any gradient or coordinate operands that should not be packed 60405ffd83dbSDimitry Andric AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 60415ffd83dbSDimitry Andric PackedAddrs.push_back(AddrReg); 60420eae32dcSDimitry Andric } 60435ffd83dbSDimitry Andric } else { 60445ffd83dbSDimitry Andric // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 60455ffd83dbSDimitry Andric // derivatives dx/dh and dx/dv are packed with undef. 60465ffd83dbSDimitry Andric if (((I + 1) >= EndIdx) || 6047e8d8bef9SDimitry Andric ((Intr->NumGradients / 2) % 2 == 1 && 6048e8d8bef9SDimitry Andric (I == static_cast<unsigned>(Intr->GradientStart + 6049e8d8bef9SDimitry Andric (Intr->NumGradients / 2) - 1) || 6050e8d8bef9SDimitry Andric I == static_cast<unsigned>(Intr->GradientStart + 6051e8d8bef9SDimitry Andric Intr->NumGradients - 1))) || 60525ffd83dbSDimitry Andric // Check for _L to _LZ optimization 6053e8d8bef9SDimitry Andric !MI.getOperand(ArgOffset + I + 1).isReg()) { 60545ffd83dbSDimitry Andric PackedAddrs.push_back( 60555ffd83dbSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 60565ffd83dbSDimitry Andric .getReg(0)); 60575ffd83dbSDimitry Andric } else { 60585ffd83dbSDimitry Andric PackedAddrs.push_back( 6059e8d8bef9SDimitry Andric B.buildBuildVector( 6060e8d8bef9SDimitry Andric V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 60615ffd83dbSDimitry Andric .getReg(0)); 60625ffd83dbSDimitry Andric ++I; 60635ffd83dbSDimitry Andric } 60645ffd83dbSDimitry Andric } 60655ffd83dbSDimitry Andric } 60665ffd83dbSDimitry Andric } 60675ffd83dbSDimitry Andric 60685ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register, 60695ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg. 60705ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 60715ffd83dbSDimitry Andric int DimIdx, int NumVAddrs) { 60725ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 6073bdd1243dSDimitry Andric (void)S32; 60745ffd83dbSDimitry Andric SmallVector<Register, 8> AddrRegs; 60755ffd83dbSDimitry Andric for (int I = 0; I != NumVAddrs; ++I) { 60765ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 60775ffd83dbSDimitry Andric if (SrcOp.isReg()) { 60785ffd83dbSDimitry Andric AddrRegs.push_back(SrcOp.getReg()); 60795ffd83dbSDimitry Andric assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 60805ffd83dbSDimitry Andric } 60815ffd83dbSDimitry Andric } 60825ffd83dbSDimitry Andric 60835ffd83dbSDimitry Andric int NumAddrRegs = AddrRegs.size(); 60845ffd83dbSDimitry Andric if (NumAddrRegs != 1) { 6085fe6060f1SDimitry Andric auto VAddr = 6086fe6060f1SDimitry Andric B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 60875ffd83dbSDimitry Andric MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 60885ffd83dbSDimitry Andric } 60895ffd83dbSDimitry Andric 60905ffd83dbSDimitry Andric for (int I = 1; I != NumVAddrs; ++I) { 60915ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 60925ffd83dbSDimitry Andric if (SrcOp.isReg()) 60935ffd83dbSDimitry Andric MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 60945ffd83dbSDimitry Andric } 60955ffd83dbSDimitry Andric } 60965ffd83dbSDimitry Andric 60975ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget. 60985ffd83dbSDimitry Andric /// 60995ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be 61005ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed 61015ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit 61025ffd83dbSDimitry Andric /// registers. 61035ffd83dbSDimitry Andric /// 61045ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want 61055ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't 610681ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid 61075ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on 6108349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires 6109349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg. 61105ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 6111e8d8bef9SDimitry Andric MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 6112e8d8bef9SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 61135ffd83dbSDimitry Andric 6114bdd1243dSDimitry Andric const MachineFunction &MF = *MI.getMF(); 6115e8d8bef9SDimitry Andric const unsigned NumDefs = MI.getNumExplicitDefs(); 6116e8d8bef9SDimitry Andric const unsigned ArgOffset = NumDefs + 1; 61175ffd83dbSDimitry Andric bool IsTFE = NumDefs == 2; 61185ffd83dbSDimitry Andric // We are only processing the operands of d16 image operations on subtargets 61195ffd83dbSDimitry Andric // that use the unpacked register layout, or need to repack the TFE result. 61205ffd83dbSDimitry Andric 61215ffd83dbSDimitry Andric // TODO: Do we need to guard against already legalized intrinsics? 61225ffd83dbSDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 6123e8d8bef9SDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 61245ffd83dbSDimitry Andric 61255ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 61265ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 61275ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 6128fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 61295ffd83dbSDimitry Andric 61305ffd83dbSDimitry Andric unsigned DMask = 0; 613104eeddc0SDimitry Andric Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 613204eeddc0SDimitry Andric LLT Ty = MRI->getType(VData); 61335ffd83dbSDimitry Andric 61347a6dacacSDimitry Andric const bool IsAtomicPacked16Bit = 61357a6dacacSDimitry Andric (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || 61367a6dacacSDimitry Andric BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); 61377a6dacacSDimitry Andric 61385ffd83dbSDimitry Andric // Check for 16 bit addresses and pack if true. 6139e8d8bef9SDimitry Andric LLT GradTy = 6140e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 6141e8d8bef9SDimitry Andric LLT AddrTy = 6142e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 614306c3fb27SDimitry Andric const bool IsG16 = 614406c3fb27SDimitry Andric ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16; 61455ffd83dbSDimitry Andric const bool IsA16 = AddrTy == S16; 61467a6dacacSDimitry Andric const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16; 61475ffd83dbSDimitry Andric 61485ffd83dbSDimitry Andric int DMaskLanes = 0; 61495ffd83dbSDimitry Andric if (!BaseOpcode->Atomic) { 6150e8d8bef9SDimitry Andric DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 61515ffd83dbSDimitry Andric if (BaseOpcode->Gather4) { 61525ffd83dbSDimitry Andric DMaskLanes = 4; 61535ffd83dbSDimitry Andric } else if (DMask != 0) { 6154bdd1243dSDimitry Andric DMaskLanes = llvm::popcount(DMask); 61555ffd83dbSDimitry Andric } else if (!IsTFE && !BaseOpcode->Store) { 61565ffd83dbSDimitry Andric // If dmask is 0, this is a no-op load. This can be eliminated. 61575ffd83dbSDimitry Andric B.buildUndef(MI.getOperand(0)); 61585ffd83dbSDimitry Andric MI.eraseFromParent(); 61595ffd83dbSDimitry Andric return true; 61605ffd83dbSDimitry Andric } 61615ffd83dbSDimitry Andric } 61625ffd83dbSDimitry Andric 61635ffd83dbSDimitry Andric Observer.changingInstr(MI); 61645ffd83dbSDimitry Andric auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 61655ffd83dbSDimitry Andric 616604eeddc0SDimitry Andric const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 616704eeddc0SDimitry Andric : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 616804eeddc0SDimitry Andric const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 616904eeddc0SDimitry Andric : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 617004eeddc0SDimitry Andric unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 61715ffd83dbSDimitry Andric 61725ffd83dbSDimitry Andric // Track that we legalized this 61735ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(NewOpcode)); 61745ffd83dbSDimitry Andric 61755ffd83dbSDimitry Andric // Expecting to get an error flag since TFC is on - and dmask is 0 Force 61765ffd83dbSDimitry Andric // dmask to be at least 1 otherwise the instruction will fail 61775ffd83dbSDimitry Andric if (IsTFE && DMask == 0) { 61785ffd83dbSDimitry Andric DMask = 0x1; 61795ffd83dbSDimitry Andric DMaskLanes = 1; 6180e8d8bef9SDimitry Andric MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 61815ffd83dbSDimitry Andric } 61825ffd83dbSDimitry Andric 61835ffd83dbSDimitry Andric if (BaseOpcode->Atomic) { 61845ffd83dbSDimitry Andric Register VData0 = MI.getOperand(2).getReg(); 61855ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData0); 61865ffd83dbSDimitry Andric 61875ffd83dbSDimitry Andric // TODO: Allow atomic swap and bit ops for v2s16/v4s16 61887a6dacacSDimitry Andric if (Ty.isVector() && !IsAtomicPacked16Bit) 61895ffd83dbSDimitry Andric return false; 61905ffd83dbSDimitry Andric 61915ffd83dbSDimitry Andric if (BaseOpcode->AtomicX2) { 61925ffd83dbSDimitry Andric Register VData1 = MI.getOperand(3).getReg(); 61935ffd83dbSDimitry Andric // The two values are packed in one register. 6194fe6060f1SDimitry Andric LLT PackedTy = LLT::fixed_vector(2, Ty); 61955ffd83dbSDimitry Andric auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 61965ffd83dbSDimitry Andric MI.getOperand(2).setReg(Concat.getReg(0)); 61975ffd83dbSDimitry Andric MI.getOperand(3).setReg(AMDGPU::NoRegister); 61985ffd83dbSDimitry Andric } 61995ffd83dbSDimitry Andric } 62005ffd83dbSDimitry Andric 6201e8d8bef9SDimitry Andric unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 62025ffd83dbSDimitry Andric 62035ffd83dbSDimitry Andric // Rewrite the addressing register layout before doing anything else. 6204fe6060f1SDimitry Andric if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 6205fe6060f1SDimitry Andric // 16 bit gradients are supported, but are tied to the A16 control 6206fe6060f1SDimitry Andric // so both gradients and addresses must be 16 bit 62075ffd83dbSDimitry Andric return false; 6208fe6060f1SDimitry Andric } 62095ffd83dbSDimitry Andric 6210fe6060f1SDimitry Andric if (IsA16 && !ST.hasA16()) { 6211fe6060f1SDimitry Andric // A16 not supported 6212fe6060f1SDimitry Andric return false; 6213fe6060f1SDimitry Andric } 6214fe6060f1SDimitry Andric 62155f757f3fSDimitry Andric const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler); 621606c3fb27SDimitry Andric const unsigned HasPartialNSA = ST.hasPartialNSAEncoding(); 621706c3fb27SDimitry Andric 6218fe6060f1SDimitry Andric if (IsA16 || IsG16) { 62195f757f3fSDimitry Andric // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the 62205f757f3fSDimitry Andric // instructions expect VGPR_32 62215ffd83dbSDimitry Andric SmallVector<Register, 4> PackedRegs; 62225ffd83dbSDimitry Andric 62235f757f3fSDimitry Andric packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16); 62245ffd83dbSDimitry Andric 62255ffd83dbSDimitry Andric // See also below in the non-a16 branch 6226bdd1243dSDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && 6227bdd1243dSDimitry Andric PackedRegs.size() >= ST.getNSAThreshold(MF) && 622806c3fb27SDimitry Andric (PackedRegs.size() <= NSAMaxSize || HasPartialNSA); 622906c3fb27SDimitry Andric const bool UsePartialNSA = 623006c3fb27SDimitry Andric UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize; 62315ffd83dbSDimitry Andric 623206c3fb27SDimitry Andric if (UsePartialNSA) { 623306c3fb27SDimitry Andric // Pack registers that would go over NSAMaxSize into last VAddr register 623406c3fb27SDimitry Andric LLT PackedAddrTy = 623506c3fb27SDimitry Andric LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16); 623606c3fb27SDimitry Andric auto Concat = B.buildConcatVectors( 623706c3fb27SDimitry Andric PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1)); 623806c3fb27SDimitry Andric PackedRegs[NSAMaxSize - 1] = Concat.getReg(0); 623906c3fb27SDimitry Andric PackedRegs.resize(NSAMaxSize); 624006c3fb27SDimitry Andric } else if (!UseNSA && PackedRegs.size() > 1) { 6241fe6060f1SDimitry Andric LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 62425ffd83dbSDimitry Andric auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 62435ffd83dbSDimitry Andric PackedRegs[0] = Concat.getReg(0); 62445ffd83dbSDimitry Andric PackedRegs.resize(1); 62455ffd83dbSDimitry Andric } 62465ffd83dbSDimitry Andric 6247e8d8bef9SDimitry Andric const unsigned NumPacked = PackedRegs.size(); 6248e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 6249e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 62505ffd83dbSDimitry Andric if (!SrcOp.isReg()) { 62515ffd83dbSDimitry Andric assert(SrcOp.isImm() && SrcOp.getImm() == 0); 62525ffd83dbSDimitry Andric continue; 62535ffd83dbSDimitry Andric } 62545ffd83dbSDimitry Andric 62555ffd83dbSDimitry Andric assert(SrcOp.getReg() != AMDGPU::NoRegister); 62565ffd83dbSDimitry Andric 6257e8d8bef9SDimitry Andric if (I - Intr->VAddrStart < NumPacked) 6258e8d8bef9SDimitry Andric SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 62595ffd83dbSDimitry Andric else 62605ffd83dbSDimitry Andric SrcOp.setReg(AMDGPU::NoRegister); 62615ffd83dbSDimitry Andric } 62625ffd83dbSDimitry Andric } else { 62635ffd83dbSDimitry Andric // If the register allocator cannot place the address registers contiguously 62645ffd83dbSDimitry Andric // without introducing moves, then using the non-sequential address encoding 62655ffd83dbSDimitry Andric // is always preferable, since it saves VALU instructions and is usually a 62665ffd83dbSDimitry Andric // wash in terms of code size or even better. 62675ffd83dbSDimitry Andric // 62685ffd83dbSDimitry Andric // However, we currently have no way of hinting to the register allocator 62695ffd83dbSDimitry Andric // that MIMG addresses should be placed contiguously when it is possible to 62705ffd83dbSDimitry Andric // do so, so force non-NSA for the common 2-address case as a heuristic. 62715ffd83dbSDimitry Andric // 62725ffd83dbSDimitry Andric // SIShrinkInstructions will convert NSA encodings to non-NSA after register 62735ffd83dbSDimitry Andric // allocation when possible. 627481ad6265SDimitry Andric // 62755f757f3fSDimitry Andric // Partial NSA is allowed on GFX11+ where the final register is a contiguous 627606c3fb27SDimitry Andric // set of the remaining addresses. 6277bdd1243dSDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && 6278bdd1243dSDimitry Andric CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 627906c3fb27SDimitry Andric (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA); 628006c3fb27SDimitry Andric const bool UsePartialNSA = 628106c3fb27SDimitry Andric UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize; 62825ffd83dbSDimitry Andric 628306c3fb27SDimitry Andric if (UsePartialNSA) { 628406c3fb27SDimitry Andric convertImageAddrToPacked(B, MI, 628506c3fb27SDimitry Andric ArgOffset + Intr->VAddrStart + NSAMaxSize - 1, 628606c3fb27SDimitry Andric Intr->NumVAddrs - NSAMaxSize + 1); 628706c3fb27SDimitry Andric } else if (!UseNSA && Intr->NumVAddrs > 1) { 6288e8d8bef9SDimitry Andric convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 6289e8d8bef9SDimitry Andric Intr->NumVAddrs); 62905ffd83dbSDimitry Andric } 629106c3fb27SDimitry Andric } 62925ffd83dbSDimitry Andric 62935ffd83dbSDimitry Andric int Flags = 0; 62945ffd83dbSDimitry Andric if (IsA16) 62955ffd83dbSDimitry Andric Flags |= 1; 62965ffd83dbSDimitry Andric if (IsG16) 62975ffd83dbSDimitry Andric Flags |= 2; 62985ffd83dbSDimitry Andric MI.addOperand(MachineOperand::CreateImm(Flags)); 62995ffd83dbSDimitry Andric 63005ffd83dbSDimitry Andric if (BaseOpcode->Store) { // No TFE for stores? 63015ffd83dbSDimitry Andric // TODO: Handle dmask trim 630204eeddc0SDimitry Andric if (!Ty.isVector() || !IsD16) 63035ffd83dbSDimitry Andric return true; 63045ffd83dbSDimitry Andric 6305e8d8bef9SDimitry Andric Register RepackedReg = handleD16VData(B, *MRI, VData, true); 63065ffd83dbSDimitry Andric if (RepackedReg != VData) { 63075ffd83dbSDimitry Andric MI.getOperand(1).setReg(RepackedReg); 63085ffd83dbSDimitry Andric } 63095ffd83dbSDimitry Andric 63105ffd83dbSDimitry Andric return true; 63115ffd83dbSDimitry Andric } 63125ffd83dbSDimitry Andric 63135ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 63145ffd83dbSDimitry Andric const LLT EltTy = Ty.getScalarType(); 63155ffd83dbSDimitry Andric const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 63165ffd83dbSDimitry Andric 63175ffd83dbSDimitry Andric // Confirm that the return type is large enough for the dmask specified 63185ffd83dbSDimitry Andric if (NumElts < DMaskLanes) 63195ffd83dbSDimitry Andric return false; 63205ffd83dbSDimitry Andric 63215ffd83dbSDimitry Andric if (NumElts > 4 || DMaskLanes > 4) 63225ffd83dbSDimitry Andric return false; 63235ffd83dbSDimitry Andric 63247a6dacacSDimitry Andric // Image atomic instructions are using DMask to specify how many bits 63257a6dacacSDimitry Andric // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16). 63267a6dacacSDimitry Andric // DMaskLanes for image atomic has default value '0'. 63277a6dacacSDimitry Andric // We must be sure that atomic variants (especially packed) will not be 63287a6dacacSDimitry Andric // truncated from v2s16 or v4s16 to s16 type. 63297a6dacacSDimitry Andric // 63307a6dacacSDimitry Andric // ChangeElementCount will be needed for image load where Ty is always scalar. 63315ffd83dbSDimitry Andric const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 6332fe6060f1SDimitry Andric const LLT AdjustedTy = 63337a6dacacSDimitry Andric DMaskLanes == 0 63347a6dacacSDimitry Andric ? Ty 63357a6dacacSDimitry Andric : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 63365ffd83dbSDimitry Andric 63375ffd83dbSDimitry Andric // The raw dword aligned data component of the load. The only legal cases 63385ffd83dbSDimitry Andric // where this matters should be when using the packed D16 format, for 63395ffd83dbSDimitry Andric // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 63405ffd83dbSDimitry Andric LLT RoundedTy; 63415ffd83dbSDimitry Andric 6342bdd1243dSDimitry Andric // S32 vector to cover all data, plus TFE result element. 63435ffd83dbSDimitry Andric LLT TFETy; 63445ffd83dbSDimitry Andric 63455ffd83dbSDimitry Andric // Register type to use for each loaded component. Will be S32 or V2S16. 63465ffd83dbSDimitry Andric LLT RegTy; 63475ffd83dbSDimitry Andric 63485ffd83dbSDimitry Andric if (IsD16 && ST.hasUnpackedD16VMem()) { 6349fe6060f1SDimitry Andric RoundedTy = 6350fe6060f1SDimitry Andric LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 6351fe6060f1SDimitry Andric TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 63525ffd83dbSDimitry Andric RegTy = S32; 63535ffd83dbSDimitry Andric } else { 63545ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 63555ffd83dbSDimitry Andric unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 63565ffd83dbSDimitry Andric unsigned RoundedSize = 32 * RoundedElts; 6357fe6060f1SDimitry Andric RoundedTy = LLT::scalarOrVector( 6358fe6060f1SDimitry Andric ElementCount::getFixed(RoundedSize / EltSize), EltSize); 6359fe6060f1SDimitry Andric TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 63605ffd83dbSDimitry Andric RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 63615ffd83dbSDimitry Andric } 63625ffd83dbSDimitry Andric 63635ffd83dbSDimitry Andric // The return type does not need adjustment. 63645ffd83dbSDimitry Andric // TODO: Should we change s16 case to s32 or <2 x s16>? 63655ffd83dbSDimitry Andric if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 63665ffd83dbSDimitry Andric return true; 63675ffd83dbSDimitry Andric 63685ffd83dbSDimitry Andric Register Dst1Reg; 63695ffd83dbSDimitry Andric 63705ffd83dbSDimitry Andric // Insert after the instruction. 63715ffd83dbSDimitry Andric B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 63725ffd83dbSDimitry Andric 63735ffd83dbSDimitry Andric // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 63745ffd83dbSDimitry Andric // s16> instead of s32, we would only need 1 bitcast instead of multiple. 63755ffd83dbSDimitry Andric const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 63765ffd83dbSDimitry Andric const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 63775ffd83dbSDimitry Andric 63785ffd83dbSDimitry Andric Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 63795ffd83dbSDimitry Andric 63805ffd83dbSDimitry Andric MI.getOperand(0).setReg(NewResultReg); 63815ffd83dbSDimitry Andric 63825ffd83dbSDimitry Andric // In the IR, TFE is supposed to be used with a 2 element struct return 6383349cc55cSDimitry Andric // type. The instruction really returns these two values in one contiguous 63845ffd83dbSDimitry Andric // register, with one additional dword beyond the loaded data. Rewrite the 63855ffd83dbSDimitry Andric // return type to use a single register result. 63865ffd83dbSDimitry Andric 63875ffd83dbSDimitry Andric if (IsTFE) { 63885ffd83dbSDimitry Andric Dst1Reg = MI.getOperand(1).getReg(); 63895ffd83dbSDimitry Andric if (MRI->getType(Dst1Reg) != S32) 63905ffd83dbSDimitry Andric return false; 63915ffd83dbSDimitry Andric 63925ffd83dbSDimitry Andric // TODO: Make sure the TFE operand bit is set. 639381ad6265SDimitry Andric MI.removeOperand(1); 63945ffd83dbSDimitry Andric 63955ffd83dbSDimitry Andric // Handle the easy case that requires no repack instructions. 63965ffd83dbSDimitry Andric if (Ty == S32) { 63975ffd83dbSDimitry Andric B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 63985ffd83dbSDimitry Andric return true; 63995ffd83dbSDimitry Andric } 64005ffd83dbSDimitry Andric } 64015ffd83dbSDimitry Andric 64025ffd83dbSDimitry Andric // Now figure out how to copy the new result register back into the old 64035ffd83dbSDimitry Andric // result. 64045ffd83dbSDimitry Andric SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 64055ffd83dbSDimitry Andric 64065ffd83dbSDimitry Andric const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 64075ffd83dbSDimitry Andric 64085ffd83dbSDimitry Andric if (ResultNumRegs == 1) { 64095ffd83dbSDimitry Andric assert(!IsTFE); 64105ffd83dbSDimitry Andric ResultRegs[0] = NewResultReg; 64115ffd83dbSDimitry Andric } else { 64125ffd83dbSDimitry Andric // We have to repack into a new vector of some kind. 64135ffd83dbSDimitry Andric for (int I = 0; I != NumDataRegs; ++I) 64145ffd83dbSDimitry Andric ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 64155ffd83dbSDimitry Andric B.buildUnmerge(ResultRegs, NewResultReg); 64165ffd83dbSDimitry Andric 64175ffd83dbSDimitry Andric // Drop the final TFE element to get the data part. The TFE result is 64185ffd83dbSDimitry Andric // directly written to the right place already. 64195ffd83dbSDimitry Andric if (IsTFE) 64205ffd83dbSDimitry Andric ResultRegs.resize(NumDataRegs); 64215ffd83dbSDimitry Andric } 64225ffd83dbSDimitry Andric 64235ffd83dbSDimitry Andric // For an s16 scalar result, we form an s32 result with a truncate regardless 64245ffd83dbSDimitry Andric // of packed vs. unpacked. 64255ffd83dbSDimitry Andric if (IsD16 && !Ty.isVector()) { 64265ffd83dbSDimitry Andric B.buildTrunc(DstReg, ResultRegs[0]); 64275ffd83dbSDimitry Andric return true; 64285ffd83dbSDimitry Andric } 64295ffd83dbSDimitry Andric 64305ffd83dbSDimitry Andric // Avoid a build/concat_vector of 1 entry. 64315ffd83dbSDimitry Andric if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 64325ffd83dbSDimitry Andric B.buildBitcast(DstReg, ResultRegs[0]); 64335ffd83dbSDimitry Andric return true; 64345ffd83dbSDimitry Andric } 64355ffd83dbSDimitry Andric 64365ffd83dbSDimitry Andric assert(Ty.isVector()); 64375ffd83dbSDimitry Andric 64385ffd83dbSDimitry Andric if (IsD16) { 64395ffd83dbSDimitry Andric // For packed D16 results with TFE enabled, all the data components are 64405ffd83dbSDimitry Andric // S32. Cast back to the expected type. 64415ffd83dbSDimitry Andric // 64425ffd83dbSDimitry Andric // TODO: We don't really need to use load s32 elements. We would only need one 64435ffd83dbSDimitry Andric // cast for the TFE result if a multiple of v2s16 was used. 64445ffd83dbSDimitry Andric if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 64455ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 64465ffd83dbSDimitry Andric Reg = B.buildBitcast(V2S16, Reg).getReg(0); 64475ffd83dbSDimitry Andric } else if (ST.hasUnpackedD16VMem()) { 64485ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 64495ffd83dbSDimitry Andric Reg = B.buildTrunc(S16, Reg).getReg(0); 64505ffd83dbSDimitry Andric } 64515ffd83dbSDimitry Andric } 64525ffd83dbSDimitry Andric 64535ffd83dbSDimitry Andric auto padWithUndef = [&](LLT Ty, int NumElts) { 64545ffd83dbSDimitry Andric if (NumElts == 0) 64555ffd83dbSDimitry Andric return; 64565ffd83dbSDimitry Andric Register Undef = B.buildUndef(Ty).getReg(0); 64575ffd83dbSDimitry Andric for (int I = 0; I != NumElts; ++I) 64585ffd83dbSDimitry Andric ResultRegs.push_back(Undef); 64595ffd83dbSDimitry Andric }; 64605ffd83dbSDimitry Andric 64615ffd83dbSDimitry Andric // Pad out any elements eliminated due to the dmask. 64625ffd83dbSDimitry Andric LLT ResTy = MRI->getType(ResultRegs[0]); 64635ffd83dbSDimitry Andric if (!ResTy.isVector()) { 64645ffd83dbSDimitry Andric padWithUndef(ResTy, NumElts - ResultRegs.size()); 64655ffd83dbSDimitry Andric B.buildBuildVector(DstReg, ResultRegs); 64665ffd83dbSDimitry Andric return true; 64675ffd83dbSDimitry Andric } 64685ffd83dbSDimitry Andric 64695ffd83dbSDimitry Andric assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 64705ffd83dbSDimitry Andric const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 64715ffd83dbSDimitry Andric 64725ffd83dbSDimitry Andric // Deal with the one annoying legal case. 6473fe6060f1SDimitry Andric const LLT V3S16 = LLT::fixed_vector(3, 16); 64745ffd83dbSDimitry Andric if (Ty == V3S16) { 64750eae32dcSDimitry Andric if (IsTFE) { 64760eae32dcSDimitry Andric if (ResultRegs.size() == 1) { 64770eae32dcSDimitry Andric NewResultReg = ResultRegs[0]; 64780eae32dcSDimitry Andric } else if (ResultRegs.size() == 2) { 64790eae32dcSDimitry Andric LLT V4S16 = LLT::fixed_vector(4, 16); 64800eae32dcSDimitry Andric NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 64810eae32dcSDimitry Andric } else { 64820eae32dcSDimitry Andric return false; 64830eae32dcSDimitry Andric } 64840eae32dcSDimitry Andric } 64850eae32dcSDimitry Andric 64860eae32dcSDimitry Andric if (MRI->getType(DstReg).getNumElements() < 64870eae32dcSDimitry Andric MRI->getType(NewResultReg).getNumElements()) { 64880eae32dcSDimitry Andric B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 64890eae32dcSDimitry Andric } else { 64900eae32dcSDimitry Andric B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 64910eae32dcSDimitry Andric } 64925ffd83dbSDimitry Andric return true; 64935ffd83dbSDimitry Andric } 64945ffd83dbSDimitry Andric 64955ffd83dbSDimitry Andric padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 64965ffd83dbSDimitry Andric B.buildConcatVectors(DstReg, ResultRegs); 64975ffd83dbSDimitry Andric return true; 64985ffd83dbSDimitry Andric } 64995ffd83dbSDimitry Andric 65007a6dacacSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper, 65017a6dacacSDimitry Andric MachineInstr &MI) const { 6502e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 6503e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 6504e8d8bef9SDimitry Andric 65057a6dacacSDimitry Andric Register OrigDst = MI.getOperand(0).getReg(); 65067a6dacacSDimitry Andric Register Dst; 65077a6dacacSDimitry Andric LLT Ty = B.getMRI()->getType(OrigDst); 65085ffd83dbSDimitry Andric unsigned Size = Ty.getSizeInBits(); 65095ffd83dbSDimitry Andric MachineFunction &MF = B.getMF(); 65107a6dacacSDimitry Andric unsigned Opc = 0; 65117a6dacacSDimitry Andric if (Size < 32 && ST.hasScalarSubwordLoads()) { 65127a6dacacSDimitry Andric assert(Size == 8 || Size == 16); 65137a6dacacSDimitry Andric Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE 65147a6dacacSDimitry Andric : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT; 65157a6dacacSDimitry Andric // The 8-bit and 16-bit scalar buffer load instructions have 32-bit 65167a6dacacSDimitry Andric // destination register. 65177a6dacacSDimitry Andric Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32)); 65187a6dacacSDimitry Andric } else { 65197a6dacacSDimitry Andric Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD; 65207a6dacacSDimitry Andric Dst = OrigDst; 65217a6dacacSDimitry Andric } 65225ffd83dbSDimitry Andric 65235ffd83dbSDimitry Andric Observer.changingInstr(MI); 65245ffd83dbSDimitry Andric 652506c3fb27SDimitry Andric // Handle needing to s.buffer.load() a p8 value. 652606c3fb27SDimitry Andric if (hasBufferRsrcWorkaround(Ty)) { 652706c3fb27SDimitry Andric Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0); 652806c3fb27SDimitry Andric B.setInsertPt(B.getMBB(), MI); 652906c3fb27SDimitry Andric } 6530fe6060f1SDimitry Andric if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 6531e8d8bef9SDimitry Andric Ty = getBitcastRegisterType(Ty); 6532e8d8bef9SDimitry Andric Helper.bitcastDst(MI, Ty, 0); 6533e8d8bef9SDimitry Andric B.setInsertPt(B.getMBB(), MI); 6534e8d8bef9SDimitry Andric } 6535e8d8bef9SDimitry Andric 65365ffd83dbSDimitry Andric // FIXME: We don't really need this intermediate instruction. The intrinsic 65375ffd83dbSDimitry Andric // should be fixed to have a memory operand. Since it's readnone, we're not 65385ffd83dbSDimitry Andric // allowed to add one. 65397a6dacacSDimitry Andric MI.setDesc(B.getTII().get(Opc)); 654081ad6265SDimitry Andric MI.removeOperand(1); // Remove intrinsic ID 65415ffd83dbSDimitry Andric 65425ffd83dbSDimitry Andric // FIXME: When intrinsic definition is fixed, this should have an MMO already. 65435ffd83dbSDimitry Andric // TODO: Should this use datalayout alignment? 65445ffd83dbSDimitry Andric const unsigned MemSize = (Size + 7) / 8; 65457a6dacacSDimitry Andric const Align MemAlign(std::min(MemSize, 4u)); 65465ffd83dbSDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 65475ffd83dbSDimitry Andric MachinePointerInfo(), 65485ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 65495ffd83dbSDimitry Andric MachineMemOperand::MOInvariant, 65505ffd83dbSDimitry Andric MemSize, MemAlign); 65515ffd83dbSDimitry Andric MI.addMemOperand(MF, MMO); 65527a6dacacSDimitry Andric if (Dst != OrigDst) { 65537a6dacacSDimitry Andric MI.getOperand(0).setReg(Dst); 65547a6dacacSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 65557a6dacacSDimitry Andric B.buildTrunc(OrigDst, Dst); 65567a6dacacSDimitry Andric } 65575ffd83dbSDimitry Andric 65585f757f3fSDimitry Andric // If we don't have 96-bit result scalar loads, widening to 128-bit should 65595ffd83dbSDimitry Andric // always be legal. We may need to restore this to a 96-bit result if it turns 65605ffd83dbSDimitry Andric // out this needs to be converted to a vector load during RegBankSelect. 65615f757f3fSDimitry Andric if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) { 65625ffd83dbSDimitry Andric if (Ty.isVector()) 65635ffd83dbSDimitry Andric Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 65645ffd83dbSDimitry Andric else 65655ffd83dbSDimitry Andric Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 65665ffd83dbSDimitry Andric } 65675ffd83dbSDimitry Andric 65685ffd83dbSDimitry Andric Observer.changedInstr(MI); 65695ffd83dbSDimitry Andric return true; 65705ffd83dbSDimitry Andric } 65715ffd83dbSDimitry Andric 6572e8d8bef9SDimitry Andric // TODO: Move to selection 65735ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 65740b57cec5SDimitry Andric MachineRegisterInfo &MRI, 65750b57cec5SDimitry Andric MachineIRBuilder &B) const { 6576fe6060f1SDimitry Andric if (!ST.isTrapHandlerEnabled() || 6577fe6060f1SDimitry Andric ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 6578fe6060f1SDimitry Andric return legalizeTrapEndpgm(MI, MRI, B); 6579fe6060f1SDimitry Andric 658006c3fb27SDimitry Andric return ST.supportsGetDoorbellID() ? 658106c3fb27SDimitry Andric legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B); 6582fe6060f1SDimitry Andric } 6583fe6060f1SDimitry Andric 6584fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 6585fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 658606c3fb27SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 658706c3fb27SDimitry Andric MachineBasicBlock &BB = B.getMBB(); 658806c3fb27SDimitry Andric MachineFunction *MF = BB.getParent(); 658906c3fb27SDimitry Andric 659006c3fb27SDimitry Andric if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) { 659106c3fb27SDimitry Andric BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 659206c3fb27SDimitry Andric .addImm(0); 659306c3fb27SDimitry Andric MI.eraseFromParent(); 659406c3fb27SDimitry Andric return true; 659506c3fb27SDimitry Andric } 659606c3fb27SDimitry Andric 659706c3fb27SDimitry Andric // We need a block split to make the real endpgm a terminator. We also don't 659806c3fb27SDimitry Andric // want to break phis in successor blocks, so we can't just delete to the 659906c3fb27SDimitry Andric // end of the block. 660006c3fb27SDimitry Andric BB.splitAt(MI, false /*UpdateLiveIns*/); 660106c3fb27SDimitry Andric MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); 660206c3fb27SDimitry Andric MF->push_back(TrapBB); 660306c3fb27SDimitry Andric BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM)) 660406c3fb27SDimitry Andric .addImm(0); 660506c3fb27SDimitry Andric BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ)) 660606c3fb27SDimitry Andric .addMBB(TrapBB); 660706c3fb27SDimitry Andric 660806c3fb27SDimitry Andric BB.addSuccessor(TrapBB); 6609fe6060f1SDimitry Andric MI.eraseFromParent(); 6610fe6060f1SDimitry Andric return true; 6611fe6060f1SDimitry Andric } 6612fe6060f1SDimitry Andric 6613fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 6614fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 661581ad6265SDimitry Andric MachineFunction &MF = B.getMF(); 661681ad6265SDimitry Andric const LLT S64 = LLT::scalar(64); 661781ad6265SDimitry Andric 661881ad6265SDimitry Andric Register SGPR01(AMDGPU::SGPR0_SGPR1); 661981ad6265SDimitry Andric // For code object version 5, queue_ptr is passed through implicit kernarg. 66207a6dacacSDimitry Andric if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >= 662106c3fb27SDimitry Andric AMDGPU::AMDHSA_COV5) { 662281ad6265SDimitry Andric AMDGPUTargetLowering::ImplicitParameter Param = 662381ad6265SDimitry Andric AMDGPUTargetLowering::QUEUE_PTR; 662481ad6265SDimitry Andric uint64_t Offset = 662581ad6265SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 662681ad6265SDimitry Andric 662781ad6265SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister( 662881ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 662981ad6265SDimitry Andric 663081ad6265SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 663181ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 663281ad6265SDimitry Andric return false; 663381ad6265SDimitry Andric 663481ad6265SDimitry Andric // TODO: can we be smarter about machine pointer info? 663581ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 663681ad6265SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 663781ad6265SDimitry Andric PtrInfo, 663881ad6265SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 663981ad6265SDimitry Andric MachineMemOperand::MOInvariant, 664081ad6265SDimitry Andric LLT::scalar(64), commonAlignment(Align(64), Offset)); 664181ad6265SDimitry Andric 664281ad6265SDimitry Andric // Pointer address 664381ad6265SDimitry Andric Register LoadAddr = MRI.createGenericVirtualRegister( 664481ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 664581ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, KernargPtrReg, 664681ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 664781ad6265SDimitry Andric // Load address 664881ad6265SDimitry Andric Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 664981ad6265SDimitry Andric B.buildCopy(SGPR01, Temp); 665081ad6265SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 665181ad6265SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 665281ad6265SDimitry Andric .addReg(SGPR01, RegState::Implicit); 665381ad6265SDimitry Andric MI.eraseFromParent(); 665481ad6265SDimitry Andric return true; 665581ad6265SDimitry Andric } 665681ad6265SDimitry Andric 66575ffd83dbSDimitry Andric // Pass queue pointer to trap handler as input, and insert trap instruction 66585ffd83dbSDimitry Andric // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 6659e8d8bef9SDimitry Andric Register LiveIn = 6660e8d8bef9SDimitry Andric MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 6661e8d8bef9SDimitry Andric if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 66625ffd83dbSDimitry Andric return false; 6663e8d8bef9SDimitry Andric 66645ffd83dbSDimitry Andric B.buildCopy(SGPR01, LiveIn); 66655ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 6666fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 66675ffd83dbSDimitry Andric .addReg(SGPR01, RegState::Implicit); 6668fe6060f1SDimitry Andric 6669fe6060f1SDimitry Andric MI.eraseFromParent(); 6670fe6060f1SDimitry Andric return true; 66715ffd83dbSDimitry Andric } 66725ffd83dbSDimitry Andric 6673fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa( 6674fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6675fe6060f1SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 6676fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 66775ffd83dbSDimitry Andric MI.eraseFromParent(); 66785ffd83dbSDimitry Andric return true; 66795ffd83dbSDimitry Andric } 66805ffd83dbSDimitry Andric 66815ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 66825ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 6683349cc55cSDimitry Andric // Is non-HSA path or trap-handler disabled? Then, report a warning 66845ffd83dbSDimitry Andric // accordingly 6685fe6060f1SDimitry Andric if (!ST.isTrapHandlerEnabled() || 6686fe6060f1SDimitry Andric ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 66875ffd83dbSDimitry Andric DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 66885ffd83dbSDimitry Andric "debugtrap handler not supported", 66895ffd83dbSDimitry Andric MI.getDebugLoc(), DS_Warning); 66905ffd83dbSDimitry Andric LLVMContext &Ctx = B.getMF().getFunction().getContext(); 66915ffd83dbSDimitry Andric Ctx.diagnose(NoTrap); 66925ffd83dbSDimitry Andric } else { 66935ffd83dbSDimitry Andric // Insert debug-trap instruction 6694fe6060f1SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 6695fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 66965ffd83dbSDimitry Andric } 66975ffd83dbSDimitry Andric 66985ffd83dbSDimitry Andric MI.eraseFromParent(); 66995ffd83dbSDimitry Andric return true; 67005ffd83dbSDimitry Andric } 67015ffd83dbSDimitry Andric 6702e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 6703e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 6704e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 6705e8d8bef9SDimitry Andric const LLT S16 = LLT::scalar(16); 6706e8d8bef9SDimitry Andric const LLT S32 = LLT::scalar(32); 670781ad6265SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 670881ad6265SDimitry Andric const LLT V3S32 = LLT::fixed_vector(3, 32); 6709e8d8bef9SDimitry Andric 6710e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 6711e8d8bef9SDimitry Andric Register NodePtr = MI.getOperand(2).getReg(); 6712e8d8bef9SDimitry Andric Register RayExtent = MI.getOperand(3).getReg(); 6713e8d8bef9SDimitry Andric Register RayOrigin = MI.getOperand(4).getReg(); 6714e8d8bef9SDimitry Andric Register RayDir = MI.getOperand(5).getReg(); 6715e8d8bef9SDimitry Andric Register RayInvDir = MI.getOperand(6).getReg(); 6716e8d8bef9SDimitry Andric Register TDescr = MI.getOperand(7).getReg(); 6717e8d8bef9SDimitry Andric 6718fe6060f1SDimitry Andric if (!ST.hasGFX10_AEncoding()) { 6719fe6060f1SDimitry Andric DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 6720fe6060f1SDimitry Andric "intrinsic not supported on subtarget", 6721fe6060f1SDimitry Andric MI.getDebugLoc()); 6722fe6060f1SDimitry Andric B.getMF().getFunction().getContext().diagnose(BadIntrin); 6723fe6060f1SDimitry Andric return false; 6724fe6060f1SDimitry Andric } 6725fe6060f1SDimitry Andric 67265f757f3fSDimitry Andric const bool IsGFX11 = AMDGPU::isGFX11(ST); 672781ad6265SDimitry Andric const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 67285f757f3fSDimitry Andric const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST); 6729349cc55cSDimitry Andric const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 6730349cc55cSDimitry Andric const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 6731349cc55cSDimitry Andric const unsigned NumVDataDwords = 4; 6732349cc55cSDimitry Andric const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 673381ad6265SDimitry Andric const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 67345f757f3fSDimitry Andric const bool UseNSA = 67355f757f3fSDimitry Andric IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize()); 67365f757f3fSDimitry Andric 6737349cc55cSDimitry Andric const unsigned BaseOpcodes[2][2] = { 6738349cc55cSDimitry Andric {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 6739349cc55cSDimitry Andric {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 6740349cc55cSDimitry Andric AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 6741349cc55cSDimitry Andric int Opcode; 6742349cc55cSDimitry Andric if (UseNSA) { 674381ad6265SDimitry Andric Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 67445f757f3fSDimitry Andric IsGFX12Plus ? AMDGPU::MIMGEncGfx12 67455f757f3fSDimitry Andric : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA 674681ad6265SDimitry Andric : AMDGPU::MIMGEncGfx10NSA, 6747349cc55cSDimitry Andric NumVDataDwords, NumVAddrDwords); 6748349cc55cSDimitry Andric } else { 67495f757f3fSDimitry Andric assert(!IsGFX12Plus); 67505f757f3fSDimitry Andric Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 67515f757f3fSDimitry Andric IsGFX11 ? AMDGPU::MIMGEncGfx11Default 67525f757f3fSDimitry Andric : AMDGPU::MIMGEncGfx10Default, 6753bdd1243dSDimitry Andric NumVDataDwords, NumVAddrDwords); 6754349cc55cSDimitry Andric } 6755349cc55cSDimitry Andric assert(Opcode != -1); 6756e8d8bef9SDimitry Andric 6757e8d8bef9SDimitry Andric SmallVector<Register, 12> Ops; 675881ad6265SDimitry Andric if (UseNSA && IsGFX11Plus) { 675981ad6265SDimitry Andric auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 676081ad6265SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6761bdd1243dSDimitry Andric auto Merged = B.buildMergeLikeInstr( 676281ad6265SDimitry Andric V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 676381ad6265SDimitry Andric Ops.push_back(Merged.getReg(0)); 676481ad6265SDimitry Andric }; 676581ad6265SDimitry Andric 676681ad6265SDimitry Andric Ops.push_back(NodePtr); 676781ad6265SDimitry Andric Ops.push_back(RayExtent); 676881ad6265SDimitry Andric packLanes(RayOrigin); 676981ad6265SDimitry Andric 677081ad6265SDimitry Andric if (IsA16) { 677181ad6265SDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 677281ad6265SDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6773bdd1243dSDimitry Andric auto MergedDir = B.buildMergeLikeInstr( 677481ad6265SDimitry Andric V3S32, 6775bdd1243dSDimitry Andric {B.buildBitcast( 6776bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 677781ad6265SDimitry Andric UnmergeRayDir.getReg(0)})) 677881ad6265SDimitry Andric .getReg(0), 6779bdd1243dSDimitry Andric B.buildBitcast( 6780bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 678181ad6265SDimitry Andric UnmergeRayDir.getReg(1)})) 678281ad6265SDimitry Andric .getReg(0), 6783bdd1243dSDimitry Andric B.buildBitcast( 6784bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 678581ad6265SDimitry Andric UnmergeRayDir.getReg(2)})) 678681ad6265SDimitry Andric .getReg(0)}); 678781ad6265SDimitry Andric Ops.push_back(MergedDir.getReg(0)); 678881ad6265SDimitry Andric } else { 678981ad6265SDimitry Andric packLanes(RayDir); 679081ad6265SDimitry Andric packLanes(RayInvDir); 679181ad6265SDimitry Andric } 679281ad6265SDimitry Andric } else { 6793e8d8bef9SDimitry Andric if (Is64) { 6794e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 6795e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 6796e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 6797e8d8bef9SDimitry Andric } else { 6798e8d8bef9SDimitry Andric Ops.push_back(NodePtr); 6799e8d8bef9SDimitry Andric } 6800e8d8bef9SDimitry Andric Ops.push_back(RayExtent); 6801e8d8bef9SDimitry Andric 6802e8d8bef9SDimitry Andric auto packLanes = [&Ops, &S32, &B](Register Src) { 68030eae32dcSDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 6804e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 6805e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 6806e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(2)); 6807e8d8bef9SDimitry Andric }; 6808e8d8bef9SDimitry Andric 6809e8d8bef9SDimitry Andric packLanes(RayOrigin); 6810e8d8bef9SDimitry Andric if (IsA16) { 68110eae32dcSDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 68120eae32dcSDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 6813e8d8bef9SDimitry Andric Register R1 = MRI.createGenericVirtualRegister(S32); 6814e8d8bef9SDimitry Andric Register R2 = MRI.createGenericVirtualRegister(S32); 6815e8d8bef9SDimitry Andric Register R3 = MRI.createGenericVirtualRegister(S32); 6816bdd1243dSDimitry Andric B.buildMergeLikeInstr(R1, 6817bdd1243dSDimitry Andric {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 6818bdd1243dSDimitry Andric B.buildMergeLikeInstr( 6819bdd1243dSDimitry Andric R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 6820bdd1243dSDimitry Andric B.buildMergeLikeInstr( 6821bdd1243dSDimitry Andric R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 6822e8d8bef9SDimitry Andric Ops.push_back(R1); 6823e8d8bef9SDimitry Andric Ops.push_back(R2); 6824e8d8bef9SDimitry Andric Ops.push_back(R3); 6825e8d8bef9SDimitry Andric } else { 6826e8d8bef9SDimitry Andric packLanes(RayDir); 6827e8d8bef9SDimitry Andric packLanes(RayInvDir); 6828e8d8bef9SDimitry Andric } 682981ad6265SDimitry Andric } 6830e8d8bef9SDimitry Andric 6831349cc55cSDimitry Andric if (!UseNSA) { 6832349cc55cSDimitry Andric // Build a single vector containing all the operands so far prepared. 6833349cc55cSDimitry Andric LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 6834bdd1243dSDimitry Andric Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 6835349cc55cSDimitry Andric Ops.clear(); 6836349cc55cSDimitry Andric Ops.push_back(MergedOps); 6837349cc55cSDimitry Andric } 6838349cc55cSDimitry Andric 6839e8d8bef9SDimitry Andric auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 6840e8d8bef9SDimitry Andric .addDef(DstReg) 6841e8d8bef9SDimitry Andric .addImm(Opcode); 6842e8d8bef9SDimitry Andric 6843e8d8bef9SDimitry Andric for (Register R : Ops) { 6844e8d8bef9SDimitry Andric MIB.addUse(R); 6845e8d8bef9SDimitry Andric } 6846e8d8bef9SDimitry Andric 6847e8d8bef9SDimitry Andric MIB.addUse(TDescr) 6848e8d8bef9SDimitry Andric .addImm(IsA16 ? 1 : 0) 6849e8d8bef9SDimitry Andric .cloneMemRefs(MI); 6850e8d8bef9SDimitry Andric 6851e8d8bef9SDimitry Andric MI.eraseFromParent(); 6852e8d8bef9SDimitry Andric return true; 6853e8d8bef9SDimitry Andric } 6854e8d8bef9SDimitry Andric 685581ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 685681ad6265SDimitry Andric MachineIRBuilder &B) const { 685781ad6265SDimitry Andric unsigned Opc; 685881ad6265SDimitry Andric int RoundMode = MI.getOperand(2).getImm(); 685981ad6265SDimitry Andric 686081ad6265SDimitry Andric if (RoundMode == (int)RoundingMode::TowardPositive) 686181ad6265SDimitry Andric Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 686281ad6265SDimitry Andric else if (RoundMode == (int)RoundingMode::TowardNegative) 686381ad6265SDimitry Andric Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 686481ad6265SDimitry Andric else 686581ad6265SDimitry Andric return false; 686681ad6265SDimitry Andric 686781ad6265SDimitry Andric B.buildInstr(Opc) 686881ad6265SDimitry Andric .addDef(MI.getOperand(0).getReg()) 686981ad6265SDimitry Andric .addUse(MI.getOperand(1).getReg()); 687081ad6265SDimitry Andric 687104eeddc0SDimitry Andric MI.eraseFromParent(); 687281ad6265SDimitry Andric 687304eeddc0SDimitry Andric return true; 687404eeddc0SDimitry Andric } 687504eeddc0SDimitry Andric 68765f757f3fSDimitry Andric bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI, 68775f757f3fSDimitry Andric MachineIRBuilder &B) const { 68785f757f3fSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 68795f757f3fSDimitry Andric Register StackPtr = TLI->getStackPointerRegisterToSaveRestore(); 68805f757f3fSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 68815f757f3fSDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr}); 68825f757f3fSDimitry Andric MI.eraseFromParent(); 68835f757f3fSDimitry Andric return true; 68845f757f3fSDimitry Andric } 68855f757f3fSDimitry Andric 6886*b3edf446SDimitry Andric bool AMDGPULegalizerInfo::legalizeWaveID(MachineInstr &MI, 6887*b3edf446SDimitry Andric MachineIRBuilder &B) const { 6888*b3edf446SDimitry Andric // With architected SGPRs, waveIDinGroup is in TTMP8[29:25]. 6889*b3edf446SDimitry Andric if (!ST.hasArchitectedSGPRs()) 6890*b3edf446SDimitry Andric return false; 6891*b3edf446SDimitry Andric LLT S32 = LLT::scalar(32); 6892*b3edf446SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 6893*b3edf446SDimitry Andric auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8)); 6894*b3edf446SDimitry Andric auto LSB = B.buildConstant(S32, 25); 6895*b3edf446SDimitry Andric auto Width = B.buildConstant(S32, 5); 6896*b3edf446SDimitry Andric B.buildUbfx(DstReg, TTMP8, LSB, Width); 6897*b3edf446SDimitry Andric MI.eraseFromParent(); 6898*b3edf446SDimitry Andric return true; 6899*b3edf446SDimitry Andric } 6900*b3edf446SDimitry Andric 69015ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 69025ffd83dbSDimitry Andric MachineInstr &MI) const { 69035ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 69045ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 69055ffd83dbSDimitry Andric 69060b57cec5SDimitry Andric // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 69075f757f3fSDimitry Andric auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID(); 6908480093f4SDimitry Andric switch (IntrID) { 6909480093f4SDimitry Andric case Intrinsic::amdgcn_if: 6910480093f4SDimitry Andric case Intrinsic::amdgcn_else: { 6911480093f4SDimitry Andric MachineInstr *Br = nullptr; 69125ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 6913e8d8bef9SDimitry Andric bool Negated = false; 6914e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 6915e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 69160b57cec5SDimitry Andric const SIRegisterInfo *TRI 69170b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 69180b57cec5SDimitry Andric 69190b57cec5SDimitry Andric Register Def = MI.getOperand(1).getReg(); 69200b57cec5SDimitry Andric Register Use = MI.getOperand(3).getReg(); 6921480093f4SDimitry Andric 69225ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 6923e8d8bef9SDimitry Andric 6924e8d8bef9SDimitry Andric if (Negated) 6925e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 6926e8d8bef9SDimitry Andric 69275ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 6928480093f4SDimitry Andric if (IntrID == Intrinsic::amdgcn_if) { 69290b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_IF) 69300b57cec5SDimitry Andric .addDef(Def) 69310b57cec5SDimitry Andric .addUse(Use) 69325ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 6933480093f4SDimitry Andric } else { 6934480093f4SDimitry Andric B.buildInstr(AMDGPU::SI_ELSE) 6935480093f4SDimitry Andric .addDef(Def) 6936480093f4SDimitry Andric .addUse(Use) 6937e8d8bef9SDimitry Andric .addMBB(UncondBrTarget); 6938480093f4SDimitry Andric } 6939480093f4SDimitry Andric 69405ffd83dbSDimitry Andric if (Br) { 69415ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 69425ffd83dbSDimitry Andric } else { 69435ffd83dbSDimitry Andric // The IRTranslator skips inserting the G_BR for fallthrough cases, but 69445ffd83dbSDimitry Andric // since we're swapping branch targets it needs to be reinserted. 69455ffd83dbSDimitry Andric // FIXME: IRTranslator should probably not do this 69465ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 69475ffd83dbSDimitry Andric } 69480b57cec5SDimitry Andric 69490b57cec5SDimitry Andric MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 69500b57cec5SDimitry Andric MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 69510b57cec5SDimitry Andric MI.eraseFromParent(); 69520b57cec5SDimitry Andric BrCond->eraseFromParent(); 69530b57cec5SDimitry Andric return true; 69540b57cec5SDimitry Andric } 69550b57cec5SDimitry Andric 69560b57cec5SDimitry Andric return false; 69570b57cec5SDimitry Andric } 69580b57cec5SDimitry Andric case Intrinsic::amdgcn_loop: { 6959480093f4SDimitry Andric MachineInstr *Br = nullptr; 69605ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 6961e8d8bef9SDimitry Andric bool Negated = false; 6962e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 6963e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 69640b57cec5SDimitry Andric const SIRegisterInfo *TRI 69650b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 69660b57cec5SDimitry Andric 69675ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 69680b57cec5SDimitry Andric Register Reg = MI.getOperand(2).getReg(); 69695ffd83dbSDimitry Andric 6970e8d8bef9SDimitry Andric if (Negated) 6971e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 6972e8d8bef9SDimitry Andric 69735ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 69740b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_LOOP) 69750b57cec5SDimitry Andric .addUse(Reg) 69765ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 69775ffd83dbSDimitry Andric 69785ffd83dbSDimitry Andric if (Br) 69795ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 69805ffd83dbSDimitry Andric else 69815ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 69825ffd83dbSDimitry Andric 69830b57cec5SDimitry Andric MI.eraseFromParent(); 69840b57cec5SDimitry Andric BrCond->eraseFromParent(); 69850b57cec5SDimitry Andric MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 69860b57cec5SDimitry Andric return true; 69870b57cec5SDimitry Andric } 69880b57cec5SDimitry Andric 69890b57cec5SDimitry Andric return false; 69900b57cec5SDimitry Andric } 699106c3fb27SDimitry Andric case Intrinsic::amdgcn_make_buffer_rsrc: 699206c3fb27SDimitry Andric return legalizePointerAsRsrcIntrin(MI, MRI, B); 69930b57cec5SDimitry Andric case Intrinsic::amdgcn_kernarg_segment_ptr: 69945ffd83dbSDimitry Andric if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 69955ffd83dbSDimitry Andric // This only makes sense to call in a kernel, so just lower to null. 69965ffd83dbSDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), 0); 69975ffd83dbSDimitry Andric MI.eraseFromParent(); 69985ffd83dbSDimitry Andric return true; 69995ffd83dbSDimitry Andric } 70005ffd83dbSDimitry Andric 70010b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 70020b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 70030b57cec5SDimitry Andric case Intrinsic::amdgcn_implicitarg_ptr: 70040b57cec5SDimitry Andric return legalizeImplicitArgPtr(MI, MRI, B); 70050b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 700681ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 70070b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_X); 70080b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 700981ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 70100b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 70110b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 701281ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 70130b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 70140b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_x: 70150b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 70160b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 70170b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_y: 70180b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 70190b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 70200b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_z: 70210b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 70220b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 7023*b3edf446SDimitry Andric case Intrinsic::amdgcn_wave_id: 7024*b3edf446SDimitry Andric return legalizeWaveID(MI, B); 7025fcaf7f86SDimitry Andric case Intrinsic::amdgcn_lds_kernel_id: 7026fcaf7f86SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 7027fcaf7f86SDimitry Andric AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 70280b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_ptr: 70290b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 70300b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_PTR); 70310b57cec5SDimitry Andric case Intrinsic::amdgcn_queue_ptr: 70320b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 70330b57cec5SDimitry Andric AMDGPUFunctionArgInfo::QUEUE_PTR); 70340b57cec5SDimitry Andric case Intrinsic::amdgcn_implicit_buffer_ptr: 70350b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 70360b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 70370b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_id: 70380b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 70390b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_ID); 704081ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_x: 704181ad6265SDimitry Andric // TODO: Emit error for hsa 704281ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 704381ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_X); 704481ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_y: 704581ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 704681ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_Y); 704781ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_z: 704881ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 704981ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_Z); 705081ad6265SDimitry Andric case Intrinsic::r600_read_local_size_x: 705181ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 705281ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 705381ad6265SDimitry Andric case Intrinsic::r600_read_local_size_y: 705481ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 705581ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 705681ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 705781ad6265SDimitry Andric case Intrinsic::r600_read_local_size_z: 705881ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 705981ad6265SDimitry Andric case Intrinsic::r600_read_global_size_x: 706081ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 706181ad6265SDimitry Andric case Intrinsic::r600_read_global_size_y: 706281ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 706381ad6265SDimitry Andric case Intrinsic::r600_read_global_size_z: 706481ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 70658bcb0991SDimitry Andric case Intrinsic::amdgcn_fdiv_fast: 70668bcb0991SDimitry Andric return legalizeFDIVFastIntrin(MI, MRI, B); 70678bcb0991SDimitry Andric case Intrinsic::amdgcn_is_shared: 70688bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 70698bcb0991SDimitry Andric case Intrinsic::amdgcn_is_private: 70708bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 70718bcb0991SDimitry Andric case Intrinsic::amdgcn_wavefrontsize: { 70728bcb0991SDimitry Andric B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 70738bcb0991SDimitry Andric MI.eraseFromParent(); 70748bcb0991SDimitry Andric return true; 70758bcb0991SDimitry Andric } 70765ffd83dbSDimitry Andric case Intrinsic::amdgcn_s_buffer_load: 7077e8d8bef9SDimitry Andric return legalizeSBufferLoad(Helper, MI); 70788bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store: 707906c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_store: 70805ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store: 708106c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_store: 70825ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, false); 70838bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store_format: 708406c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_store_format: 70855ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store_format: 708606c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_store_format: 70875ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, true); 70885ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_store: 708906c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_tbuffer_store: 70905ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_store: 709106c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_tbuffer_store: 70925ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, true, true); 70935ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load: 709406c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_load: 70955ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load: 709606c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_load: 70975ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, false, false); 70985ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load_format: 709906c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_load_format: 71005ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load_format: 710106c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_load_format: 71025ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, false); 71035ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_load: 710406c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_tbuffer_load: 71055ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_load: 710606c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_tbuffer_load: 71075ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, true); 71085ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 710906c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap: 71105ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 711106c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap: 71125ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 711306c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add: 71145ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 711506c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add: 71165ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 711706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub: 71185ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 711906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub: 71205ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 712106c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin: 71225ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 712306c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin: 71245ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 712506c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin: 71265ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 712706c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin: 71285ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 712906c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax: 71305ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 713106c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax: 71325ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 713306c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: 71345ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 713506c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax: 71365ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 713706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and: 71385ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 713906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and: 71405ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 714106c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or: 71425ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 714306c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or: 71445ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 714506c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor: 71465ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 714706c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor: 71485ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 714906c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc: 71505ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 715106c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc: 71525ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 715306c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec: 71545ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 715506c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec: 71565ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 715706c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: 71585ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 715906c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: 7160fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 716106c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin: 7162fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 716306c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin: 7164fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 716506c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax: 7166fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 716706c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax: 716804eeddc0SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 716906c3fb27SDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd: 7170bdd1243dSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 717106c3fb27SDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd: 71727a6dacacSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16: 71737a6dacacSDimitry Andric case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16: 71747a6dacacSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16: 71757a6dacacSDimitry Andric case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16: 717604eeddc0SDimitry Andric return legalizeBufferAtomic(MI, B, IntrID); 71775ffd83dbSDimitry Andric case Intrinsic::trap: 71785ffd83dbSDimitry Andric return legalizeTrapIntrinsic(MI, MRI, B); 71795ffd83dbSDimitry Andric case Intrinsic::debugtrap: 71805ffd83dbSDimitry Andric return legalizeDebugTrapIntrinsic(MI, MRI, B); 7181e8d8bef9SDimitry Andric case Intrinsic::amdgcn_rsq_clamp: 7182e8d8bef9SDimitry Andric return legalizeRsqClampIntrinsic(MI, MRI, B); 7183e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 7184e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 7185e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 7186e8d8bef9SDimitry Andric return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 7187e8d8bef9SDimitry Andric case Intrinsic::amdgcn_image_bvh_intersect_ray: 7188e8d8bef9SDimitry Andric return legalizeBVHIntrinsic(MI, B); 7189*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: 7190*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: 7191*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: 7192*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16: 7193*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8: 7194*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: 7195*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: 7196*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: { 7197*b3edf446SDimitry Andric Register Index = MI.getOperand(5).getReg(); 7198*b3edf446SDimitry Andric LLT S32 = LLT::scalar(32); 7199*b3edf446SDimitry Andric if (MRI.getType(Index) != S32) 7200*b3edf446SDimitry Andric MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7201*b3edf446SDimitry Andric return true; 7202*b3edf446SDimitry Andric } 7203*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: 7204*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: 7205*b3edf446SDimitry Andric case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { 7206*b3edf446SDimitry Andric Register Index = MI.getOperand(7).getReg(); 7207*b3edf446SDimitry Andric LLT S32 = LLT::scalar(32); 7208*b3edf446SDimitry Andric if (MRI.getType(Index) != S32) 7209*b3edf446SDimitry Andric MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); 7210*b3edf446SDimitry Andric return true; 7211*b3edf446SDimitry Andric } 721206c3fb27SDimitry Andric case Intrinsic::amdgcn_fmed3: { 721306c3fb27SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 721406c3fb27SDimitry Andric 721506c3fb27SDimitry Andric // FIXME: This is to workaround the inability of tablegen match combiners to 721606c3fb27SDimitry Andric // match intrinsics in patterns. 721706c3fb27SDimitry Andric Observer.changingInstr(MI); 721806c3fb27SDimitry Andric MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3)); 721906c3fb27SDimitry Andric MI.removeOperand(1); 722006c3fb27SDimitry Andric Observer.changedInstr(MI); 722106c3fb27SDimitry Andric return true; 722206c3fb27SDimitry Andric } 72235ffd83dbSDimitry Andric default: { 72245ffd83dbSDimitry Andric if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 72255ffd83dbSDimitry Andric AMDGPU::getImageDimIntrinsicInfo(IntrID)) 72265ffd83dbSDimitry Andric return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 72270b57cec5SDimitry Andric return true; 72280b57cec5SDimitry Andric } 72295ffd83dbSDimitry Andric } 72300b57cec5SDimitry Andric 72310b57cec5SDimitry Andric return true; 72320b57cec5SDimitry Andric } 7233