10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for 100b57cec5SDimitry Andric /// AMDGPU. 110b57cec5SDimitry Andric /// \todo This should be generated by TableGen. 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 158bcb0991SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h" 18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h" 190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 21fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h" 23fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h" 240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 28e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 2981ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h" 300b57cec5SDimitry Andric 310b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo" 320b57cec5SDimitry Andric 330b57cec5SDimitry Andric using namespace llvm; 340b57cec5SDimitry Andric using namespace LegalizeActions; 350b57cec5SDimitry Andric using namespace LegalizeMutations; 360b57cec5SDimitry Andric using namespace LegalityPredicates; 375ffd83dbSDimitry Andric using namespace MIPatternMatch; 380b57cec5SDimitry Andric 395ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types. 405ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality( 415ffd83dbSDimitry Andric "amdgpu-global-isel-new-legality", 425ffd83dbSDimitry Andric cl::desc("Use GlobalISel desired legality, rather than try to use" 435ffd83dbSDimitry Andric "rules compatible with selection patterns"), 445ffd83dbSDimitry Andric cl::init(false), 455ffd83dbSDimitry Andric cl::ReallyHidden); 460b57cec5SDimitry Andric 475ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024; 485ffd83dbSDimitry Andric 495ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements 505ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) { 515ffd83dbSDimitry Andric unsigned NElts = Ty.getNumElements(); 525ffd83dbSDimitry Andric unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53fe6060f1SDimitry Andric return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 540b57cec5SDimitry Andric } 550b57cec5SDimitry Andric 565ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits 575ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) { 585ffd83dbSDimitry Andric unsigned Bits = Ty.getSizeInBits(); 595ffd83dbSDimitry Andric unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 605ffd83dbSDimitry Andric return LLT::scalar(Pow2Bits); 618bcb0991SDimitry Andric } 628bcb0991SDimitry Andric 63349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an 64e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized. 660b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 670b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 680b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 69e8d8bef9SDimitry Andric if (!Ty.isVector()) 70e8d8bef9SDimitry Andric return false; 71e8d8bef9SDimitry Andric 72e8d8bef9SDimitry Andric const LLT EltTy = Ty.getElementType(); 73e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 74e8d8bef9SDimitry Andric return Ty.getNumElements() % 2 != 0 && 75e8d8bef9SDimitry Andric EltSize > 1 && EltSize < 32 && 768bcb0991SDimitry Andric Ty.getSizeInBits() % 32 != 0; 778bcb0991SDimitry Andric }; 788bcb0991SDimitry Andric } 798bcb0991SDimitry Andric 80e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 82e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 83e8d8bef9SDimitry Andric return Ty.getSizeInBits() % 32 == 0; 84e8d8bef9SDimitry Andric }; 85e8d8bef9SDimitry Andric } 86e8d8bef9SDimitry Andric 878bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) { 888bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 898bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 908bcb0991SDimitry Andric const LLT EltTy = Ty.getScalarType(); 918bcb0991SDimitry Andric return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 920b57cec5SDimitry Andric }; 930b57cec5SDimitry Andric } 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 960b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 970b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 980b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 99fe6060f1SDimitry Andric return std::make_pair(TypeIdx, 100fe6060f1SDimitry Andric LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 1010b57cec5SDimitry Andric }; 1020b57cec5SDimitry Andric } 1030b57cec5SDimitry Andric 1040b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 1050b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1060b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1070b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 1080b57cec5SDimitry Andric unsigned Size = Ty.getSizeInBits(); 1090b57cec5SDimitry Andric unsigned Pieces = (Size + 63) / 64; 1100b57cec5SDimitry Andric unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111fe6060f1SDimitry Andric return std::make_pair( 112fe6060f1SDimitry Andric TypeIdx, 113fe6060f1SDimitry Andric LLT::scalarOrVector(ElementCount::getFixed(NewNumElts), EltTy)); 1140b57cec5SDimitry Andric }; 1150b57cec5SDimitry Andric } 1160b57cec5SDimitry Andric 1178bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit 1188bcb0991SDimitry Andric // type. 1198bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 1208bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1218bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1228bcb0991SDimitry Andric 1238bcb0991SDimitry Andric const LLT EltTy = Ty.getElementType(); 1248bcb0991SDimitry Andric const int Size = Ty.getSizeInBits(); 1258bcb0991SDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1268bcb0991SDimitry Andric const int NextMul32 = (Size + 31) / 32; 1278bcb0991SDimitry Andric 1288bcb0991SDimitry Andric assert(EltSize < 32); 1298bcb0991SDimitry Andric 1308bcb0991SDimitry Andric const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 131fe6060f1SDimitry Andric return std::make_pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 1328bcb0991SDimitry Andric }; 1338bcb0991SDimitry Andric } 1348bcb0991SDimitry Andric 135e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) { 136e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 1375ffd83dbSDimitry Andric 1385ffd83dbSDimitry Andric if (Size <= 32) { 1395ffd83dbSDimitry Andric // <2 x s8> -> s16 1405ffd83dbSDimitry Andric // <4 x s8> -> s32 141e8d8bef9SDimitry Andric return LLT::scalar(Size); 142e8d8bef9SDimitry Andric } 1435ffd83dbSDimitry Andric 144fe6060f1SDimitry Andric return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 145e8d8bef9SDimitry Andric } 146e8d8bef9SDimitry Andric 147e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 148e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 149e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 150e8d8bef9SDimitry Andric return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 151e8d8bef9SDimitry Andric }; 152e8d8bef9SDimitry Andric } 153e8d8bef9SDimitry Andric 154e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 155e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 156e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 157e8d8bef9SDimitry Andric unsigned Size = Ty.getSizeInBits(); 158e8d8bef9SDimitry Andric assert(Size % 32 == 0); 159fe6060f1SDimitry Andric return std::make_pair( 160fe6060f1SDimitry Andric TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 1615ffd83dbSDimitry Andric }; 1625ffd83dbSDimitry Andric } 1635ffd83dbSDimitry Andric 1648bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 1658bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1668bcb0991SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1678bcb0991SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 1688bcb0991SDimitry Andric }; 1698bcb0991SDimitry Andric } 1708bcb0991SDimitry Andric 1710b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 1720b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1730b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1740b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 1750b57cec5SDimitry Andric }; 1760b57cec5SDimitry Andric } 1770b57cec5SDimitry Andric 1780b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 1790b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1800b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1810b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 1820b57cec5SDimitry Andric }; 1830b57cec5SDimitry Andric } 1840b57cec5SDimitry Andric 1855ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) { 1865ffd83dbSDimitry Andric return Size % 32 == 0 && Size <= MaxRegisterSize; 1875ffd83dbSDimitry Andric } 1885ffd83dbSDimitry Andric 1895ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) { 1905ffd83dbSDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1915ffd83dbSDimitry Andric return EltSize == 16 || EltSize % 32 == 0; 1925ffd83dbSDimitry Andric } 1935ffd83dbSDimitry Andric 1945ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) { 1950b57cec5SDimitry Andric const int EltSize = Ty.getElementType().getSizeInBits(); 1960b57cec5SDimitry Andric return EltSize == 32 || EltSize == 64 || 1970b57cec5SDimitry Andric (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 1980b57cec5SDimitry Andric EltSize == 128 || EltSize == 256; 1990b57cec5SDimitry Andric } 2000b57cec5SDimitry Andric 2015ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) { 2025ffd83dbSDimitry Andric if (!isRegisterSize(Ty.getSizeInBits())) 2035ffd83dbSDimitry Andric return false; 2045ffd83dbSDimitry Andric 2055ffd83dbSDimitry Andric if (Ty.isVector()) 2065ffd83dbSDimitry Andric return isRegisterVectorType(Ty); 2075ffd83dbSDimitry Andric 2085ffd83dbSDimitry Andric return true; 2095ffd83dbSDimitry Andric } 2105ffd83dbSDimitry Andric 2115ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and 2125ffd83dbSDimitry Andric // multiples of v2s16. 2135ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) { 2145ffd83dbSDimitry Andric return [=](const LegalityQuery &Query) { 2155ffd83dbSDimitry Andric return isRegisterType(Query.Types[TypeIdx]); 2168bcb0991SDimitry Andric }; 2178bcb0991SDimitry Andric } 2188bcb0991SDimitry Andric 2195ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 2208bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2215ffd83dbSDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2225ffd83dbSDimitry Andric if (!QueryTy.isVector()) 2235ffd83dbSDimitry Andric return false; 2245ffd83dbSDimitry Andric const LLT EltTy = QueryTy.getElementType(); 2255ffd83dbSDimitry Andric return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 2268bcb0991SDimitry Andric }; 2278bcb0991SDimitry Andric } 2288bcb0991SDimitry Andric 229fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger 230fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type. 231fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 2328bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2338bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 2348bcb0991SDimitry Andric return !Ty.isVector() && Ty.getSizeInBits() > 32 && 235fe6060f1SDimitry Andric Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 2360b57cec5SDimitry Andric }; 2370b57cec5SDimitry Andric } 2380b57cec5SDimitry Andric 2395ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 2405ffd83dbSDimitry Andric // handle some operations by just promoting the register during 2415ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits. 2425ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 2435ffd83dbSDimitry Andric bool IsLoad) { 2445ffd83dbSDimitry Andric switch (AS) { 2455ffd83dbSDimitry Andric case AMDGPUAS::PRIVATE_ADDRESS: 2465ffd83dbSDimitry Andric // FIXME: Private element size. 247e8d8bef9SDimitry Andric return ST.enableFlatScratch() ? 128 : 32; 2485ffd83dbSDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 2495ffd83dbSDimitry Andric return ST.useDS128() ? 128 : 64; 2505ffd83dbSDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 2515ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS: 2525ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 2535ffd83dbSDimitry Andric // Treat constant and global as identical. SMRD loads are sometimes usable for 2545ffd83dbSDimitry Andric // global loads (ideally constant address space should be eliminated) 2555ffd83dbSDimitry Andric // depending on the context. Legality cannot be context dependent, but 2565ffd83dbSDimitry Andric // RegBankSelect can split the load as necessary depending on the pointer 2575ffd83dbSDimitry Andric // register bank/uniformity and if the memory is invariant or not written in a 2585ffd83dbSDimitry Andric // kernel. 2595ffd83dbSDimitry Andric return IsLoad ? 512 : 128; 2605ffd83dbSDimitry Andric default: 2615ffd83dbSDimitry Andric // Flat addresses may contextually need to be split to 32-bit parts if they 2625ffd83dbSDimitry Andric // may alias scratch depending on the subtarget. 2635ffd83dbSDimitry Andric return 128; 2645ffd83dbSDimitry Andric } 2655ffd83dbSDimitry Andric } 2665ffd83dbSDimitry Andric 2675ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 268fe6060f1SDimitry Andric const LegalityQuery &Query) { 2695ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 2705ffd83dbSDimitry Andric 2715ffd83dbSDimitry Andric // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 272fe6060f1SDimitry Andric const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 2735ffd83dbSDimitry Andric 2745ffd83dbSDimitry Andric unsigned RegSize = Ty.getSizeInBits(); 27504eeddc0SDimitry Andric uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 27604eeddc0SDimitry Andric uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 2775ffd83dbSDimitry Andric unsigned AS = Query.Types[1].getAddressSpace(); 2785ffd83dbSDimitry Andric 2795ffd83dbSDimitry Andric // All of these need to be custom lowered to cast the pointer operand. 2805ffd83dbSDimitry Andric if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2815ffd83dbSDimitry Andric return false; 2825ffd83dbSDimitry Andric 283fe6060f1SDimitry Andric // Do not handle extending vector loads. 284fe6060f1SDimitry Andric if (Ty.isVector() && MemSize != RegSize) 285fe6060f1SDimitry Andric return false; 286fe6060f1SDimitry Andric 2875ffd83dbSDimitry Andric // TODO: We should be able to widen loads if the alignment is high enough, but 2885ffd83dbSDimitry Andric // we also need to modify the memory access size. 2895ffd83dbSDimitry Andric #if 0 2905ffd83dbSDimitry Andric // Accept widening loads based on alignment. 2915ffd83dbSDimitry Andric if (IsLoad && MemSize < Size) 2925ffd83dbSDimitry Andric MemSize = std::max(MemSize, Align); 2935ffd83dbSDimitry Andric #endif 2945ffd83dbSDimitry Andric 2955ffd83dbSDimitry Andric // Only 1-byte and 2-byte to 32-bit extloads are valid. 2965ffd83dbSDimitry Andric if (MemSize != RegSize && RegSize != 32) 2975ffd83dbSDimitry Andric return false; 2985ffd83dbSDimitry Andric 2995ffd83dbSDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 3005ffd83dbSDimitry Andric return false; 3015ffd83dbSDimitry Andric 3025ffd83dbSDimitry Andric switch (MemSize) { 3035ffd83dbSDimitry Andric case 8: 3045ffd83dbSDimitry Andric case 16: 3055ffd83dbSDimitry Andric case 32: 3065ffd83dbSDimitry Andric case 64: 3075ffd83dbSDimitry Andric case 128: 3085ffd83dbSDimitry Andric break; 3095ffd83dbSDimitry Andric case 96: 3105ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 3115ffd83dbSDimitry Andric return false; 3125ffd83dbSDimitry Andric break; 3135ffd83dbSDimitry Andric case 256: 3145ffd83dbSDimitry Andric case 512: 3155ffd83dbSDimitry Andric // These may contextually need to be broken down. 3165ffd83dbSDimitry Andric break; 3175ffd83dbSDimitry Andric default: 3185ffd83dbSDimitry Andric return false; 3195ffd83dbSDimitry Andric } 3205ffd83dbSDimitry Andric 3215ffd83dbSDimitry Andric assert(RegSize >= MemSize); 3225ffd83dbSDimitry Andric 323e8d8bef9SDimitry Andric if (AlignBits < MemSize) { 3245ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 325e8d8bef9SDimitry Andric if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 326e8d8bef9SDimitry Andric Align(AlignBits / 8))) 3275ffd83dbSDimitry Andric return false; 3285ffd83dbSDimitry Andric } 3295ffd83dbSDimitry Andric 3305ffd83dbSDimitry Andric return true; 3315ffd83dbSDimitry Andric } 3325ffd83dbSDimitry Andric 3335ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 3345ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care 3355ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by 3365ffd83dbSDimitry Andric // bitcasting. 3375ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) { 3385ffd83dbSDimitry Andric if (EnableNewLegality) 3395ffd83dbSDimitry Andric return false; 3405ffd83dbSDimitry Andric 3415ffd83dbSDimitry Andric const unsigned Size = Ty.getSizeInBits(); 3425ffd83dbSDimitry Andric if (Size <= 64) 3435ffd83dbSDimitry Andric return false; 3445ffd83dbSDimitry Andric if (!Ty.isVector()) 3455ffd83dbSDimitry Andric return true; 346e8d8bef9SDimitry Andric 347e8d8bef9SDimitry Andric LLT EltTy = Ty.getElementType(); 348e8d8bef9SDimitry Andric if (EltTy.isPointer()) 349e8d8bef9SDimitry Andric return true; 350e8d8bef9SDimitry Andric 351e8d8bef9SDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 3525ffd83dbSDimitry Andric return EltSize != 32 && EltSize != 64; 3535ffd83dbSDimitry Andric } 3545ffd83dbSDimitry Andric 355fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 3565ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 357fe6060f1SDimitry Andric return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 3585ffd83dbSDimitry Andric !loadStoreBitcastWorkaround(Ty); 3595ffd83dbSDimitry Andric } 3605ffd83dbSDimitry Andric 361e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast 362e8d8bef9SDimitry Andric /// to a different type. 363e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 364fe6060f1SDimitry Andric const LLT MemTy) { 365fe6060f1SDimitry Andric const unsigned MemSizeInBits = MemTy.getSizeInBits(); 366e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 367e8d8bef9SDimitry Andric if (Size != MemSizeInBits) 368e8d8bef9SDimitry Andric return Size <= 32 && Ty.isVector(); 369e8d8bef9SDimitry Andric 370e8d8bef9SDimitry Andric if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 371e8d8bef9SDimitry Andric return true; 372fe6060f1SDimitry Andric 373fe6060f1SDimitry Andric // Don't try to handle bitcasting vector ext loads for now. 374fe6060f1SDimitry Andric return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 375fe6060f1SDimitry Andric (Size <= 32 || isRegisterSize(Size)) && 376e8d8bef9SDimitry Andric !isRegisterVectorElementType(Ty.getElementType()); 377e8d8bef9SDimitry Andric } 378e8d8bef9SDimitry Andric 379e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory 380e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself 381e8d8bef9SDimitry Andric /// changes, not the size of the result register. 382fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 38304eeddc0SDimitry Andric uint64_t AlignInBits, unsigned AddrSpace, 384e8d8bef9SDimitry Andric unsigned Opcode) { 385fe6060f1SDimitry Andric unsigned SizeInBits = MemoryTy.getSizeInBits(); 386e8d8bef9SDimitry Andric // We don't want to widen cases that are naturally legal. 387e8d8bef9SDimitry Andric if (isPowerOf2_32(SizeInBits)) 388e8d8bef9SDimitry Andric return false; 389e8d8bef9SDimitry Andric 390e8d8bef9SDimitry Andric // If we have 96-bit memory operations, we shouldn't touch them. Note we may 391e8d8bef9SDimitry Andric // end up widening these for a scalar load during RegBankSelect, since there 392e8d8bef9SDimitry Andric // aren't 96-bit scalar loads. 393e8d8bef9SDimitry Andric if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 394e8d8bef9SDimitry Andric return false; 395e8d8bef9SDimitry Andric 396e8d8bef9SDimitry Andric if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 397e8d8bef9SDimitry Andric return false; 398e8d8bef9SDimitry Andric 399e8d8bef9SDimitry Andric // A load is known dereferenceable up to the alignment, so it's legal to widen 400e8d8bef9SDimitry Andric // to it. 401e8d8bef9SDimitry Andric // 402e8d8bef9SDimitry Andric // TODO: Could check dereferenceable for less aligned cases. 403e8d8bef9SDimitry Andric unsigned RoundedSize = NextPowerOf2(SizeInBits); 404e8d8bef9SDimitry Andric if (AlignInBits < RoundedSize) 405e8d8bef9SDimitry Andric return false; 406e8d8bef9SDimitry Andric 407e8d8bef9SDimitry Andric // Do not widen if it would introduce a slow unaligned load. 408e8d8bef9SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 409e8d8bef9SDimitry Andric bool Fast = false; 410e8d8bef9SDimitry Andric return TLI->allowsMisalignedMemoryAccessesImpl( 411e8d8bef9SDimitry Andric RoundedSize, AddrSpace, Align(AlignInBits / 8), 412e8d8bef9SDimitry Andric MachineMemOperand::MOLoad, &Fast) && 413e8d8bef9SDimitry Andric Fast; 414e8d8bef9SDimitry Andric } 415e8d8bef9SDimitry Andric 416e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 417e8d8bef9SDimitry Andric unsigned Opcode) { 418e8d8bef9SDimitry Andric if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 419e8d8bef9SDimitry Andric return false; 420e8d8bef9SDimitry Andric 421fe6060f1SDimitry Andric return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 422e8d8bef9SDimitry Andric Query.MMODescrs[0].AlignInBits, 423e8d8bef9SDimitry Andric Query.Types[1].getAddressSpace(), Opcode); 424e8d8bef9SDimitry Andric } 425e8d8bef9SDimitry Andric 4260b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 4270b57cec5SDimitry Andric const GCNTargetMachine &TM) 4280b57cec5SDimitry Andric : ST(ST_) { 4290b57cec5SDimitry Andric using namespace TargetOpcode; 4300b57cec5SDimitry Andric 4310b57cec5SDimitry Andric auto GetAddrSpacePtr = [&TM](unsigned AS) { 4320b57cec5SDimitry Andric return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 4330b57cec5SDimitry Andric }; 4340b57cec5SDimitry Andric 4350b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 436e8d8bef9SDimitry Andric const LLT S8 = LLT::scalar(8); 4370b57cec5SDimitry Andric const LLT S16 = LLT::scalar(16); 4380b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 4390b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 4400b57cec5SDimitry Andric const LLT S128 = LLT::scalar(128); 4410b57cec5SDimitry Andric const LLT S256 = LLT::scalar(256); 4425ffd83dbSDimitry Andric const LLT S512 = LLT::scalar(512); 4435ffd83dbSDimitry Andric const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 4440b57cec5SDimitry Andric 445fe6060f1SDimitry Andric const LLT V2S8 = LLT::fixed_vector(2, 8); 446fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 447fe6060f1SDimitry Andric const LLT V4S16 = LLT::fixed_vector(4, 16); 4480b57cec5SDimitry Andric 449fe6060f1SDimitry Andric const LLT V2S32 = LLT::fixed_vector(2, 32); 450fe6060f1SDimitry Andric const LLT V3S32 = LLT::fixed_vector(3, 32); 451fe6060f1SDimitry Andric const LLT V4S32 = LLT::fixed_vector(4, 32); 452fe6060f1SDimitry Andric const LLT V5S32 = LLT::fixed_vector(5, 32); 453fe6060f1SDimitry Andric const LLT V6S32 = LLT::fixed_vector(6, 32); 454fe6060f1SDimitry Andric const LLT V7S32 = LLT::fixed_vector(7, 32); 455fe6060f1SDimitry Andric const LLT V8S32 = LLT::fixed_vector(8, 32); 456fe6060f1SDimitry Andric const LLT V9S32 = LLT::fixed_vector(9, 32); 457fe6060f1SDimitry Andric const LLT V10S32 = LLT::fixed_vector(10, 32); 458fe6060f1SDimitry Andric const LLT V11S32 = LLT::fixed_vector(11, 32); 459fe6060f1SDimitry Andric const LLT V12S32 = LLT::fixed_vector(12, 32); 460fe6060f1SDimitry Andric const LLT V13S32 = LLT::fixed_vector(13, 32); 461fe6060f1SDimitry Andric const LLT V14S32 = LLT::fixed_vector(14, 32); 462fe6060f1SDimitry Andric const LLT V15S32 = LLT::fixed_vector(15, 32); 463fe6060f1SDimitry Andric const LLT V16S32 = LLT::fixed_vector(16, 32); 464fe6060f1SDimitry Andric const LLT V32S32 = LLT::fixed_vector(32, 32); 4650b57cec5SDimitry Andric 466fe6060f1SDimitry Andric const LLT V2S64 = LLT::fixed_vector(2, 64); 467fe6060f1SDimitry Andric const LLT V3S64 = LLT::fixed_vector(3, 64); 468fe6060f1SDimitry Andric const LLT V4S64 = LLT::fixed_vector(4, 64); 469fe6060f1SDimitry Andric const LLT V5S64 = LLT::fixed_vector(5, 64); 470fe6060f1SDimitry Andric const LLT V6S64 = LLT::fixed_vector(6, 64); 471fe6060f1SDimitry Andric const LLT V7S64 = LLT::fixed_vector(7, 64); 472fe6060f1SDimitry Andric const LLT V8S64 = LLT::fixed_vector(8, 64); 473fe6060f1SDimitry Andric const LLT V16S64 = LLT::fixed_vector(16, 64); 4740b57cec5SDimitry Andric 4750b57cec5SDimitry Andric std::initializer_list<LLT> AllS32Vectors = 4760b57cec5SDimitry Andric {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 4778bcb0991SDimitry Andric V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 4780b57cec5SDimitry Andric std::initializer_list<LLT> AllS64Vectors = 4798bcb0991SDimitry Andric {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 4800b57cec5SDimitry Andric 4810b57cec5SDimitry Andric const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 4820b57cec5SDimitry Andric const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 4838bcb0991SDimitry Andric const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 4840b57cec5SDimitry Andric const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 4858bcb0991SDimitry Andric const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 4860b57cec5SDimitry Andric const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 4870b57cec5SDimitry Andric const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 4880b57cec5SDimitry Andric 4890b57cec5SDimitry Andric const LLT CodePtr = FlatPtr; 4900b57cec5SDimitry Andric 4910b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces64 = { 4920b57cec5SDimitry Andric GlobalPtr, ConstantPtr, FlatPtr 4930b57cec5SDimitry Andric }; 4940b57cec5SDimitry Andric 4950b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces32 = { 4968bcb0991SDimitry Andric LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 4970b57cec5SDimitry Andric }; 4980b57cec5SDimitry Andric 4990b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesBase = { 5000b57cec5SDimitry Andric S32, S64 5010b57cec5SDimitry Andric }; 5020b57cec5SDimitry Andric 5030b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypes16 = { 5040b57cec5SDimitry Andric S32, S64, S16 5050b57cec5SDimitry Andric }; 5060b57cec5SDimitry Andric 5070b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesPK16 = { 5080b57cec5SDimitry Andric S32, S64, S16, V2S16 5090b57cec5SDimitry Andric }; 5100b57cec5SDimitry Andric 5115ffd83dbSDimitry Andric const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 5125ffd83dbSDimitry Andric 513fe6060f1SDimitry Andric // s1 for VCC branches, s32 for SCC branches. 514fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 5150b57cec5SDimitry Andric 5160b57cec5SDimitry Andric // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 5170b57cec5SDimitry Andric // elements for v3s16 5180b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PHI) 519e8d8bef9SDimitry Andric .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 5200b57cec5SDimitry Andric .legalFor(AllS32Vectors) 5210b57cec5SDimitry Andric .legalFor(AllS64Vectors) 5220b57cec5SDimitry Andric .legalFor(AddrSpaces64) 5230b57cec5SDimitry Andric .legalFor(AddrSpaces32) 524e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 525e8d8bef9SDimitry Andric .clampScalar(0, S16, S256) 5260b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 5270b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16) 5280b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 529e8d8bef9SDimitry Andric .scalarize(0); 5300b57cec5SDimitry Andric 531e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 532e8d8bef9SDimitry Andric // Full set of gfx9 features. 53381ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 5345ffd83dbSDimitry Andric .legalFor({S32, S16, V2S16}) 5350eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 53681ad6265SDimitry Andric .scalarize(0) 53781ad6265SDimitry Andric .minScalar(0, S16) 538349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 53981ad6265SDimitry Andric .maxScalar(0, S32); 54081ad6265SDimitry Andric 54181ad6265SDimitry Andric getActionDefinitionsBuilder(G_MUL) 54281ad6265SDimitry Andric .legalFor({S32, S16, V2S16}) 54381ad6265SDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 54481ad6265SDimitry Andric .scalarize(0) 54581ad6265SDimitry Andric .minScalar(0, S16) 54681ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 54781ad6265SDimitry Andric .custom(); 54881ad6265SDimitry Andric assert(ST.hasMad64_32()); 549e8d8bef9SDimitry Andric 550e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 551e8d8bef9SDimitry Andric .legalFor({S32, S16, V2S16}) // Clamp modifier 552e8d8bef9SDimitry Andric .minScalarOrElt(0, S16) 5530eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 554e8d8bef9SDimitry Andric .scalarize(0) 555e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 32) 556e8d8bef9SDimitry Andric .lower(); 5575ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 55881ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 5590b57cec5SDimitry Andric .legalFor({S32, S16}) 560349cc55cSDimitry Andric .minScalar(0, S16) 561349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 562349cc55cSDimitry Andric .maxScalar(0, S32) 563349cc55cSDimitry Andric .scalarize(0); 564e8d8bef9SDimitry Andric 56581ad6265SDimitry Andric getActionDefinitionsBuilder(G_MUL) 56681ad6265SDimitry Andric .legalFor({S32, S16}) 56781ad6265SDimitry Andric .scalarize(0) 56881ad6265SDimitry Andric .minScalar(0, S16) 56981ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 57081ad6265SDimitry Andric .custom(); 57181ad6265SDimitry Andric assert(ST.hasMad64_32()); 57281ad6265SDimitry Andric 573e8d8bef9SDimitry Andric // Technically the saturating operations require clamp bit support, but this 574e8d8bef9SDimitry Andric // was introduced at the same time as 16-bit operations. 575e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 576e8d8bef9SDimitry Andric .legalFor({S32, S16}) // Clamp modifier 577e8d8bef9SDimitry Andric .minScalar(0, S16) 578e8d8bef9SDimitry Andric .scalarize(0) 579e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 16) 580e8d8bef9SDimitry Andric .lower(); 581e8d8bef9SDimitry Andric 582e8d8bef9SDimitry Andric // We're just lowering this, but it helps get a better result to try to 583e8d8bef9SDimitry Andric // coerce to the desired type first. 584e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 585e8d8bef9SDimitry Andric .minScalar(0, S16) 586e8d8bef9SDimitry Andric .scalarize(0) 587e8d8bef9SDimitry Andric .lower(); 5880b57cec5SDimitry Andric } else { 58981ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 5900b57cec5SDimitry Andric .legalFor({S32}) 591349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 5920b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5930b57cec5SDimitry Andric .scalarize(0); 594e8d8bef9SDimitry Andric 59581ad6265SDimitry Andric auto &Mul = getActionDefinitionsBuilder(G_MUL) 59681ad6265SDimitry Andric .legalFor({S32}) 59781ad6265SDimitry Andric .scalarize(0) 59881ad6265SDimitry Andric .minScalar(0, S32) 59981ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32); 60081ad6265SDimitry Andric 60181ad6265SDimitry Andric if (ST.hasMad64_32()) 60281ad6265SDimitry Andric Mul.custom(); 60381ad6265SDimitry Andric else 60481ad6265SDimitry Andric Mul.maxScalar(0, S32); 60581ad6265SDimitry Andric 606e8d8bef9SDimitry Andric if (ST.hasIntClamp()) { 607e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 608e8d8bef9SDimitry Andric .legalFor({S32}) // Clamp modifier. 609e8d8bef9SDimitry Andric .scalarize(0) 610e8d8bef9SDimitry Andric .minScalarOrElt(0, S32) 611e8d8bef9SDimitry Andric .lower(); 612e8d8bef9SDimitry Andric } else { 613e8d8bef9SDimitry Andric // Clamp bit support was added in VI, along with 16-bit operations. 614e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 615e8d8bef9SDimitry Andric .minScalar(0, S32) 616e8d8bef9SDimitry Andric .scalarize(0) 617e8d8bef9SDimitry Andric .lower(); 6180b57cec5SDimitry Andric } 6190b57cec5SDimitry Andric 620e8d8bef9SDimitry Andric // FIXME: DAG expansion gets better results. The widening uses the smaller 621e8d8bef9SDimitry Andric // range values and goes for the min/max lowering directly. 622e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 623e8d8bef9SDimitry Andric .minScalar(0, S32) 624e8d8bef9SDimitry Andric .scalarize(0) 625e8d8bef9SDimitry Andric .lower(); 626e8d8bef9SDimitry Andric } 627e8d8bef9SDimitry Andric 628fe6060f1SDimitry Andric getActionDefinitionsBuilder( 629fe6060f1SDimitry Andric {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 6305ffd83dbSDimitry Andric .customFor({S32, S64}) 631480093f4SDimitry Andric .clampScalar(0, S32, S64) 632480093f4SDimitry Andric .widenScalarToNextPow2(0, 32) 633480093f4SDimitry Andric .scalarize(0); 634480093f4SDimitry Andric 635e8d8bef9SDimitry Andric auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 6360b57cec5SDimitry Andric .legalFor({S32}) 637349cc55cSDimitry Andric .maxScalar(0, S32); 638e8d8bef9SDimitry Andric 639e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts()) { 640e8d8bef9SDimitry Andric Mulh 641e8d8bef9SDimitry Andric .clampMaxNumElements(0, S8, 2) 642e8d8bef9SDimitry Andric .lowerFor({V2S8}); 643e8d8bef9SDimitry Andric } 644e8d8bef9SDimitry Andric 645e8d8bef9SDimitry Andric Mulh 646e8d8bef9SDimitry Andric .scalarize(0) 647e8d8bef9SDimitry Andric .lower(); 6480b57cec5SDimitry Andric 6490b57cec5SDimitry Andric // Report legal for any types we can handle anywhere. For the cases only legal 6500b57cec5SDimitry Andric // on the SALU, RegBankSelect will be able to re-legalize. 6510b57cec5SDimitry Andric getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 6520b57cec5SDimitry Andric .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 6530b57cec5SDimitry Andric .clampScalar(0, S32, S64) 6540b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 6558bcb0991SDimitry Andric .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 6560b57cec5SDimitry Andric .widenScalarToNextPow2(0) 6570b57cec5SDimitry Andric .scalarize(0); 6580b57cec5SDimitry Andric 6598bcb0991SDimitry Andric getActionDefinitionsBuilder({G_UADDO, G_USUBO, 6600b57cec5SDimitry Andric G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 661480093f4SDimitry Andric .legalFor({{S32, S1}, {S32, S32}}) 6625ffd83dbSDimitry Andric .minScalar(0, S32) 66381ad6265SDimitry Andric .scalarize(0) 6648bcb0991SDimitry Andric .lower(); 6650b57cec5SDimitry Andric 6660b57cec5SDimitry Andric getActionDefinitionsBuilder(G_BITCAST) 6670b57cec5SDimitry Andric // Don't worry about the size constraint. 6688bcb0991SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 6695ffd83dbSDimitry Andric .lower(); 6700b57cec5SDimitry Andric 6710b57cec5SDimitry Andric 6720b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONSTANT) 6738bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, GlobalPtr, 6740b57cec5SDimitry Andric LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 675e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 6760b57cec5SDimitry Andric .clampScalar(0, S32, S64) 677e8d8bef9SDimitry Andric .widenScalarToNextPow2(0); 6780b57cec5SDimitry Andric 6795ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FCONSTANT) 6805ffd83dbSDimitry Andric .legalFor({S32, S64, S16}) 6815ffd83dbSDimitry Andric .clampScalar(0, S16, S64); 6828bcb0991SDimitry Andric 6835ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 6845ffd83dbSDimitry Andric .legalIf(isRegisterType(0)) 6855ffd83dbSDimitry Andric // s1 and s16 are special cases because they have legal operations on 6865ffd83dbSDimitry Andric // them, but don't really occupy registers in the normal way. 6875ffd83dbSDimitry Andric .legalFor({S1, S16}) 6885ffd83dbSDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 6895ffd83dbSDimitry Andric .clampScalarOrElt(0, S32, MaxScalar) 6905ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 6915ffd83dbSDimitry Andric .clampMaxNumElements(0, S32, 16); 6925ffd83dbSDimitry Andric 693fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 6945ffd83dbSDimitry Andric 6955ffd83dbSDimitry Andric // If the amount is divergent, we have to do a wave reduction to get the 6965ffd83dbSDimitry Andric // maximum value, so this is expanded during RegBankSelect. 6975ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_DYN_STACKALLOC) 6985ffd83dbSDimitry Andric .legalFor({{PrivatePtr, S32}}); 6995ffd83dbSDimitry Andric 7005ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_GLOBAL_VALUE) 701e8d8bef9SDimitry Andric .customIf(typeIsNot(0, PrivatePtr)); 702e8d8bef9SDimitry Andric 703fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 7040b57cec5SDimitry Andric 7050b57cec5SDimitry Andric auto &FPOpActions = getActionDefinitionsBuilder( 7068bcb0991SDimitry Andric { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 7070b57cec5SDimitry Andric .legalFor({S32, S64}); 7088bcb0991SDimitry Andric auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 7098bcb0991SDimitry Andric .customFor({S32, S64}); 7108bcb0991SDimitry Andric auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 7118bcb0991SDimitry Andric .customFor({S32, S64}); 7120b57cec5SDimitry Andric 7130b57cec5SDimitry Andric if (ST.has16BitInsts()) { 7140b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 7150b57cec5SDimitry Andric FPOpActions.legalFor({S16, V2S16}); 7160b57cec5SDimitry Andric else 7170b57cec5SDimitry Andric FPOpActions.legalFor({S16}); 7188bcb0991SDimitry Andric 7198bcb0991SDimitry Andric TrigActions.customFor({S16}); 7208bcb0991SDimitry Andric FDIVActions.customFor({S16}); 7210b57cec5SDimitry Andric } 7220b57cec5SDimitry Andric 7230b57cec5SDimitry Andric auto &MinNumMaxNum = getActionDefinitionsBuilder({ 7240b57cec5SDimitry Andric G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 7250b57cec5SDimitry Andric 7260b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 7270b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesPK16) 728480093f4SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 7290b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 7300b57cec5SDimitry Andric .clampScalar(0, S16, S64) 7310b57cec5SDimitry Andric .scalarize(0); 7320b57cec5SDimitry Andric } else if (ST.has16BitInsts()) { 7330b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypes16) 7340b57cec5SDimitry Andric .clampScalar(0, S16, S64) 7350b57cec5SDimitry Andric .scalarize(0); 7360b57cec5SDimitry Andric } else { 7370b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesBase) 7380b57cec5SDimitry Andric .clampScalar(0, S32, S64) 7390b57cec5SDimitry Andric .scalarize(0); 7400b57cec5SDimitry Andric } 7410b57cec5SDimitry Andric 7420b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 7430eae32dcSDimitry Andric FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 7448bcb0991SDimitry Andric 7450b57cec5SDimitry Andric FPOpActions 7460b57cec5SDimitry Andric .scalarize(0) 7470b57cec5SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7480b57cec5SDimitry Andric 7498bcb0991SDimitry Andric TrigActions 7508bcb0991SDimitry Andric .scalarize(0) 7518bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7528bcb0991SDimitry Andric 7538bcb0991SDimitry Andric FDIVActions 7548bcb0991SDimitry Andric .scalarize(0) 7558bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7568bcb0991SDimitry Andric 7578bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FNEG, G_FABS}) 7588bcb0991SDimitry Andric .legalFor(FPTypesPK16) 7590eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 7608bcb0991SDimitry Andric .scalarize(0) 7618bcb0991SDimitry Andric .clampScalar(0, S16, S64); 7628bcb0991SDimitry Andric 7630b57cec5SDimitry Andric if (ST.has16BitInsts()) { 7648bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 7650b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 7660b57cec5SDimitry Andric .scalarize(0) 7670b57cec5SDimitry Andric .clampScalar(0, S16, S64); 7680b57cec5SDimitry Andric } else { 7695ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSQRT) 7705ffd83dbSDimitry Andric .legalFor({S32, S64}) 7715ffd83dbSDimitry Andric .scalarize(0) 7725ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 7735ffd83dbSDimitry Andric 7745ffd83dbSDimitry Andric if (ST.hasFractBug()) { 7755ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 7765ffd83dbSDimitry Andric .customFor({S64}) 7775ffd83dbSDimitry Andric .legalFor({S32, S64}) 7785ffd83dbSDimitry Andric .scalarize(0) 7795ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 7805ffd83dbSDimitry Andric } else { 7815ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 7820b57cec5SDimitry Andric .legalFor({S32, S64}) 7830b57cec5SDimitry Andric .scalarize(0) 7840b57cec5SDimitry Andric .clampScalar(0, S32, S64); 7850b57cec5SDimitry Andric } 7865ffd83dbSDimitry Andric } 7870b57cec5SDimitry Andric 7880b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPTRUNC) 7890b57cec5SDimitry Andric .legalFor({{S32, S64}, {S16, S32}}) 7905ffd83dbSDimitry Andric .scalarize(0) 7915ffd83dbSDimitry Andric .lower(); 7920b57cec5SDimitry Andric 7930b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPEXT) 7940b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}}) 795e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 7960b57cec5SDimitry Andric .scalarize(0); 7970b57cec5SDimitry Andric 79881ad6265SDimitry Andric auto &FSubActions = getActionDefinitionsBuilder(G_FSUB); 79981ad6265SDimitry Andric if (ST.has16BitInsts()) { 80081ad6265SDimitry Andric FSubActions 80181ad6265SDimitry Andric // Use actual fsub instruction 80281ad6265SDimitry Andric .legalFor({S32, S16}) 80381ad6265SDimitry Andric // Must use fadd + fneg 80481ad6265SDimitry Andric .lowerFor({S64, V2S16}); 80581ad6265SDimitry Andric } else { 80681ad6265SDimitry Andric FSubActions 8070b57cec5SDimitry Andric // Use actual fsub instruction 8080b57cec5SDimitry Andric .legalFor({S32}) 8090b57cec5SDimitry Andric // Must use fadd + fneg 81081ad6265SDimitry Andric .lowerFor({S64, S16, V2S16}); 81181ad6265SDimitry Andric } 81281ad6265SDimitry Andric 81381ad6265SDimitry Andric FSubActions 8140b57cec5SDimitry Andric .scalarize(0) 8150b57cec5SDimitry Andric .clampScalar(0, S32, S64); 8160b57cec5SDimitry Andric 8178bcb0991SDimitry Andric // Whether this is legal depends on the floating point mode for the function. 8188bcb0991SDimitry Andric auto &FMad = getActionDefinitionsBuilder(G_FMAD); 8195ffd83dbSDimitry Andric if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 8208bcb0991SDimitry Andric FMad.customFor({S32, S16}); 8215ffd83dbSDimitry Andric else if (ST.hasMadMacF32Insts()) 8228bcb0991SDimitry Andric FMad.customFor({S32}); 8235ffd83dbSDimitry Andric else if (ST.hasMadF16()) 8245ffd83dbSDimitry Andric FMad.customFor({S16}); 8258bcb0991SDimitry Andric FMad.scalarize(0) 8268bcb0991SDimitry Andric .lower(); 8278bcb0991SDimitry Andric 828e8d8bef9SDimitry Andric auto &FRem = getActionDefinitionsBuilder(G_FREM); 829e8d8bef9SDimitry Andric if (ST.has16BitInsts()) { 830e8d8bef9SDimitry Andric FRem.customFor({S16, S32, S64}); 831e8d8bef9SDimitry Andric } else { 832e8d8bef9SDimitry Andric FRem.minScalar(0, S32) 833e8d8bef9SDimitry Andric .customFor({S32, S64}); 834e8d8bef9SDimitry Andric } 835e8d8bef9SDimitry Andric FRem.scalarize(0); 836e8d8bef9SDimitry Andric 8375ffd83dbSDimitry Andric // TODO: Do we need to clamp maximum bitwidth? 8385ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_TRUNC) 8395ffd83dbSDimitry Andric .legalIf(isScalar(0)) 8405ffd83dbSDimitry Andric .legalFor({{V2S16, V2S32}}) 8415ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 8425ffd83dbSDimitry Andric // Avoid scalarizing in cases that should be truly illegal. In unresolvable 8435ffd83dbSDimitry Andric // situations (like an invalid implicit use), we don't want to infinite loop 8445ffd83dbSDimitry Andric // in the legalizer. 8455ffd83dbSDimitry Andric .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 8465ffd83dbSDimitry Andric .alwaysLegal(); 8475ffd83dbSDimitry Andric 8480b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 8490b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 8505ffd83dbSDimitry Andric {S32, S1}, {S64, S1}, {S16, S1}}) 851480093f4SDimitry Andric .scalarize(0) 8525ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 8535ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 8540b57cec5SDimitry Andric 8558bcb0991SDimitry Andric // TODO: Split s1->s64 during regbankselect for VALU. 8568bcb0991SDimitry Andric auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 857480093f4SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 858480093f4SDimitry Andric .lowerIf(typeIs(1, S1)) 859349cc55cSDimitry Andric .customFor({{S32, S64}, {S64, S64}}); 8608bcb0991SDimitry Andric if (ST.has16BitInsts()) 8618bcb0991SDimitry Andric IToFP.legalFor({{S16, S16}}); 8628bcb0991SDimitry Andric IToFP.clampScalar(1, S32, S64) 863e8d8bef9SDimitry Andric .minScalar(0, S32) 8645ffd83dbSDimitry Andric .scalarize(0) 8655ffd83dbSDimitry Andric .widenScalarToNextPow2(1); 8660b57cec5SDimitry Andric 8678bcb0991SDimitry Andric auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 8685ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 869fe6060f1SDimitry Andric .customFor({{S64, S32}, {S64, S64}}) 870e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 8718bcb0991SDimitry Andric if (ST.has16BitInsts()) 8728bcb0991SDimitry Andric FPToI.legalFor({{S16, S16}}); 8738bcb0991SDimitry Andric else 8748bcb0991SDimitry Andric FPToI.minScalar(1, S32); 8758bcb0991SDimitry Andric 8768bcb0991SDimitry Andric FPToI.minScalar(0, S32) 877fe6060f1SDimitry Andric .widenScalarToNextPow2(0, 32) 8785ffd83dbSDimitry Andric .scalarize(0) 8795ffd83dbSDimitry Andric .lower(); 8800b57cec5SDimitry Andric 88181ad6265SDimitry Andric getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 88281ad6265SDimitry Andric .customFor({S16, S32}) 88381ad6265SDimitry Andric .scalarize(0) 88481ad6265SDimitry Andric .lower(); 88581ad6265SDimitry Andric 886e8d8bef9SDimitry Andric // Lower roundeven into G_FRINT 887e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 888480093f4SDimitry Andric .scalarize(0) 889480093f4SDimitry Andric .lower(); 8900b57cec5SDimitry Andric 891480093f4SDimitry Andric if (ST.has16BitInsts()) { 892480093f4SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 893480093f4SDimitry Andric .legalFor({S16, S32, S64}) 894480093f4SDimitry Andric .clampScalar(0, S16, S64) 895480093f4SDimitry Andric .scalarize(0); 896480093f4SDimitry Andric } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 8970b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 8980b57cec5SDimitry Andric .legalFor({S32, S64}) 8990b57cec5SDimitry Andric .clampScalar(0, S32, S64) 9000b57cec5SDimitry Andric .scalarize(0); 9010b57cec5SDimitry Andric } else { 9020b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 9030b57cec5SDimitry Andric .legalFor({S32}) 9040b57cec5SDimitry Andric .customFor({S64}) 9050b57cec5SDimitry Andric .clampScalar(0, S32, S64) 9060b57cec5SDimitry Andric .scalarize(0); 9070b57cec5SDimitry Andric } 9080b57cec5SDimitry Andric 909480093f4SDimitry Andric getActionDefinitionsBuilder(G_PTR_ADD) 910e8d8bef9SDimitry Andric .legalIf(all(isPointer(0), sameSize(0, 1))) 911e8d8bef9SDimitry Andric .scalarize(0) 912e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0); 9130b57cec5SDimitry Andric 9145ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_PTRMASK) 915e8d8bef9SDimitry Andric .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 916e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0) 9175ffd83dbSDimitry Andric .scalarize(0); 9180b57cec5SDimitry Andric 9190b57cec5SDimitry Andric auto &CmpBuilder = 9200b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ICMP) 921480093f4SDimitry Andric // The compare output type differs based on the register bank of the output, 922480093f4SDimitry Andric // so make both s1 and s32 legal. 923480093f4SDimitry Andric // 924480093f4SDimitry Andric // Scalar compares producing output in scc will be promoted to s32, as that 925480093f4SDimitry Andric // is the allocatable register type that will be needed for the copy from 926480093f4SDimitry Andric // scc. This will be promoted during RegBankSelect, and we assume something 927480093f4SDimitry Andric // before that won't try to use s32 result types. 928480093f4SDimitry Andric // 929480093f4SDimitry Andric // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 930480093f4SDimitry Andric // bank. 9310b57cec5SDimitry Andric .legalForCartesianProduct( 9320b57cec5SDimitry Andric {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 933480093f4SDimitry Andric .legalForCartesianProduct( 934480093f4SDimitry Andric {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 9350b57cec5SDimitry Andric if (ST.has16BitInsts()) { 9360b57cec5SDimitry Andric CmpBuilder.legalFor({{S1, S16}}); 9370b57cec5SDimitry Andric } 9380b57cec5SDimitry Andric 9390b57cec5SDimitry Andric CmpBuilder 9400b57cec5SDimitry Andric .widenScalarToNextPow2(1) 9410b57cec5SDimitry Andric .clampScalar(1, S32, S64) 9420b57cec5SDimitry Andric .scalarize(0) 943480093f4SDimitry Andric .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 9440b57cec5SDimitry Andric 9450b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCMP) 9460b57cec5SDimitry Andric .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 9470b57cec5SDimitry Andric .widenScalarToNextPow2(1) 9480b57cec5SDimitry Andric .clampScalar(1, S32, S64) 9490b57cec5SDimitry Andric .scalarize(0); 9500b57cec5SDimitry Andric 9515ffd83dbSDimitry Andric // FIXME: fpow has a selection pattern that should move to custom lowering. 9525ffd83dbSDimitry Andric auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 9535ffd83dbSDimitry Andric if (ST.has16BitInsts()) 9545ffd83dbSDimitry Andric Exp2Ops.legalFor({S32, S16}); 9555ffd83dbSDimitry Andric else 9565ffd83dbSDimitry Andric Exp2Ops.legalFor({S32}); 9575ffd83dbSDimitry Andric Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 9585ffd83dbSDimitry Andric Exp2Ops.scalarize(0); 9595ffd83dbSDimitry Andric 9605ffd83dbSDimitry Andric auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 9615ffd83dbSDimitry Andric if (ST.has16BitInsts()) 9625ffd83dbSDimitry Andric ExpOps.customFor({{S32}, {S16}}); 9635ffd83dbSDimitry Andric else 9645ffd83dbSDimitry Andric ExpOps.customFor({S32}); 9655ffd83dbSDimitry Andric ExpOps.clampScalar(0, MinScalarFPTy, S32) 9660b57cec5SDimitry Andric .scalarize(0); 9670b57cec5SDimitry Andric 968e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FPOWI) 969e8d8bef9SDimitry Andric .clampScalar(0, MinScalarFPTy, S32) 970e8d8bef9SDimitry Andric .lower(); 971e8d8bef9SDimitry Andric 9720b57cec5SDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 9735ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_CTPOP) 9740b57cec5SDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 9750b57cec5SDimitry Andric .clampScalar(0, S32, S32) 97604eeddc0SDimitry Andric .widenScalarToNextPow2(1, 32) 9770b57cec5SDimitry Andric .clampScalar(1, S32, S64) 9780b57cec5SDimitry Andric .scalarize(0) 97904eeddc0SDimitry Andric .widenScalarToNextPow2(0, 32); 98004eeddc0SDimitry Andric 9810b57cec5SDimitry Andric 9825ffd83dbSDimitry Andric // The hardware instructions return a different result on 0 than the generic 9835ffd83dbSDimitry Andric // instructions expect. The hardware produces -1, but these produce the 9845ffd83dbSDimitry Andric // bitwidth. 9855ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 9865ffd83dbSDimitry Andric .scalarize(0) 9875ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 9885ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 9895ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 9905ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32) 991349cc55cSDimitry Andric .custom(); 9925ffd83dbSDimitry Andric 9935ffd83dbSDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 9945ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 9955ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 9965ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 9975ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 9985ffd83dbSDimitry Andric .scalarize(0) 9995ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 10005ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 10015ffd83dbSDimitry Andric 1002fe6060f1SDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1003fe6060f1SDimitry Andric // RegBankSelect. 10045ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BITREVERSE) 1005fe6060f1SDimitry Andric .legalFor({S32, S64}) 1006fe6060f1SDimitry Andric .clampScalar(0, S32, S64) 1007fe6060f1SDimitry Andric .scalarize(0) 1008fe6060f1SDimitry Andric .widenScalarToNextPow2(0); 10090b57cec5SDimitry Andric 10100b57cec5SDimitry Andric if (ST.has16BitInsts()) { 10115ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 10125ffd83dbSDimitry Andric .legalFor({S16, S32, V2S16}) 10130eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 10145ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 10155ffd83dbSDimitry Andric // narrowScalar limitation. 10165ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 10175ffd83dbSDimitry Andric .clampScalar(0, S16, S32) 10185ffd83dbSDimitry Andric .scalarize(0); 10195ffd83dbSDimitry Andric 10200b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 1021fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 10220b57cec5SDimitry Andric .legalFor({S32, S16, V2S16}) 10230b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 10240b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 10255ffd83dbSDimitry Andric .minScalar(0, S16) 10260b57cec5SDimitry Andric .widenScalarToNextPow2(0) 10275ffd83dbSDimitry Andric .scalarize(0) 10285ffd83dbSDimitry Andric .lower(); 10290b57cec5SDimitry Andric } else { 1030fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 10310b57cec5SDimitry Andric .legalFor({S32, S16}) 10320b57cec5SDimitry Andric .widenScalarToNextPow2(0) 10335ffd83dbSDimitry Andric .minScalar(0, S16) 10345ffd83dbSDimitry Andric .scalarize(0) 10355ffd83dbSDimitry Andric .lower(); 10360b57cec5SDimitry Andric } 10370b57cec5SDimitry Andric } else { 10385ffd83dbSDimitry Andric // TODO: Should have same legality without v_perm_b32 10395ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 10405ffd83dbSDimitry Andric .legalFor({S32}) 10415ffd83dbSDimitry Andric .lowerIf(scalarNarrowerThan(0, 32)) 10425ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 10435ffd83dbSDimitry Andric // narrowScalar limitation. 10445ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 10455ffd83dbSDimitry Andric .maxScalar(0, S32) 10465ffd83dbSDimitry Andric .scalarize(0) 10475ffd83dbSDimitry Andric .lower(); 10485ffd83dbSDimitry Andric 1049fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 10500b57cec5SDimitry Andric .legalFor({S32}) 10515ffd83dbSDimitry Andric .minScalar(0, S32) 10520b57cec5SDimitry Andric .widenScalarToNextPow2(0) 10535ffd83dbSDimitry Andric .scalarize(0) 10545ffd83dbSDimitry Andric .lower(); 10550b57cec5SDimitry Andric } 10560b57cec5SDimitry Andric 10570b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTTOPTR) 10580b57cec5SDimitry Andric // List the common cases 10590b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 10600b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 10610b57cec5SDimitry Andric .scalarize(0) 10620b57cec5SDimitry Andric // Accept any address space as long as the size matches 10630b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 10640b57cec5SDimitry Andric .widenScalarIf(smallerThan(1, 0), 10650b57cec5SDimitry Andric [](const LegalityQuery &Query) { 10660b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 10670b57cec5SDimitry Andric }) 10685ffd83dbSDimitry Andric .narrowScalarIf(largerThan(1, 0), 10690b57cec5SDimitry Andric [](const LegalityQuery &Query) { 10700b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 10710b57cec5SDimitry Andric }); 10720b57cec5SDimitry Andric 10730b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PTRTOINT) 10740b57cec5SDimitry Andric // List the common cases 10750b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 10760b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 10770b57cec5SDimitry Andric .scalarize(0) 10780b57cec5SDimitry Andric // Accept any address space as long as the size matches 10790b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 10800b57cec5SDimitry Andric .widenScalarIf(smallerThan(0, 1), 10810b57cec5SDimitry Andric [](const LegalityQuery &Query) { 10820b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 10830b57cec5SDimitry Andric }) 10840b57cec5SDimitry Andric .narrowScalarIf( 10855ffd83dbSDimitry Andric largerThan(0, 1), 10860b57cec5SDimitry Andric [](const LegalityQuery &Query) { 10870b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 10880b57cec5SDimitry Andric }); 10890b57cec5SDimitry Andric 10900b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 10910b57cec5SDimitry Andric .scalarize(0) 10920b57cec5SDimitry Andric .custom(); 10930b57cec5SDimitry Andric 10945ffd83dbSDimitry Andric const auto needToSplitMemOp = [=](const LegalityQuery &Query, 10955ffd83dbSDimitry Andric bool IsLoad) -> bool { 10968bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 10978bcb0991SDimitry Andric 10988bcb0991SDimitry Andric // Split vector extloads. 1099fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1100480093f4SDimitry Andric 11018bcb0991SDimitry Andric if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 11028bcb0991SDimitry Andric return true; 11038bcb0991SDimitry Andric 11048bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 11058bcb0991SDimitry Andric unsigned AS = PtrTy.getAddressSpace(); 11065ffd83dbSDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 11078bcb0991SDimitry Andric return true; 11088bcb0991SDimitry Andric 11098bcb0991SDimitry Andric // Catch weird sized loads that don't evenly divide into the access sizes 11108bcb0991SDimitry Andric // TODO: May be able to widen depending on alignment etc. 11115ffd83dbSDimitry Andric unsigned NumRegs = (MemSize + 31) / 32; 11125ffd83dbSDimitry Andric if (NumRegs == 3) { 11135ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 11148bcb0991SDimitry Andric return true; 11155ffd83dbSDimitry Andric } else { 11165ffd83dbSDimitry Andric // If the alignment allows, these should have been widened. 11175ffd83dbSDimitry Andric if (!isPowerOf2_32(NumRegs)) 11185ffd83dbSDimitry Andric return true; 11195ffd83dbSDimitry Andric } 11208bcb0991SDimitry Andric 11218bcb0991SDimitry Andric return false; 11228bcb0991SDimitry Andric }; 11238bcb0991SDimitry Andric 1124e8d8bef9SDimitry Andric unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1125e8d8bef9SDimitry Andric unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1126e8d8bef9SDimitry Andric unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 11278bcb0991SDimitry Andric 11288bcb0991SDimitry Andric // TODO: Refine based on subtargets which support unaligned access or 128-bit 11298bcb0991SDimitry Andric // LDS 11308bcb0991SDimitry Andric // TODO: Unsupported flat for SI. 11318bcb0991SDimitry Andric 11328bcb0991SDimitry Andric for (unsigned Op : {G_LOAD, G_STORE}) { 11338bcb0991SDimitry Andric const bool IsStore = Op == G_STORE; 11348bcb0991SDimitry Andric 11358bcb0991SDimitry Andric auto &Actions = getActionDefinitionsBuilder(Op); 11365ffd83dbSDimitry Andric // Explicitly list some common cases. 11375ffd83dbSDimitry Andric // TODO: Does this help compile time at all? 1138fe6060f1SDimitry Andric Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1139fe6060f1SDimitry Andric {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1140fe6060f1SDimitry Andric {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1141fe6060f1SDimitry Andric {S64, GlobalPtr, S64, GlobalAlign32}, 1142fe6060f1SDimitry Andric {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1143fe6060f1SDimitry Andric {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1144fe6060f1SDimitry Andric {S32, GlobalPtr, S8, GlobalAlign8}, 1145fe6060f1SDimitry Andric {S32, GlobalPtr, S16, GlobalAlign16}, 11468bcb0991SDimitry Andric 1147fe6060f1SDimitry Andric {S32, LocalPtr, S32, 32}, 1148fe6060f1SDimitry Andric {S64, LocalPtr, S64, 32}, 1149fe6060f1SDimitry Andric {V2S32, LocalPtr, V2S32, 32}, 1150fe6060f1SDimitry Andric {S32, LocalPtr, S8, 8}, 1151fe6060f1SDimitry Andric {S32, LocalPtr, S16, 16}, 1152fe6060f1SDimitry Andric {V2S16, LocalPtr, S32, 32}, 11538bcb0991SDimitry Andric 1154fe6060f1SDimitry Andric {S32, PrivatePtr, S32, 32}, 1155fe6060f1SDimitry Andric {S32, PrivatePtr, S8, 8}, 1156fe6060f1SDimitry Andric {S32, PrivatePtr, S16, 16}, 1157fe6060f1SDimitry Andric {V2S16, PrivatePtr, S32, 32}, 11588bcb0991SDimitry Andric 1159fe6060f1SDimitry Andric {S32, ConstantPtr, S32, GlobalAlign32}, 1160fe6060f1SDimitry Andric {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1161fe6060f1SDimitry Andric {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1162fe6060f1SDimitry Andric {S64, ConstantPtr, S64, GlobalAlign32}, 1163fe6060f1SDimitry Andric {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 11645ffd83dbSDimitry Andric Actions.legalIf( 11655ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1166fe6060f1SDimitry Andric return isLoadStoreLegal(ST, Query); 11675ffd83dbSDimitry Andric }); 11685ffd83dbSDimitry Andric 11695ffd83dbSDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 11705ffd83dbSDimitry Andric // 64-bits. 11715ffd83dbSDimitry Andric // 11725ffd83dbSDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 11735ffd83dbSDimitry Andric // inserting addrspacecasts. 11745ffd83dbSDimitry Andric Actions.customIf(typeIs(1, Constant32Ptr)); 11755ffd83dbSDimitry Andric 11765ffd83dbSDimitry Andric // Turn any illegal element vectors into something easier to deal 11775ffd83dbSDimitry Andric // with. These will ultimately produce 32-bit scalar shifts to extract the 11785ffd83dbSDimitry Andric // parts anyway. 11795ffd83dbSDimitry Andric // 11805ffd83dbSDimitry Andric // For odd 16-bit element vectors, prefer to split those into pieces with 11815ffd83dbSDimitry Andric // 16-bit vector parts. 11825ffd83dbSDimitry Andric Actions.bitcastIf( 11835ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1184e8d8bef9SDimitry Andric return shouldBitcastLoadStoreType(ST, Query.Types[0], 1185fe6060f1SDimitry Andric Query.MMODescrs[0].MemoryTy); 11865ffd83dbSDimitry Andric }, bitcastToRegisterType(0)); 11875ffd83dbSDimitry Andric 1188e8d8bef9SDimitry Andric if (!IsStore) { 1189e8d8bef9SDimitry Andric // Widen suitably aligned loads by loading extra bytes. The standard 1190e8d8bef9SDimitry Andric // legalization actions can't properly express widening memory operands. 1191e8d8bef9SDimitry Andric Actions.customIf([=](const LegalityQuery &Query) -> bool { 1192e8d8bef9SDimitry Andric return shouldWidenLoad(ST, Query, G_LOAD); 1193e8d8bef9SDimitry Andric }); 1194e8d8bef9SDimitry Andric } 1195e8d8bef9SDimitry Andric 1196e8d8bef9SDimitry Andric // FIXME: load/store narrowing should be moved to lower action 11978bcb0991SDimitry Andric Actions 11988bcb0991SDimitry Andric .narrowScalarIf( 11998bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 12005ffd83dbSDimitry Andric return !Query.Types[0].isVector() && 12015ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 12028bcb0991SDimitry Andric }, 12038bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 12048bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 12058bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 12068bcb0991SDimitry Andric 12078bcb0991SDimitry Andric const unsigned DstSize = DstTy.getSizeInBits(); 1208fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 12098bcb0991SDimitry Andric 12108bcb0991SDimitry Andric // Split extloads. 12118bcb0991SDimitry Andric if (DstSize > MemSize) 12128bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MemSize)); 12138bcb0991SDimitry Andric 12145ffd83dbSDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(ST, 12155ffd83dbSDimitry Andric PtrTy.getAddressSpace(), 12165ffd83dbSDimitry Andric Op == G_LOAD); 12178bcb0991SDimitry Andric if (MemSize > MaxSize) 12188bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MaxSize)); 12198bcb0991SDimitry Andric 122004eeddc0SDimitry Andric uint64_t Align = Query.MMODescrs[0].AlignInBits; 12218bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(Align)); 12228bcb0991SDimitry Andric }) 12238bcb0991SDimitry Andric .fewerElementsIf( 12248bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 12255ffd83dbSDimitry Andric return Query.Types[0].isVector() && 12265ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 12278bcb0991SDimitry Andric }, 12288bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 12298bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 12308bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 12318bcb0991SDimitry Andric 12328bcb0991SDimitry Andric LLT EltTy = DstTy.getElementType(); 12335ffd83dbSDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(ST, 12345ffd83dbSDimitry Andric PtrTy.getAddressSpace(), 12355ffd83dbSDimitry Andric Op == G_LOAD); 12365ffd83dbSDimitry Andric 12375ffd83dbSDimitry Andric // FIXME: Handle widened to power of 2 results better. This ends 12385ffd83dbSDimitry Andric // up scalarizing. 12395ffd83dbSDimitry Andric // FIXME: 3 element stores scalarized on SI 12408bcb0991SDimitry Andric 12418bcb0991SDimitry Andric // Split if it's too large for the address space. 1242fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1243fe6060f1SDimitry Andric if (MemSize > MaxSize) { 12448bcb0991SDimitry Andric unsigned NumElts = DstTy.getNumElements(); 12455ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 12465ffd83dbSDimitry Andric 12475ffd83dbSDimitry Andric if (MaxSize % EltSize == 0) { 12485ffd83dbSDimitry Andric return std::make_pair( 1249fe6060f1SDimitry Andric 0, LLT::scalarOrVector( 1250fe6060f1SDimitry Andric ElementCount::getFixed(MaxSize / EltSize), EltTy)); 12515ffd83dbSDimitry Andric } 12525ffd83dbSDimitry Andric 1253fe6060f1SDimitry Andric unsigned NumPieces = MemSize / MaxSize; 12548bcb0991SDimitry Andric 12558bcb0991SDimitry Andric // FIXME: Refine when odd breakdowns handled 12568bcb0991SDimitry Andric // The scalars will need to be re-legalized. 12578bcb0991SDimitry Andric if (NumPieces == 1 || NumPieces >= NumElts || 12588bcb0991SDimitry Andric NumElts % NumPieces != 0) 12598bcb0991SDimitry Andric return std::make_pair(0, EltTy); 12608bcb0991SDimitry Andric 1261fe6060f1SDimitry Andric return std::make_pair( 1262fe6060f1SDimitry Andric 0, LLT::fixed_vector(NumElts / NumPieces, EltTy)); 12638bcb0991SDimitry Andric } 12648bcb0991SDimitry Andric 12655ffd83dbSDimitry Andric // FIXME: We could probably handle weird extending loads better. 12665ffd83dbSDimitry Andric if (DstTy.getSizeInBits() > MemSize) 12675ffd83dbSDimitry Andric return std::make_pair(0, EltTy); 12685ffd83dbSDimitry Andric 12695ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 12705ffd83dbSDimitry Andric unsigned DstSize = DstTy.getSizeInBits(); 12715ffd83dbSDimitry Andric if (!isPowerOf2_32(DstSize)) { 12725ffd83dbSDimitry Andric // We're probably decomposing an odd sized store. Try to split 12735ffd83dbSDimitry Andric // to the widest type. TODO: Account for alignment. As-is it 12745ffd83dbSDimitry Andric // should be OK, since the new parts will be further legalized. 12755ffd83dbSDimitry Andric unsigned FloorSize = PowerOf2Floor(DstSize); 12765ffd83dbSDimitry Andric return std::make_pair( 1277fe6060f1SDimitry Andric 0, LLT::scalarOrVector( 1278fe6060f1SDimitry Andric ElementCount::getFixed(FloorSize / EltSize), EltTy)); 12795ffd83dbSDimitry Andric } 12805ffd83dbSDimitry Andric 12818bcb0991SDimitry Andric // May need relegalization for the scalars. 12828bcb0991SDimitry Andric return std::make_pair(0, EltTy); 12838bcb0991SDimitry Andric }) 1284fe6060f1SDimitry Andric .minScalar(0, S32) 1285fe6060f1SDimitry Andric .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 12868bcb0991SDimitry Andric .widenScalarToNextPow2(0) 1287e8d8bef9SDimitry Andric .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1288e8d8bef9SDimitry Andric .lower(); 12898bcb0991SDimitry Andric } 12900b57cec5SDimitry Andric 1291fe6060f1SDimitry Andric // FIXME: Unaligned accesses not lowered. 12920b57cec5SDimitry Andric auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1293fe6060f1SDimitry Andric .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1294fe6060f1SDimitry Andric {S32, GlobalPtr, S16, 2 * 8}, 1295fe6060f1SDimitry Andric {S32, LocalPtr, S8, 8}, 1296fe6060f1SDimitry Andric {S32, LocalPtr, S16, 16}, 1297fe6060f1SDimitry Andric {S32, PrivatePtr, S8, 8}, 1298fe6060f1SDimitry Andric {S32, PrivatePtr, S16, 16}, 1299fe6060f1SDimitry Andric {S32, ConstantPtr, S8, 8}, 1300fe6060f1SDimitry Andric {S32, ConstantPtr, S16, 2 * 8}}) 1301fe6060f1SDimitry Andric .legalIf( 1302fe6060f1SDimitry Andric [=](const LegalityQuery &Query) -> bool { 1303fe6060f1SDimitry Andric return isLoadStoreLegal(ST, Query); 1304fe6060f1SDimitry Andric }); 1305fe6060f1SDimitry Andric 13060b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 13078bcb0991SDimitry Andric ExtLoads.legalForTypesWithMemDesc( 1308fe6060f1SDimitry Andric {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 13090b57cec5SDimitry Andric } 13100b57cec5SDimitry Andric 1311fe6060f1SDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1312fe6060f1SDimitry Andric // 64-bits. 1313fe6060f1SDimitry Andric // 1314fe6060f1SDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 1315fe6060f1SDimitry Andric // inserting addrspacecasts. 1316fe6060f1SDimitry Andric ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1317fe6060f1SDimitry Andric 13180b57cec5SDimitry Andric ExtLoads.clampScalar(0, S32, S32) 13190b57cec5SDimitry Andric .widenScalarToNextPow2(0) 13200b57cec5SDimitry Andric .lower(); 13210b57cec5SDimitry Andric 13220b57cec5SDimitry Andric auto &Atomics = getActionDefinitionsBuilder( 13230b57cec5SDimitry Andric {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 13240b57cec5SDimitry Andric G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 13250b57cec5SDimitry Andric G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1326480093f4SDimitry Andric G_ATOMICRMW_UMIN}) 13270b57cec5SDimitry Andric .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1328e8d8bef9SDimitry Andric {S64, GlobalPtr}, {S64, LocalPtr}, 1329e8d8bef9SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 13300b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 13310b57cec5SDimitry Andric Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 13320b57cec5SDimitry Andric } 13330b57cec5SDimitry Andric 1334fe6060f1SDimitry Andric auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1335349cc55cSDimitry Andric if (ST.hasLDSFPAtomicAdd()) { 1336fe6060f1SDimitry Andric Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1337fe6060f1SDimitry Andric if (ST.hasGFX90AInsts()) 1338fe6060f1SDimitry Andric Atomic.legalFor({{S64, LocalPtr}}); 133981ad6265SDimitry Andric if (ST.hasGFX940Insts()) 134081ad6265SDimitry Andric Atomic.legalFor({{V2S16, LocalPtr}}); 13415ffd83dbSDimitry Andric } 1342fe6060f1SDimitry Andric if (ST.hasAtomicFaddInsts()) 1343fe6060f1SDimitry Andric Atomic.legalFor({{S32, GlobalPtr}}); 13448bcb0991SDimitry Andric 134504eeddc0SDimitry Andric if (ST.hasGFX90AInsts()) { 134604eeddc0SDimitry Andric // These are legal with some caveats, and should have undergone expansion in 134704eeddc0SDimitry Andric // the IR in most situations 134804eeddc0SDimitry Andric // TODO: Move atomic expansion into legalizer 134904eeddc0SDimitry Andric // TODO: Also supports <2 x f16> 135004eeddc0SDimitry Andric Atomic.legalFor({ 135104eeddc0SDimitry Andric {S32, GlobalPtr}, 135204eeddc0SDimitry Andric {S64, GlobalPtr}, 135304eeddc0SDimitry Andric {S64, FlatPtr} 135404eeddc0SDimitry Andric }); 135504eeddc0SDimitry Andric } 135604eeddc0SDimitry Andric 1357480093f4SDimitry Andric // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1358480093f4SDimitry Andric // demarshalling 1359480093f4SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1360480093f4SDimitry Andric .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1361480093f4SDimitry Andric {S32, FlatPtr}, {S64, FlatPtr}}) 1362480093f4SDimitry Andric .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1363480093f4SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 13640b57cec5SDimitry Andric // TODO: Pointer types, any 32-bit or 64-bit vector 1365480093f4SDimitry Andric 1366480093f4SDimitry Andric // Condition should be s32 for scalar, s1 for vector. 13670b57cec5SDimitry Andric getActionDefinitionsBuilder(G_SELECT) 1368fe6060f1SDimitry Andric .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1369fe6060f1SDimitry Andric LocalPtr, FlatPtr, PrivatePtr, 1370fe6060f1SDimitry Andric LLT::fixed_vector(2, LocalPtr), 1371fe6060f1SDimitry Andric LLT::fixed_vector(2, PrivatePtr)}, 1372fe6060f1SDimitry Andric {S1, S32}) 13730b57cec5SDimitry Andric .clampScalar(0, S16, S64) 13745ffd83dbSDimitry Andric .scalarize(1) 13750b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 13760b57cec5SDimitry Andric .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 13770b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 2) 13780b57cec5SDimitry Andric .clampMaxNumElements(0, LocalPtr, 2) 13790b57cec5SDimitry Andric .clampMaxNumElements(0, PrivatePtr, 2) 13800b57cec5SDimitry Andric .scalarize(0) 13810b57cec5SDimitry Andric .widenScalarToNextPow2(0) 1382480093f4SDimitry Andric .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 13830b57cec5SDimitry Andric 13840b57cec5SDimitry Andric // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 13850b57cec5SDimitry Andric // be more flexible with the shift amount type. 13860b57cec5SDimitry Andric auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 13870b57cec5SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}); 13880b57cec5SDimitry Andric if (ST.has16BitInsts()) { 13890b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 13905ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 13910b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2); 13920b57cec5SDimitry Andric } else 13935ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}}); 13940b57cec5SDimitry Andric 13955ffd83dbSDimitry Andric // TODO: Support 16-bit shift amounts for all types 13965ffd83dbSDimitry Andric Shifts.widenScalarIf( 13975ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { 13985ffd83dbSDimitry Andric // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 13995ffd83dbSDimitry Andric // 32-bit amount. 14005ffd83dbSDimitry Andric const LLT ValTy = Query.Types[0]; 14015ffd83dbSDimitry Andric const LLT AmountTy = Query.Types[1]; 14025ffd83dbSDimitry Andric return ValTy.getSizeInBits() <= 16 && 14035ffd83dbSDimitry Andric AmountTy.getSizeInBits() < 16; 14045ffd83dbSDimitry Andric }, changeTo(1, S16)); 14055ffd83dbSDimitry Andric Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1406480093f4SDimitry Andric Shifts.clampScalar(1, S32, S32); 14070b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 16); 140804eeddc0SDimitry Andric Shifts.clampScalar(0, S16, S64); 1409e8d8bef9SDimitry Andric 1410e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1411e8d8bef9SDimitry Andric .minScalar(0, S16) 1412e8d8bef9SDimitry Andric .scalarize(0) 1413e8d8bef9SDimitry Andric .lower(); 14140b57cec5SDimitry Andric } else { 14150b57cec5SDimitry Andric // Make sure we legalize the shift amount type first, as the general 14160b57cec5SDimitry Andric // expansion for the shifted type will produce much worse code if it hasn't 14170b57cec5SDimitry Andric // been truncated already. 14180b57cec5SDimitry Andric Shifts.clampScalar(1, S32, S32); 14190b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 32); 142004eeddc0SDimitry Andric Shifts.clampScalar(0, S32, S64); 1421e8d8bef9SDimitry Andric 1422e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1423e8d8bef9SDimitry Andric .minScalar(0, S32) 1424e8d8bef9SDimitry Andric .scalarize(0) 1425e8d8bef9SDimitry Andric .lower(); 14260b57cec5SDimitry Andric } 14270b57cec5SDimitry Andric Shifts.scalarize(0); 14280b57cec5SDimitry Andric 14290b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 14300b57cec5SDimitry Andric unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 14310b57cec5SDimitry Andric unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 14320b57cec5SDimitry Andric unsigned IdxTypeIdx = 2; 14330b57cec5SDimitry Andric 14340b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 14350b57cec5SDimitry Andric .customIf([=](const LegalityQuery &Query) { 14360b57cec5SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 14370b57cec5SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 14380b57cec5SDimitry Andric const LLT IdxTy = Query.Types[IdxTypeIdx]; 1439e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 1440e8d8bef9SDimitry Andric return (EltSize == 32 || EltSize == 64) && 14410b57cec5SDimitry Andric VecTy.getSizeInBits() % 32 == 0 && 14425ffd83dbSDimitry Andric VecTy.getSizeInBits() <= MaxRegisterSize && 14430b57cec5SDimitry Andric IdxTy.getSizeInBits() == 32; 14440b57cec5SDimitry Andric }) 1445e8d8bef9SDimitry Andric .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1446e8d8bef9SDimitry Andric bitcastToVectorElement32(VecTypeIdx)) 1447e8d8bef9SDimitry Andric //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1448e8d8bef9SDimitry Andric .bitcastIf( 1449e8d8bef9SDimitry Andric all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1450e8d8bef9SDimitry Andric [=](const LegalityQuery &Query) { 1451e8d8bef9SDimitry Andric // For > 64-bit element types, try to turn this into a 64-bit 1452e8d8bef9SDimitry Andric // element vector since we may be able to do better indexing 1453e8d8bef9SDimitry Andric // if this is scalar. If not, fall back to 32. 1454e8d8bef9SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 1455e8d8bef9SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 1456e8d8bef9SDimitry Andric const unsigned DstEltSize = EltTy.getSizeInBits(); 1457e8d8bef9SDimitry Andric const unsigned VecSize = VecTy.getSizeInBits(); 1458e8d8bef9SDimitry Andric 1459e8d8bef9SDimitry Andric const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1460e8d8bef9SDimitry Andric return std::make_pair( 1461fe6060f1SDimitry Andric VecTypeIdx, 1462fe6060f1SDimitry Andric LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1463e8d8bef9SDimitry Andric }) 14640b57cec5SDimitry Andric .clampScalar(EltTypeIdx, S32, S64) 14650b57cec5SDimitry Andric .clampScalar(VecTypeIdx, S32, S64) 1466e8d8bef9SDimitry Andric .clampScalar(IdxTypeIdx, S32, S32) 1467e8d8bef9SDimitry Andric .clampMaxNumElements(VecTypeIdx, S32, 32) 1468e8d8bef9SDimitry Andric // TODO: Clamp elements for 64-bit vectors? 1469e8d8bef9SDimitry Andric // It should only be necessary with variable indexes. 1470e8d8bef9SDimitry Andric // As a last resort, lower to the stack 1471e8d8bef9SDimitry Andric .lower(); 14720b57cec5SDimitry Andric } 14730b57cec5SDimitry Andric 14740b57cec5SDimitry Andric getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 14750b57cec5SDimitry Andric .unsupportedIf([=](const LegalityQuery &Query) { 14760b57cec5SDimitry Andric const LLT &EltTy = Query.Types[1].getElementType(); 14770b57cec5SDimitry Andric return Query.Types[0] != EltTy; 14780b57cec5SDimitry Andric }); 14790b57cec5SDimitry Andric 14800b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT, G_INSERT}) { 14810b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 14820b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 14830b57cec5SDimitry Andric 14840b57cec5SDimitry Andric // FIXME: Doesn't handle extract of illegal sizes. 14850b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 14868bcb0991SDimitry Andric .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 14870eae32dcSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 14880eae32dcSDimitry Andric // Sub-vector(or single element) insert and extract. 14890eae32dcSDimitry Andric // TODO: verify immediate offset here since lower only works with 14900eae32dcSDimitry Andric // whole elements. 14910eae32dcSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 14920eae32dcSDimitry Andric return BigTy.isVector(); 14930eae32dcSDimitry Andric }) 14948bcb0991SDimitry Andric // FIXME: Multiples of 16 should not be legal. 14950b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 14960b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 14970b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 14980b57cec5SDimitry Andric return (BigTy.getSizeInBits() % 32 == 0) && 14990b57cec5SDimitry Andric (LitTy.getSizeInBits() % 16 == 0); 15000b57cec5SDimitry Andric }) 15010b57cec5SDimitry Andric .widenScalarIf( 15020b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 15030b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 15040b57cec5SDimitry Andric return (BigTy.getScalarSizeInBits() < 16); 15050b57cec5SDimitry Andric }, 15060b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 15070b57cec5SDimitry Andric .widenScalarIf( 15080b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 15090b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 15100b57cec5SDimitry Andric return (LitTy.getScalarSizeInBits() < 16); 15110b57cec5SDimitry Andric }, 15120b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 15130b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 15140b57cec5SDimitry Andric .widenScalarToNextPow2(BigTyIdx, 32); 15150b57cec5SDimitry Andric 15160b57cec5SDimitry Andric } 15170b57cec5SDimitry Andric 15188bcb0991SDimitry Andric auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 15190b57cec5SDimitry Andric .legalForCartesianProduct(AllS32Vectors, {S32}) 15200b57cec5SDimitry Andric .legalForCartesianProduct(AllS64Vectors, {S64}) 15218bcb0991SDimitry Andric .clampNumElements(0, V16S32, V32S32) 15228bcb0991SDimitry Andric .clampNumElements(0, V2S64, V16S64) 15238bcb0991SDimitry Andric .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 15248bcb0991SDimitry Andric 15258bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) { 15265ffd83dbSDimitry Andric BuildVector 15275ffd83dbSDimitry Andric // FIXME: Should probably widen s1 vectors straight to s32 15285ffd83dbSDimitry Andric .minScalarOrElt(0, S16) 15295ffd83dbSDimitry Andric // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 15305ffd83dbSDimitry Andric .minScalar(1, S32); 15315ffd83dbSDimitry Andric 15328bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 15338bcb0991SDimitry Andric .legalFor({V2S16, S32}) 15348bcb0991SDimitry Andric .lower(); 15355ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 15368bcb0991SDimitry Andric } else { 15375ffd83dbSDimitry Andric BuildVector.customFor({V2S16, S16}); 15385ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 15395ffd83dbSDimitry Andric 15408bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 15415ffd83dbSDimitry Andric .customFor({V2S16, S32}) 15428bcb0991SDimitry Andric .lower(); 15438bcb0991SDimitry Andric } 15448bcb0991SDimitry Andric 15455ffd83dbSDimitry Andric BuildVector.legalIf(isRegisterType(0)); 15465ffd83dbSDimitry Andric 15475ffd83dbSDimitry Andric // FIXME: Clamp maximum size 15480b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1549e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 1550e8d8bef9SDimitry Andric .clampMaxNumElements(0, S32, 32) 1551e8d8bef9SDimitry Andric .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1552e8d8bef9SDimitry Andric .clampMaxNumElements(0, S16, 64); 15530b57cec5SDimitry Andric 155481ad6265SDimitry Andric // TODO: Don't fully scalarize v2s16 pieces? Or combine out those 15555ffd83dbSDimitry Andric // pre-legalize. 15565ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 15575ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 15585ffd83dbSDimitry Andric .customFor({V2S16, V2S16}) 15595ffd83dbSDimitry Andric .lower(); 15605ffd83dbSDimitry Andric } else 15618bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 15628bcb0991SDimitry Andric 15630b57cec5SDimitry Andric // Merge/Unmerge 15640b57cec5SDimitry Andric for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 15650b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 15660b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 15670b57cec5SDimitry Andric 15680b57cec5SDimitry Andric auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 15695ffd83dbSDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 15700b57cec5SDimitry Andric if (Ty.isVector()) { 15710b57cec5SDimitry Andric const LLT &EltTy = Ty.getElementType(); 15725ffd83dbSDimitry Andric if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 15730b57cec5SDimitry Andric return true; 15740b57cec5SDimitry Andric if (!isPowerOf2_32(EltTy.getSizeInBits())) 15750b57cec5SDimitry Andric return true; 15760b57cec5SDimitry Andric } 15770b57cec5SDimitry Andric return false; 15780b57cec5SDimitry Andric }; 15790b57cec5SDimitry Andric 15808bcb0991SDimitry Andric auto &Builder = getActionDefinitionsBuilder(Op) 1581e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 15825ffd83dbSDimitry Andric .lowerFor({{S16, V2S16}}) 15835ffd83dbSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 15845ffd83dbSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 15855ffd83dbSDimitry Andric return BigTy.getSizeInBits() == 32; 15865ffd83dbSDimitry Andric }) 15875ffd83dbSDimitry Andric // Try to widen to s16 first for small types. 15885ffd83dbSDimitry Andric // TODO: Only do this on targets with legal s16 shifts 15895ffd83dbSDimitry Andric .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 15900b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 15918bcb0991SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 15928bcb0991SDimitry Andric .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 15938bcb0991SDimitry Andric elementTypeIs(1, S16)), 15948bcb0991SDimitry Andric changeTo(1, V2S16)) 15955ffd83dbSDimitry Andric // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 15965ffd83dbSDimitry Andric // worth considering the multiples of 64 since 2*192 and 2*384 are not 15975ffd83dbSDimitry Andric // valid. 15985ffd83dbSDimitry Andric .clampScalar(LitTyIdx, S32, S512) 15995ffd83dbSDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 16000b57cec5SDimitry Andric // Break up vectors with weird elements into scalars 16010b57cec5SDimitry Andric .fewerElementsIf( 16025ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 16030b57cec5SDimitry Andric scalarize(0)) 16040b57cec5SDimitry Andric .fewerElementsIf( 16055ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 16060b57cec5SDimitry Andric scalarize(1)) 16075ffd83dbSDimitry Andric .clampScalar(BigTyIdx, S32, MaxScalar); 16088bcb0991SDimitry Andric 16098bcb0991SDimitry Andric if (Op == G_MERGE_VALUES) { 16108bcb0991SDimitry Andric Builder.widenScalarIf( 16118bcb0991SDimitry Andric // TODO: Use 16-bit shifts if legal for 8-bit values? 16120b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 16138bcb0991SDimitry Andric const LLT Ty = Query.Types[LitTyIdx]; 16148bcb0991SDimitry Andric return Ty.getSizeInBits() < 32; 16158bcb0991SDimitry Andric }, 16168bcb0991SDimitry Andric changeTo(LitTyIdx, S32)); 16178bcb0991SDimitry Andric } 16188bcb0991SDimitry Andric 16198bcb0991SDimitry Andric Builder.widenScalarIf( 16208bcb0991SDimitry Andric [=](const LegalityQuery &Query) { 16218bcb0991SDimitry Andric const LLT Ty = Query.Types[BigTyIdx]; 16220b57cec5SDimitry Andric return !isPowerOf2_32(Ty.getSizeInBits()) && 16230b57cec5SDimitry Andric Ty.getSizeInBits() % 16 != 0; 16240b57cec5SDimitry Andric }, 16250b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 16260b57cec5SDimitry Andric // Pick the next power of 2, or a multiple of 64 over 128. 16270b57cec5SDimitry Andric // Whichever is smaller. 16280b57cec5SDimitry Andric const LLT &Ty = Query.Types[BigTyIdx]; 16290b57cec5SDimitry Andric unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 16300b57cec5SDimitry Andric if (NewSizeInBits >= 256) { 16310b57cec5SDimitry Andric unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 16320b57cec5SDimitry Andric if (RoundedTo < NewSizeInBits) 16330b57cec5SDimitry Andric NewSizeInBits = RoundedTo; 16340b57cec5SDimitry Andric } 16350b57cec5SDimitry Andric return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 16360b57cec5SDimitry Andric }) 16370b57cec5SDimitry Andric // Any vectors left are the wrong size. Scalarize them. 16380b57cec5SDimitry Andric .scalarize(0) 16390b57cec5SDimitry Andric .scalarize(1); 16400b57cec5SDimitry Andric } 16410b57cec5SDimitry Andric 16425ffd83dbSDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 16435ffd83dbSDimitry Andric // RegBankSelect. 16445ffd83dbSDimitry Andric auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 16455ffd83dbSDimitry Andric .legalFor({{S32}, {S64}}); 16468bcb0991SDimitry Andric 16475ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 16485ffd83dbSDimitry Andric SextInReg.lowerFor({{V2S16}}) 16495ffd83dbSDimitry Andric // Prefer to reduce vector widths for 16-bit vectors before lowering, to 16505ffd83dbSDimitry Andric // get more vector shift opportunities, since we'll get those when 16515ffd83dbSDimitry Andric // expanded. 16520eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2); 16535ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 16545ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}, {S16}}); 16555ffd83dbSDimitry Andric } else { 16565ffd83dbSDimitry Andric // Prefer to promote to s32 before lowering if we don't have 16-bit 16575ffd83dbSDimitry Andric // shifts. This avoid a lot of intermediate truncate and extend operations. 16585ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}}); 16595ffd83dbSDimitry Andric } 16605ffd83dbSDimitry Andric 16615ffd83dbSDimitry Andric SextInReg 16625ffd83dbSDimitry Andric .scalarize(0) 16635ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 16645ffd83dbSDimitry Andric .lower(); 16655ffd83dbSDimitry Andric 1666349cc55cSDimitry Andric getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1667349cc55cSDimitry Andric .scalarize(0) 1668349cc55cSDimitry Andric .lower(); 1669349cc55cSDimitry Andric 1670fe6060f1SDimitry Andric // TODO: Only Try to form v2s16 with legal packed instructions. 16715ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSHR) 16725ffd83dbSDimitry Andric .legalFor({{S32, S32}}) 1673fe6060f1SDimitry Andric .lowerFor({{V2S16, V2S16}}) 16740eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 16755ffd83dbSDimitry Andric .scalarize(0) 16765ffd83dbSDimitry Andric .lower(); 1677480093f4SDimitry Andric 1678fe6060f1SDimitry Andric if (ST.hasVOP3PInsts()) { 1679fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FSHL) 1680fe6060f1SDimitry Andric .lowerFor({{V2S16, V2S16}}) 16810eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 1682fe6060f1SDimitry Andric .scalarize(0) 1683fe6060f1SDimitry Andric .lower(); 1684fe6060f1SDimitry Andric } else { 1685fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FSHL) 1686fe6060f1SDimitry Andric .scalarize(0) 1687fe6060f1SDimitry Andric .lower(); 1688fe6060f1SDimitry Andric } 1689fe6060f1SDimitry Andric 1690480093f4SDimitry Andric getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1691480093f4SDimitry Andric .legalFor({S64}); 1692480093f4SDimitry Andric 1693e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FENCE) 1694e8d8bef9SDimitry Andric .alwaysLegal(); 1695e8d8bef9SDimitry Andric 1696fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1697fe6060f1SDimitry Andric .scalarize(0) 1698fe6060f1SDimitry Andric .minScalar(0, S32) 1699fe6060f1SDimitry Andric .lower(); 1700fe6060f1SDimitry Andric 1701fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1702fe6060f1SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}) 1703fe6060f1SDimitry Andric .clampScalar(1, S32, S32) 1704fe6060f1SDimitry Andric .clampScalar(0, S32, S64) 1705fe6060f1SDimitry Andric .widenScalarToNextPow2(0) 1706fe6060f1SDimitry Andric .scalarize(0); 1707fe6060f1SDimitry Andric 17085ffd83dbSDimitry Andric getActionDefinitionsBuilder({ 17095ffd83dbSDimitry Andric // TODO: Verify V_BFI_B32 is generated from expanded bit ops 17105ffd83dbSDimitry Andric G_FCOPYSIGN, 17115ffd83dbSDimitry Andric 17125ffd83dbSDimitry Andric G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1713e8d8bef9SDimitry Andric G_ATOMICRMW_NAND, 1714e8d8bef9SDimitry Andric G_ATOMICRMW_FSUB, 17155ffd83dbSDimitry Andric G_READ_REGISTER, 17165ffd83dbSDimitry Andric G_WRITE_REGISTER, 17175ffd83dbSDimitry Andric 17185ffd83dbSDimitry Andric G_SADDO, G_SSUBO, 17195ffd83dbSDimitry Andric 17205ffd83dbSDimitry Andric // TODO: Implement 1721fe6060f1SDimitry Andric G_FMINIMUM, G_FMAXIMUM}).lower(); 17225ffd83dbSDimitry Andric 1723349cc55cSDimitry Andric getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1724349cc55cSDimitry Andric .lower(); 1725349cc55cSDimitry Andric 1726480093f4SDimitry Andric getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 17275ffd83dbSDimitry Andric G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1728480093f4SDimitry Andric G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1729480093f4SDimitry Andric .unsupported(); 1730480093f4SDimitry Andric 1731fe6060f1SDimitry Andric getLegacyLegalizerInfo().computeTables(); 17320b57cec5SDimitry Andric verify(*ST.getInstrInfo()); 17330b57cec5SDimitry Andric } 17340b57cec5SDimitry Andric 17355ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 17365ffd83dbSDimitry Andric MachineInstr &MI) const { 17375ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 17385ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 17395ffd83dbSDimitry Andric 17400b57cec5SDimitry Andric switch (MI.getOpcode()) { 17410b57cec5SDimitry Andric case TargetOpcode::G_ADDRSPACE_CAST: 17428bcb0991SDimitry Andric return legalizeAddrSpaceCast(MI, MRI, B); 17430b57cec5SDimitry Andric case TargetOpcode::G_FRINT: 17448bcb0991SDimitry Andric return legalizeFrint(MI, MRI, B); 17450b57cec5SDimitry Andric case TargetOpcode::G_FCEIL: 17468bcb0991SDimitry Andric return legalizeFceil(MI, MRI, B); 1747e8d8bef9SDimitry Andric case TargetOpcode::G_FREM: 1748e8d8bef9SDimitry Andric return legalizeFrem(MI, MRI, B); 17490b57cec5SDimitry Andric case TargetOpcode::G_INTRINSIC_TRUNC: 17508bcb0991SDimitry Andric return legalizeIntrinsicTrunc(MI, MRI, B); 17510b57cec5SDimitry Andric case TargetOpcode::G_SITOFP: 17528bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, true); 17530b57cec5SDimitry Andric case TargetOpcode::G_UITOFP: 17548bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, false); 17555ffd83dbSDimitry Andric case TargetOpcode::G_FPTOSI: 17565ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, true); 17575ffd83dbSDimitry Andric case TargetOpcode::G_FPTOUI: 17585ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, false); 17590b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM: 17600b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM: 17610b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM_IEEE: 17620b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM_IEEE: 17635ffd83dbSDimitry Andric return legalizeMinNumMaxNum(Helper, MI); 17640b57cec5SDimitry Andric case TargetOpcode::G_EXTRACT_VECTOR_ELT: 17658bcb0991SDimitry Andric return legalizeExtractVectorElt(MI, MRI, B); 17660b57cec5SDimitry Andric case TargetOpcode::G_INSERT_VECTOR_ELT: 17678bcb0991SDimitry Andric return legalizeInsertVectorElt(MI, MRI, B); 17685ffd83dbSDimitry Andric case TargetOpcode::G_SHUFFLE_VECTOR: 17695ffd83dbSDimitry Andric return legalizeShuffleVector(MI, MRI, B); 17708bcb0991SDimitry Andric case TargetOpcode::G_FSIN: 17718bcb0991SDimitry Andric case TargetOpcode::G_FCOS: 17728bcb0991SDimitry Andric return legalizeSinCos(MI, MRI, B); 17738bcb0991SDimitry Andric case TargetOpcode::G_GLOBAL_VALUE: 17748bcb0991SDimitry Andric return legalizeGlobalValue(MI, MRI, B); 17758bcb0991SDimitry Andric case TargetOpcode::G_LOAD: 1776fe6060f1SDimitry Andric case TargetOpcode::G_SEXTLOAD: 1777fe6060f1SDimitry Andric case TargetOpcode::G_ZEXTLOAD: 1778e8d8bef9SDimitry Andric return legalizeLoad(Helper, MI); 17798bcb0991SDimitry Andric case TargetOpcode::G_FMAD: 17808bcb0991SDimitry Andric return legalizeFMad(MI, MRI, B); 17818bcb0991SDimitry Andric case TargetOpcode::G_FDIV: 17828bcb0991SDimitry Andric return legalizeFDIV(MI, MRI, B); 17835ffd83dbSDimitry Andric case TargetOpcode::G_UDIV: 17845ffd83dbSDimitry Andric case TargetOpcode::G_UREM: 1785fe6060f1SDimitry Andric case TargetOpcode::G_UDIVREM: 1786fe6060f1SDimitry Andric return legalizeUnsignedDIV_REM(MI, MRI, B); 17875ffd83dbSDimitry Andric case TargetOpcode::G_SDIV: 17885ffd83dbSDimitry Andric case TargetOpcode::G_SREM: 1789fe6060f1SDimitry Andric case TargetOpcode::G_SDIVREM: 1790fe6060f1SDimitry Andric return legalizeSignedDIV_REM(MI, MRI, B); 1791480093f4SDimitry Andric case TargetOpcode::G_ATOMIC_CMPXCHG: 1792480093f4SDimitry Andric return legalizeAtomicCmpXChg(MI, MRI, B); 17935ffd83dbSDimitry Andric case TargetOpcode::G_FLOG: 17945ffd83dbSDimitry Andric return legalizeFlog(MI, B, numbers::ln2f); 17955ffd83dbSDimitry Andric case TargetOpcode::G_FLOG10: 17965ffd83dbSDimitry Andric return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 17975ffd83dbSDimitry Andric case TargetOpcode::G_FEXP: 17985ffd83dbSDimitry Andric return legalizeFExp(MI, B); 17995ffd83dbSDimitry Andric case TargetOpcode::G_FPOW: 18005ffd83dbSDimitry Andric return legalizeFPow(MI, B); 18015ffd83dbSDimitry Andric case TargetOpcode::G_FFLOOR: 18025ffd83dbSDimitry Andric return legalizeFFloor(MI, MRI, B); 18035ffd83dbSDimitry Andric case TargetOpcode::G_BUILD_VECTOR: 18045ffd83dbSDimitry Andric return legalizeBuildVector(MI, MRI, B); 180581ad6265SDimitry Andric case TargetOpcode::G_MUL: 180681ad6265SDimitry Andric return legalizeMul(Helper, MI); 1807349cc55cSDimitry Andric case TargetOpcode::G_CTLZ: 1808349cc55cSDimitry Andric case TargetOpcode::G_CTTZ: 1809349cc55cSDimitry Andric return legalizeCTLZ_CTTZ(MI, MRI, B); 181081ad6265SDimitry Andric case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 181181ad6265SDimitry Andric return legalizeFPTruncRound(MI, B); 18120b57cec5SDimitry Andric default: 18130b57cec5SDimitry Andric return false; 18140b57cec5SDimitry Andric } 18150b57cec5SDimitry Andric 18160b57cec5SDimitry Andric llvm_unreachable("expected switch to return"); 18170b57cec5SDimitry Andric } 18180b57cec5SDimitry Andric 18190b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture( 18200b57cec5SDimitry Andric unsigned AS, 18210b57cec5SDimitry Andric MachineRegisterInfo &MRI, 18228bcb0991SDimitry Andric MachineIRBuilder &B) const { 18238bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 18240b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 18250b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 18260b57cec5SDimitry Andric 18278bcb0991SDimitry Andric assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 18288bcb0991SDimitry Andric 18290b57cec5SDimitry Andric if (ST.hasApertureRegs()) { 18300b57cec5SDimitry Andric // FIXME: Use inline constants (src_{shared, private}_base) instead of 18310b57cec5SDimitry Andric // getreg. 18320b57cec5SDimitry Andric unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 18330b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 18340b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 18350b57cec5SDimitry Andric unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 18360b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 18370b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 18380b57cec5SDimitry Andric unsigned Encoding = 18390b57cec5SDimitry Andric AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 18400b57cec5SDimitry Andric Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 18410b57cec5SDimitry Andric WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 18420b57cec5SDimitry Andric 18430b57cec5SDimitry Andric Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 18440b57cec5SDimitry Andric 18458bcb0991SDimitry Andric B.buildInstr(AMDGPU::S_GETREG_B32) 18460b57cec5SDimitry Andric .addDef(GetReg) 18470b57cec5SDimitry Andric .addImm(Encoding); 18480b57cec5SDimitry Andric MRI.setType(GetReg, S32); 18490b57cec5SDimitry Andric 18508bcb0991SDimitry Andric auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 18515ffd83dbSDimitry Andric return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 18520b57cec5SDimitry Andric } 18530b57cec5SDimitry Andric 185481ad6265SDimitry Andric // TODO: can we be smarter about machine pointer info? 185581ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 185681ad6265SDimitry Andric Register LoadAddr = MRI.createGenericVirtualRegister( 185781ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 185881ad6265SDimitry Andric // For code object version 5, private_base and shared_base are passed through 185981ad6265SDimitry Andric // implicit kernargs. 186081ad6265SDimitry Andric if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 186181ad6265SDimitry Andric AMDGPUTargetLowering::ImplicitParameter Param = 186281ad6265SDimitry Andric AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 186381ad6265SDimitry Andric : AMDGPUTargetLowering::PRIVATE_BASE; 186481ad6265SDimitry Andric uint64_t Offset = 186581ad6265SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 186681ad6265SDimitry Andric 186781ad6265SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister( 186881ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 186981ad6265SDimitry Andric 187081ad6265SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 187181ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 187281ad6265SDimitry Andric return Register(); 187381ad6265SDimitry Andric 187481ad6265SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 187581ad6265SDimitry Andric PtrInfo, 187681ad6265SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 187781ad6265SDimitry Andric MachineMemOperand::MOInvariant, 187881ad6265SDimitry Andric LLT::scalar(32), commonAlignment(Align(64), Offset)); 187981ad6265SDimitry Andric 188081ad6265SDimitry Andric // Pointer address 188181ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, KernargPtrReg, 188281ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 188381ad6265SDimitry Andric // Load address 188481ad6265SDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 188581ad6265SDimitry Andric } 188681ad6265SDimitry Andric 18870b57cec5SDimitry Andric Register QueuePtr = MRI.createGenericVirtualRegister( 18880b57cec5SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 18890b57cec5SDimitry Andric 1890e8d8bef9SDimitry Andric if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 18918bcb0991SDimitry Andric return Register(); 18920b57cec5SDimitry Andric 18930b57cec5SDimitry Andric // Offset into amd_queue_t for group_segment_aperture_base_hi / 18940b57cec5SDimitry Andric // private_segment_aperture_base_hi. 18950b57cec5SDimitry Andric uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 18960b57cec5SDimitry Andric 18970b57cec5SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 18980b57cec5SDimitry Andric PtrInfo, 18995ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 19000b57cec5SDimitry Andric MachineMemOperand::MOInvariant, 1901fe6060f1SDimitry Andric LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 19020b57cec5SDimitry Andric 190381ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, QueuePtr, 190481ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 19055ffd83dbSDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 19060b57cec5SDimitry Andric } 19070b57cec5SDimitry Andric 190804eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is 190904eeddc0SDimitry Andric /// not necessary. 191004eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 191104eeddc0SDimitry Andric const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 191204eeddc0SDimitry Andric MachineInstr *Def = MRI.getVRegDef(Val); 191304eeddc0SDimitry Andric switch (Def->getOpcode()) { 191404eeddc0SDimitry Andric case AMDGPU::G_FRAME_INDEX: 191504eeddc0SDimitry Andric case AMDGPU::G_GLOBAL_VALUE: 191604eeddc0SDimitry Andric case AMDGPU::G_BLOCK_ADDR: 191704eeddc0SDimitry Andric return true; 191804eeddc0SDimitry Andric case AMDGPU::G_CONSTANT: { 191904eeddc0SDimitry Andric const ConstantInt *CI = Def->getOperand(1).getCImm(); 192004eeddc0SDimitry Andric return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 192104eeddc0SDimitry Andric } 192204eeddc0SDimitry Andric default: 192304eeddc0SDimitry Andric return false; 192404eeddc0SDimitry Andric } 192504eeddc0SDimitry Andric 192604eeddc0SDimitry Andric return false; 192704eeddc0SDimitry Andric } 192804eeddc0SDimitry Andric 19290b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 19300b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 19318bcb0991SDimitry Andric MachineIRBuilder &B) const { 19328bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 19330b57cec5SDimitry Andric 19348bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 19350b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 19360b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 19370b57cec5SDimitry Andric 19380b57cec5SDimitry Andric LLT DstTy = MRI.getType(Dst); 19390b57cec5SDimitry Andric LLT SrcTy = MRI.getType(Src); 19400b57cec5SDimitry Andric unsigned DestAS = DstTy.getAddressSpace(); 19410b57cec5SDimitry Andric unsigned SrcAS = SrcTy.getAddressSpace(); 19420b57cec5SDimitry Andric 19430b57cec5SDimitry Andric // TODO: Avoid reloading from the queue ptr for each cast, or at least each 19440b57cec5SDimitry Andric // vector element. 19450b57cec5SDimitry Andric assert(!DstTy.isVector()); 19460b57cec5SDimitry Andric 19470b57cec5SDimitry Andric const AMDGPUTargetMachine &TM 19480b57cec5SDimitry Andric = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 19490b57cec5SDimitry Andric 1950e8d8bef9SDimitry Andric if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 19518bcb0991SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 19528bcb0991SDimitry Andric return true; 19538bcb0991SDimitry Andric } 19548bcb0991SDimitry Andric 195581ad6265SDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 195681ad6265SDimitry Andric (DestAS == AMDGPUAS::LOCAL_ADDRESS || 195781ad6265SDimitry Andric DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 195804eeddc0SDimitry Andric if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 195904eeddc0SDimitry Andric // Extract low 32-bits of the pointer. 196004eeddc0SDimitry Andric B.buildExtract(Dst, Src, 0); 196104eeddc0SDimitry Andric MI.eraseFromParent(); 196204eeddc0SDimitry Andric return true; 196304eeddc0SDimitry Andric } 196404eeddc0SDimitry Andric 19650b57cec5SDimitry Andric unsigned NullVal = TM.getNullPointerValue(DestAS); 19660b57cec5SDimitry Andric 19678bcb0991SDimitry Andric auto SegmentNull = B.buildConstant(DstTy, NullVal); 19688bcb0991SDimitry Andric auto FlatNull = B.buildConstant(SrcTy, 0); 19690b57cec5SDimitry Andric 19700b57cec5SDimitry Andric // Extract low 32-bits of the pointer. 19715ffd83dbSDimitry Andric auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 19720b57cec5SDimitry Andric 19735ffd83dbSDimitry Andric auto CmpRes = 19745ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 19758bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 19760b57cec5SDimitry Andric 19770b57cec5SDimitry Andric MI.eraseFromParent(); 19780b57cec5SDimitry Andric return true; 19790b57cec5SDimitry Andric } 19800b57cec5SDimitry Andric 198181ad6265SDimitry Andric if (DestAS == AMDGPUAS::FLAT_ADDRESS && 198281ad6265SDimitry Andric (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 198381ad6265SDimitry Andric SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 19848bcb0991SDimitry Andric if (!ST.hasFlatAddressSpace()) 19858bcb0991SDimitry Andric return false; 19860b57cec5SDimitry Andric 19878bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 19888bcb0991SDimitry Andric if (!ApertureReg.isValid()) 19898bcb0991SDimitry Andric return false; 19900b57cec5SDimitry Andric 19910b57cec5SDimitry Andric // Coerce the type of the low half of the result so we can use merge_values. 19925ffd83dbSDimitry Andric Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 19930b57cec5SDimitry Andric 19940b57cec5SDimitry Andric // TODO: Should we allow mismatched types but matching sizes in merges to 19950b57cec5SDimitry Andric // avoid the ptrtoint? 19965ffd83dbSDimitry Andric auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 199704eeddc0SDimitry Andric 199804eeddc0SDimitry Andric if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 199904eeddc0SDimitry Andric B.buildCopy(Dst, BuildPtr); 200004eeddc0SDimitry Andric MI.eraseFromParent(); 200104eeddc0SDimitry Andric return true; 200204eeddc0SDimitry Andric } 200304eeddc0SDimitry Andric 200404eeddc0SDimitry Andric auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 200504eeddc0SDimitry Andric auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 200604eeddc0SDimitry Andric 200781ad6265SDimitry Andric auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 200881ad6265SDimitry Andric SegmentNull.getReg(0)); 200904eeddc0SDimitry Andric 20105ffd83dbSDimitry Andric B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 20110b57cec5SDimitry Andric 20120b57cec5SDimitry Andric MI.eraseFromParent(); 20130b57cec5SDimitry Andric return true; 20140b57cec5SDimitry Andric } 20150b57cec5SDimitry Andric 201681ad6265SDimitry Andric if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 201781ad6265SDimitry Andric SrcTy.getSizeInBits() == 64) { 201881ad6265SDimitry Andric // Truncate. 201981ad6265SDimitry Andric B.buildExtract(Dst, Src, 0); 202081ad6265SDimitry Andric MI.eraseFromParent(); 202181ad6265SDimitry Andric return true; 202281ad6265SDimitry Andric } 202381ad6265SDimitry Andric 202481ad6265SDimitry Andric if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 202581ad6265SDimitry Andric DstTy.getSizeInBits() == 64) { 202681ad6265SDimitry Andric const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 202781ad6265SDimitry Andric uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 202881ad6265SDimitry Andric 202981ad6265SDimitry Andric // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 203081ad6265SDimitry Andric // another. Merge operands are required to be the same type, but creating an 203181ad6265SDimitry Andric // extra ptrtoint would be kind of pointless. 203281ad6265SDimitry Andric auto HighAddr = B.buildConstant( 203381ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 203481ad6265SDimitry Andric B.buildMerge(Dst, {Src, HighAddr}); 203581ad6265SDimitry Andric MI.eraseFromParent(); 203681ad6265SDimitry Andric return true; 203781ad6265SDimitry Andric } 203881ad6265SDimitry Andric 203981ad6265SDimitry Andric DiagnosticInfoUnsupported InvalidAddrSpaceCast( 204081ad6265SDimitry Andric MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 204181ad6265SDimitry Andric 204281ad6265SDimitry Andric LLVMContext &Ctx = MF.getFunction().getContext(); 204381ad6265SDimitry Andric Ctx.diagnose(InvalidAddrSpaceCast); 204481ad6265SDimitry Andric B.buildUndef(Dst); 204581ad6265SDimitry Andric MI.eraseFromParent(); 204681ad6265SDimitry Andric return true; 204781ad6265SDimitry Andric } 204881ad6265SDimitry Andric 20490b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint( 20500b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 20518bcb0991SDimitry Andric MachineIRBuilder &B) const { 20520b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 20530b57cec5SDimitry Andric LLT Ty = MRI.getType(Src); 20540b57cec5SDimitry Andric assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 20550b57cec5SDimitry Andric 20560b57cec5SDimitry Andric APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 20570b57cec5SDimitry Andric APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 20580b57cec5SDimitry Andric 20598bcb0991SDimitry Andric auto C1 = B.buildFConstant(Ty, C1Val); 20608bcb0991SDimitry Andric auto CopySign = B.buildFCopysign(Ty, C1, Src); 20610b57cec5SDimitry Andric 20620b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 20638bcb0991SDimitry Andric auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 20648bcb0991SDimitry Andric auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 20650b57cec5SDimitry Andric 20668bcb0991SDimitry Andric auto C2 = B.buildFConstant(Ty, C2Val); 20678bcb0991SDimitry Andric auto Fabs = B.buildFAbs(Ty, Src); 20680b57cec5SDimitry Andric 20698bcb0991SDimitry Andric auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 20708bcb0991SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2071e8d8bef9SDimitry Andric MI.eraseFromParent(); 20720b57cec5SDimitry Andric return true; 20730b57cec5SDimitry Andric } 20740b57cec5SDimitry Andric 20750b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil( 20760b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 20770b57cec5SDimitry Andric MachineIRBuilder &B) const { 20780b57cec5SDimitry Andric 20790b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 20800b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 20810b57cec5SDimitry Andric 20820b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 20830b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 20840b57cec5SDimitry Andric 20850b57cec5SDimitry Andric // result = trunc(src) 20860b57cec5SDimitry Andric // if (src > 0.0 && src != result) 20870b57cec5SDimitry Andric // result += 1.0 20880b57cec5SDimitry Andric 20895ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S64, Src); 20900b57cec5SDimitry Andric 20910b57cec5SDimitry Andric const auto Zero = B.buildFConstant(S64, 0.0); 20920b57cec5SDimitry Andric const auto One = B.buildFConstant(S64, 1.0); 20930b57cec5SDimitry Andric auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 20940b57cec5SDimitry Andric auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 20950b57cec5SDimitry Andric auto And = B.buildAnd(S1, Lt0, NeTrunc); 20960b57cec5SDimitry Andric auto Add = B.buildSelect(S64, And, One, Zero); 20970b57cec5SDimitry Andric 20980b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 20990b57cec5SDimitry Andric B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 210004eeddc0SDimitry Andric MI.eraseFromParent(); 21010b57cec5SDimitry Andric return true; 21020b57cec5SDimitry Andric } 21030b57cec5SDimitry Andric 2104e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem( 2105e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 2106e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 2107e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2108e8d8bef9SDimitry Andric Register Src0Reg = MI.getOperand(1).getReg(); 2109e8d8bef9SDimitry Andric Register Src1Reg = MI.getOperand(2).getReg(); 2110e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 2111e8d8bef9SDimitry Andric LLT Ty = MRI.getType(DstReg); 2112e8d8bef9SDimitry Andric 2113e8d8bef9SDimitry Andric auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2114e8d8bef9SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2115e8d8bef9SDimitry Andric auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2116e8d8bef9SDimitry Andric B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2117e8d8bef9SDimitry Andric MI.eraseFromParent(); 2118e8d8bef9SDimitry Andric return true; 2119e8d8bef9SDimitry Andric } 2120e8d8bef9SDimitry Andric 2121e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi, 21220b57cec5SDimitry Andric MachineIRBuilder &B) { 21230b57cec5SDimitry Andric const unsigned FractBits = 52; 21240b57cec5SDimitry Andric const unsigned ExpBits = 11; 21250b57cec5SDimitry Andric LLT S32 = LLT::scalar(32); 21260b57cec5SDimitry Andric 21270b57cec5SDimitry Andric auto Const0 = B.buildConstant(S32, FractBits - 32); 21280b57cec5SDimitry Andric auto Const1 = B.buildConstant(S32, ExpBits); 21290b57cec5SDimitry Andric 21300b57cec5SDimitry Andric auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2131e8d8bef9SDimitry Andric .addUse(Hi) 21320b57cec5SDimitry Andric .addUse(Const0.getReg(0)) 21330b57cec5SDimitry Andric .addUse(Const1.getReg(0)); 21340b57cec5SDimitry Andric 21350b57cec5SDimitry Andric return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 21360b57cec5SDimitry Andric } 21370b57cec5SDimitry Andric 21380b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 21390b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21400b57cec5SDimitry Andric MachineIRBuilder &B) const { 21410b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 21420b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 21430b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 21440b57cec5SDimitry Andric 21450b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 21460b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 21470b57cec5SDimitry Andric 21480b57cec5SDimitry Andric // TODO: Should this use extract since the low half is unused? 21490b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 21500b57cec5SDimitry Andric Register Hi = Unmerge.getReg(1); 21510b57cec5SDimitry Andric 21520b57cec5SDimitry Andric // Extract the upper half, since this is where we will find the sign and 21530b57cec5SDimitry Andric // exponent. 21540b57cec5SDimitry Andric auto Exp = extractF64Exponent(Hi, B); 21550b57cec5SDimitry Andric 21560b57cec5SDimitry Andric const unsigned FractBits = 52; 21570b57cec5SDimitry Andric 21580b57cec5SDimitry Andric // Extract the sign bit. 21590b57cec5SDimitry Andric const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 21600b57cec5SDimitry Andric auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 21610b57cec5SDimitry Andric 21620b57cec5SDimitry Andric const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 21630b57cec5SDimitry Andric 21640b57cec5SDimitry Andric const auto Zero32 = B.buildConstant(S32, 0); 21650b57cec5SDimitry Andric 21660b57cec5SDimitry Andric // Extend back to 64-bits. 21675ffd83dbSDimitry Andric auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 21680b57cec5SDimitry Andric 21690b57cec5SDimitry Andric auto Shr = B.buildAShr(S64, FractMask, Exp); 21700b57cec5SDimitry Andric auto Not = B.buildNot(S64, Shr); 21710b57cec5SDimitry Andric auto Tmp0 = B.buildAnd(S64, Src, Not); 21720b57cec5SDimitry Andric auto FiftyOne = B.buildConstant(S32, FractBits - 1); 21730b57cec5SDimitry Andric 21740b57cec5SDimitry Andric auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 21750b57cec5SDimitry Andric auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 21760b57cec5SDimitry Andric 21770b57cec5SDimitry Andric auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 21780b57cec5SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2179e8d8bef9SDimitry Andric MI.eraseFromParent(); 21800b57cec5SDimitry Andric return true; 21810b57cec5SDimitry Andric } 21820b57cec5SDimitry Andric 21830b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP( 21840b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21850b57cec5SDimitry Andric MachineIRBuilder &B, bool Signed) const { 21860b57cec5SDimitry Andric 21870b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 21880b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 21890b57cec5SDimitry Andric 21900b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 21910b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 21920b57cec5SDimitry Andric 2193349cc55cSDimitry Andric assert(MRI.getType(Src) == S64); 21940b57cec5SDimitry Andric 21950b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2196349cc55cSDimitry Andric auto ThirtyTwo = B.buildConstant(S32, 32); 21970b57cec5SDimitry Andric 2198349cc55cSDimitry Andric if (MRI.getType(Dst) == S64) { 2199349cc55cSDimitry Andric auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2200349cc55cSDimitry Andric : B.buildUITOFP(S64, Unmerge.getReg(1)); 22010b57cec5SDimitry Andric 22020b57cec5SDimitry Andric auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 22030b57cec5SDimitry Andric auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 22040b57cec5SDimitry Andric .addUse(CvtHi.getReg(0)) 22050b57cec5SDimitry Andric .addUse(ThirtyTwo.getReg(0)); 22060b57cec5SDimitry Andric 22070b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 22080b57cec5SDimitry Andric B.buildFAdd(Dst, LdExp, CvtLo); 22090b57cec5SDimitry Andric MI.eraseFromParent(); 22100b57cec5SDimitry Andric return true; 22110b57cec5SDimitry Andric } 22120b57cec5SDimitry Andric 2213349cc55cSDimitry Andric assert(MRI.getType(Dst) == S32); 2214349cc55cSDimitry Andric 2215349cc55cSDimitry Andric auto One = B.buildConstant(S32, 1); 2216349cc55cSDimitry Andric 2217349cc55cSDimitry Andric MachineInstrBuilder ShAmt; 2218349cc55cSDimitry Andric if (Signed) { 2219349cc55cSDimitry Andric auto ThirtyOne = B.buildConstant(S32, 31); 2220349cc55cSDimitry Andric auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2221349cc55cSDimitry Andric auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2222349cc55cSDimitry Andric auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2223349cc55cSDimitry Andric auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2224349cc55cSDimitry Andric /*HasSideEffects=*/false) 2225349cc55cSDimitry Andric .addUse(Unmerge.getReg(1)); 2226349cc55cSDimitry Andric auto LS2 = B.buildSub(S32, LS, One); 2227349cc55cSDimitry Andric ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2228349cc55cSDimitry Andric } else 2229349cc55cSDimitry Andric ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2230349cc55cSDimitry Andric auto Norm = B.buildShl(S64, Src, ShAmt); 2231349cc55cSDimitry Andric auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2232349cc55cSDimitry Andric auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2233349cc55cSDimitry Andric auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2234349cc55cSDimitry Andric auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2235349cc55cSDimitry Andric auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2236349cc55cSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2237349cc55cSDimitry Andric /*HasSideEffects=*/false) 2238349cc55cSDimitry Andric .addUse(FVal.getReg(0)) 2239349cc55cSDimitry Andric .addUse(Scale.getReg(0)); 2240349cc55cSDimitry Andric MI.eraseFromParent(); 2241349cc55cSDimitry Andric return true; 2242349cc55cSDimitry Andric } 2243349cc55cSDimitry Andric 22445ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this 22455ffd83dbSDimitry Andric // actually works. 2246fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2247fe6060f1SDimitry Andric MachineRegisterInfo &MRI, 2248fe6060f1SDimitry Andric MachineIRBuilder &B, 2249fe6060f1SDimitry Andric bool Signed) const { 22505ffd83dbSDimitry Andric 22515ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 22525ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 22535ffd83dbSDimitry Andric 22545ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 22555ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 22565ffd83dbSDimitry Andric 2257fe6060f1SDimitry Andric const LLT SrcLT = MRI.getType(Src); 2258fe6060f1SDimitry Andric assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 22595ffd83dbSDimitry Andric 22605ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 22615ffd83dbSDimitry Andric 2262fe6060f1SDimitry Andric // The basic idea of converting a floating point number into a pair of 32-bit 2263fe6060f1SDimitry Andric // integers is illustrated as follows: 2264fe6060f1SDimitry Andric // 2265fe6060f1SDimitry Andric // tf := trunc(val); 2266fe6060f1SDimitry Andric // hif := floor(tf * 2^-32); 2267fe6060f1SDimitry Andric // lof := tf - hif * 2^32; // lof is always positive due to floor. 2268fe6060f1SDimitry Andric // hi := fptoi(hif); 2269fe6060f1SDimitry Andric // lo := fptoi(lof); 2270fe6060f1SDimitry Andric // 2271fe6060f1SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2272fe6060f1SDimitry Andric MachineInstrBuilder Sign; 2273fe6060f1SDimitry Andric if (Signed && SrcLT == S32) { 2274fe6060f1SDimitry Andric // However, a 32-bit floating point number has only 23 bits mantissa and 2275fe6060f1SDimitry Andric // it's not enough to hold all the significant bits of `lof` if val is 2276fe6060f1SDimitry Andric // negative. To avoid the loss of precision, We need to take the absolute 2277fe6060f1SDimitry Andric // value after truncating and flip the result back based on the original 2278fe6060f1SDimitry Andric // signedness. 2279fe6060f1SDimitry Andric Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2280fe6060f1SDimitry Andric Trunc = B.buildFAbs(S32, Trunc, Flags); 2281fe6060f1SDimitry Andric } 2282fe6060f1SDimitry Andric MachineInstrBuilder K0, K1; 2283fe6060f1SDimitry Andric if (SrcLT == S64) { 2284fe6060f1SDimitry Andric K0 = B.buildFConstant(S64, 2285fe6060f1SDimitry Andric BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2286fe6060f1SDimitry Andric K1 = B.buildFConstant(S64, 2287fe6060f1SDimitry Andric BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2288fe6060f1SDimitry Andric } else { 2289fe6060f1SDimitry Andric K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2290fe6060f1SDimitry Andric K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2291fe6060f1SDimitry Andric } 22925ffd83dbSDimitry Andric 2293fe6060f1SDimitry Andric auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2294fe6060f1SDimitry Andric auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2295fe6060f1SDimitry Andric auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 22965ffd83dbSDimitry Andric 2297fe6060f1SDimitry Andric auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2298fe6060f1SDimitry Andric : B.buildFPTOUI(S32, FloorMul); 22995ffd83dbSDimitry Andric auto Lo = B.buildFPTOUI(S32, Fma); 23005ffd83dbSDimitry Andric 2301fe6060f1SDimitry Andric if (Signed && SrcLT == S32) { 2302fe6060f1SDimitry Andric // Flip the result based on the signedness, which is either all 0s or 1s. 2303fe6060f1SDimitry Andric Sign = B.buildMerge(S64, {Sign, Sign}); 2304fe6060f1SDimitry Andric // r := xor({lo, hi}, sign) - sign; 2305fe6060f1SDimitry Andric B.buildSub(Dst, B.buildXor(S64, B.buildMerge(S64, {Lo, Hi}), Sign), Sign); 2306fe6060f1SDimitry Andric } else 23075ffd83dbSDimitry Andric B.buildMerge(Dst, {Lo, Hi}); 23085ffd83dbSDimitry Andric MI.eraseFromParent(); 23095ffd83dbSDimitry Andric 23105ffd83dbSDimitry Andric return true; 23115ffd83dbSDimitry Andric } 23125ffd83dbSDimitry Andric 23135ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 23145ffd83dbSDimitry Andric MachineInstr &MI) const { 23155ffd83dbSDimitry Andric MachineFunction &MF = Helper.MIRBuilder.getMF(); 23160b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 23170b57cec5SDimitry Andric 23180b57cec5SDimitry Andric const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 23190b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 23200b57cec5SDimitry Andric 23210b57cec5SDimitry Andric // With ieee_mode disabled, the instructions have the correct behavior 23220b57cec5SDimitry Andric // already for G_FMINNUM/G_FMAXNUM 23230b57cec5SDimitry Andric if (!MFI->getMode().IEEE) 23240b57cec5SDimitry Andric return !IsIEEEOp; 23250b57cec5SDimitry Andric 23260b57cec5SDimitry Andric if (IsIEEEOp) 23270b57cec5SDimitry Andric return true; 23280b57cec5SDimitry Andric 23290b57cec5SDimitry Andric return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 23300b57cec5SDimitry Andric } 23310b57cec5SDimitry Andric 23320b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 23330b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 23340b57cec5SDimitry Andric MachineIRBuilder &B) const { 23350b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 23360b57cec5SDimitry Andric 23370b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 23385ffd83dbSDimitry Andric 23395ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 23405ffd83dbSDimitry Andric // constant before this, so we shouldn't need 2341349cc55cSDimitry Andric // getIConstantVRegValWithLookThrough. 2342e8d8bef9SDimitry Andric Optional<ValueAndVReg> MaybeIdxVal = 2343349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2344e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 23450b57cec5SDimitry Andric return true; 2346e8d8bef9SDimitry Andric const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 23470b57cec5SDimitry Andric 23480b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 23490b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 23500b57cec5SDimitry Andric 23510b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 23520b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 23530b57cec5SDimitry Andric assert(EltTy == MRI.getType(Dst)); 23540b57cec5SDimitry Andric 235504eeddc0SDimitry Andric if (IdxVal < VecTy.getNumElements()) { 235604eeddc0SDimitry Andric auto Unmerge = B.buildUnmerge(EltTy, Vec); 235704eeddc0SDimitry Andric B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 235804eeddc0SDimitry Andric } else { 23590b57cec5SDimitry Andric B.buildUndef(Dst); 236004eeddc0SDimitry Andric } 23610b57cec5SDimitry Andric 23620b57cec5SDimitry Andric MI.eraseFromParent(); 23630b57cec5SDimitry Andric return true; 23640b57cec5SDimitry Andric } 23650b57cec5SDimitry Andric 23660b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 23670b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 23680b57cec5SDimitry Andric MachineIRBuilder &B) const { 23690b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 23700b57cec5SDimitry Andric 23710b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 23725ffd83dbSDimitry Andric 23735ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 23745ffd83dbSDimitry Andric // constant before this, so we shouldn't need 2375349cc55cSDimitry Andric // getIConstantVRegValWithLookThrough. 2376e8d8bef9SDimitry Andric Optional<ValueAndVReg> MaybeIdxVal = 2377349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2378e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 23790b57cec5SDimitry Andric return true; 23800b57cec5SDimitry Andric 2381e8d8bef9SDimitry Andric int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 23820b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 23830b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 23840b57cec5SDimitry Andric Register Ins = MI.getOperand(2).getReg(); 23850b57cec5SDimitry Andric 23860b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 23870b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 23880b57cec5SDimitry Andric assert(EltTy == MRI.getType(Ins)); 238904eeddc0SDimitry Andric (void)Ins; 23900b57cec5SDimitry Andric 239104eeddc0SDimitry Andric unsigned NumElts = VecTy.getNumElements(); 239204eeddc0SDimitry Andric if (IdxVal < NumElts) { 239304eeddc0SDimitry Andric SmallVector<Register, 8> SrcRegs; 239404eeddc0SDimitry Andric for (unsigned i = 0; i < NumElts; ++i) 239504eeddc0SDimitry Andric SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 239604eeddc0SDimitry Andric B.buildUnmerge(SrcRegs, Vec); 239704eeddc0SDimitry Andric 239804eeddc0SDimitry Andric SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 239904eeddc0SDimitry Andric B.buildMerge(Dst, SrcRegs); 240004eeddc0SDimitry Andric } else { 24010b57cec5SDimitry Andric B.buildUndef(Dst); 240204eeddc0SDimitry Andric } 24030b57cec5SDimitry Andric 24040b57cec5SDimitry Andric MI.eraseFromParent(); 24050b57cec5SDimitry Andric return true; 24060b57cec5SDimitry Andric } 24070b57cec5SDimitry Andric 24085ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector( 24095ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 24105ffd83dbSDimitry Andric MachineIRBuilder &B) const { 2411fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 24125ffd83dbSDimitry Andric 24135ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 24145ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 24155ffd83dbSDimitry Andric LLT DstTy = MRI.getType(Dst); 24165ffd83dbSDimitry Andric LLT SrcTy = MRI.getType(Src0); 24175ffd83dbSDimitry Andric 24185ffd83dbSDimitry Andric if (SrcTy == V2S16 && DstTy == V2S16 && 24195ffd83dbSDimitry Andric AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 24205ffd83dbSDimitry Andric return true; 24215ffd83dbSDimitry Andric 24225ffd83dbSDimitry Andric MachineIRBuilder HelperBuilder(MI); 24235ffd83dbSDimitry Andric GISelObserverWrapper DummyObserver; 24245ffd83dbSDimitry Andric LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 24255ffd83dbSDimitry Andric return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 24265ffd83dbSDimitry Andric } 24275ffd83dbSDimitry Andric 24288bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos( 24298bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 24308bcb0991SDimitry Andric MachineIRBuilder &B) const { 24318bcb0991SDimitry Andric 24328bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 24338bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 24348bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 24358bcb0991SDimitry Andric unsigned Flags = MI.getFlags(); 24368bcb0991SDimitry Andric 24378bcb0991SDimitry Andric Register TrigVal; 24385ffd83dbSDimitry Andric auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 24398bcb0991SDimitry Andric if (ST.hasTrigReducedRange()) { 24408bcb0991SDimitry Andric auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 24418bcb0991SDimitry Andric TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 24428bcb0991SDimitry Andric .addUse(MulVal.getReg(0)) 24438bcb0991SDimitry Andric .setMIFlags(Flags).getReg(0); 24448bcb0991SDimitry Andric } else 24458bcb0991SDimitry Andric TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 24468bcb0991SDimitry Andric 24478bcb0991SDimitry Andric Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 24488bcb0991SDimitry Andric Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 24498bcb0991SDimitry Andric B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 24508bcb0991SDimitry Andric .addUse(TrigVal) 24518bcb0991SDimitry Andric .setMIFlags(Flags); 24528bcb0991SDimitry Andric MI.eraseFromParent(); 24538bcb0991SDimitry Andric return true; 24548bcb0991SDimitry Andric } 24558bcb0991SDimitry Andric 24565ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 24575ffd83dbSDimitry Andric MachineIRBuilder &B, 24585ffd83dbSDimitry Andric const GlobalValue *GV, 24595ffd83dbSDimitry Andric int64_t Offset, 24605ffd83dbSDimitry Andric unsigned GAFlags) const { 24615ffd83dbSDimitry Andric assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 24628bcb0991SDimitry Andric // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 24638bcb0991SDimitry Andric // to the following code sequence: 24648bcb0991SDimitry Andric // 24658bcb0991SDimitry Andric // For constant address space: 24668bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 24678bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol 24688bcb0991SDimitry Andric // s_addc_u32 s1, s1, 0 24698bcb0991SDimitry Andric // 24708bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 24718bcb0991SDimitry Andric // a fixup or relocation is emitted to replace $symbol with a literal 24728bcb0991SDimitry Andric // constant, which is a pc-relative offset from the encoding of the $symbol 24738bcb0991SDimitry Andric // operand to the global variable. 24748bcb0991SDimitry Andric // 24758bcb0991SDimitry Andric // For global address space: 24768bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 24778bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 24788bcb0991SDimitry Andric // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 24798bcb0991SDimitry Andric // 24808bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 24818bcb0991SDimitry Andric // fixups or relocations are emitted to replace $symbol@*@lo and 24828bcb0991SDimitry Andric // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 24838bcb0991SDimitry Andric // which is a 64-bit pc-relative offset from the encoding of the $symbol 24848bcb0991SDimitry Andric // operand to the global variable. 24858bcb0991SDimitry Andric // 24868bcb0991SDimitry Andric // What we want here is an offset from the value returned by s_getpc 24878bcb0991SDimitry Andric // (which is the address of the s_add_u32 instruction) to the global 24888bcb0991SDimitry Andric // variable, but since the encoding of $symbol starts 4 bytes after the start 24898bcb0991SDimitry Andric // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 24908bcb0991SDimitry Andric // small. This requires us to add 4 to the global variable offset in order to 2491e8d8bef9SDimitry Andric // compute the correct address. Similarly for the s_addc_u32 instruction, the 2492e8d8bef9SDimitry Andric // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2493e8d8bef9SDimitry Andric // instruction. 24948bcb0991SDimitry Andric 24958bcb0991SDimitry Andric LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 24968bcb0991SDimitry Andric 24978bcb0991SDimitry Andric Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 24988bcb0991SDimitry Andric B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 24998bcb0991SDimitry Andric 25008bcb0991SDimitry Andric MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 25018bcb0991SDimitry Andric .addDef(PCReg); 25028bcb0991SDimitry Andric 25038bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 25048bcb0991SDimitry Andric if (GAFlags == SIInstrInfo::MO_NONE) 25058bcb0991SDimitry Andric MIB.addImm(0); 25068bcb0991SDimitry Andric else 2507e8d8bef9SDimitry Andric MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 25088bcb0991SDimitry Andric 25098bcb0991SDimitry Andric B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 25108bcb0991SDimitry Andric 25118bcb0991SDimitry Andric if (PtrTy.getSizeInBits() == 32) 25128bcb0991SDimitry Andric B.buildExtract(DstReg, PCReg, 0); 25138bcb0991SDimitry Andric return true; 25148bcb0991SDimitry Andric } 25158bcb0991SDimitry Andric 25168bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue( 25178bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 25188bcb0991SDimitry Andric MachineIRBuilder &B) const { 25198bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 25208bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 25218bcb0991SDimitry Andric unsigned AS = Ty.getAddressSpace(); 25228bcb0991SDimitry Andric 25238bcb0991SDimitry Andric const GlobalValue *GV = MI.getOperand(1).getGlobal(); 25248bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 25258bcb0991SDimitry Andric SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 25268bcb0991SDimitry Andric 25278bcb0991SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2528fe6060f1SDimitry Andric if (!MFI->isModuleEntryFunction() && 2529fe6060f1SDimitry Andric !GV->getName().equals("llvm.amdgcn.module.lds")) { 25308bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 25318bcb0991SDimitry Andric DiagnosticInfoUnsupported BadLDSDecl( 25325ffd83dbSDimitry Andric Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 25335ffd83dbSDimitry Andric DS_Warning); 25348bcb0991SDimitry Andric Fn.getContext().diagnose(BadLDSDecl); 25355ffd83dbSDimitry Andric 25365ffd83dbSDimitry Andric // We currently don't have a way to correctly allocate LDS objects that 25375ffd83dbSDimitry Andric // aren't directly associated with a kernel. We do force inlining of 25385ffd83dbSDimitry Andric // functions that use local objects. However, if these dead functions are 25395ffd83dbSDimitry Andric // not eliminated, we don't want a compile time error. Just emit a warning 25405ffd83dbSDimitry Andric // and a trap, since there should be no callable path here. 25415ffd83dbSDimitry Andric B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 25425ffd83dbSDimitry Andric B.buildUndef(DstReg); 25435ffd83dbSDimitry Andric MI.eraseFromParent(); 25445ffd83dbSDimitry Andric return true; 25458bcb0991SDimitry Andric } 25468bcb0991SDimitry Andric 25478bcb0991SDimitry Andric // TODO: We could emit code to handle the initialization somewhere. 2548349cc55cSDimitry Andric // We ignore the initializer for now and legalize it to allow selection. 2549349cc55cSDimitry Andric // The initializer will anyway get errored out during assembly emission. 25505ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 25515ffd83dbSDimitry Andric if (!TLI->shouldUseLDSConstAddress(GV)) { 25525ffd83dbSDimitry Andric MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 25535ffd83dbSDimitry Andric return true; // Leave in place; 25545ffd83dbSDimitry Andric } 25555ffd83dbSDimitry Andric 2556e8d8bef9SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2557e8d8bef9SDimitry Andric Type *Ty = GV->getValueType(); 2558e8d8bef9SDimitry Andric // HIP uses an unsized array `extern __shared__ T s[]` or similar 2559e8d8bef9SDimitry Andric // zero-sized type in other languages to declare the dynamic shared 2560e8d8bef9SDimitry Andric // memory which size is not known at the compile time. They will be 2561e8d8bef9SDimitry Andric // allocated by the runtime and placed directly after the static 2562e8d8bef9SDimitry Andric // allocated ones. They all share the same offset. 2563e8d8bef9SDimitry Andric if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2564e8d8bef9SDimitry Andric // Adjust alignment for that dynamic shared memory array. 2565e8d8bef9SDimitry Andric MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2566e8d8bef9SDimitry Andric LLT S32 = LLT::scalar(32); 2567e8d8bef9SDimitry Andric auto Sz = 2568e8d8bef9SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2569e8d8bef9SDimitry Andric B.buildIntToPtr(DstReg, Sz); 2570e8d8bef9SDimitry Andric MI.eraseFromParent(); 2571e8d8bef9SDimitry Andric return true; 2572e8d8bef9SDimitry Andric } 2573e8d8bef9SDimitry Andric } 2574e8d8bef9SDimitry Andric 2575349cc55cSDimitry Andric B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2576349cc55cSDimitry Andric *cast<GlobalVariable>(GV))); 25778bcb0991SDimitry Andric MI.eraseFromParent(); 25788bcb0991SDimitry Andric return true; 25798bcb0991SDimitry Andric } 25808bcb0991SDimitry Andric 25818bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 25828bcb0991SDimitry Andric 25838bcb0991SDimitry Andric if (TLI->shouldEmitFixup(GV)) { 25848bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 25858bcb0991SDimitry Andric MI.eraseFromParent(); 25868bcb0991SDimitry Andric return true; 25878bcb0991SDimitry Andric } 25888bcb0991SDimitry Andric 25898bcb0991SDimitry Andric if (TLI->shouldEmitPCReloc(GV)) { 25908bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 25918bcb0991SDimitry Andric MI.eraseFromParent(); 25928bcb0991SDimitry Andric return true; 25938bcb0991SDimitry Andric } 25948bcb0991SDimitry Andric 25958bcb0991SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 25968bcb0991SDimitry Andric Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 25978bcb0991SDimitry Andric 2598fe6060f1SDimitry Andric LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 25998bcb0991SDimitry Andric MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 26008bcb0991SDimitry Andric MachinePointerInfo::getGOT(MF), 26018bcb0991SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 26028bcb0991SDimitry Andric MachineMemOperand::MOInvariant, 2603fe6060f1SDimitry Andric LoadTy, Align(8)); 26048bcb0991SDimitry Andric 26058bcb0991SDimitry Andric buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 26068bcb0991SDimitry Andric 26078bcb0991SDimitry Andric if (Ty.getSizeInBits() == 32) { 2608349cc55cSDimitry Andric // Truncate if this is a 32-bit constant address. 26098bcb0991SDimitry Andric auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 26108bcb0991SDimitry Andric B.buildExtract(DstReg, Load, 0); 26118bcb0991SDimitry Andric } else 26128bcb0991SDimitry Andric B.buildLoad(DstReg, GOTAddr, *GOTMMO); 26138bcb0991SDimitry Andric 26148bcb0991SDimitry Andric MI.eraseFromParent(); 26158bcb0991SDimitry Andric return true; 26168bcb0991SDimitry Andric } 26178bcb0991SDimitry Andric 2618e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) { 2619e8d8bef9SDimitry Andric if (Ty.isVector()) 2620fe6060f1SDimitry Andric return Ty.changeElementCount( 2621fe6060f1SDimitry Andric ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2622e8d8bef9SDimitry Andric return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2623e8d8bef9SDimitry Andric } 2624e8d8bef9SDimitry Andric 2625e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2626e8d8bef9SDimitry Andric MachineInstr &MI) const { 2627e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 2628e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 2629e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 2630e8d8bef9SDimitry Andric 2631e8d8bef9SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2632e8d8bef9SDimitry Andric LLT PtrTy = MRI.getType(PtrReg); 2633e8d8bef9SDimitry Andric unsigned AddrSpace = PtrTy.getAddressSpace(); 2634e8d8bef9SDimitry Andric 2635e8d8bef9SDimitry Andric if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 26368bcb0991SDimitry Andric LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2637e8d8bef9SDimitry Andric auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 26388bcb0991SDimitry Andric Observer.changingInstr(MI); 26398bcb0991SDimitry Andric MI.getOperand(1).setReg(Cast.getReg(0)); 26408bcb0991SDimitry Andric Observer.changedInstr(MI); 26418bcb0991SDimitry Andric return true; 26428bcb0991SDimitry Andric } 26438bcb0991SDimitry Andric 2644fe6060f1SDimitry Andric if (MI.getOpcode() != AMDGPU::G_LOAD) 2645fe6060f1SDimitry Andric return false; 2646fe6060f1SDimitry Andric 2647e8d8bef9SDimitry Andric Register ValReg = MI.getOperand(0).getReg(); 2648e8d8bef9SDimitry Andric LLT ValTy = MRI.getType(ValReg); 2649e8d8bef9SDimitry Andric 2650e8d8bef9SDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 2651e8d8bef9SDimitry Andric const unsigned ValSize = ValTy.getSizeInBits(); 2652fe6060f1SDimitry Andric const LLT MemTy = MMO->getMemoryType(); 2653e8d8bef9SDimitry Andric const Align MemAlign = MMO->getAlign(); 2654fe6060f1SDimitry Andric const unsigned MemSize = MemTy.getSizeInBits(); 265504eeddc0SDimitry Andric const uint64_t AlignInBits = 8 * MemAlign.value(); 2656e8d8bef9SDimitry Andric 2657e8d8bef9SDimitry Andric // Widen non-power-of-2 loads to the alignment if needed 2658fe6060f1SDimitry Andric if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2659e8d8bef9SDimitry Andric const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2660e8d8bef9SDimitry Andric 2661e8d8bef9SDimitry Andric // This was already the correct extending load result type, so just adjust 2662e8d8bef9SDimitry Andric // the memory type. 2663e8d8bef9SDimitry Andric if (WideMemSize == ValSize) { 2664e8d8bef9SDimitry Andric MachineFunction &MF = B.getMF(); 2665e8d8bef9SDimitry Andric 2666e8d8bef9SDimitry Andric MachineMemOperand *WideMMO = 2667e8d8bef9SDimitry Andric MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2668e8d8bef9SDimitry Andric Observer.changingInstr(MI); 2669e8d8bef9SDimitry Andric MI.setMemRefs(MF, {WideMMO}); 2670e8d8bef9SDimitry Andric Observer.changedInstr(MI); 2671e8d8bef9SDimitry Andric return true; 2672e8d8bef9SDimitry Andric } 2673e8d8bef9SDimitry Andric 2674e8d8bef9SDimitry Andric // Don't bother handling edge case that should probably never be produced. 2675e8d8bef9SDimitry Andric if (ValSize > WideMemSize) 2676e8d8bef9SDimitry Andric return false; 2677e8d8bef9SDimitry Andric 2678e8d8bef9SDimitry Andric LLT WideTy = widenToNextPowerOf2(ValTy); 2679e8d8bef9SDimitry Andric 2680e8d8bef9SDimitry Andric Register WideLoad; 2681e8d8bef9SDimitry Andric if (!WideTy.isVector()) { 2682e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2683e8d8bef9SDimitry Andric B.buildTrunc(ValReg, WideLoad).getReg(0); 2684e8d8bef9SDimitry Andric } else { 2685e8d8bef9SDimitry Andric // Extract the subvector. 2686e8d8bef9SDimitry Andric 2687e8d8bef9SDimitry Andric if (isRegisterType(ValTy)) { 2688e8d8bef9SDimitry Andric // If this a case where G_EXTRACT is legal, use it. 2689e8d8bef9SDimitry Andric // (e.g. <3 x s32> -> <4 x s32>) 2690e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2691e8d8bef9SDimitry Andric B.buildExtract(ValReg, WideLoad, 0); 2692e8d8bef9SDimitry Andric } else { 2693e8d8bef9SDimitry Andric // For cases where the widened type isn't a nice register value, unmerge 2694e8d8bef9SDimitry Andric // from a widened register (e.g. <3 x s16> -> <4 x s16>) 26950eae32dcSDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 26960eae32dcSDimitry Andric B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2697e8d8bef9SDimitry Andric } 2698e8d8bef9SDimitry Andric } 2699e8d8bef9SDimitry Andric 2700e8d8bef9SDimitry Andric MI.eraseFromParent(); 2701e8d8bef9SDimitry Andric return true; 2702e8d8bef9SDimitry Andric } 2703e8d8bef9SDimitry Andric 2704e8d8bef9SDimitry Andric return false; 2705e8d8bef9SDimitry Andric } 2706e8d8bef9SDimitry Andric 27078bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad( 27088bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 27098bcb0991SDimitry Andric MachineIRBuilder &B) const { 27108bcb0991SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 27118bcb0991SDimitry Andric assert(Ty.isScalar()); 27128bcb0991SDimitry Andric 2713480093f4SDimitry Andric MachineFunction &MF = B.getMF(); 2714480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2715480093f4SDimitry Andric 27168bcb0991SDimitry Andric // TODO: Always legal with future ftz flag. 27175ffd83dbSDimitry Andric // FIXME: Do we need just output? 27185ffd83dbSDimitry Andric if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 27198bcb0991SDimitry Andric return true; 27205ffd83dbSDimitry Andric if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 27218bcb0991SDimitry Andric return true; 27228bcb0991SDimitry Andric 27238bcb0991SDimitry Andric MachineIRBuilder HelperBuilder(MI); 27248bcb0991SDimitry Andric GISelObserverWrapper DummyObserver; 27258bcb0991SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 27268bcb0991SDimitry Andric return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 27278bcb0991SDimitry Andric } 27288bcb0991SDimitry Andric 2729480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2730480093f4SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2731480093f4SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2732480093f4SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2733480093f4SDimitry Andric Register CmpVal = MI.getOperand(2).getReg(); 2734480093f4SDimitry Andric Register NewVal = MI.getOperand(3).getReg(); 2735480093f4SDimitry Andric 2736e8d8bef9SDimitry Andric assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2737480093f4SDimitry Andric "this should not have been custom lowered"); 2738480093f4SDimitry Andric 2739480093f4SDimitry Andric LLT ValTy = MRI.getType(CmpVal); 2740fe6060f1SDimitry Andric LLT VecTy = LLT::fixed_vector(2, ValTy); 2741480093f4SDimitry Andric 2742480093f4SDimitry Andric Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2743480093f4SDimitry Andric 2744480093f4SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2745480093f4SDimitry Andric .addDef(DstReg) 2746480093f4SDimitry Andric .addUse(PtrReg) 2747480093f4SDimitry Andric .addUse(PackedVal) 2748480093f4SDimitry Andric .setMemRefs(MI.memoperands()); 2749480093f4SDimitry Andric 2750480093f4SDimitry Andric MI.eraseFromParent(); 2751480093f4SDimitry Andric return true; 2752480093f4SDimitry Andric } 2753480093f4SDimitry Andric 27545ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog( 27555ffd83dbSDimitry Andric MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 27565ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 27575ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 27585ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 27595ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 27605ffd83dbSDimitry Andric 27615ffd83dbSDimitry Andric auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 27625ffd83dbSDimitry Andric auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 27635ffd83dbSDimitry Andric 27645ffd83dbSDimitry Andric B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 27655ffd83dbSDimitry Andric MI.eraseFromParent(); 27665ffd83dbSDimitry Andric return true; 27675ffd83dbSDimitry Andric } 27685ffd83dbSDimitry Andric 27695ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 27705ffd83dbSDimitry Andric MachineIRBuilder &B) const { 27715ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 27725ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 27735ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 27745ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 27755ffd83dbSDimitry Andric 27765ffd83dbSDimitry Andric auto K = B.buildFConstant(Ty, numbers::log2e); 27775ffd83dbSDimitry Andric auto Mul = B.buildFMul(Ty, Src, K, Flags); 27785ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 27795ffd83dbSDimitry Andric MI.eraseFromParent(); 27805ffd83dbSDimitry Andric return true; 27815ffd83dbSDimitry Andric } 27825ffd83dbSDimitry Andric 27835ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 27845ffd83dbSDimitry Andric MachineIRBuilder &B) const { 27855ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 27865ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 27875ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 27885ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 27895ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 27905ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 27915ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 27925ffd83dbSDimitry Andric 27935ffd83dbSDimitry Andric if (Ty == S32) { 27945ffd83dbSDimitry Andric auto Log = B.buildFLog2(S32, Src0, Flags); 27955ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 27965ffd83dbSDimitry Andric .addUse(Log.getReg(0)) 27975ffd83dbSDimitry Andric .addUse(Src1) 27985ffd83dbSDimitry Andric .setMIFlags(Flags); 27995ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 28005ffd83dbSDimitry Andric } else if (Ty == S16) { 28015ffd83dbSDimitry Andric // There's no f16 fmul_legacy, so we need to convert for it. 28025ffd83dbSDimitry Andric auto Log = B.buildFLog2(S16, Src0, Flags); 28035ffd83dbSDimitry Andric auto Ext0 = B.buildFPExt(S32, Log, Flags); 28045ffd83dbSDimitry Andric auto Ext1 = B.buildFPExt(S32, Src1, Flags); 28055ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 28065ffd83dbSDimitry Andric .addUse(Ext0.getReg(0)) 28075ffd83dbSDimitry Andric .addUse(Ext1.getReg(0)) 28085ffd83dbSDimitry Andric .setMIFlags(Flags); 28095ffd83dbSDimitry Andric 28105ffd83dbSDimitry Andric B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 28115ffd83dbSDimitry Andric } else 28125ffd83dbSDimitry Andric return false; 28135ffd83dbSDimitry Andric 28145ffd83dbSDimitry Andric MI.eraseFromParent(); 28155ffd83dbSDimitry Andric return true; 28165ffd83dbSDimitry Andric } 28175ffd83dbSDimitry Andric 28185ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers. 28195ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 28205ffd83dbSDimitry Andric Register ModSrc = OrigSrc; 28215ffd83dbSDimitry Andric if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 28225ffd83dbSDimitry Andric ModSrc = SrcFNeg->getOperand(1).getReg(); 28235ffd83dbSDimitry Andric if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 28245ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 28255ffd83dbSDimitry Andric } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 28265ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 28275ffd83dbSDimitry Andric return ModSrc; 28285ffd83dbSDimitry Andric } 28295ffd83dbSDimitry Andric 28305ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 28315ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 28325ffd83dbSDimitry Andric MachineIRBuilder &B) const { 28335ffd83dbSDimitry Andric 28345ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 28355ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 28365ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 28375ffd83dbSDimitry Andric Register OrigSrc = MI.getOperand(1).getReg(); 28385ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 28395ffd83dbSDimitry Andric assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 28405ffd83dbSDimitry Andric "this should not have been custom lowered"); 28415ffd83dbSDimitry Andric 28425ffd83dbSDimitry Andric // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 28435ffd83dbSDimitry Andric // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 28445ffd83dbSDimitry Andric // efficient way to implement it is using V_FRACT_F64. The workaround for the 28455ffd83dbSDimitry Andric // V_FRACT bug is: 28465ffd83dbSDimitry Andric // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 28475ffd83dbSDimitry Andric // 28485ffd83dbSDimitry Andric // Convert floor(x) to (x - fract(x)) 28495ffd83dbSDimitry Andric 28505ffd83dbSDimitry Andric auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 28515ffd83dbSDimitry Andric .addUse(OrigSrc) 28525ffd83dbSDimitry Andric .setMIFlags(Flags); 28535ffd83dbSDimitry Andric 28545ffd83dbSDimitry Andric // Give source modifier matching some assistance before obscuring a foldable 28555ffd83dbSDimitry Andric // pattern. 28565ffd83dbSDimitry Andric 28575ffd83dbSDimitry Andric // TODO: We can avoid the neg on the fract? The input sign to fract 28585ffd83dbSDimitry Andric // shouldn't matter? 28595ffd83dbSDimitry Andric Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 28605ffd83dbSDimitry Andric 28615ffd83dbSDimitry Andric auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 28625ffd83dbSDimitry Andric 28635ffd83dbSDimitry Andric Register Min = MRI.createGenericVirtualRegister(S64); 28645ffd83dbSDimitry Andric 28655ffd83dbSDimitry Andric // We don't need to concern ourselves with the snan handling difference, so 28665ffd83dbSDimitry Andric // use the one which will directly select. 28675ffd83dbSDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 28685ffd83dbSDimitry Andric if (MFI->getMode().IEEE) 28695ffd83dbSDimitry Andric B.buildFMinNumIEEE(Min, Fract, Const, Flags); 28705ffd83dbSDimitry Andric else 28715ffd83dbSDimitry Andric B.buildFMinNum(Min, Fract, Const, Flags); 28725ffd83dbSDimitry Andric 28735ffd83dbSDimitry Andric Register CorrectedFract = Min; 28745ffd83dbSDimitry Andric if (!MI.getFlag(MachineInstr::FmNoNans)) { 28755ffd83dbSDimitry Andric auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 28765ffd83dbSDimitry Andric CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 28775ffd83dbSDimitry Andric } 28785ffd83dbSDimitry Andric 28795ffd83dbSDimitry Andric auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 28805ffd83dbSDimitry Andric B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 28815ffd83dbSDimitry Andric 28825ffd83dbSDimitry Andric MI.eraseFromParent(); 28835ffd83dbSDimitry Andric return true; 28845ffd83dbSDimitry Andric } 28855ffd83dbSDimitry Andric 28865ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations. 28875ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper. 28885ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector( 28895ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 28905ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 28915ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2892fe6060f1SDimitry Andric assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 28935ffd83dbSDimitry Andric 28945ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 28955ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 28965ffd83dbSDimitry Andric assert(MRI.getType(Src0) == LLT::scalar(16)); 28975ffd83dbSDimitry Andric 28985ffd83dbSDimitry Andric auto Merge = B.buildMerge(S32, {Src0, Src1}); 28995ffd83dbSDimitry Andric B.buildBitcast(Dst, Merge); 29005ffd83dbSDimitry Andric 29015ffd83dbSDimitry Andric MI.eraseFromParent(); 29025ffd83dbSDimitry Andric return true; 29035ffd83dbSDimitry Andric } 29045ffd83dbSDimitry Andric 290581ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 290681ad6265SDimitry Andric // 290781ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits. 290881ad6265SDimitry Andric // 290981ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence 291081ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of 291181ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go 291281ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction 291381ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions. 291481ad6265SDimitry Andric void AMDGPULegalizerInfo::buildMultiply( 291581ad6265SDimitry Andric LegalizerHelper &Helper, MutableArrayRef<Register> Accum, 291681ad6265SDimitry Andric ArrayRef<Register> Src0, ArrayRef<Register> Src1, 291781ad6265SDimitry Andric bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { 291881ad6265SDimitry Andric // Use (possibly empty) vectors of S1 registers to represent the set of 291981ad6265SDimitry Andric // carries from one pair of positions to the next. 292081ad6265SDimitry Andric using Carry = SmallVector<Register, 2>; 292181ad6265SDimitry Andric 292281ad6265SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 292381ad6265SDimitry Andric 292481ad6265SDimitry Andric const LLT S1 = LLT::scalar(1); 292581ad6265SDimitry Andric const LLT S32 = LLT::scalar(32); 292681ad6265SDimitry Andric const LLT S64 = LLT::scalar(64); 292781ad6265SDimitry Andric 292881ad6265SDimitry Andric Register Zero32; 292981ad6265SDimitry Andric Register Zero64; 293081ad6265SDimitry Andric 293181ad6265SDimitry Andric auto getZero32 = [&]() -> Register { 293281ad6265SDimitry Andric if (!Zero32) 293381ad6265SDimitry Andric Zero32 = B.buildConstant(S32, 0).getReg(0); 293481ad6265SDimitry Andric return Zero32; 293581ad6265SDimitry Andric }; 293681ad6265SDimitry Andric auto getZero64 = [&]() -> Register { 293781ad6265SDimitry Andric if (!Zero64) 293881ad6265SDimitry Andric Zero64 = B.buildConstant(S64, 0).getReg(0); 293981ad6265SDimitry Andric return Zero64; 294081ad6265SDimitry Andric }; 294181ad6265SDimitry Andric 294281ad6265SDimitry Andric // Merge the given carries into the 32-bit LocalAccum, which is modified 294381ad6265SDimitry Andric // in-place. 294481ad6265SDimitry Andric // 294581ad6265SDimitry Andric // Returns the carry-out, which is a single S1 register or null. 294681ad6265SDimitry Andric auto mergeCarry = 294781ad6265SDimitry Andric [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 294881ad6265SDimitry Andric if (CarryIn.empty()) 294981ad6265SDimitry Andric return Register(); 295081ad6265SDimitry Andric 295181ad6265SDimitry Andric bool HaveCarryOut = true; 295281ad6265SDimitry Andric Register CarryAccum; 295381ad6265SDimitry Andric if (CarryIn.size() == 1) { 295481ad6265SDimitry Andric if (!LocalAccum) { 295581ad6265SDimitry Andric LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 295681ad6265SDimitry Andric return Register(); 295781ad6265SDimitry Andric } 295881ad6265SDimitry Andric 295981ad6265SDimitry Andric CarryAccum = getZero32(); 296081ad6265SDimitry Andric } else { 296181ad6265SDimitry Andric CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 296281ad6265SDimitry Andric for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 296381ad6265SDimitry Andric CarryAccum = 296481ad6265SDimitry Andric B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 296581ad6265SDimitry Andric .getReg(0); 296681ad6265SDimitry Andric } 296781ad6265SDimitry Andric 296881ad6265SDimitry Andric if (!LocalAccum) { 296981ad6265SDimitry Andric LocalAccum = getZero32(); 297081ad6265SDimitry Andric HaveCarryOut = false; 297181ad6265SDimitry Andric } 297281ad6265SDimitry Andric } 297381ad6265SDimitry Andric 297481ad6265SDimitry Andric auto Add = 297581ad6265SDimitry Andric B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 297681ad6265SDimitry Andric LocalAccum = Add.getReg(0); 297781ad6265SDimitry Andric return HaveCarryOut ? Add.getReg(1) : Register(); 297881ad6265SDimitry Andric }; 297981ad6265SDimitry Andric 298081ad6265SDimitry Andric // Build a multiply-add chain to compute 298181ad6265SDimitry Andric // 298281ad6265SDimitry Andric // LocalAccum + (partial products at DstIndex) 298381ad6265SDimitry Andric // + (opportunistic subset of CarryIn) 298481ad6265SDimitry Andric // 298581ad6265SDimitry Andric // LocalAccum is an array of one or two 32-bit registers that are updated 298681ad6265SDimitry Andric // in-place. The incoming registers may be null. 298781ad6265SDimitry Andric // 298881ad6265SDimitry Andric // In some edge cases, carry-ins can be consumed "for free". In that case, 298981ad6265SDimitry Andric // the consumed carry bits are removed from CarryIn in-place. 299081ad6265SDimitry Andric auto buildMadChain = 299181ad6265SDimitry Andric [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 299281ad6265SDimitry Andric -> Carry { 299381ad6265SDimitry Andric assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 299481ad6265SDimitry Andric (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 299581ad6265SDimitry Andric 299681ad6265SDimitry Andric Carry CarryOut; 299781ad6265SDimitry Andric unsigned j0 = 0; 299881ad6265SDimitry Andric 299981ad6265SDimitry Andric // Use plain 32-bit multiplication for the most significant part of the 300081ad6265SDimitry Andric // result by default. 300181ad6265SDimitry Andric if (LocalAccum.size() == 1 && 300281ad6265SDimitry Andric (!UsePartialMad64_32 || !CarryIn.empty())) { 300381ad6265SDimitry Andric do { 300481ad6265SDimitry Andric unsigned j1 = DstIndex - j0; 300581ad6265SDimitry Andric auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 300681ad6265SDimitry Andric if (!LocalAccum[0]) { 300781ad6265SDimitry Andric LocalAccum[0] = Mul.getReg(0); 300881ad6265SDimitry Andric } else { 300981ad6265SDimitry Andric if (CarryIn.empty()) { 301081ad6265SDimitry Andric LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 301181ad6265SDimitry Andric } else { 301281ad6265SDimitry Andric LocalAccum[0] = 301381ad6265SDimitry Andric B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 301481ad6265SDimitry Andric .getReg(0); 301581ad6265SDimitry Andric CarryIn.pop_back(); 301681ad6265SDimitry Andric } 301781ad6265SDimitry Andric } 301881ad6265SDimitry Andric ++j0; 301981ad6265SDimitry Andric } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 302081ad6265SDimitry Andric } 302181ad6265SDimitry Andric 302281ad6265SDimitry Andric // Build full 64-bit multiplies. 302381ad6265SDimitry Andric if (j0 <= DstIndex) { 302481ad6265SDimitry Andric bool HaveSmallAccum = false; 302581ad6265SDimitry Andric Register Tmp; 302681ad6265SDimitry Andric 302781ad6265SDimitry Andric if (LocalAccum[0]) { 302881ad6265SDimitry Andric if (LocalAccum.size() == 1) { 302981ad6265SDimitry Andric Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 303081ad6265SDimitry Andric HaveSmallAccum = true; 303181ad6265SDimitry Andric } else if (LocalAccum[1]) { 303281ad6265SDimitry Andric Tmp = B.buildMerge(S64, LocalAccum).getReg(0); 303381ad6265SDimitry Andric HaveSmallAccum = false; 303481ad6265SDimitry Andric } else { 303581ad6265SDimitry Andric Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 303681ad6265SDimitry Andric HaveSmallAccum = true; 303781ad6265SDimitry Andric } 303881ad6265SDimitry Andric } else { 303981ad6265SDimitry Andric assert(LocalAccum.size() == 1 || !LocalAccum[1]); 304081ad6265SDimitry Andric Tmp = getZero64(); 304181ad6265SDimitry Andric HaveSmallAccum = true; 304281ad6265SDimitry Andric } 304381ad6265SDimitry Andric 304481ad6265SDimitry Andric do { 304581ad6265SDimitry Andric unsigned j1 = DstIndex - j0; 304681ad6265SDimitry Andric auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 304781ad6265SDimitry Andric {Src0[j0], Src1[j1], Tmp}); 304881ad6265SDimitry Andric Tmp = Mad.getReg(0); 304981ad6265SDimitry Andric if (!HaveSmallAccum) 305081ad6265SDimitry Andric CarryOut.push_back(Mad.getReg(1)); 305181ad6265SDimitry Andric HaveSmallAccum = false; 305281ad6265SDimitry Andric ++j0; 305381ad6265SDimitry Andric } while (j0 <= DstIndex); 305481ad6265SDimitry Andric 305581ad6265SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Tmp); 305681ad6265SDimitry Andric LocalAccum[0] = Unmerge.getReg(0); 305781ad6265SDimitry Andric if (LocalAccum.size() > 1) 305881ad6265SDimitry Andric LocalAccum[1] = Unmerge.getReg(1); 305981ad6265SDimitry Andric } 306081ad6265SDimitry Andric 306181ad6265SDimitry Andric return CarryOut; 306281ad6265SDimitry Andric }; 306381ad6265SDimitry Andric 306481ad6265SDimitry Andric // Outer multiply loop, iterating over destination parts from least 306581ad6265SDimitry Andric // significant to most significant parts. 306681ad6265SDimitry Andric // 306781ad6265SDimitry Andric // The columns of the following diagram correspond to the destination parts 306881ad6265SDimitry Andric // affected by one iteration of the outer loop (ignoring boundary 306981ad6265SDimitry Andric // conditions). 307081ad6265SDimitry Andric // 307181ad6265SDimitry Andric // Dest index relative to 2 * i: 1 0 -1 307281ad6265SDimitry Andric // ------ 307381ad6265SDimitry Andric // Carries from previous iteration: e o 307481ad6265SDimitry Andric // Even-aligned partial product sum: E E . 307581ad6265SDimitry Andric // Odd-aligned partial product sum: O O 307681ad6265SDimitry Andric // 307781ad6265SDimitry Andric // 'o' is OddCarry, 'e' is EvenCarry. 307881ad6265SDimitry Andric // EE and OO are computed from partial products via buildMadChain and use 307981ad6265SDimitry Andric // accumulation where possible and appropriate. 308081ad6265SDimitry Andric // 308181ad6265SDimitry Andric Register SeparateOddCarry; 308281ad6265SDimitry Andric Carry EvenCarry; 308381ad6265SDimitry Andric Carry OddCarry; 308481ad6265SDimitry Andric 308581ad6265SDimitry Andric for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 308681ad6265SDimitry Andric Carry OddCarryIn = std::move(OddCarry); 308781ad6265SDimitry Andric Carry EvenCarryIn = std::move(EvenCarry); 308881ad6265SDimitry Andric OddCarry.clear(); 308981ad6265SDimitry Andric EvenCarry.clear(); 309081ad6265SDimitry Andric 309181ad6265SDimitry Andric // Partial products at offset 2 * i. 309281ad6265SDimitry Andric if (2 * i < Accum.size()) { 309381ad6265SDimitry Andric auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 309481ad6265SDimitry Andric EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 309581ad6265SDimitry Andric } 309681ad6265SDimitry Andric 309781ad6265SDimitry Andric // Partial products at offset 2 * i - 1. 309881ad6265SDimitry Andric if (i > 0) { 309981ad6265SDimitry Andric if (!SeparateOddAlignedProducts) { 310081ad6265SDimitry Andric auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 310181ad6265SDimitry Andric OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 310281ad6265SDimitry Andric } else { 310381ad6265SDimitry Andric bool IsHighest = 2 * i >= Accum.size(); 310481ad6265SDimitry Andric Register SeparateOddOut[2]; 310581ad6265SDimitry Andric auto LocalAccum = makeMutableArrayRef(SeparateOddOut) 310681ad6265SDimitry Andric .take_front(IsHighest ? 1 : 2); 310781ad6265SDimitry Andric OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 310881ad6265SDimitry Andric 310981ad6265SDimitry Andric MachineInstr *Lo; 311081ad6265SDimitry Andric 311181ad6265SDimitry Andric if (i == 1) { 311281ad6265SDimitry Andric if (!IsHighest) 311381ad6265SDimitry Andric Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 311481ad6265SDimitry Andric else 311581ad6265SDimitry Andric Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 311681ad6265SDimitry Andric } else { 311781ad6265SDimitry Andric Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 311881ad6265SDimitry Andric SeparateOddCarry); 311981ad6265SDimitry Andric } 312081ad6265SDimitry Andric Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 312181ad6265SDimitry Andric 312281ad6265SDimitry Andric if (!IsHighest) { 312381ad6265SDimitry Andric auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 312481ad6265SDimitry Andric Lo->getOperand(1).getReg()); 312581ad6265SDimitry Andric Accum[2 * i] = Hi.getReg(0); 312681ad6265SDimitry Andric SeparateOddCarry = Hi.getReg(1); 312781ad6265SDimitry Andric } 312881ad6265SDimitry Andric } 312981ad6265SDimitry Andric } 313081ad6265SDimitry Andric 313181ad6265SDimitry Andric // Add in the carries from the previous iteration 313281ad6265SDimitry Andric if (i > 0) { 313381ad6265SDimitry Andric if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 313481ad6265SDimitry Andric EvenCarryIn.push_back(CarryOut); 313581ad6265SDimitry Andric 313681ad6265SDimitry Andric if (2 * i < Accum.size()) { 313781ad6265SDimitry Andric if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 313881ad6265SDimitry Andric OddCarry.push_back(CarryOut); 313981ad6265SDimitry Andric } 314081ad6265SDimitry Andric } 314181ad6265SDimitry Andric } 314281ad6265SDimitry Andric } 314381ad6265SDimitry Andric 314481ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions. 314581ad6265SDimitry Andric // 314681ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to 314781ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 314881ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 314981ad6265SDimitry Andric MachineInstr &MI) const { 315081ad6265SDimitry Andric assert(ST.hasMad64_32()); 315181ad6265SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_MUL); 315281ad6265SDimitry Andric 315381ad6265SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 315481ad6265SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 315581ad6265SDimitry Andric 315681ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 315781ad6265SDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 315881ad6265SDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 315981ad6265SDimitry Andric 316081ad6265SDimitry Andric LLT Ty = MRI.getType(DstReg); 316181ad6265SDimitry Andric assert(Ty.isScalar()); 316281ad6265SDimitry Andric 316381ad6265SDimitry Andric unsigned Size = Ty.getSizeInBits(); 316481ad6265SDimitry Andric unsigned NumParts = Size / 32; 316581ad6265SDimitry Andric assert((Size % 32) == 0); 316681ad6265SDimitry Andric assert(NumParts >= 2); 316781ad6265SDimitry Andric 316881ad6265SDimitry Andric // Whether to use MAD_64_32 for partial products whose high half is 316981ad6265SDimitry Andric // discarded. This avoids some ADD instructions but risks false dependency 317081ad6265SDimitry Andric // stalls on some subtargets in some cases. 317181ad6265SDimitry Andric const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 317281ad6265SDimitry Andric 317381ad6265SDimitry Andric // Whether to compute odd-aligned partial products separately. This is 317481ad6265SDimitry Andric // advisable on subtargets where the accumulator of MAD_64_32 must be placed 317581ad6265SDimitry Andric // in an even-aligned VGPR. 317681ad6265SDimitry Andric const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 317781ad6265SDimitry Andric 317881ad6265SDimitry Andric LLT S32 = LLT::scalar(32); 317981ad6265SDimitry Andric SmallVector<Register, 2> Src0Parts, Src1Parts; 318081ad6265SDimitry Andric for (unsigned i = 0; i < NumParts; ++i) { 318181ad6265SDimitry Andric Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 318281ad6265SDimitry Andric Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 318381ad6265SDimitry Andric } 318481ad6265SDimitry Andric B.buildUnmerge(Src0Parts, Src0); 318581ad6265SDimitry Andric B.buildUnmerge(Src1Parts, Src1); 318681ad6265SDimitry Andric 318781ad6265SDimitry Andric SmallVector<Register, 2> AccumRegs(NumParts); 318881ad6265SDimitry Andric buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 318981ad6265SDimitry Andric SeparateOddAlignedProducts); 319081ad6265SDimitry Andric 319181ad6265SDimitry Andric B.buildMerge(DstReg, AccumRegs); 319281ad6265SDimitry Andric MI.eraseFromParent(); 319381ad6265SDimitry Andric return true; 319481ad6265SDimitry Andric 319581ad6265SDimitry Andric } 319681ad6265SDimitry Andric 3197349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3198349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3199349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select. 3200349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3201349cc55cSDimitry Andric MachineRegisterInfo &MRI, 3202349cc55cSDimitry Andric MachineIRBuilder &B) const { 3203349cc55cSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 3204349cc55cSDimitry Andric Register Src = MI.getOperand(1).getReg(); 3205349cc55cSDimitry Andric LLT DstTy = MRI.getType(Dst); 3206349cc55cSDimitry Andric LLT SrcTy = MRI.getType(Src); 3207349cc55cSDimitry Andric 3208349cc55cSDimitry Andric unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3209349cc55cSDimitry Andric ? AMDGPU::G_AMDGPU_FFBH_U32 3210349cc55cSDimitry Andric : AMDGPU::G_AMDGPU_FFBL_B32; 3211349cc55cSDimitry Andric auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3212349cc55cSDimitry Andric B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3213349cc55cSDimitry Andric 3214349cc55cSDimitry Andric MI.eraseFromParent(); 3215349cc55cSDimitry Andric return true; 3216349cc55cSDimitry Andric } 3217349cc55cSDimitry Andric 3218e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1 3219e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3220e8d8bef9SDimitry Andric if (MI.getOpcode() != TargetOpcode::G_XOR) 3221e8d8bef9SDimitry Andric return false; 3222349cc55cSDimitry Andric auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3223e8d8bef9SDimitry Andric return ConstVal && *ConstVal == -1; 3224e8d8bef9SDimitry Andric } 3225e8d8bef9SDimitry Andric 32260b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid. 3227e8d8bef9SDimitry Andric static MachineInstr * 3228e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3229e8d8bef9SDimitry Andric MachineBasicBlock *&UncondBrTarget, bool &Negated) { 32300b57cec5SDimitry Andric Register CondDef = MI.getOperand(0).getReg(); 32310b57cec5SDimitry Andric if (!MRI.hasOneNonDBGUse(CondDef)) 32320b57cec5SDimitry Andric return nullptr; 32330b57cec5SDimitry Andric 32345ffd83dbSDimitry Andric MachineBasicBlock *Parent = MI.getParent(); 3235e8d8bef9SDimitry Andric MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3236e8d8bef9SDimitry Andric 3237e8d8bef9SDimitry Andric if (isNot(MRI, *UseMI)) { 3238e8d8bef9SDimitry Andric Register NegatedCond = UseMI->getOperand(0).getReg(); 3239e8d8bef9SDimitry Andric if (!MRI.hasOneNonDBGUse(NegatedCond)) 3240e8d8bef9SDimitry Andric return nullptr; 3241e8d8bef9SDimitry Andric 3242e8d8bef9SDimitry Andric // We're deleting the def of this value, so we need to remove it. 3243349cc55cSDimitry Andric eraseInstr(*UseMI, MRI); 3244e8d8bef9SDimitry Andric 3245e8d8bef9SDimitry Andric UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3246e8d8bef9SDimitry Andric Negated = true; 3247e8d8bef9SDimitry Andric } 3248e8d8bef9SDimitry Andric 3249e8d8bef9SDimitry Andric if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3250480093f4SDimitry Andric return nullptr; 3251480093f4SDimitry Andric 32525ffd83dbSDimitry Andric // Make sure the cond br is followed by a G_BR, or is the last instruction. 3253e8d8bef9SDimitry Andric MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 32545ffd83dbSDimitry Andric if (Next == Parent->end()) { 32555ffd83dbSDimitry Andric MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 32565ffd83dbSDimitry Andric if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 32575ffd83dbSDimitry Andric return nullptr; 32585ffd83dbSDimitry Andric UncondBrTarget = &*NextMBB; 32595ffd83dbSDimitry Andric } else { 3260480093f4SDimitry Andric if (Next->getOpcode() != AMDGPU::G_BR) 3261480093f4SDimitry Andric return nullptr; 3262480093f4SDimitry Andric Br = &*Next; 32635ffd83dbSDimitry Andric UncondBrTarget = Br->getOperand(0).getMBB(); 3264480093f4SDimitry Andric } 3265480093f4SDimitry Andric 3266e8d8bef9SDimitry Andric return UseMI; 32670b57cec5SDimitry Andric } 32680b57cec5SDimitry Andric 32690b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3270e8d8bef9SDimitry Andric const ArgDescriptor *Arg, 3271e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC, 3272e8d8bef9SDimitry Andric LLT ArgTy) const { 3273e8d8bef9SDimitry Andric MCRegister SrcReg = Arg->getRegister(); 3274e8d8bef9SDimitry Andric assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 32755ffd83dbSDimitry Andric assert(DstReg.isVirtual() && "Virtual register expected"); 32760b57cec5SDimitry Andric 327704eeddc0SDimitry Andric Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 327804eeddc0SDimitry Andric *ArgRC, B.getDebugLoc(), ArgTy); 32790b57cec5SDimitry Andric if (Arg->isMasked()) { 32800b57cec5SDimitry Andric // TODO: Should we try to emit this once in the entry block? 32810b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 32820b57cec5SDimitry Andric const unsigned Mask = Arg->getMask(); 32830b57cec5SDimitry Andric const unsigned Shift = countTrailingZeros<unsigned>(Mask); 32840b57cec5SDimitry Andric 32858bcb0991SDimitry Andric Register AndMaskSrc = LiveIn; 32868bcb0991SDimitry Andric 328704eeddc0SDimitry Andric // TODO: Avoid clearing the high bits if we know workitem id y/z are always 328804eeddc0SDimitry Andric // 0. 32898bcb0991SDimitry Andric if (Shift != 0) { 32900b57cec5SDimitry Andric auto ShiftAmt = B.buildConstant(S32, Shift); 32918bcb0991SDimitry Andric AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 32928bcb0991SDimitry Andric } 32938bcb0991SDimitry Andric 32948bcb0991SDimitry Andric B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 32955ffd83dbSDimitry Andric } else { 32960b57cec5SDimitry Andric B.buildCopy(DstReg, LiveIn); 32970b57cec5SDimitry Andric } 32980b57cec5SDimitry Andric 32990b57cec5SDimitry Andric return true; 33000b57cec5SDimitry Andric } 33010b57cec5SDimitry Andric 3302e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue( 3303e8d8bef9SDimitry Andric Register DstReg, MachineIRBuilder &B, 3304e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3305e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3306e8d8bef9SDimitry Andric const ArgDescriptor *Arg; 3307e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC; 3308e8d8bef9SDimitry Andric LLT ArgTy; 3309e8d8bef9SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3310e8d8bef9SDimitry Andric 3311349cc55cSDimitry Andric if (!Arg) { 3312349cc55cSDimitry Andric if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 3313349cc55cSDimitry Andric // The intrinsic may appear when we have a 0 sized kernarg segment, in which 3314349cc55cSDimitry Andric // case the pointer argument may be missing and we use null. 3315349cc55cSDimitry Andric B.buildConstant(DstReg, 0); 3316349cc55cSDimitry Andric return true; 3317349cc55cSDimitry Andric } 3318349cc55cSDimitry Andric 3319349cc55cSDimitry Andric // It's undefined behavior if a function marked with the amdgpu-no-* 3320349cc55cSDimitry Andric // attributes uses the corresponding intrinsic. 3321349cc55cSDimitry Andric B.buildUndef(DstReg); 3322349cc55cSDimitry Andric return true; 3323349cc55cSDimitry Andric } 3324349cc55cSDimitry Andric 3325e8d8bef9SDimitry Andric if (!Arg->isRegister() || !Arg->getRegister().isValid()) 3326e8d8bef9SDimitry Andric return false; // TODO: Handle these 3327e8d8bef9SDimitry Andric return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 3328e8d8bef9SDimitry Andric } 3329e8d8bef9SDimitry Andric 33300b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 33315ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 33320b57cec5SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3333e8d8bef9SDimitry Andric if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 33345ffd83dbSDimitry Andric return false; 33355ffd83dbSDimitry Andric 33360b57cec5SDimitry Andric MI.eraseFromParent(); 33370b57cec5SDimitry Andric return true; 33380b57cec5SDimitry Andric } 33390b57cec5SDimitry Andric 334081ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 334181ad6265SDimitry Andric int64_t C) { 334281ad6265SDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), C); 334381ad6265SDimitry Andric MI.eraseFromParent(); 334481ad6265SDimitry Andric return true; 334581ad6265SDimitry Andric } 334681ad6265SDimitry Andric 334781ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 334881ad6265SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 334981ad6265SDimitry Andric unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 335081ad6265SDimitry Andric unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 335181ad6265SDimitry Andric if (MaxID == 0) 335281ad6265SDimitry Andric return replaceWithConstant(B, MI, 0); 335381ad6265SDimitry Andric 335481ad6265SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 335581ad6265SDimitry Andric const ArgDescriptor *Arg; 335681ad6265SDimitry Andric const TargetRegisterClass *ArgRC; 335781ad6265SDimitry Andric LLT ArgTy; 335881ad6265SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 335981ad6265SDimitry Andric 336081ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 336181ad6265SDimitry Andric if (!Arg) { 336281ad6265SDimitry Andric // It's undefined behavior if a function marked with the amdgpu-no-* 336381ad6265SDimitry Andric // attributes uses the corresponding intrinsic. 336481ad6265SDimitry Andric B.buildUndef(DstReg); 336581ad6265SDimitry Andric MI.eraseFromParent(); 336681ad6265SDimitry Andric return true; 336781ad6265SDimitry Andric } 336881ad6265SDimitry Andric 336981ad6265SDimitry Andric if (Arg->isMasked()) { 337081ad6265SDimitry Andric // Don't bother inserting AssertZext for packed IDs since we're emitting the 337181ad6265SDimitry Andric // masking operations anyway. 337281ad6265SDimitry Andric // 337381ad6265SDimitry Andric // TODO: We could assert the top bit is 0 for the source copy. 337481ad6265SDimitry Andric if (!loadInputValue(DstReg, B, ArgType)) 337581ad6265SDimitry Andric return false; 337681ad6265SDimitry Andric } else { 337781ad6265SDimitry Andric Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 337881ad6265SDimitry Andric if (!loadInputValue(TmpReg, B, ArgType)) 337981ad6265SDimitry Andric return false; 338081ad6265SDimitry Andric B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); 338181ad6265SDimitry Andric } 338281ad6265SDimitry Andric 338381ad6265SDimitry Andric MI.eraseFromParent(); 338481ad6265SDimitry Andric return true; 338581ad6265SDimitry Andric } 338681ad6265SDimitry Andric 338781ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 338881ad6265SDimitry Andric int64_t Offset) const { 338981ad6265SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 339081ad6265SDimitry Andric Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 339181ad6265SDimitry Andric 339281ad6265SDimitry Andric // TODO: If we passed in the base kernel offset we could have a better 339381ad6265SDimitry Andric // alignment than 4, but we don't really need it. 339481ad6265SDimitry Andric if (!loadInputValue(KernArgReg, B, 339581ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 339681ad6265SDimitry Andric llvm_unreachable("failed to find kernarg segment ptr"); 339781ad6265SDimitry Andric 339881ad6265SDimitry Andric auto COffset = B.buildConstant(LLT::scalar(64), Offset); 339981ad6265SDimitry Andric // TODO: Should get nuw 340081ad6265SDimitry Andric return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 340181ad6265SDimitry Andric } 340281ad6265SDimitry Andric 340381ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by 340481ad6265SDimitry Andric /// legacy intrinsics. 340581ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 340681ad6265SDimitry Andric MachineIRBuilder &B, 340781ad6265SDimitry Andric uint64_t Offset, 340881ad6265SDimitry Andric Align Alignment) const { 340981ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 341081ad6265SDimitry Andric 341181ad6265SDimitry Andric assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 341281ad6265SDimitry Andric "unexpected kernarg parameter type"); 341381ad6265SDimitry Andric 341481ad6265SDimitry Andric Register Ptr = getKernargParameterPtr(B, Offset); 341581ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 341681ad6265SDimitry Andric B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 341781ad6265SDimitry Andric MachineMemOperand::MODereferenceable | 341881ad6265SDimitry Andric MachineMemOperand::MOInvariant); 341981ad6265SDimitry Andric MI.eraseFromParent(); 342081ad6265SDimitry Andric return true; 342181ad6265SDimitry Andric } 342281ad6265SDimitry Andric 34238bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 34248bcb0991SDimitry Andric MachineRegisterInfo &MRI, 34258bcb0991SDimitry Andric MachineIRBuilder &B) const { 3426480093f4SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 3427480093f4SDimitry Andric LLT DstTy = MRI.getType(Dst); 3428480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 3429480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3430480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 34318bcb0991SDimitry Andric 3432480093f4SDimitry Andric if (DstTy == S16) 3433480093f4SDimitry Andric return legalizeFDIV16(MI, MRI, B); 3434480093f4SDimitry Andric if (DstTy == S32) 3435480093f4SDimitry Andric return legalizeFDIV32(MI, MRI, B); 3436480093f4SDimitry Andric if (DstTy == S64) 3437480093f4SDimitry Andric return legalizeFDIV64(MI, MRI, B); 3438480093f4SDimitry Andric 34398bcb0991SDimitry Andric return false; 34408bcb0991SDimitry Andric } 34418bcb0991SDimitry Andric 3442fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3443fe6060f1SDimitry Andric Register DstDivReg, 3444fe6060f1SDimitry Andric Register DstRemReg, 34455ffd83dbSDimitry Andric Register X, 3446fe6060f1SDimitry Andric Register Y) const { 34475ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 34485ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 34495ffd83dbSDimitry Andric 34505ffd83dbSDimitry Andric // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 34515ffd83dbSDimitry Andric // algorithm used here. 34525ffd83dbSDimitry Andric 34535ffd83dbSDimitry Andric // Initial estimate of inv(y). 34545ffd83dbSDimitry Andric auto FloatY = B.buildUITOFP(S32, Y); 34555ffd83dbSDimitry Andric auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 34565ffd83dbSDimitry Andric auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 34575ffd83dbSDimitry Andric auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 34585ffd83dbSDimitry Andric auto Z = B.buildFPTOUI(S32, ScaledY); 34595ffd83dbSDimitry Andric 34605ffd83dbSDimitry Andric // One round of UNR. 34615ffd83dbSDimitry Andric auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 34625ffd83dbSDimitry Andric auto NegYZ = B.buildMul(S32, NegY, Z); 34635ffd83dbSDimitry Andric Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 34645ffd83dbSDimitry Andric 34655ffd83dbSDimitry Andric // Quotient/remainder estimate. 34665ffd83dbSDimitry Andric auto Q = B.buildUMulH(S32, X, Z); 34675ffd83dbSDimitry Andric auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 34685ffd83dbSDimitry Andric 34695ffd83dbSDimitry Andric // First quotient/remainder refinement. 34705ffd83dbSDimitry Andric auto One = B.buildConstant(S32, 1); 34715ffd83dbSDimitry Andric auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3472fe6060f1SDimitry Andric if (DstDivReg) 34735ffd83dbSDimitry Andric Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 34745ffd83dbSDimitry Andric R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 34755ffd83dbSDimitry Andric 34765ffd83dbSDimitry Andric // Second quotient/remainder refinement. 34775ffd83dbSDimitry Andric Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3478fe6060f1SDimitry Andric if (DstDivReg) 3479fe6060f1SDimitry Andric B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 34805ffd83dbSDimitry Andric 3481fe6060f1SDimitry Andric if (DstRemReg) 3482fe6060f1SDimitry Andric B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 34835ffd83dbSDimitry Andric } 34845ffd83dbSDimitry Andric 3485349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32 34865ffd83dbSDimitry Andric // 34875ffd83dbSDimitry Andric // Return lo, hi of result 34885ffd83dbSDimitry Andric // 34895ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo 34905ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi 34915ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 34925ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad 34935ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc 34945ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32) 34955ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2 34965ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1 34975ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 34985ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 34995ffd83dbSDimitry Andric Register Val) { 35005ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 35015ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, Val); 35025ffd83dbSDimitry Andric 35035ffd83dbSDimitry Andric auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 35045ffd83dbSDimitry Andric auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 35055ffd83dbSDimitry Andric 35065ffd83dbSDimitry Andric auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 35075ffd83dbSDimitry Andric B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 35085ffd83dbSDimitry Andric 35095ffd83dbSDimitry Andric auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 35105ffd83dbSDimitry Andric auto Mul1 = 35115ffd83dbSDimitry Andric B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 35125ffd83dbSDimitry Andric 35135ffd83dbSDimitry Andric // 2**(-32) 35145ffd83dbSDimitry Andric auto Mul2 = 35155ffd83dbSDimitry Andric B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 35165ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 35175ffd83dbSDimitry Andric 35185ffd83dbSDimitry Andric // -(2**32) 35195ffd83dbSDimitry Andric auto Mad2 = B.buildFMAD(S32, Trunc, 35205ffd83dbSDimitry Andric B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 35215ffd83dbSDimitry Andric 35225ffd83dbSDimitry Andric auto ResultLo = B.buildFPTOUI(S32, Mad2); 35235ffd83dbSDimitry Andric auto ResultHi = B.buildFPTOUI(S32, Trunc); 35245ffd83dbSDimitry Andric 35255ffd83dbSDimitry Andric return {ResultLo.getReg(0), ResultHi.getReg(0)}; 35265ffd83dbSDimitry Andric } 35275ffd83dbSDimitry Andric 3528fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3529fe6060f1SDimitry Andric Register DstDivReg, 3530fe6060f1SDimitry Andric Register DstRemReg, 35315ffd83dbSDimitry Andric Register Numer, 3532fe6060f1SDimitry Andric Register Denom) const { 35335ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 35345ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 35355ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 35365ffd83dbSDimitry Andric Register RcpLo, RcpHi; 35375ffd83dbSDimitry Andric 35385ffd83dbSDimitry Andric std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 35395ffd83dbSDimitry Andric 35405ffd83dbSDimitry Andric auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 35415ffd83dbSDimitry Andric 35425ffd83dbSDimitry Andric auto Zero64 = B.buildConstant(S64, 0); 35435ffd83dbSDimitry Andric auto NegDenom = B.buildSub(S64, Zero64, Denom); 35445ffd83dbSDimitry Andric 35455ffd83dbSDimitry Andric auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 35465ffd83dbSDimitry Andric auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 35475ffd83dbSDimitry Andric 35485ffd83dbSDimitry Andric auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 35495ffd83dbSDimitry Andric Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 35505ffd83dbSDimitry Andric Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 35515ffd83dbSDimitry Andric 35525ffd83dbSDimitry Andric auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 35535ffd83dbSDimitry Andric auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 35545ffd83dbSDimitry Andric auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 35555ffd83dbSDimitry Andric 35565ffd83dbSDimitry Andric auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 35575ffd83dbSDimitry Andric auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 35585ffd83dbSDimitry Andric auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 35595ffd83dbSDimitry Andric Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 35605ffd83dbSDimitry Andric Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 35615ffd83dbSDimitry Andric 35625ffd83dbSDimitry Andric auto Zero32 = B.buildConstant(S32, 0); 35635ffd83dbSDimitry Andric auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3564349cc55cSDimitry Andric auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 35655ffd83dbSDimitry Andric auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 35665ffd83dbSDimitry Andric 35675ffd83dbSDimitry Andric auto UnmergeNumer = B.buildUnmerge(S32, Numer); 35685ffd83dbSDimitry Andric Register NumerLo = UnmergeNumer.getReg(0); 35695ffd83dbSDimitry Andric Register NumerHi = UnmergeNumer.getReg(1); 35705ffd83dbSDimitry Andric 35715ffd83dbSDimitry Andric auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 35725ffd83dbSDimitry Andric auto Mul3 = B.buildMul(S64, Denom, MulHi3); 35735ffd83dbSDimitry Andric auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 35745ffd83dbSDimitry Andric Register Mul3_Lo = UnmergeMul3.getReg(0); 35755ffd83dbSDimitry Andric Register Mul3_Hi = UnmergeMul3.getReg(1); 35765ffd83dbSDimitry Andric auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 35775ffd83dbSDimitry Andric auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 35785ffd83dbSDimitry Andric auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 35795ffd83dbSDimitry Andric auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 35805ffd83dbSDimitry Andric 35815ffd83dbSDimitry Andric auto UnmergeDenom = B.buildUnmerge(S32, Denom); 35825ffd83dbSDimitry Andric Register DenomLo = UnmergeDenom.getReg(0); 35835ffd83dbSDimitry Andric Register DenomHi = UnmergeDenom.getReg(1); 35845ffd83dbSDimitry Andric 35855ffd83dbSDimitry Andric auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 35865ffd83dbSDimitry Andric auto C1 = B.buildSExt(S32, CmpHi); 35875ffd83dbSDimitry Andric 35885ffd83dbSDimitry Andric auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 35895ffd83dbSDimitry Andric auto C2 = B.buildSExt(S32, CmpLo); 35905ffd83dbSDimitry Andric 35915ffd83dbSDimitry Andric auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 35925ffd83dbSDimitry Andric auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 35935ffd83dbSDimitry Andric 35945ffd83dbSDimitry Andric // TODO: Here and below portions of the code can be enclosed into if/endif. 35955ffd83dbSDimitry Andric // Currently control flow is unconditional and we have 4 selects after 35965ffd83dbSDimitry Andric // potential endif to substitute PHIs. 35975ffd83dbSDimitry Andric 35985ffd83dbSDimitry Andric // if C3 != 0 ... 35995ffd83dbSDimitry Andric auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 36005ffd83dbSDimitry Andric auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 36015ffd83dbSDimitry Andric auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 36025ffd83dbSDimitry Andric auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 36035ffd83dbSDimitry Andric 36045ffd83dbSDimitry Andric auto One64 = B.buildConstant(S64, 1); 36055ffd83dbSDimitry Andric auto Add3 = B.buildAdd(S64, MulHi3, One64); 36065ffd83dbSDimitry Andric 36075ffd83dbSDimitry Andric auto C4 = 36085ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 36095ffd83dbSDimitry Andric auto C5 = 36105ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 36115ffd83dbSDimitry Andric auto C6 = B.buildSelect( 36125ffd83dbSDimitry Andric S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 36135ffd83dbSDimitry Andric 36145ffd83dbSDimitry Andric // if (C6 != 0) 36155ffd83dbSDimitry Andric auto Add4 = B.buildAdd(S64, Add3, One64); 36165ffd83dbSDimitry Andric auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 36175ffd83dbSDimitry Andric 36185ffd83dbSDimitry Andric auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 36195ffd83dbSDimitry Andric auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 36205ffd83dbSDimitry Andric auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 36215ffd83dbSDimitry Andric 36225ffd83dbSDimitry Andric // endif C6 36235ffd83dbSDimitry Andric // endif C3 36245ffd83dbSDimitry Andric 3625fe6060f1SDimitry Andric if (DstDivReg) { 36265ffd83dbSDimitry Andric auto Sel1 = B.buildSelect( 36275ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3628fe6060f1SDimitry Andric B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3629fe6060f1SDimitry Andric Sel1, MulHi3); 3630fe6060f1SDimitry Andric } 3631fe6060f1SDimitry Andric 3632fe6060f1SDimitry Andric if (DstRemReg) { 36335ffd83dbSDimitry Andric auto Sel2 = B.buildSelect( 36345ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3635fe6060f1SDimitry Andric B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3636fe6060f1SDimitry Andric Sel2, Sub1); 36375ffd83dbSDimitry Andric } 36385ffd83dbSDimitry Andric } 36395ffd83dbSDimitry Andric 3640fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 36415ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 36425ffd83dbSDimitry Andric MachineIRBuilder &B) const { 3643fe6060f1SDimitry Andric Register DstDivReg, DstRemReg; 3644fe6060f1SDimitry Andric switch (MI.getOpcode()) { 3645fe6060f1SDimitry Andric default: 3646fe6060f1SDimitry Andric llvm_unreachable("Unexpected opcode!"); 3647fe6060f1SDimitry Andric case AMDGPU::G_UDIV: { 3648fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 3649fe6060f1SDimitry Andric break; 3650fe6060f1SDimitry Andric } 3651fe6060f1SDimitry Andric case AMDGPU::G_UREM: { 3652fe6060f1SDimitry Andric DstRemReg = MI.getOperand(0).getReg(); 3653fe6060f1SDimitry Andric break; 3654fe6060f1SDimitry Andric } 3655fe6060f1SDimitry Andric case AMDGPU::G_UDIVREM: { 3656fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 3657fe6060f1SDimitry Andric DstRemReg = MI.getOperand(1).getReg(); 3658fe6060f1SDimitry Andric break; 3659fe6060f1SDimitry Andric } 3660fe6060f1SDimitry Andric } 3661fe6060f1SDimitry Andric 36625ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 36635ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3664fe6060f1SDimitry Andric const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3665fe6060f1SDimitry Andric Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3666fe6060f1SDimitry Andric Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3667fe6060f1SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 36685ffd83dbSDimitry Andric 36695ffd83dbSDimitry Andric if (Ty == S32) 3670fe6060f1SDimitry Andric legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 36715ffd83dbSDimitry Andric else if (Ty == S64) 3672fe6060f1SDimitry Andric legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 36735ffd83dbSDimitry Andric else 36745ffd83dbSDimitry Andric return false; 36755ffd83dbSDimitry Andric 36765ffd83dbSDimitry Andric MI.eraseFromParent(); 36775ffd83dbSDimitry Andric return true; 36785ffd83dbSDimitry Andric } 36795ffd83dbSDimitry Andric 3680fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 36815ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 36825ffd83dbSDimitry Andric MachineIRBuilder &B) const { 36835ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 36845ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 36855ffd83dbSDimitry Andric 3686fe6060f1SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 36875ffd83dbSDimitry Andric if (Ty != S32 && Ty != S64) 36885ffd83dbSDimitry Andric return false; 36895ffd83dbSDimitry Andric 3690fe6060f1SDimitry Andric const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3691fe6060f1SDimitry Andric Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3692fe6060f1SDimitry Andric Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 36935ffd83dbSDimitry Andric 36945ffd83dbSDimitry Andric auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 36955ffd83dbSDimitry Andric auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 36965ffd83dbSDimitry Andric auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 36975ffd83dbSDimitry Andric 36985ffd83dbSDimitry Andric LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 36995ffd83dbSDimitry Andric RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 37005ffd83dbSDimitry Andric 37015ffd83dbSDimitry Andric LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 37025ffd83dbSDimitry Andric RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 37035ffd83dbSDimitry Andric 3704fe6060f1SDimitry Andric Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3705fe6060f1SDimitry Andric switch (MI.getOpcode()) { 3706fe6060f1SDimitry Andric default: 3707fe6060f1SDimitry Andric llvm_unreachable("Unexpected opcode!"); 3708fe6060f1SDimitry Andric case AMDGPU::G_SDIV: { 3709fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 3710fe6060f1SDimitry Andric TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3711fe6060f1SDimitry Andric break; 3712fe6060f1SDimitry Andric } 3713fe6060f1SDimitry Andric case AMDGPU::G_SREM: { 3714fe6060f1SDimitry Andric DstRemReg = MI.getOperand(0).getReg(); 3715fe6060f1SDimitry Andric TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3716fe6060f1SDimitry Andric break; 3717fe6060f1SDimitry Andric } 3718fe6060f1SDimitry Andric case AMDGPU::G_SDIVREM: { 3719fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 3720fe6060f1SDimitry Andric DstRemReg = MI.getOperand(1).getReg(); 3721fe6060f1SDimitry Andric TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3722fe6060f1SDimitry Andric TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3723fe6060f1SDimitry Andric break; 3724fe6060f1SDimitry Andric } 3725fe6060f1SDimitry Andric } 3726fe6060f1SDimitry Andric 37275ffd83dbSDimitry Andric if (Ty == S32) 3728fe6060f1SDimitry Andric legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 37295ffd83dbSDimitry Andric else 3730fe6060f1SDimitry Andric legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 37315ffd83dbSDimitry Andric 3732fe6060f1SDimitry Andric if (DstDivReg) { 3733fe6060f1SDimitry Andric auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3734fe6060f1SDimitry Andric auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3735fe6060f1SDimitry Andric B.buildSub(DstDivReg, SignXor, Sign); 3736fe6060f1SDimitry Andric } 37375ffd83dbSDimitry Andric 3738fe6060f1SDimitry Andric if (DstRemReg) { 3739fe6060f1SDimitry Andric auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3740fe6060f1SDimitry Andric auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3741fe6060f1SDimitry Andric B.buildSub(DstRemReg, SignXor, Sign); 3742fe6060f1SDimitry Andric } 37435ffd83dbSDimitry Andric 37445ffd83dbSDimitry Andric MI.eraseFromParent(); 37455ffd83dbSDimitry Andric return true; 37465ffd83dbSDimitry Andric } 37475ffd83dbSDimitry Andric 37488bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 37498bcb0991SDimitry Andric MachineRegisterInfo &MRI, 37508bcb0991SDimitry Andric MachineIRBuilder &B) const { 37518bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 37528bcb0991SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 37538bcb0991SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 37548bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 37558bcb0991SDimitry Andric LLT ResTy = MRI.getType(Res); 37568bcb0991SDimitry Andric 37578bcb0991SDimitry Andric const MachineFunction &MF = B.getMF(); 3758e8d8bef9SDimitry Andric bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3759e8d8bef9SDimitry Andric MI.getFlag(MachineInstr::FmAfn); 37608bcb0991SDimitry Andric 3761e8d8bef9SDimitry Andric if (!AllowInaccurateRcp) 37628bcb0991SDimitry Andric return false; 37638bcb0991SDimitry Andric 37648bcb0991SDimitry Andric if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 37658bcb0991SDimitry Andric // 1 / x -> RCP(x) 37668bcb0991SDimitry Andric if (CLHS->isExactlyValue(1.0)) { 37678bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 37688bcb0991SDimitry Andric .addUse(RHS) 37698bcb0991SDimitry Andric .setMIFlags(Flags); 37708bcb0991SDimitry Andric 37718bcb0991SDimitry Andric MI.eraseFromParent(); 37728bcb0991SDimitry Andric return true; 37738bcb0991SDimitry Andric } 37748bcb0991SDimitry Andric 37758bcb0991SDimitry Andric // -1 / x -> RCP( FNEG(x) ) 37768bcb0991SDimitry Andric if (CLHS->isExactlyValue(-1.0)) { 37778bcb0991SDimitry Andric auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 37788bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 37798bcb0991SDimitry Andric .addUse(FNeg.getReg(0)) 37808bcb0991SDimitry Andric .setMIFlags(Flags); 37818bcb0991SDimitry Andric 37828bcb0991SDimitry Andric MI.eraseFromParent(); 37838bcb0991SDimitry Andric return true; 37848bcb0991SDimitry Andric } 37858bcb0991SDimitry Andric } 37868bcb0991SDimitry Andric 37878bcb0991SDimitry Andric // x / y -> x * (1.0 / y) 37888bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 37898bcb0991SDimitry Andric .addUse(RHS) 37908bcb0991SDimitry Andric .setMIFlags(Flags); 37918bcb0991SDimitry Andric B.buildFMul(Res, LHS, RCP, Flags); 37928bcb0991SDimitry Andric 37938bcb0991SDimitry Andric MI.eraseFromParent(); 37948bcb0991SDimitry Andric return true; 37958bcb0991SDimitry Andric } 37968bcb0991SDimitry Andric 3797e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3798e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 3799e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 3800e8d8bef9SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3801e8d8bef9SDimitry Andric Register X = MI.getOperand(1).getReg(); 3802e8d8bef9SDimitry Andric Register Y = MI.getOperand(2).getReg(); 3803e8d8bef9SDimitry Andric uint16_t Flags = MI.getFlags(); 3804e8d8bef9SDimitry Andric LLT ResTy = MRI.getType(Res); 3805e8d8bef9SDimitry Andric 3806e8d8bef9SDimitry Andric const MachineFunction &MF = B.getMF(); 3807e8d8bef9SDimitry Andric bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3808e8d8bef9SDimitry Andric MI.getFlag(MachineInstr::FmAfn); 3809e8d8bef9SDimitry Andric 3810e8d8bef9SDimitry Andric if (!AllowInaccurateRcp) 38118bcb0991SDimitry Andric return false; 3812e8d8bef9SDimitry Andric 3813e8d8bef9SDimitry Andric auto NegY = B.buildFNeg(ResTy, Y); 3814e8d8bef9SDimitry Andric auto One = B.buildFConstant(ResTy, 1.0); 3815e8d8bef9SDimitry Andric 3816e8d8bef9SDimitry Andric auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3817e8d8bef9SDimitry Andric .addUse(Y) 3818e8d8bef9SDimitry Andric .setMIFlags(Flags); 3819e8d8bef9SDimitry Andric 3820e8d8bef9SDimitry Andric auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3821e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp0, R, R); 3822e8d8bef9SDimitry Andric 3823e8d8bef9SDimitry Andric auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3824e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp1, R, R); 3825e8d8bef9SDimitry Andric 3826e8d8bef9SDimitry Andric auto Ret = B.buildFMul(ResTy, X, R); 3827e8d8bef9SDimitry Andric auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3828e8d8bef9SDimitry Andric 3829e8d8bef9SDimitry Andric B.buildFMA(Res, Tmp2, R, Ret); 3830e8d8bef9SDimitry Andric MI.eraseFromParent(); 3831e8d8bef9SDimitry Andric return true; 38328bcb0991SDimitry Andric } 38338bcb0991SDimitry Andric 3834480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3835480093f4SDimitry Andric MachineRegisterInfo &MRI, 3836480093f4SDimitry Andric MachineIRBuilder &B) const { 3837e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3838e8d8bef9SDimitry Andric return true; 3839e8d8bef9SDimitry Andric 3840480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3841480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3842480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3843480093f4SDimitry Andric 3844480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3845480093f4SDimitry Andric 3846480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 3847480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3848480093f4SDimitry Andric 3849480093f4SDimitry Andric auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3850480093f4SDimitry Andric auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3851480093f4SDimitry Andric 3852480093f4SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3853480093f4SDimitry Andric .addUse(RHSExt.getReg(0)) 3854480093f4SDimitry Andric .setMIFlags(Flags); 3855480093f4SDimitry Andric 3856480093f4SDimitry Andric auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3857480093f4SDimitry Andric auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3858480093f4SDimitry Andric 3859480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3860480093f4SDimitry Andric .addUse(RDst.getReg(0)) 3861480093f4SDimitry Andric .addUse(RHS) 3862480093f4SDimitry Andric .addUse(LHS) 3863480093f4SDimitry Andric .setMIFlags(Flags); 3864480093f4SDimitry Andric 3865480093f4SDimitry Andric MI.eraseFromParent(); 3866480093f4SDimitry Andric return true; 3867480093f4SDimitry Andric } 3868480093f4SDimitry Andric 3869480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3870480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3871480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable, 3872480093f4SDimitry Andric MachineIRBuilder &B, 3873480093f4SDimitry Andric const GCNSubtarget &ST, 3874480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode) { 3875480093f4SDimitry Andric // Set SP denorm mode to this value. 3876480093f4SDimitry Andric unsigned SPDenormMode = 38775ffd83dbSDimitry Andric Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3878480093f4SDimitry Andric 3879480093f4SDimitry Andric if (ST.hasDenormModeInst()) { 3880480093f4SDimitry Andric // Preserve default FP64FP16 denorm mode while updating FP32 mode. 38815ffd83dbSDimitry Andric uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3882480093f4SDimitry Andric 38835ffd83dbSDimitry Andric uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3884480093f4SDimitry Andric B.buildInstr(AMDGPU::S_DENORM_MODE) 3885480093f4SDimitry Andric .addImm(NewDenormModeValue); 3886480093f4SDimitry Andric 3887480093f4SDimitry Andric } else { 3888480093f4SDimitry Andric // Select FP32 bit field in mode register. 3889480093f4SDimitry Andric unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3890480093f4SDimitry Andric (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3891480093f4SDimitry Andric (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3892480093f4SDimitry Andric 3893480093f4SDimitry Andric B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3894480093f4SDimitry Andric .addImm(SPDenormMode) 3895480093f4SDimitry Andric .addImm(SPDenormModeBitField); 3896480093f4SDimitry Andric } 3897480093f4SDimitry Andric } 3898480093f4SDimitry Andric 3899480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3900480093f4SDimitry Andric MachineRegisterInfo &MRI, 3901480093f4SDimitry Andric MachineIRBuilder &B) const { 3902e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3903e8d8bef9SDimitry Andric return true; 3904e8d8bef9SDimitry Andric 3905480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3906480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3907480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3908480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3909480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3910480093f4SDimitry Andric 3911480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3912480093f4SDimitry Andric 3913480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3914480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 3915480093f4SDimitry Andric 3916480093f4SDimitry Andric auto One = B.buildFConstant(S32, 1.0f); 3917480093f4SDimitry Andric 3918480093f4SDimitry Andric auto DenominatorScaled = 3919480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3920480093f4SDimitry Andric .addUse(LHS) 39215ffd83dbSDimitry Andric .addUse(RHS) 39225ffd83dbSDimitry Andric .addImm(0) 3923480093f4SDimitry Andric .setMIFlags(Flags); 3924480093f4SDimitry Andric auto NumeratorScaled = 3925480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3926480093f4SDimitry Andric .addUse(LHS) 3927480093f4SDimitry Andric .addUse(RHS) 39285ffd83dbSDimitry Andric .addImm(1) 3929480093f4SDimitry Andric .setMIFlags(Flags); 3930480093f4SDimitry Andric 3931480093f4SDimitry Andric auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3932480093f4SDimitry Andric .addUse(DenominatorScaled.getReg(0)) 3933480093f4SDimitry Andric .setMIFlags(Flags); 3934480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3935480093f4SDimitry Andric 3936480093f4SDimitry Andric // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3937480093f4SDimitry Andric // aren't modeled as reading it. 39385ffd83dbSDimitry Andric if (!Mode.allFP32Denormals()) 3939480093f4SDimitry Andric toggleSPDenormMode(true, B, ST, Mode); 3940480093f4SDimitry Andric 3941480093f4SDimitry Andric auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3942480093f4SDimitry Andric auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3943480093f4SDimitry Andric auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3944480093f4SDimitry Andric auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3945480093f4SDimitry Andric auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3946480093f4SDimitry Andric auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3947480093f4SDimitry Andric 39485ffd83dbSDimitry Andric if (!Mode.allFP32Denormals()) 3949480093f4SDimitry Andric toggleSPDenormMode(false, B, ST, Mode); 3950480093f4SDimitry Andric 3951480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3952480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 3953480093f4SDimitry Andric .addUse(Fma1.getReg(0)) 3954480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 3955480093f4SDimitry Andric .addUse(NumeratorScaled.getReg(1)) 3956480093f4SDimitry Andric .setMIFlags(Flags); 3957480093f4SDimitry Andric 3958480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3959480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 3960480093f4SDimitry Andric .addUse(RHS) 3961480093f4SDimitry Andric .addUse(LHS) 3962480093f4SDimitry Andric .setMIFlags(Flags); 3963480093f4SDimitry Andric 3964480093f4SDimitry Andric MI.eraseFromParent(); 3965480093f4SDimitry Andric return true; 3966480093f4SDimitry Andric } 3967480093f4SDimitry Andric 3968480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3969480093f4SDimitry Andric MachineRegisterInfo &MRI, 3970480093f4SDimitry Andric MachineIRBuilder &B) const { 3971e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3972e8d8bef9SDimitry Andric return true; 3973e8d8bef9SDimitry Andric 3974480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3975480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3976480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3977480093f4SDimitry Andric 3978480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3979480093f4SDimitry Andric 3980480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 3981480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 3982480093f4SDimitry Andric 3983480093f4SDimitry Andric auto One = B.buildFConstant(S64, 1.0); 3984480093f4SDimitry Andric 3985480093f4SDimitry Andric auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3986480093f4SDimitry Andric .addUse(LHS) 3987480093f4SDimitry Andric .addUse(RHS) 39885ffd83dbSDimitry Andric .addImm(0) 3989480093f4SDimitry Andric .setMIFlags(Flags); 3990480093f4SDimitry Andric 3991480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3992480093f4SDimitry Andric 3993480093f4SDimitry Andric auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3994480093f4SDimitry Andric .addUse(DivScale0.getReg(0)) 3995480093f4SDimitry Andric .setMIFlags(Flags); 3996480093f4SDimitry Andric 3997480093f4SDimitry Andric auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3998480093f4SDimitry Andric auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3999480093f4SDimitry Andric auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 4000480093f4SDimitry Andric 4001480093f4SDimitry Andric auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 4002480093f4SDimitry Andric .addUse(LHS) 4003480093f4SDimitry Andric .addUse(RHS) 40045ffd83dbSDimitry Andric .addImm(1) 4005480093f4SDimitry Andric .setMIFlags(Flags); 4006480093f4SDimitry Andric 4007480093f4SDimitry Andric auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 40085ffd83dbSDimitry Andric auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 4009480093f4SDimitry Andric auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 4010480093f4SDimitry Andric 4011480093f4SDimitry Andric Register Scale; 4012480093f4SDimitry Andric if (!ST.hasUsableDivScaleConditionOutput()) { 4013480093f4SDimitry Andric // Workaround a hardware bug on SI where the condition output from div_scale 4014480093f4SDimitry Andric // is not usable. 4015480093f4SDimitry Andric 4016480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 4017480093f4SDimitry Andric 4018480093f4SDimitry Andric auto NumUnmerge = B.buildUnmerge(S32, LHS); 4019480093f4SDimitry Andric auto DenUnmerge = B.buildUnmerge(S32, RHS); 4020480093f4SDimitry Andric auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4021480093f4SDimitry Andric auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4022480093f4SDimitry Andric 4023480093f4SDimitry Andric auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4024480093f4SDimitry Andric Scale1Unmerge.getReg(1)); 4025480093f4SDimitry Andric auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4026480093f4SDimitry Andric Scale0Unmerge.getReg(1)); 40275ffd83dbSDimitry Andric Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4028480093f4SDimitry Andric } else { 4029480093f4SDimitry Andric Scale = DivScale1.getReg(1); 4030480093f4SDimitry Andric } 4031480093f4SDimitry Andric 4032480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4033480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 4034480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 4035480093f4SDimitry Andric .addUse(Mul.getReg(0)) 4036480093f4SDimitry Andric .addUse(Scale) 4037480093f4SDimitry Andric .setMIFlags(Flags); 4038480093f4SDimitry Andric 4039480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 4040480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 4041480093f4SDimitry Andric .addUse(RHS) 4042480093f4SDimitry Andric .addUse(LHS) 4043480093f4SDimitry Andric .setMIFlags(Flags); 4044480093f4SDimitry Andric 4045480093f4SDimitry Andric MI.eraseFromParent(); 4046480093f4SDimitry Andric return true; 4047480093f4SDimitry Andric } 4048480093f4SDimitry Andric 40498bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 40508bcb0991SDimitry Andric MachineRegisterInfo &MRI, 40518bcb0991SDimitry Andric MachineIRBuilder &B) const { 40528bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 40538bcb0991SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 40548bcb0991SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 40558bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 40568bcb0991SDimitry Andric 40578bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 40588bcb0991SDimitry Andric LLT S1 = LLT::scalar(1); 40598bcb0991SDimitry Andric 40608bcb0991SDimitry Andric auto Abs = B.buildFAbs(S32, RHS, Flags); 40618bcb0991SDimitry Andric const APFloat C0Val(1.0f); 40628bcb0991SDimitry Andric 40638bcb0991SDimitry Andric auto C0 = B.buildConstant(S32, 0x6f800000); 40648bcb0991SDimitry Andric auto C1 = B.buildConstant(S32, 0x2f800000); 40658bcb0991SDimitry Andric auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 40668bcb0991SDimitry Andric 40678bcb0991SDimitry Andric auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 40688bcb0991SDimitry Andric auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 40698bcb0991SDimitry Andric 40708bcb0991SDimitry Andric auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 40718bcb0991SDimitry Andric 40728bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 40738bcb0991SDimitry Andric .addUse(Mul0.getReg(0)) 40748bcb0991SDimitry Andric .setMIFlags(Flags); 40758bcb0991SDimitry Andric 40768bcb0991SDimitry Andric auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 40778bcb0991SDimitry Andric 40788bcb0991SDimitry Andric B.buildFMul(Res, Sel, Mul1, Flags); 40798bcb0991SDimitry Andric 40808bcb0991SDimitry Andric MI.eraseFromParent(); 40818bcb0991SDimitry Andric return true; 40828bcb0991SDimitry Andric } 40838bcb0991SDimitry Andric 4084e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4085e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions? 4086e8d8bef9SDimitry Andric // 4087e8d8bef9SDimitry Andric // Reciprocal square root. The clamp prevents infinite results, clamping 4088e8d8bef9SDimitry Andric // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4089e8d8bef9SDimitry Andric // +-max_float. 4090e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4091e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 4092e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 4093e8d8bef9SDimitry Andric if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4094e8d8bef9SDimitry Andric return true; 4095e8d8bef9SDimitry Andric 4096e8d8bef9SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 4097e8d8bef9SDimitry Andric Register Src = MI.getOperand(2).getReg(); 4098e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 4099e8d8bef9SDimitry Andric 4100e8d8bef9SDimitry Andric LLT Ty = MRI.getType(Dst); 4101e8d8bef9SDimitry Andric 4102e8d8bef9SDimitry Andric const fltSemantics *FltSemantics; 4103e8d8bef9SDimitry Andric if (Ty == LLT::scalar(32)) 4104e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEsingle(); 4105e8d8bef9SDimitry Andric else if (Ty == LLT::scalar(64)) 4106e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEdouble(); 4107e8d8bef9SDimitry Andric else 4108e8d8bef9SDimitry Andric return false; 4109e8d8bef9SDimitry Andric 4110e8d8bef9SDimitry Andric auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4111e8d8bef9SDimitry Andric .addUse(Src) 4112e8d8bef9SDimitry Andric .setMIFlags(Flags); 4113e8d8bef9SDimitry Andric 4114e8d8bef9SDimitry Andric // We don't need to concern ourselves with the snan handling difference, since 4115e8d8bef9SDimitry Andric // the rsq quieted (or not) so use the one which will directly select. 4116e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4117e8d8bef9SDimitry Andric const bool UseIEEE = MFI->getMode().IEEE; 4118e8d8bef9SDimitry Andric 4119e8d8bef9SDimitry Andric auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4120e8d8bef9SDimitry Andric auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4121e8d8bef9SDimitry Andric B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4122e8d8bef9SDimitry Andric 4123e8d8bef9SDimitry Andric auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4124e8d8bef9SDimitry Andric 4125e8d8bef9SDimitry Andric if (UseIEEE) 4126e8d8bef9SDimitry Andric B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4127e8d8bef9SDimitry Andric else 4128e8d8bef9SDimitry Andric B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4129e8d8bef9SDimitry Andric MI.eraseFromParent(); 4130e8d8bef9SDimitry Andric return true; 4131e8d8bef9SDimitry Andric } 4132e8d8bef9SDimitry Andric 4133e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4134e8d8bef9SDimitry Andric switch (IID) { 4135e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 4136e8d8bef9SDimitry Andric return AMDGPU::G_ATOMICRMW_FADD; 4137e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 4138e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 4139e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 4140e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 4141e8d8bef9SDimitry Andric default: 4142e8d8bef9SDimitry Andric llvm_unreachable("not a DS FP intrinsic"); 4143e8d8bef9SDimitry Andric } 4144e8d8bef9SDimitry Andric } 4145e8d8bef9SDimitry Andric 4146e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 4147e8d8bef9SDimitry Andric MachineInstr &MI, 4148e8d8bef9SDimitry Andric Intrinsic::ID IID) const { 4149e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 4150e8d8bef9SDimitry Andric Observer.changingInstr(MI); 4151e8d8bef9SDimitry Andric 4152e8d8bef9SDimitry Andric MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 4153e8d8bef9SDimitry Andric 4154e8d8bef9SDimitry Andric // The remaining operands were used to set fields in the MemOperand on 4155e8d8bef9SDimitry Andric // construction. 4156e8d8bef9SDimitry Andric for (int I = 6; I > 3; --I) 415781ad6265SDimitry Andric MI.removeOperand(I); 4158e8d8bef9SDimitry Andric 415981ad6265SDimitry Andric MI.removeOperand(1); // Remove the intrinsic ID. 4160e8d8bef9SDimitry Andric Observer.changedInstr(MI); 4161e8d8bef9SDimitry Andric return true; 4162e8d8bef9SDimitry Andric } 4163e8d8bef9SDimitry Andric 4164e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 4165e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 4166e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 4167e8d8bef9SDimitry Andric uint64_t Offset = 4168e8d8bef9SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset( 4169e8d8bef9SDimitry Andric B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 4170e8d8bef9SDimitry Andric LLT DstTy = MRI.getType(DstReg); 4171e8d8bef9SDimitry Andric LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 4172e8d8bef9SDimitry Andric 4173e8d8bef9SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 4174e8d8bef9SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 4175e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4176e8d8bef9SDimitry Andric return false; 4177e8d8bef9SDimitry Andric 4178e8d8bef9SDimitry Andric // FIXME: This should be nuw 4179e8d8bef9SDimitry Andric B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 4180e8d8bef9SDimitry Andric return true; 4181e8d8bef9SDimitry Andric } 4182e8d8bef9SDimitry Andric 41830b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 41840b57cec5SDimitry Andric MachineRegisterInfo &MRI, 41850b57cec5SDimitry Andric MachineIRBuilder &B) const { 41860b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 41870b57cec5SDimitry Andric if (!MFI->isEntryFunction()) { 41880b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 41890b57cec5SDimitry Andric AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 41900b57cec5SDimitry Andric } 41910b57cec5SDimitry Andric 41920b57cec5SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 4193e8d8bef9SDimitry Andric if (!getImplicitArgPtr(DstReg, MRI, B)) 41940b57cec5SDimitry Andric return false; 41950b57cec5SDimitry Andric 41960b57cec5SDimitry Andric MI.eraseFromParent(); 41970b57cec5SDimitry Andric return true; 41980b57cec5SDimitry Andric } 41990b57cec5SDimitry Andric 4200*fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 4201*fcaf7f86SDimitry Andric MachineRegisterInfo &MRI, 4202*fcaf7f86SDimitry Andric MachineIRBuilder &B) const { 4203*fcaf7f86SDimitry Andric Function &F = B.getMF().getFunction(); 4204*fcaf7f86SDimitry Andric Optional<uint32_t> KnownSize = 4205*fcaf7f86SDimitry Andric AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 4206*fcaf7f86SDimitry Andric if (KnownSize.has_value()) 4207*fcaf7f86SDimitry Andric B.buildConstant(DstReg, KnownSize.value()); 4208*fcaf7f86SDimitry Andric return false; 4209*fcaf7f86SDimitry Andric } 4210*fcaf7f86SDimitry Andric 4211*fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 4212*fcaf7f86SDimitry Andric MachineRegisterInfo &MRI, 4213*fcaf7f86SDimitry Andric MachineIRBuilder &B) const { 4214*fcaf7f86SDimitry Andric 4215*fcaf7f86SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4216*fcaf7f86SDimitry Andric if (!MFI->isEntryFunction()) { 4217*fcaf7f86SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 4218*fcaf7f86SDimitry Andric AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 4219*fcaf7f86SDimitry Andric } 4220*fcaf7f86SDimitry Andric 4221*fcaf7f86SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 4222*fcaf7f86SDimitry Andric if (!getLDSKernelId(DstReg, MRI, B)) 4223*fcaf7f86SDimitry Andric return false; 4224*fcaf7f86SDimitry Andric 4225*fcaf7f86SDimitry Andric MI.eraseFromParent(); 4226*fcaf7f86SDimitry Andric return true; 4227*fcaf7f86SDimitry Andric } 4228*fcaf7f86SDimitry Andric 42298bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 42308bcb0991SDimitry Andric MachineRegisterInfo &MRI, 42318bcb0991SDimitry Andric MachineIRBuilder &B, 42328bcb0991SDimitry Andric unsigned AddrSpace) const { 42338bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 4234e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 4235e8d8bef9SDimitry Andric Register Hi32 = Unmerge.getReg(1); 4236e8d8bef9SDimitry Andric 42378bcb0991SDimitry Andric B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 42388bcb0991SDimitry Andric MI.eraseFromParent(); 42398bcb0991SDimitry Andric return true; 42408bcb0991SDimitry Andric } 42418bcb0991SDimitry Andric 42425ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 42435ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be 42445ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset 42455ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in 42465ffd83dbSDimitry Andric // the instruction's soffset field). This function takes the first kind of 42475ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset. 4248fe6060f1SDimitry Andric std::pair<Register, unsigned> 42495ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 42505ffd83dbSDimitry Andric Register OrigOffset) const { 42515ffd83dbSDimitry Andric const unsigned MaxImm = 4095; 42525ffd83dbSDimitry Andric Register BaseReg; 4253fe6060f1SDimitry Andric unsigned ImmOffset; 42545ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 4255fe6060f1SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 42565ffd83dbSDimitry Andric 4257fe6060f1SDimitry Andric std::tie(BaseReg, ImmOffset) = 4258fe6060f1SDimitry Andric AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 42595ffd83dbSDimitry Andric 4260fe6060f1SDimitry Andric // If BaseReg is a pointer, convert it to int. 4261fe6060f1SDimitry Andric if (MRI.getType(BaseReg).isPointer()) 4262fe6060f1SDimitry Andric BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 42635ffd83dbSDimitry Andric 42645ffd83dbSDimitry Andric // If the immediate value is too big for the immoffset field, put the value 42655ffd83dbSDimitry Andric // and -4096 into the immoffset field so that the value that is copied/added 42665ffd83dbSDimitry Andric // for the voffset field is a multiple of 4096, and it stands more chance 42675ffd83dbSDimitry Andric // of being CSEd with the copy/add for another similar load/store. 42685ffd83dbSDimitry Andric // However, do not do that rounding down to a multiple of 4096 if that is a 42695ffd83dbSDimitry Andric // negative number, as it appears to be illegal to have a negative offset 42705ffd83dbSDimitry Andric // in the vgpr, even if adding the immediate offset makes it positive. 42715ffd83dbSDimitry Andric unsigned Overflow = ImmOffset & ~MaxImm; 42725ffd83dbSDimitry Andric ImmOffset -= Overflow; 42735ffd83dbSDimitry Andric if ((int32_t)Overflow < 0) { 42745ffd83dbSDimitry Andric Overflow += ImmOffset; 42755ffd83dbSDimitry Andric ImmOffset = 0; 42765ffd83dbSDimitry Andric } 42775ffd83dbSDimitry Andric 42785ffd83dbSDimitry Andric if (Overflow != 0) { 42795ffd83dbSDimitry Andric if (!BaseReg) { 42805ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, Overflow).getReg(0); 42815ffd83dbSDimitry Andric } else { 42825ffd83dbSDimitry Andric auto OverflowVal = B.buildConstant(S32, Overflow); 42835ffd83dbSDimitry Andric BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 42845ffd83dbSDimitry Andric } 42855ffd83dbSDimitry Andric } 42865ffd83dbSDimitry Andric 42875ffd83dbSDimitry Andric if (!BaseReg) 42885ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, 0).getReg(0); 42895ffd83dbSDimitry Andric 4290fe6060f1SDimitry Andric return std::make_pair(BaseReg, ImmOffset); 4291fe6060f1SDimitry Andric } 4292fe6060f1SDimitry Andric 4293fe6060f1SDimitry Andric /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 4294fe6060f1SDimitry Andric void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 4295fe6060f1SDimitry Andric Register VOffset, Register SOffset, 4296fe6060f1SDimitry Andric unsigned ImmOffset, Register VIndex, 4297fe6060f1SDimitry Andric MachineRegisterInfo &MRI) const { 4298fe6060f1SDimitry Andric Optional<ValueAndVReg> MaybeVOffsetVal = 4299349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(VOffset, MRI); 4300fe6060f1SDimitry Andric Optional<ValueAndVReg> MaybeSOffsetVal = 4301349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(SOffset, MRI); 4302fe6060f1SDimitry Andric Optional<ValueAndVReg> MaybeVIndexVal = 4303349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(VIndex, MRI); 4304fe6060f1SDimitry Andric // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 4305fe6060f1SDimitry Andric // update the MMO with that offset. The stride is unknown so we can only do 4306fe6060f1SDimitry Andric // this if VIndex is constant 0. 4307fe6060f1SDimitry Andric if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 4308fe6060f1SDimitry Andric MaybeVIndexVal->Value == 0) { 4309fe6060f1SDimitry Andric uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 4310fe6060f1SDimitry Andric MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 4311fe6060f1SDimitry Andric MMO->setOffset(TotalOffset); 4312fe6060f1SDimitry Andric } else { 4313fe6060f1SDimitry Andric // We don't have a constant combined offset to use in the MMO. Give up. 4314fe6060f1SDimitry Andric MMO->setValue((Value *)nullptr); 4315fe6060f1SDimitry Andric } 43165ffd83dbSDimitry Andric } 43175ffd83dbSDimitry Andric 43188bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets. 43198bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 43208bcb0991SDimitry Andric MachineRegisterInfo &MRI, 4321e8d8bef9SDimitry Andric Register Reg, 4322e8d8bef9SDimitry Andric bool ImageStore) const { 43238bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 43248bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 43258bcb0991SDimitry Andric LLT StoreVT = MRI.getType(Reg); 43268bcb0991SDimitry Andric assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 43278bcb0991SDimitry Andric 4328e8d8bef9SDimitry Andric if (ST.hasUnpackedD16VMem()) { 43298bcb0991SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 43308bcb0991SDimitry Andric 43318bcb0991SDimitry Andric SmallVector<Register, 4> WideRegs; 43328bcb0991SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 43338bcb0991SDimitry Andric WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 43348bcb0991SDimitry Andric 43358bcb0991SDimitry Andric int NumElts = StoreVT.getNumElements(); 43368bcb0991SDimitry Andric 4337fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 4338fe6060f1SDimitry Andric .getReg(0); 43398bcb0991SDimitry Andric } 43408bcb0991SDimitry Andric 4341e8d8bef9SDimitry Andric if (ImageStore && ST.hasImageStoreD16Bug()) { 4342e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 2) { 4343e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 4344e8d8bef9SDimitry Andric Reg = B.buildBitcast(S32, Reg).getReg(0); 4345e8d8bef9SDimitry Andric PackedRegs.push_back(Reg); 4346e8d8bef9SDimitry Andric PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 4347fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 4348fe6060f1SDimitry Andric .getReg(0); 4349e8d8bef9SDimitry Andric } 4350e8d8bef9SDimitry Andric 4351e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 3) { 4352e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 4353e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 4354e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4355e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 4356e8d8bef9SDimitry Andric PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 4357fe6060f1SDimitry Andric Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 4358fe6060f1SDimitry Andric return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 4359e8d8bef9SDimitry Andric } 4360e8d8bef9SDimitry Andric 4361e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 4) { 4362e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 4363fe6060f1SDimitry Andric Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 4364e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Reg); 4365e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4366e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 4367e8d8bef9SDimitry Andric PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 4368fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 4369fe6060f1SDimitry Andric .getReg(0); 4370e8d8bef9SDimitry Andric } 4371e8d8bef9SDimitry Andric 4372e8d8bef9SDimitry Andric llvm_unreachable("invalid data type"); 4373e8d8bef9SDimitry Andric } 4374e8d8bef9SDimitry Andric 43750eae32dcSDimitry Andric if (StoreVT == LLT::fixed_vector(3, S16)) { 43760eae32dcSDimitry Andric Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 43770eae32dcSDimitry Andric .getReg(0); 43780eae32dcSDimitry Andric } 4379e8d8bef9SDimitry Andric return Reg; 4380e8d8bef9SDimitry Andric } 4381e8d8bef9SDimitry Andric 43825ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType( 43835ffd83dbSDimitry Andric MachineIRBuilder &B, Register VData, bool IsFormat) const { 43845ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 43855ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData); 43868bcb0991SDimitry Andric 43878bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 43888bcb0991SDimitry Andric 43898bcb0991SDimitry Andric // Fixup illegal register types for i8 stores. 43908bcb0991SDimitry Andric if (Ty == LLT::scalar(8) || Ty == S16) { 43918bcb0991SDimitry Andric Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 43925ffd83dbSDimitry Andric return AnyExt; 43938bcb0991SDimitry Andric } 43948bcb0991SDimitry Andric 43958bcb0991SDimitry Andric if (Ty.isVector()) { 43968bcb0991SDimitry Andric if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 43978bcb0991SDimitry Andric if (IsFormat) 43985ffd83dbSDimitry Andric return handleD16VData(B, *MRI, VData); 43995ffd83dbSDimitry Andric } 44005ffd83dbSDimitry Andric } 44015ffd83dbSDimitry Andric 44025ffd83dbSDimitry Andric return VData; 44035ffd83dbSDimitry Andric } 44045ffd83dbSDimitry Andric 44055ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 44065ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 44075ffd83dbSDimitry Andric MachineIRBuilder &B, 44085ffd83dbSDimitry Andric bool IsTyped, 44095ffd83dbSDimitry Andric bool IsFormat) const { 44105ffd83dbSDimitry Andric Register VData = MI.getOperand(1).getReg(); 44115ffd83dbSDimitry Andric LLT Ty = MRI.getType(VData); 44125ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 44135ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 44145ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 44155ffd83dbSDimitry Andric 44165ffd83dbSDimitry Andric VData = fixStoreSourceType(B, VData, IsFormat); 44175ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 44185ffd83dbSDimitry Andric 44195ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 44205ffd83dbSDimitry Andric const int MemSize = MMO->getSize(); 44215ffd83dbSDimitry Andric 44225ffd83dbSDimitry Andric unsigned ImmOffset; 44235ffd83dbSDimitry Andric 44245ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 44255ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 44265ffd83dbSDimitry Andric 44275ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 44285ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 44295ffd83dbSDimitry Andric Register VIndex; 44305ffd83dbSDimitry Andric int OpOffset = 0; 44315ffd83dbSDimitry Andric if (HasVIndex) { 44325ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 44335ffd83dbSDimitry Andric OpOffset = 1; 4434fe6060f1SDimitry Andric } else { 4435fe6060f1SDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 44365ffd83dbSDimitry Andric } 44375ffd83dbSDimitry Andric 44385ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 44395ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 44405ffd83dbSDimitry Andric 44415ffd83dbSDimitry Andric unsigned Format = 0; 44425ffd83dbSDimitry Andric if (IsTyped) { 44435ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 44445ffd83dbSDimitry Andric ++OpOffset; 44455ffd83dbSDimitry Andric } 44465ffd83dbSDimitry Andric 44475ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 44485ffd83dbSDimitry Andric 4449fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4450fe6060f1SDimitry Andric updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 44515ffd83dbSDimitry Andric 44525ffd83dbSDimitry Andric unsigned Opc; 44535ffd83dbSDimitry Andric if (IsTyped) { 44545ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 44555ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 44565ffd83dbSDimitry Andric } else if (IsFormat) { 44575ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 44585ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 44595ffd83dbSDimitry Andric } else { 44605ffd83dbSDimitry Andric switch (MemSize) { 44615ffd83dbSDimitry Andric case 1: 44625ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 44635ffd83dbSDimitry Andric break; 44645ffd83dbSDimitry Andric case 2: 44655ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 44665ffd83dbSDimitry Andric break; 44675ffd83dbSDimitry Andric default: 44685ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 44695ffd83dbSDimitry Andric break; 44705ffd83dbSDimitry Andric } 44715ffd83dbSDimitry Andric } 44725ffd83dbSDimitry Andric 44735ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 44745ffd83dbSDimitry Andric .addUse(VData) // vdata 44755ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 44765ffd83dbSDimitry Andric .addUse(VIndex) // vindex 44775ffd83dbSDimitry Andric .addUse(VOffset) // voffset 44785ffd83dbSDimitry Andric .addUse(SOffset) // soffset 44795ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 44805ffd83dbSDimitry Andric 44815ffd83dbSDimitry Andric if (IsTyped) 44825ffd83dbSDimitry Andric MIB.addImm(Format); 44835ffd83dbSDimitry Andric 44845ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 44855ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 44865ffd83dbSDimitry Andric .addMemOperand(MMO); 44875ffd83dbSDimitry Andric 44885ffd83dbSDimitry Andric MI.eraseFromParent(); 44898bcb0991SDimitry Andric return true; 44908bcb0991SDimitry Andric } 44918bcb0991SDimitry Andric 44925ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 44935ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 44945ffd83dbSDimitry Andric MachineIRBuilder &B, 44955ffd83dbSDimitry Andric bool IsFormat, 44965ffd83dbSDimitry Andric bool IsTyped) const { 44975ffd83dbSDimitry Andric // FIXME: Verifier should enforce 1 MMO for these intrinsics. 44985ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 4499fe6060f1SDimitry Andric const LLT MemTy = MMO->getMemoryType(); 45005ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 45015ffd83dbSDimitry Andric 45025ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 45035ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 45045ffd83dbSDimitry Andric 45055ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 45065ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 45075ffd83dbSDimitry Andric 45085ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 45095ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 45105ffd83dbSDimitry Andric Register VIndex; 45115ffd83dbSDimitry Andric int OpOffset = 0; 45125ffd83dbSDimitry Andric if (HasVIndex) { 45135ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 45145ffd83dbSDimitry Andric OpOffset = 1; 4515fe6060f1SDimitry Andric } else { 4516fe6060f1SDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 45178bcb0991SDimitry Andric } 45188bcb0991SDimitry Andric 45195ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 45205ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 45215ffd83dbSDimitry Andric 45225ffd83dbSDimitry Andric unsigned Format = 0; 45235ffd83dbSDimitry Andric if (IsTyped) { 45245ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 45255ffd83dbSDimitry Andric ++OpOffset; 45268bcb0991SDimitry Andric } 45278bcb0991SDimitry Andric 45285ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 45295ffd83dbSDimitry Andric unsigned ImmOffset; 45305ffd83dbSDimitry Andric 45315ffd83dbSDimitry Andric LLT Ty = MRI.getType(Dst); 45325ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 45335ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 45345ffd83dbSDimitry Andric const bool Unpacked = ST.hasUnpackedD16VMem(); 45355ffd83dbSDimitry Andric 4536fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4537fe6060f1SDimitry Andric updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 45385ffd83dbSDimitry Andric 45395ffd83dbSDimitry Andric unsigned Opc; 45405ffd83dbSDimitry Andric 45415ffd83dbSDimitry Andric if (IsTyped) { 45425ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 45435ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 45445ffd83dbSDimitry Andric } else if (IsFormat) { 45455ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 45465ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 45475ffd83dbSDimitry Andric } else { 4548fe6060f1SDimitry Andric switch (MemTy.getSizeInBits()) { 4549fe6060f1SDimitry Andric case 8: 45505ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 45515ffd83dbSDimitry Andric break; 4552fe6060f1SDimitry Andric case 16: 45535ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 45545ffd83dbSDimitry Andric break; 45555ffd83dbSDimitry Andric default: 45565ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 45575ffd83dbSDimitry Andric break; 45585ffd83dbSDimitry Andric } 45595ffd83dbSDimitry Andric } 45605ffd83dbSDimitry Andric 45615ffd83dbSDimitry Andric Register LoadDstReg; 45625ffd83dbSDimitry Andric 4563fe6060f1SDimitry Andric bool IsExtLoad = 4564fe6060f1SDimitry Andric (!IsD16 && MemTy.getSizeInBits() < 32) || (IsD16 && !Ty.isVector()); 45655ffd83dbSDimitry Andric LLT UnpackedTy = Ty.changeElementSize(32); 45665ffd83dbSDimitry Andric 45675ffd83dbSDimitry Andric if (IsExtLoad) 45685ffd83dbSDimitry Andric LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 45695ffd83dbSDimitry Andric else if (Unpacked && IsD16 && Ty.isVector()) 45705ffd83dbSDimitry Andric LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 45715ffd83dbSDimitry Andric else 45725ffd83dbSDimitry Andric LoadDstReg = Dst; 45735ffd83dbSDimitry Andric 45745ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 45755ffd83dbSDimitry Andric .addDef(LoadDstReg) // vdata 45765ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 45775ffd83dbSDimitry Andric .addUse(VIndex) // vindex 45785ffd83dbSDimitry Andric .addUse(VOffset) // voffset 45795ffd83dbSDimitry Andric .addUse(SOffset) // soffset 45805ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 45815ffd83dbSDimitry Andric 45825ffd83dbSDimitry Andric if (IsTyped) 45835ffd83dbSDimitry Andric MIB.addImm(Format); 45845ffd83dbSDimitry Andric 45855ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 45865ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 45875ffd83dbSDimitry Andric .addMemOperand(MMO); 45885ffd83dbSDimitry Andric 45895ffd83dbSDimitry Andric if (LoadDstReg != Dst) { 45905ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 45915ffd83dbSDimitry Andric 45925ffd83dbSDimitry Andric // Widen result for extending loads was widened. 45935ffd83dbSDimitry Andric if (IsExtLoad) 45945ffd83dbSDimitry Andric B.buildTrunc(Dst, LoadDstReg); 45955ffd83dbSDimitry Andric else { 45965ffd83dbSDimitry Andric // Repack to original 16-bit vector result 45975ffd83dbSDimitry Andric // FIXME: G_TRUNC should work, but legalization currently fails 45985ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 45995ffd83dbSDimitry Andric SmallVector<Register, 4> Repack; 46005ffd83dbSDimitry Andric for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 46015ffd83dbSDimitry Andric Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 46025ffd83dbSDimitry Andric B.buildMerge(Dst, Repack); 46035ffd83dbSDimitry Andric } 46045ffd83dbSDimitry Andric } 46055ffd83dbSDimitry Andric 46065ffd83dbSDimitry Andric MI.eraseFromParent(); 46075ffd83dbSDimitry Andric return true; 46085ffd83dbSDimitry Andric } 46095ffd83dbSDimitry Andric 46105ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 46115ffd83dbSDimitry Andric MachineIRBuilder &B, 46125ffd83dbSDimitry Andric bool IsInc) const { 46135ffd83dbSDimitry Andric unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 46145ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_ATOMIC_DEC; 46155ffd83dbSDimitry Andric B.buildInstr(Opc) 46165ffd83dbSDimitry Andric .addDef(MI.getOperand(0).getReg()) 46175ffd83dbSDimitry Andric .addUse(MI.getOperand(2).getReg()) 46185ffd83dbSDimitry Andric .addUse(MI.getOperand(3).getReg()) 46195ffd83dbSDimitry Andric .cloneMemRefs(MI); 46205ffd83dbSDimitry Andric MI.eraseFromParent(); 46215ffd83dbSDimitry Andric return true; 46225ffd83dbSDimitry Andric } 46235ffd83dbSDimitry Andric 46245ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 46255ffd83dbSDimitry Andric switch (IntrID) { 46265ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 46275ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 46285ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 46295ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 46305ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 46315ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 46325ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 46335ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 46345ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 46355ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 46365ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 46375ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 46385ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 46395ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 46405ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 46415ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 46425ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 46435ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 46445ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 46455ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 46465ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 46475ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 46485ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 46495ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 46505ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 46515ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 46525ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 46535ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 46545ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 46555ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 46565ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 46575ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 46585ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 46595ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 46605ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 46615ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 46625ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 46635ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 46645ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4665e8d8bef9SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4666e8d8bef9SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4667e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4668fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4669fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4670fe6060f1SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4671fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4672fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4673fe6060f1SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 46745ffd83dbSDimitry Andric default: 46755ffd83dbSDimitry Andric llvm_unreachable("unhandled atomic opcode"); 46765ffd83dbSDimitry Andric } 46775ffd83dbSDimitry Andric } 46785ffd83dbSDimitry Andric 46795ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 46805ffd83dbSDimitry Andric MachineIRBuilder &B, 46815ffd83dbSDimitry Andric Intrinsic::ID IID) const { 46825ffd83dbSDimitry Andric const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 46835ffd83dbSDimitry Andric IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4684e8d8bef9SDimitry Andric const bool HasReturn = MI.getNumExplicitDefs() != 0; 46855ffd83dbSDimitry Andric 4686e8d8bef9SDimitry Andric Register Dst; 46875ffd83dbSDimitry Andric 46885ffd83dbSDimitry Andric int OpOffset = 0; 4689e8d8bef9SDimitry Andric if (HasReturn) { 4690e8d8bef9SDimitry Andric // A few FP atomics do not support return values. 4691e8d8bef9SDimitry Andric Dst = MI.getOperand(0).getReg(); 4692e8d8bef9SDimitry Andric } else { 4693e8d8bef9SDimitry Andric OpOffset = -1; 4694e8d8bef9SDimitry Andric } 4695e8d8bef9SDimitry Andric 4696e8d8bef9SDimitry Andric Register VData = MI.getOperand(2 + OpOffset).getReg(); 4697e8d8bef9SDimitry Andric Register CmpVal; 46985ffd83dbSDimitry Andric 46995ffd83dbSDimitry Andric if (IsCmpSwap) { 47005ffd83dbSDimitry Andric CmpVal = MI.getOperand(3 + OpOffset).getReg(); 47015ffd83dbSDimitry Andric ++OpOffset; 47025ffd83dbSDimitry Andric } 47035ffd83dbSDimitry Andric 47045ffd83dbSDimitry Andric Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4705e8d8bef9SDimitry Andric const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 47065ffd83dbSDimitry Andric 47075ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 47085ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 47095ffd83dbSDimitry Andric Register VIndex; 47105ffd83dbSDimitry Andric if (HasVIndex) { 47115ffd83dbSDimitry Andric VIndex = MI.getOperand(4 + OpOffset).getReg(); 47125ffd83dbSDimitry Andric ++OpOffset; 4713fe6060f1SDimitry Andric } else { 4714fe6060f1SDimitry Andric VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 47155ffd83dbSDimitry Andric } 47165ffd83dbSDimitry Andric 47175ffd83dbSDimitry Andric Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 47185ffd83dbSDimitry Andric Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 47195ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 47205ffd83dbSDimitry Andric 47215ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 47225ffd83dbSDimitry Andric 47235ffd83dbSDimitry Andric unsigned ImmOffset; 4724fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4725fe6060f1SDimitry Andric updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 47265ffd83dbSDimitry Andric 4727e8d8bef9SDimitry Andric auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4728e8d8bef9SDimitry Andric 4729e8d8bef9SDimitry Andric if (HasReturn) 4730e8d8bef9SDimitry Andric MIB.addDef(Dst); 4731e8d8bef9SDimitry Andric 4732e8d8bef9SDimitry Andric MIB.addUse(VData); // vdata 47335ffd83dbSDimitry Andric 47345ffd83dbSDimitry Andric if (IsCmpSwap) 47355ffd83dbSDimitry Andric MIB.addReg(CmpVal); 47365ffd83dbSDimitry Andric 47375ffd83dbSDimitry Andric MIB.addUse(RSrc) // rsrc 47385ffd83dbSDimitry Andric .addUse(VIndex) // vindex 47395ffd83dbSDimitry Andric .addUse(VOffset) // voffset 47405ffd83dbSDimitry Andric .addUse(SOffset) // soffset 47415ffd83dbSDimitry Andric .addImm(ImmOffset) // offset(imm) 47425ffd83dbSDimitry Andric .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 47435ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 47445ffd83dbSDimitry Andric .addMemOperand(MMO); 47455ffd83dbSDimitry Andric 47465ffd83dbSDimitry Andric MI.eraseFromParent(); 47475ffd83dbSDimitry Andric return true; 47485ffd83dbSDimitry Andric } 47495ffd83dbSDimitry Andric 4750fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 47515ffd83dbSDimitry Andric /// vector with s16 typed elements. 4752fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4753fe6060f1SDimitry Andric SmallVectorImpl<Register> &PackedAddrs, 4754fe6060f1SDimitry Andric unsigned ArgOffset, 4755fe6060f1SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr, 4756fe6060f1SDimitry Andric bool IsA16, bool IsG16) { 47575ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 4758fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 4759fe6060f1SDimitry Andric auto EndIdx = Intr->VAddrEnd; 47605ffd83dbSDimitry Andric 4761e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4762e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 47635ffd83dbSDimitry Andric if (!SrcOp.isReg()) 47645ffd83dbSDimitry Andric continue; // _L to _LZ may have eliminated this. 47655ffd83dbSDimitry Andric 47665ffd83dbSDimitry Andric Register AddrReg = SrcOp.getReg(); 47675ffd83dbSDimitry Andric 4768fe6060f1SDimitry Andric if ((I < Intr->GradientStart) || 4769fe6060f1SDimitry Andric (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4770fe6060f1SDimitry Andric (I >= Intr->CoordStart && !IsA16)) { 47710eae32dcSDimitry Andric if ((I < Intr->GradientStart) && IsA16 && 47720eae32dcSDimitry Andric (B.getMRI()->getType(AddrReg) == S16)) { 477304eeddc0SDimitry Andric assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 47740eae32dcSDimitry Andric // Special handling of bias when A16 is on. Bias is of type half but 47750eae32dcSDimitry Andric // occupies full 32-bit. 47760eae32dcSDimitry Andric PackedAddrs.push_back( 47770eae32dcSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 47780eae32dcSDimitry Andric .getReg(0)); 47790eae32dcSDimitry Andric } else { 478004eeddc0SDimitry Andric assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 478104eeddc0SDimitry Andric "Bias needs to be converted to 16 bit in A16 mode"); 478204eeddc0SDimitry Andric // Handle any gradient or coordinate operands that should not be packed 47835ffd83dbSDimitry Andric AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 47845ffd83dbSDimitry Andric PackedAddrs.push_back(AddrReg); 47850eae32dcSDimitry Andric } 47865ffd83dbSDimitry Andric } else { 47875ffd83dbSDimitry Andric // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 47885ffd83dbSDimitry Andric // derivatives dx/dh and dx/dv are packed with undef. 47895ffd83dbSDimitry Andric if (((I + 1) >= EndIdx) || 4790e8d8bef9SDimitry Andric ((Intr->NumGradients / 2) % 2 == 1 && 4791e8d8bef9SDimitry Andric (I == static_cast<unsigned>(Intr->GradientStart + 4792e8d8bef9SDimitry Andric (Intr->NumGradients / 2) - 1) || 4793e8d8bef9SDimitry Andric I == static_cast<unsigned>(Intr->GradientStart + 4794e8d8bef9SDimitry Andric Intr->NumGradients - 1))) || 47955ffd83dbSDimitry Andric // Check for _L to _LZ optimization 4796e8d8bef9SDimitry Andric !MI.getOperand(ArgOffset + I + 1).isReg()) { 47975ffd83dbSDimitry Andric PackedAddrs.push_back( 47985ffd83dbSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 47995ffd83dbSDimitry Andric .getReg(0)); 48005ffd83dbSDimitry Andric } else { 48015ffd83dbSDimitry Andric PackedAddrs.push_back( 4802e8d8bef9SDimitry Andric B.buildBuildVector( 4803e8d8bef9SDimitry Andric V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 48045ffd83dbSDimitry Andric .getReg(0)); 48055ffd83dbSDimitry Andric ++I; 48065ffd83dbSDimitry Andric } 48075ffd83dbSDimitry Andric } 48085ffd83dbSDimitry Andric } 48095ffd83dbSDimitry Andric } 48105ffd83dbSDimitry Andric 48115ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register, 48125ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg. 48135ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 48145ffd83dbSDimitry Andric int DimIdx, int NumVAddrs) { 48155ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 48165ffd83dbSDimitry Andric 48175ffd83dbSDimitry Andric SmallVector<Register, 8> AddrRegs; 48185ffd83dbSDimitry Andric for (int I = 0; I != NumVAddrs; ++I) { 48195ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 48205ffd83dbSDimitry Andric if (SrcOp.isReg()) { 48215ffd83dbSDimitry Andric AddrRegs.push_back(SrcOp.getReg()); 48225ffd83dbSDimitry Andric assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 48235ffd83dbSDimitry Andric } 48245ffd83dbSDimitry Andric } 48255ffd83dbSDimitry Andric 48265ffd83dbSDimitry Andric int NumAddrRegs = AddrRegs.size(); 48275ffd83dbSDimitry Andric if (NumAddrRegs != 1) { 4828fe6060f1SDimitry Andric // Above 8 elements round up to next power of 2 (i.e. 16). 4829fe6060f1SDimitry Andric if (NumAddrRegs > 8 && !isPowerOf2_32(NumAddrRegs)) { 48305ffd83dbSDimitry Andric const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 48315ffd83dbSDimitry Andric auto Undef = B.buildUndef(S32); 48325ffd83dbSDimitry Andric AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 48335ffd83dbSDimitry Andric NumAddrRegs = RoundedNumRegs; 48345ffd83dbSDimitry Andric } 48355ffd83dbSDimitry Andric 4836fe6060f1SDimitry Andric auto VAddr = 4837fe6060f1SDimitry Andric B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 48385ffd83dbSDimitry Andric MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 48395ffd83dbSDimitry Andric } 48405ffd83dbSDimitry Andric 48415ffd83dbSDimitry Andric for (int I = 1; I != NumVAddrs; ++I) { 48425ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 48435ffd83dbSDimitry Andric if (SrcOp.isReg()) 48445ffd83dbSDimitry Andric MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 48455ffd83dbSDimitry Andric } 48465ffd83dbSDimitry Andric } 48475ffd83dbSDimitry Andric 48485ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget. 48495ffd83dbSDimitry Andric /// 48505ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be 48515ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed 48525ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit 48535ffd83dbSDimitry Andric /// registers. 48545ffd83dbSDimitry Andric /// 48555ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want 48565ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't 485781ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid 48585ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on 4859349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires 4860349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg. 48615ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4862e8d8bef9SDimitry Andric MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4863e8d8bef9SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 48645ffd83dbSDimitry Andric 4865e8d8bef9SDimitry Andric const unsigned NumDefs = MI.getNumExplicitDefs(); 4866e8d8bef9SDimitry Andric const unsigned ArgOffset = NumDefs + 1; 48675ffd83dbSDimitry Andric bool IsTFE = NumDefs == 2; 48685ffd83dbSDimitry Andric // We are only processing the operands of d16 image operations on subtargets 48695ffd83dbSDimitry Andric // that use the unpacked register layout, or need to repack the TFE result. 48705ffd83dbSDimitry Andric 48715ffd83dbSDimitry Andric // TODO: Do we need to guard against already legalized intrinsics? 48725ffd83dbSDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4873e8d8bef9SDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 48745ffd83dbSDimitry Andric 48755ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 48765ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 48775ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 4878fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 48795ffd83dbSDimitry Andric 48805ffd83dbSDimitry Andric unsigned DMask = 0; 488104eeddc0SDimitry Andric Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 488204eeddc0SDimitry Andric LLT Ty = MRI->getType(VData); 48835ffd83dbSDimitry Andric 48845ffd83dbSDimitry Andric // Check for 16 bit addresses and pack if true. 4885e8d8bef9SDimitry Andric LLT GradTy = 4886e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4887e8d8bef9SDimitry Andric LLT AddrTy = 4888e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 48895ffd83dbSDimitry Andric const bool IsG16 = GradTy == S16; 48905ffd83dbSDimitry Andric const bool IsA16 = AddrTy == S16; 489104eeddc0SDimitry Andric const bool IsD16 = Ty.getScalarType() == S16; 48925ffd83dbSDimitry Andric 48935ffd83dbSDimitry Andric int DMaskLanes = 0; 48945ffd83dbSDimitry Andric if (!BaseOpcode->Atomic) { 4895e8d8bef9SDimitry Andric DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 48965ffd83dbSDimitry Andric if (BaseOpcode->Gather4) { 48975ffd83dbSDimitry Andric DMaskLanes = 4; 48985ffd83dbSDimitry Andric } else if (DMask != 0) { 48995ffd83dbSDimitry Andric DMaskLanes = countPopulation(DMask); 49005ffd83dbSDimitry Andric } else if (!IsTFE && !BaseOpcode->Store) { 49015ffd83dbSDimitry Andric // If dmask is 0, this is a no-op load. This can be eliminated. 49025ffd83dbSDimitry Andric B.buildUndef(MI.getOperand(0)); 49035ffd83dbSDimitry Andric MI.eraseFromParent(); 49045ffd83dbSDimitry Andric return true; 49055ffd83dbSDimitry Andric } 49065ffd83dbSDimitry Andric } 49075ffd83dbSDimitry Andric 49085ffd83dbSDimitry Andric Observer.changingInstr(MI); 49095ffd83dbSDimitry Andric auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 49105ffd83dbSDimitry Andric 491104eeddc0SDimitry Andric const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 491204eeddc0SDimitry Andric : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 491304eeddc0SDimitry Andric const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 491404eeddc0SDimitry Andric : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 491504eeddc0SDimitry Andric unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 49165ffd83dbSDimitry Andric 49175ffd83dbSDimitry Andric // Track that we legalized this 49185ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(NewOpcode)); 49195ffd83dbSDimitry Andric 49205ffd83dbSDimitry Andric // Expecting to get an error flag since TFC is on - and dmask is 0 Force 49215ffd83dbSDimitry Andric // dmask to be at least 1 otherwise the instruction will fail 49225ffd83dbSDimitry Andric if (IsTFE && DMask == 0) { 49235ffd83dbSDimitry Andric DMask = 0x1; 49245ffd83dbSDimitry Andric DMaskLanes = 1; 4925e8d8bef9SDimitry Andric MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 49265ffd83dbSDimitry Andric } 49275ffd83dbSDimitry Andric 49285ffd83dbSDimitry Andric if (BaseOpcode->Atomic) { 49295ffd83dbSDimitry Andric Register VData0 = MI.getOperand(2).getReg(); 49305ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData0); 49315ffd83dbSDimitry Andric 49325ffd83dbSDimitry Andric // TODO: Allow atomic swap and bit ops for v2s16/v4s16 49335ffd83dbSDimitry Andric if (Ty.isVector()) 49345ffd83dbSDimitry Andric return false; 49355ffd83dbSDimitry Andric 49365ffd83dbSDimitry Andric if (BaseOpcode->AtomicX2) { 49375ffd83dbSDimitry Andric Register VData1 = MI.getOperand(3).getReg(); 49385ffd83dbSDimitry Andric // The two values are packed in one register. 4939fe6060f1SDimitry Andric LLT PackedTy = LLT::fixed_vector(2, Ty); 49405ffd83dbSDimitry Andric auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 49415ffd83dbSDimitry Andric MI.getOperand(2).setReg(Concat.getReg(0)); 49425ffd83dbSDimitry Andric MI.getOperand(3).setReg(AMDGPU::NoRegister); 49435ffd83dbSDimitry Andric } 49445ffd83dbSDimitry Andric } 49455ffd83dbSDimitry Andric 4946e8d8bef9SDimitry Andric unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 49475ffd83dbSDimitry Andric 49485ffd83dbSDimitry Andric // Rewrite the addressing register layout before doing anything else. 4949fe6060f1SDimitry Andric if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4950fe6060f1SDimitry Andric // 16 bit gradients are supported, but are tied to the A16 control 4951fe6060f1SDimitry Andric // so both gradients and addresses must be 16 bit 49525ffd83dbSDimitry Andric return false; 4953fe6060f1SDimitry Andric } 49545ffd83dbSDimitry Andric 4955fe6060f1SDimitry Andric if (IsA16 && !ST.hasA16()) { 4956fe6060f1SDimitry Andric // A16 not supported 4957fe6060f1SDimitry Andric return false; 4958fe6060f1SDimitry Andric } 4959fe6060f1SDimitry Andric 4960fe6060f1SDimitry Andric if (IsA16 || IsG16) { 4961e8d8bef9SDimitry Andric if (Intr->NumVAddrs > 1) { 49625ffd83dbSDimitry Andric SmallVector<Register, 4> PackedRegs; 49635ffd83dbSDimitry Andric 4964fe6060f1SDimitry Andric packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4965fe6060f1SDimitry Andric IsG16); 49665ffd83dbSDimitry Andric 49675ffd83dbSDimitry Andric // See also below in the non-a16 branch 4968fe6060f1SDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && PackedRegs.size() >= 3 && 4969fe6060f1SDimitry Andric PackedRegs.size() <= ST.getNSAMaxSize(); 49705ffd83dbSDimitry Andric 49715ffd83dbSDimitry Andric if (!UseNSA && PackedRegs.size() > 1) { 4972fe6060f1SDimitry Andric LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 49735ffd83dbSDimitry Andric auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 49745ffd83dbSDimitry Andric PackedRegs[0] = Concat.getReg(0); 49755ffd83dbSDimitry Andric PackedRegs.resize(1); 49765ffd83dbSDimitry Andric } 49775ffd83dbSDimitry Andric 4978e8d8bef9SDimitry Andric const unsigned NumPacked = PackedRegs.size(); 4979e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4980e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 49815ffd83dbSDimitry Andric if (!SrcOp.isReg()) { 49825ffd83dbSDimitry Andric assert(SrcOp.isImm() && SrcOp.getImm() == 0); 49835ffd83dbSDimitry Andric continue; 49845ffd83dbSDimitry Andric } 49855ffd83dbSDimitry Andric 49865ffd83dbSDimitry Andric assert(SrcOp.getReg() != AMDGPU::NoRegister); 49875ffd83dbSDimitry Andric 4988e8d8bef9SDimitry Andric if (I - Intr->VAddrStart < NumPacked) 4989e8d8bef9SDimitry Andric SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 49905ffd83dbSDimitry Andric else 49915ffd83dbSDimitry Andric SrcOp.setReg(AMDGPU::NoRegister); 49925ffd83dbSDimitry Andric } 49935ffd83dbSDimitry Andric } 49945ffd83dbSDimitry Andric } else { 49955ffd83dbSDimitry Andric // If the register allocator cannot place the address registers contiguously 49965ffd83dbSDimitry Andric // without introducing moves, then using the non-sequential address encoding 49975ffd83dbSDimitry Andric // is always preferable, since it saves VALU instructions and is usually a 49985ffd83dbSDimitry Andric // wash in terms of code size or even better. 49995ffd83dbSDimitry Andric // 50005ffd83dbSDimitry Andric // However, we currently have no way of hinting to the register allocator 50015ffd83dbSDimitry Andric // that MIMG addresses should be placed contiguously when it is possible to 50025ffd83dbSDimitry Andric // do so, so force non-NSA for the common 2-address case as a heuristic. 50035ffd83dbSDimitry Andric // 50045ffd83dbSDimitry Andric // SIShrinkInstructions will convert NSA encodings to non-NSA after register 50055ffd83dbSDimitry Andric // allocation when possible. 500681ad6265SDimitry Andric // 500781ad6265SDimitry Andric // TODO: we can actually allow partial NSA where the final register is a 500881ad6265SDimitry Andric // contiguous set of the remaining addresses. 500981ad6265SDimitry Andric // This could help where there are more addresses than supported. 5010fe6060f1SDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && 5011fe6060f1SDimitry Andric CorrectedNumVAddrs <= ST.getNSAMaxSize(); 50125ffd83dbSDimitry Andric 5013e8d8bef9SDimitry Andric if (!UseNSA && Intr->NumVAddrs > 1) 5014e8d8bef9SDimitry Andric convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 5015e8d8bef9SDimitry Andric Intr->NumVAddrs); 50165ffd83dbSDimitry Andric } 50175ffd83dbSDimitry Andric 50185ffd83dbSDimitry Andric int Flags = 0; 50195ffd83dbSDimitry Andric if (IsA16) 50205ffd83dbSDimitry Andric Flags |= 1; 50215ffd83dbSDimitry Andric if (IsG16) 50225ffd83dbSDimitry Andric Flags |= 2; 50235ffd83dbSDimitry Andric MI.addOperand(MachineOperand::CreateImm(Flags)); 50245ffd83dbSDimitry Andric 50255ffd83dbSDimitry Andric if (BaseOpcode->Store) { // No TFE for stores? 50265ffd83dbSDimitry Andric // TODO: Handle dmask trim 502704eeddc0SDimitry Andric if (!Ty.isVector() || !IsD16) 50285ffd83dbSDimitry Andric return true; 50295ffd83dbSDimitry Andric 5030e8d8bef9SDimitry Andric Register RepackedReg = handleD16VData(B, *MRI, VData, true); 50315ffd83dbSDimitry Andric if (RepackedReg != VData) { 50325ffd83dbSDimitry Andric MI.getOperand(1).setReg(RepackedReg); 50335ffd83dbSDimitry Andric } 50345ffd83dbSDimitry Andric 50355ffd83dbSDimitry Andric return true; 50365ffd83dbSDimitry Andric } 50375ffd83dbSDimitry Andric 50385ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 50395ffd83dbSDimitry Andric const LLT EltTy = Ty.getScalarType(); 50405ffd83dbSDimitry Andric const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 50415ffd83dbSDimitry Andric 50425ffd83dbSDimitry Andric // Confirm that the return type is large enough for the dmask specified 50435ffd83dbSDimitry Andric if (NumElts < DMaskLanes) 50445ffd83dbSDimitry Andric return false; 50455ffd83dbSDimitry Andric 50465ffd83dbSDimitry Andric if (NumElts > 4 || DMaskLanes > 4) 50475ffd83dbSDimitry Andric return false; 50485ffd83dbSDimitry Andric 50495ffd83dbSDimitry Andric const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 5050fe6060f1SDimitry Andric const LLT AdjustedTy = 5051fe6060f1SDimitry Andric Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 50525ffd83dbSDimitry Andric 50535ffd83dbSDimitry Andric // The raw dword aligned data component of the load. The only legal cases 50545ffd83dbSDimitry Andric // where this matters should be when using the packed D16 format, for 50555ffd83dbSDimitry Andric // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 50565ffd83dbSDimitry Andric LLT RoundedTy; 50575ffd83dbSDimitry Andric 50585ffd83dbSDimitry Andric // S32 vector to to cover all data, plus TFE result element. 50595ffd83dbSDimitry Andric LLT TFETy; 50605ffd83dbSDimitry Andric 50615ffd83dbSDimitry Andric // Register type to use for each loaded component. Will be S32 or V2S16. 50625ffd83dbSDimitry Andric LLT RegTy; 50635ffd83dbSDimitry Andric 50645ffd83dbSDimitry Andric if (IsD16 && ST.hasUnpackedD16VMem()) { 5065fe6060f1SDimitry Andric RoundedTy = 5066fe6060f1SDimitry Andric LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 5067fe6060f1SDimitry Andric TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 50685ffd83dbSDimitry Andric RegTy = S32; 50695ffd83dbSDimitry Andric } else { 50705ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 50715ffd83dbSDimitry Andric unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 50725ffd83dbSDimitry Andric unsigned RoundedSize = 32 * RoundedElts; 5073fe6060f1SDimitry Andric RoundedTy = LLT::scalarOrVector( 5074fe6060f1SDimitry Andric ElementCount::getFixed(RoundedSize / EltSize), EltSize); 5075fe6060f1SDimitry Andric TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 50765ffd83dbSDimitry Andric RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 50775ffd83dbSDimitry Andric } 50785ffd83dbSDimitry Andric 50795ffd83dbSDimitry Andric // The return type does not need adjustment. 50805ffd83dbSDimitry Andric // TODO: Should we change s16 case to s32 or <2 x s16>? 50815ffd83dbSDimitry Andric if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 50825ffd83dbSDimitry Andric return true; 50835ffd83dbSDimitry Andric 50845ffd83dbSDimitry Andric Register Dst1Reg; 50855ffd83dbSDimitry Andric 50865ffd83dbSDimitry Andric // Insert after the instruction. 50875ffd83dbSDimitry Andric B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 50885ffd83dbSDimitry Andric 50895ffd83dbSDimitry Andric // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 50905ffd83dbSDimitry Andric // s16> instead of s32, we would only need 1 bitcast instead of multiple. 50915ffd83dbSDimitry Andric const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 50925ffd83dbSDimitry Andric const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 50935ffd83dbSDimitry Andric 50945ffd83dbSDimitry Andric Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 50955ffd83dbSDimitry Andric 50965ffd83dbSDimitry Andric MI.getOperand(0).setReg(NewResultReg); 50975ffd83dbSDimitry Andric 50985ffd83dbSDimitry Andric // In the IR, TFE is supposed to be used with a 2 element struct return 5099349cc55cSDimitry Andric // type. The instruction really returns these two values in one contiguous 51005ffd83dbSDimitry Andric // register, with one additional dword beyond the loaded data. Rewrite the 51015ffd83dbSDimitry Andric // return type to use a single register result. 51025ffd83dbSDimitry Andric 51035ffd83dbSDimitry Andric if (IsTFE) { 51045ffd83dbSDimitry Andric Dst1Reg = MI.getOperand(1).getReg(); 51055ffd83dbSDimitry Andric if (MRI->getType(Dst1Reg) != S32) 51065ffd83dbSDimitry Andric return false; 51075ffd83dbSDimitry Andric 51085ffd83dbSDimitry Andric // TODO: Make sure the TFE operand bit is set. 510981ad6265SDimitry Andric MI.removeOperand(1); 51105ffd83dbSDimitry Andric 51115ffd83dbSDimitry Andric // Handle the easy case that requires no repack instructions. 51125ffd83dbSDimitry Andric if (Ty == S32) { 51135ffd83dbSDimitry Andric B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 51145ffd83dbSDimitry Andric return true; 51155ffd83dbSDimitry Andric } 51165ffd83dbSDimitry Andric } 51175ffd83dbSDimitry Andric 51185ffd83dbSDimitry Andric // Now figure out how to copy the new result register back into the old 51195ffd83dbSDimitry Andric // result. 51205ffd83dbSDimitry Andric SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 51215ffd83dbSDimitry Andric 51225ffd83dbSDimitry Andric const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 51235ffd83dbSDimitry Andric 51245ffd83dbSDimitry Andric if (ResultNumRegs == 1) { 51255ffd83dbSDimitry Andric assert(!IsTFE); 51265ffd83dbSDimitry Andric ResultRegs[0] = NewResultReg; 51275ffd83dbSDimitry Andric } else { 51285ffd83dbSDimitry Andric // We have to repack into a new vector of some kind. 51295ffd83dbSDimitry Andric for (int I = 0; I != NumDataRegs; ++I) 51305ffd83dbSDimitry Andric ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 51315ffd83dbSDimitry Andric B.buildUnmerge(ResultRegs, NewResultReg); 51325ffd83dbSDimitry Andric 51335ffd83dbSDimitry Andric // Drop the final TFE element to get the data part. The TFE result is 51345ffd83dbSDimitry Andric // directly written to the right place already. 51355ffd83dbSDimitry Andric if (IsTFE) 51365ffd83dbSDimitry Andric ResultRegs.resize(NumDataRegs); 51375ffd83dbSDimitry Andric } 51385ffd83dbSDimitry Andric 51395ffd83dbSDimitry Andric // For an s16 scalar result, we form an s32 result with a truncate regardless 51405ffd83dbSDimitry Andric // of packed vs. unpacked. 51415ffd83dbSDimitry Andric if (IsD16 && !Ty.isVector()) { 51425ffd83dbSDimitry Andric B.buildTrunc(DstReg, ResultRegs[0]); 51435ffd83dbSDimitry Andric return true; 51445ffd83dbSDimitry Andric } 51455ffd83dbSDimitry Andric 51465ffd83dbSDimitry Andric // Avoid a build/concat_vector of 1 entry. 51475ffd83dbSDimitry Andric if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 51485ffd83dbSDimitry Andric B.buildBitcast(DstReg, ResultRegs[0]); 51495ffd83dbSDimitry Andric return true; 51505ffd83dbSDimitry Andric } 51515ffd83dbSDimitry Andric 51525ffd83dbSDimitry Andric assert(Ty.isVector()); 51535ffd83dbSDimitry Andric 51545ffd83dbSDimitry Andric if (IsD16) { 51555ffd83dbSDimitry Andric // For packed D16 results with TFE enabled, all the data components are 51565ffd83dbSDimitry Andric // S32. Cast back to the expected type. 51575ffd83dbSDimitry Andric // 51585ffd83dbSDimitry Andric // TODO: We don't really need to use load s32 elements. We would only need one 51595ffd83dbSDimitry Andric // cast for the TFE result if a multiple of v2s16 was used. 51605ffd83dbSDimitry Andric if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 51615ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 51625ffd83dbSDimitry Andric Reg = B.buildBitcast(V2S16, Reg).getReg(0); 51635ffd83dbSDimitry Andric } else if (ST.hasUnpackedD16VMem()) { 51645ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 51655ffd83dbSDimitry Andric Reg = B.buildTrunc(S16, Reg).getReg(0); 51665ffd83dbSDimitry Andric } 51675ffd83dbSDimitry Andric } 51685ffd83dbSDimitry Andric 51695ffd83dbSDimitry Andric auto padWithUndef = [&](LLT Ty, int NumElts) { 51705ffd83dbSDimitry Andric if (NumElts == 0) 51715ffd83dbSDimitry Andric return; 51725ffd83dbSDimitry Andric Register Undef = B.buildUndef(Ty).getReg(0); 51735ffd83dbSDimitry Andric for (int I = 0; I != NumElts; ++I) 51745ffd83dbSDimitry Andric ResultRegs.push_back(Undef); 51755ffd83dbSDimitry Andric }; 51765ffd83dbSDimitry Andric 51775ffd83dbSDimitry Andric // Pad out any elements eliminated due to the dmask. 51785ffd83dbSDimitry Andric LLT ResTy = MRI->getType(ResultRegs[0]); 51795ffd83dbSDimitry Andric if (!ResTy.isVector()) { 51805ffd83dbSDimitry Andric padWithUndef(ResTy, NumElts - ResultRegs.size()); 51815ffd83dbSDimitry Andric B.buildBuildVector(DstReg, ResultRegs); 51825ffd83dbSDimitry Andric return true; 51835ffd83dbSDimitry Andric } 51845ffd83dbSDimitry Andric 51855ffd83dbSDimitry Andric assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 51865ffd83dbSDimitry Andric const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 51875ffd83dbSDimitry Andric 51885ffd83dbSDimitry Andric // Deal with the one annoying legal case. 5189fe6060f1SDimitry Andric const LLT V3S16 = LLT::fixed_vector(3, 16); 51905ffd83dbSDimitry Andric if (Ty == V3S16) { 51910eae32dcSDimitry Andric if (IsTFE) { 51920eae32dcSDimitry Andric if (ResultRegs.size() == 1) { 51930eae32dcSDimitry Andric NewResultReg = ResultRegs[0]; 51940eae32dcSDimitry Andric } else if (ResultRegs.size() == 2) { 51950eae32dcSDimitry Andric LLT V4S16 = LLT::fixed_vector(4, 16); 51960eae32dcSDimitry Andric NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 51970eae32dcSDimitry Andric } else { 51980eae32dcSDimitry Andric return false; 51990eae32dcSDimitry Andric } 52000eae32dcSDimitry Andric } 52010eae32dcSDimitry Andric 52020eae32dcSDimitry Andric if (MRI->getType(DstReg).getNumElements() < 52030eae32dcSDimitry Andric MRI->getType(NewResultReg).getNumElements()) { 52040eae32dcSDimitry Andric B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 52050eae32dcSDimitry Andric } else { 52060eae32dcSDimitry Andric B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 52070eae32dcSDimitry Andric } 52085ffd83dbSDimitry Andric return true; 52095ffd83dbSDimitry Andric } 52105ffd83dbSDimitry Andric 52115ffd83dbSDimitry Andric padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 52125ffd83dbSDimitry Andric B.buildConcatVectors(DstReg, ResultRegs); 52135ffd83dbSDimitry Andric return true; 52145ffd83dbSDimitry Andric } 52155ffd83dbSDimitry Andric 52165ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad( 5217e8d8bef9SDimitry Andric LegalizerHelper &Helper, MachineInstr &MI) const { 5218e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 5219e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 5220e8d8bef9SDimitry Andric 52215ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 52225ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 52235ffd83dbSDimitry Andric unsigned Size = Ty.getSizeInBits(); 52245ffd83dbSDimitry Andric MachineFunction &MF = B.getMF(); 52255ffd83dbSDimitry Andric 52265ffd83dbSDimitry Andric Observer.changingInstr(MI); 52275ffd83dbSDimitry Andric 5228fe6060f1SDimitry Andric if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 5229e8d8bef9SDimitry Andric Ty = getBitcastRegisterType(Ty); 5230e8d8bef9SDimitry Andric Helper.bitcastDst(MI, Ty, 0); 5231e8d8bef9SDimitry Andric Dst = MI.getOperand(0).getReg(); 5232e8d8bef9SDimitry Andric B.setInsertPt(B.getMBB(), MI); 5233e8d8bef9SDimitry Andric } 5234e8d8bef9SDimitry Andric 52355ffd83dbSDimitry Andric // FIXME: We don't really need this intermediate instruction. The intrinsic 52365ffd83dbSDimitry Andric // should be fixed to have a memory operand. Since it's readnone, we're not 52375ffd83dbSDimitry Andric // allowed to add one. 52385ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 523981ad6265SDimitry Andric MI.removeOperand(1); // Remove intrinsic ID 52405ffd83dbSDimitry Andric 52415ffd83dbSDimitry Andric // FIXME: When intrinsic definition is fixed, this should have an MMO already. 52425ffd83dbSDimitry Andric // TODO: Should this use datalayout alignment? 52435ffd83dbSDimitry Andric const unsigned MemSize = (Size + 7) / 8; 52445ffd83dbSDimitry Andric const Align MemAlign(4); 52455ffd83dbSDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 52465ffd83dbSDimitry Andric MachinePointerInfo(), 52475ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 52485ffd83dbSDimitry Andric MachineMemOperand::MOInvariant, 52495ffd83dbSDimitry Andric MemSize, MemAlign); 52505ffd83dbSDimitry Andric MI.addMemOperand(MF, MMO); 52515ffd83dbSDimitry Andric 52525ffd83dbSDimitry Andric // There are no 96-bit result scalar loads, but widening to 128-bit should 52535ffd83dbSDimitry Andric // always be legal. We may need to restore this to a 96-bit result if it turns 52545ffd83dbSDimitry Andric // out this needs to be converted to a vector load during RegBankSelect. 52555ffd83dbSDimitry Andric if (!isPowerOf2_32(Size)) { 52565ffd83dbSDimitry Andric if (Ty.isVector()) 52575ffd83dbSDimitry Andric Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 52585ffd83dbSDimitry Andric else 52595ffd83dbSDimitry Andric Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 52605ffd83dbSDimitry Andric } 52615ffd83dbSDimitry Andric 52625ffd83dbSDimitry Andric Observer.changedInstr(MI); 52635ffd83dbSDimitry Andric return true; 52645ffd83dbSDimitry Andric } 52655ffd83dbSDimitry Andric 5266e8d8bef9SDimitry Andric // TODO: Move to selection 52675ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 52680b57cec5SDimitry Andric MachineRegisterInfo &MRI, 52690b57cec5SDimitry Andric MachineIRBuilder &B) const { 5270fe6060f1SDimitry Andric if (!ST.isTrapHandlerEnabled() || 5271fe6060f1SDimitry Andric ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 5272fe6060f1SDimitry Andric return legalizeTrapEndpgm(MI, MRI, B); 5273fe6060f1SDimitry Andric 5274fe6060f1SDimitry Andric if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 5275fe6060f1SDimitry Andric switch (*HsaAbiVer) { 5276fe6060f1SDimitry Andric case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 5277fe6060f1SDimitry Andric case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 5278fe6060f1SDimitry Andric return legalizeTrapHsaQueuePtr(MI, MRI, B); 5279fe6060f1SDimitry Andric case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 52801fd87a68SDimitry Andric case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 5281fe6060f1SDimitry Andric return ST.supportsGetDoorbellID() ? 5282fe6060f1SDimitry Andric legalizeTrapHsa(MI, MRI, B) : 5283fe6060f1SDimitry Andric legalizeTrapHsaQueuePtr(MI, MRI, B); 5284fe6060f1SDimitry Andric } 5285fe6060f1SDimitry Andric } 5286fe6060f1SDimitry Andric 5287fe6060f1SDimitry Andric llvm_unreachable("Unknown trap handler"); 5288fe6060f1SDimitry Andric } 5289fe6060f1SDimitry Andric 5290fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 5291fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 52925ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 5293fe6060f1SDimitry Andric MI.eraseFromParent(); 5294fe6060f1SDimitry Andric return true; 5295fe6060f1SDimitry Andric } 5296fe6060f1SDimitry Andric 5297fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 5298fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 529981ad6265SDimitry Andric MachineFunction &MF = B.getMF(); 530081ad6265SDimitry Andric const LLT S64 = LLT::scalar(64); 530181ad6265SDimitry Andric 530281ad6265SDimitry Andric Register SGPR01(AMDGPU::SGPR0_SGPR1); 530381ad6265SDimitry Andric // For code object version 5, queue_ptr is passed through implicit kernarg. 530481ad6265SDimitry Andric if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 530581ad6265SDimitry Andric AMDGPUTargetLowering::ImplicitParameter Param = 530681ad6265SDimitry Andric AMDGPUTargetLowering::QUEUE_PTR; 530781ad6265SDimitry Andric uint64_t Offset = 530881ad6265SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 530981ad6265SDimitry Andric 531081ad6265SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister( 531181ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 531281ad6265SDimitry Andric 531381ad6265SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 531481ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 531581ad6265SDimitry Andric return false; 531681ad6265SDimitry Andric 531781ad6265SDimitry Andric // TODO: can we be smarter about machine pointer info? 531881ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 531981ad6265SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 532081ad6265SDimitry Andric PtrInfo, 532181ad6265SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 532281ad6265SDimitry Andric MachineMemOperand::MOInvariant, 532381ad6265SDimitry Andric LLT::scalar(64), commonAlignment(Align(64), Offset)); 532481ad6265SDimitry Andric 532581ad6265SDimitry Andric // Pointer address 532681ad6265SDimitry Andric Register LoadAddr = MRI.createGenericVirtualRegister( 532781ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 532881ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, KernargPtrReg, 532981ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 533081ad6265SDimitry Andric // Load address 533181ad6265SDimitry Andric Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 533281ad6265SDimitry Andric B.buildCopy(SGPR01, Temp); 533381ad6265SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 533481ad6265SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 533581ad6265SDimitry Andric .addReg(SGPR01, RegState::Implicit); 533681ad6265SDimitry Andric MI.eraseFromParent(); 533781ad6265SDimitry Andric return true; 533881ad6265SDimitry Andric } 533981ad6265SDimitry Andric 53405ffd83dbSDimitry Andric // Pass queue pointer to trap handler as input, and insert trap instruction 53415ffd83dbSDimitry Andric // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 5342e8d8bef9SDimitry Andric Register LiveIn = 5343e8d8bef9SDimitry Andric MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5344e8d8bef9SDimitry Andric if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 53455ffd83dbSDimitry Andric return false; 5346e8d8bef9SDimitry Andric 53475ffd83dbSDimitry Andric B.buildCopy(SGPR01, LiveIn); 53485ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 5349fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 53505ffd83dbSDimitry Andric .addReg(SGPR01, RegState::Implicit); 5351fe6060f1SDimitry Andric 5352fe6060f1SDimitry Andric MI.eraseFromParent(); 5353fe6060f1SDimitry Andric return true; 53545ffd83dbSDimitry Andric } 53555ffd83dbSDimitry Andric 5356fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa( 5357fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5358fe6060f1SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 5359fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 53605ffd83dbSDimitry Andric MI.eraseFromParent(); 53615ffd83dbSDimitry Andric return true; 53625ffd83dbSDimitry Andric } 53635ffd83dbSDimitry Andric 53645ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 53655ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5366349cc55cSDimitry Andric // Is non-HSA path or trap-handler disabled? Then, report a warning 53675ffd83dbSDimitry Andric // accordingly 5368fe6060f1SDimitry Andric if (!ST.isTrapHandlerEnabled() || 5369fe6060f1SDimitry Andric ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 53705ffd83dbSDimitry Andric DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 53715ffd83dbSDimitry Andric "debugtrap handler not supported", 53725ffd83dbSDimitry Andric MI.getDebugLoc(), DS_Warning); 53735ffd83dbSDimitry Andric LLVMContext &Ctx = B.getMF().getFunction().getContext(); 53745ffd83dbSDimitry Andric Ctx.diagnose(NoTrap); 53755ffd83dbSDimitry Andric } else { 53765ffd83dbSDimitry Andric // Insert debug-trap instruction 5377fe6060f1SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 5378fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 53795ffd83dbSDimitry Andric } 53805ffd83dbSDimitry Andric 53815ffd83dbSDimitry Andric MI.eraseFromParent(); 53825ffd83dbSDimitry Andric return true; 53835ffd83dbSDimitry Andric } 53845ffd83dbSDimitry Andric 5385e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 5386e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 5387e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 5388e8d8bef9SDimitry Andric const LLT S16 = LLT::scalar(16); 5389e8d8bef9SDimitry Andric const LLT S32 = LLT::scalar(32); 539081ad6265SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 539181ad6265SDimitry Andric const LLT V3S32 = LLT::fixed_vector(3, 32); 5392e8d8bef9SDimitry Andric 5393e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 5394e8d8bef9SDimitry Andric Register NodePtr = MI.getOperand(2).getReg(); 5395e8d8bef9SDimitry Andric Register RayExtent = MI.getOperand(3).getReg(); 5396e8d8bef9SDimitry Andric Register RayOrigin = MI.getOperand(4).getReg(); 5397e8d8bef9SDimitry Andric Register RayDir = MI.getOperand(5).getReg(); 5398e8d8bef9SDimitry Andric Register RayInvDir = MI.getOperand(6).getReg(); 5399e8d8bef9SDimitry Andric Register TDescr = MI.getOperand(7).getReg(); 5400e8d8bef9SDimitry Andric 5401fe6060f1SDimitry Andric if (!ST.hasGFX10_AEncoding()) { 5402fe6060f1SDimitry Andric DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 5403fe6060f1SDimitry Andric "intrinsic not supported on subtarget", 5404fe6060f1SDimitry Andric MI.getDebugLoc()); 5405fe6060f1SDimitry Andric B.getMF().getFunction().getContext().diagnose(BadIntrin); 5406fe6060f1SDimitry Andric return false; 5407fe6060f1SDimitry Andric } 5408fe6060f1SDimitry Andric 540981ad6265SDimitry Andric const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 5410349cc55cSDimitry Andric const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 5411349cc55cSDimitry Andric const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 5412349cc55cSDimitry Andric const unsigned NumVDataDwords = 4; 5413349cc55cSDimitry Andric const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 541481ad6265SDimitry Andric const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 541581ad6265SDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 5416349cc55cSDimitry Andric const unsigned BaseOpcodes[2][2] = { 5417349cc55cSDimitry Andric {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5418349cc55cSDimitry Andric {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5419349cc55cSDimitry Andric AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5420349cc55cSDimitry Andric int Opcode; 5421349cc55cSDimitry Andric if (UseNSA) { 542281ad6265SDimitry Andric Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 542381ad6265SDimitry Andric IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 542481ad6265SDimitry Andric : AMDGPU::MIMGEncGfx10NSA, 5425349cc55cSDimitry Andric NumVDataDwords, NumVAddrDwords); 5426349cc55cSDimitry Andric } else { 542781ad6265SDimitry Andric Opcode = AMDGPU::getMIMGOpcode( 542881ad6265SDimitry Andric BaseOpcodes[Is64][IsA16], 542981ad6265SDimitry Andric IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 543081ad6265SDimitry Andric NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); 5431349cc55cSDimitry Andric } 5432349cc55cSDimitry Andric assert(Opcode != -1); 5433e8d8bef9SDimitry Andric 5434e8d8bef9SDimitry Andric SmallVector<Register, 12> Ops; 543581ad6265SDimitry Andric if (UseNSA && IsGFX11Plus) { 543681ad6265SDimitry Andric auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 543781ad6265SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 543881ad6265SDimitry Andric auto Merged = B.buildMerge( 543981ad6265SDimitry Andric V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 544081ad6265SDimitry Andric Ops.push_back(Merged.getReg(0)); 544181ad6265SDimitry Andric }; 544281ad6265SDimitry Andric 544381ad6265SDimitry Andric Ops.push_back(NodePtr); 544481ad6265SDimitry Andric Ops.push_back(RayExtent); 544581ad6265SDimitry Andric packLanes(RayOrigin); 544681ad6265SDimitry Andric 544781ad6265SDimitry Andric if (IsA16) { 544881ad6265SDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 544981ad6265SDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 545081ad6265SDimitry Andric auto MergedDir = B.buildMerge( 545181ad6265SDimitry Andric V3S32, 545281ad6265SDimitry Andric {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0), 545381ad6265SDimitry Andric UnmergeRayDir.getReg(0)})) 545481ad6265SDimitry Andric .getReg(0), 545581ad6265SDimitry Andric B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1), 545681ad6265SDimitry Andric UnmergeRayDir.getReg(1)})) 545781ad6265SDimitry Andric .getReg(0), 545881ad6265SDimitry Andric B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2), 545981ad6265SDimitry Andric UnmergeRayDir.getReg(2)})) 546081ad6265SDimitry Andric .getReg(0)}); 546181ad6265SDimitry Andric Ops.push_back(MergedDir.getReg(0)); 546281ad6265SDimitry Andric } else { 546381ad6265SDimitry Andric packLanes(RayDir); 546481ad6265SDimitry Andric packLanes(RayInvDir); 546581ad6265SDimitry Andric } 546681ad6265SDimitry Andric } else { 5467e8d8bef9SDimitry Andric if (Is64) { 5468e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5469e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 5470e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 5471e8d8bef9SDimitry Andric } else { 5472e8d8bef9SDimitry Andric Ops.push_back(NodePtr); 5473e8d8bef9SDimitry Andric } 5474e8d8bef9SDimitry Andric Ops.push_back(RayExtent); 5475e8d8bef9SDimitry Andric 5476e8d8bef9SDimitry Andric auto packLanes = [&Ops, &S32, &B](Register Src) { 54770eae32dcSDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5478e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 5479e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 5480e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(2)); 5481e8d8bef9SDimitry Andric }; 5482e8d8bef9SDimitry Andric 5483e8d8bef9SDimitry Andric packLanes(RayOrigin); 5484e8d8bef9SDimitry Andric if (IsA16) { 54850eae32dcSDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 54860eae32dcSDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5487e8d8bef9SDimitry Andric Register R1 = MRI.createGenericVirtualRegister(S32); 5488e8d8bef9SDimitry Andric Register R2 = MRI.createGenericVirtualRegister(S32); 5489e8d8bef9SDimitry Andric Register R3 = MRI.createGenericVirtualRegister(S32); 5490e8d8bef9SDimitry Andric B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5491e8d8bef9SDimitry Andric B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 549281ad6265SDimitry Andric B.buildMerge(R3, 549381ad6265SDimitry Andric {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5494e8d8bef9SDimitry Andric Ops.push_back(R1); 5495e8d8bef9SDimitry Andric Ops.push_back(R2); 5496e8d8bef9SDimitry Andric Ops.push_back(R3); 5497e8d8bef9SDimitry Andric } else { 5498e8d8bef9SDimitry Andric packLanes(RayDir); 5499e8d8bef9SDimitry Andric packLanes(RayInvDir); 5500e8d8bef9SDimitry Andric } 550181ad6265SDimitry Andric } 5502e8d8bef9SDimitry Andric 5503349cc55cSDimitry Andric if (!UseNSA) { 5504349cc55cSDimitry Andric // Build a single vector containing all the operands so far prepared. 5505349cc55cSDimitry Andric LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5506349cc55cSDimitry Andric Register MergedOps = B.buildMerge(OpTy, Ops).getReg(0); 5507349cc55cSDimitry Andric Ops.clear(); 5508349cc55cSDimitry Andric Ops.push_back(MergedOps); 5509349cc55cSDimitry Andric } 5510349cc55cSDimitry Andric 5511e8d8bef9SDimitry Andric auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5512e8d8bef9SDimitry Andric .addDef(DstReg) 5513e8d8bef9SDimitry Andric .addImm(Opcode); 5514e8d8bef9SDimitry Andric 5515e8d8bef9SDimitry Andric for (Register R : Ops) { 5516e8d8bef9SDimitry Andric MIB.addUse(R); 5517e8d8bef9SDimitry Andric } 5518e8d8bef9SDimitry Andric 5519e8d8bef9SDimitry Andric MIB.addUse(TDescr) 5520e8d8bef9SDimitry Andric .addImm(IsA16 ? 1 : 0) 5521e8d8bef9SDimitry Andric .cloneMemRefs(MI); 5522e8d8bef9SDimitry Andric 5523e8d8bef9SDimitry Andric MI.eraseFromParent(); 5524e8d8bef9SDimitry Andric return true; 5525e8d8bef9SDimitry Andric } 5526e8d8bef9SDimitry Andric 552781ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 552881ad6265SDimitry Andric MachineIRBuilder &B) const { 552981ad6265SDimitry Andric unsigned Opc; 553081ad6265SDimitry Andric int RoundMode = MI.getOperand(2).getImm(); 553181ad6265SDimitry Andric 553281ad6265SDimitry Andric if (RoundMode == (int)RoundingMode::TowardPositive) 553381ad6265SDimitry Andric Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 553481ad6265SDimitry Andric else if (RoundMode == (int)RoundingMode::TowardNegative) 553581ad6265SDimitry Andric Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 553681ad6265SDimitry Andric else 553781ad6265SDimitry Andric return false; 553881ad6265SDimitry Andric 553981ad6265SDimitry Andric B.buildInstr(Opc) 554081ad6265SDimitry Andric .addDef(MI.getOperand(0).getReg()) 554181ad6265SDimitry Andric .addUse(MI.getOperand(1).getReg()); 554281ad6265SDimitry Andric 554304eeddc0SDimitry Andric MI.eraseFromParent(); 554481ad6265SDimitry Andric 554504eeddc0SDimitry Andric return true; 554604eeddc0SDimitry Andric } 554704eeddc0SDimitry Andric 55485ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 55495ffd83dbSDimitry Andric MachineInstr &MI) const { 55505ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 55515ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 55525ffd83dbSDimitry Andric 55530b57cec5SDimitry Andric // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5554480093f4SDimitry Andric auto IntrID = MI.getIntrinsicID(); 5555480093f4SDimitry Andric switch (IntrID) { 5556480093f4SDimitry Andric case Intrinsic::amdgcn_if: 5557480093f4SDimitry Andric case Intrinsic::amdgcn_else: { 5558480093f4SDimitry Andric MachineInstr *Br = nullptr; 55595ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 5560e8d8bef9SDimitry Andric bool Negated = false; 5561e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 5562e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 55630b57cec5SDimitry Andric const SIRegisterInfo *TRI 55640b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 55650b57cec5SDimitry Andric 55660b57cec5SDimitry Andric Register Def = MI.getOperand(1).getReg(); 55670b57cec5SDimitry Andric Register Use = MI.getOperand(3).getReg(); 5568480093f4SDimitry Andric 55695ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5570e8d8bef9SDimitry Andric 5571e8d8bef9SDimitry Andric if (Negated) 5572e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 5573e8d8bef9SDimitry Andric 55745ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5575480093f4SDimitry Andric if (IntrID == Intrinsic::amdgcn_if) { 55760b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_IF) 55770b57cec5SDimitry Andric .addDef(Def) 55780b57cec5SDimitry Andric .addUse(Use) 55795ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 5580480093f4SDimitry Andric } else { 5581480093f4SDimitry Andric B.buildInstr(AMDGPU::SI_ELSE) 5582480093f4SDimitry Andric .addDef(Def) 5583480093f4SDimitry Andric .addUse(Use) 5584e8d8bef9SDimitry Andric .addMBB(UncondBrTarget); 5585480093f4SDimitry Andric } 5586480093f4SDimitry Andric 55875ffd83dbSDimitry Andric if (Br) { 55885ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 55895ffd83dbSDimitry Andric } else { 55905ffd83dbSDimitry Andric // The IRTranslator skips inserting the G_BR for fallthrough cases, but 55915ffd83dbSDimitry Andric // since we're swapping branch targets it needs to be reinserted. 55925ffd83dbSDimitry Andric // FIXME: IRTranslator should probably not do this 55935ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 55945ffd83dbSDimitry Andric } 55950b57cec5SDimitry Andric 55960b57cec5SDimitry Andric MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 55970b57cec5SDimitry Andric MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 55980b57cec5SDimitry Andric MI.eraseFromParent(); 55990b57cec5SDimitry Andric BrCond->eraseFromParent(); 56000b57cec5SDimitry Andric return true; 56010b57cec5SDimitry Andric } 56020b57cec5SDimitry Andric 56030b57cec5SDimitry Andric return false; 56040b57cec5SDimitry Andric } 56050b57cec5SDimitry Andric case Intrinsic::amdgcn_loop: { 5606480093f4SDimitry Andric MachineInstr *Br = nullptr; 56075ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 5608e8d8bef9SDimitry Andric bool Negated = false; 5609e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 5610e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 56110b57cec5SDimitry Andric const SIRegisterInfo *TRI 56120b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 56130b57cec5SDimitry Andric 56145ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 56150b57cec5SDimitry Andric Register Reg = MI.getOperand(2).getReg(); 56165ffd83dbSDimitry Andric 5617e8d8bef9SDimitry Andric if (Negated) 5618e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 5619e8d8bef9SDimitry Andric 56205ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 56210b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_LOOP) 56220b57cec5SDimitry Andric .addUse(Reg) 56235ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 56245ffd83dbSDimitry Andric 56255ffd83dbSDimitry Andric if (Br) 56265ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 56275ffd83dbSDimitry Andric else 56285ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 56295ffd83dbSDimitry Andric 56300b57cec5SDimitry Andric MI.eraseFromParent(); 56310b57cec5SDimitry Andric BrCond->eraseFromParent(); 56320b57cec5SDimitry Andric MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 56330b57cec5SDimitry Andric return true; 56340b57cec5SDimitry Andric } 56350b57cec5SDimitry Andric 56360b57cec5SDimitry Andric return false; 56370b57cec5SDimitry Andric } 56380b57cec5SDimitry Andric case Intrinsic::amdgcn_kernarg_segment_ptr: 56395ffd83dbSDimitry Andric if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 56405ffd83dbSDimitry Andric // This only makes sense to call in a kernel, so just lower to null. 56415ffd83dbSDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), 0); 56425ffd83dbSDimitry Andric MI.eraseFromParent(); 56435ffd83dbSDimitry Andric return true; 56445ffd83dbSDimitry Andric } 56455ffd83dbSDimitry Andric 56460b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 56470b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 56480b57cec5SDimitry Andric case Intrinsic::amdgcn_implicitarg_ptr: 56490b57cec5SDimitry Andric return legalizeImplicitArgPtr(MI, MRI, B); 56500b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 565181ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 56520b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_X); 56530b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 565481ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 56550b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 56560b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 565781ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 56580b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 56590b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_x: 56600b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56610b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 56620b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_y: 56630b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56640b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 56650b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_z: 56660b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56670b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5668*fcaf7f86SDimitry Andric case Intrinsic::amdgcn_lds_kernel_id: 5669*fcaf7f86SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 5670*fcaf7f86SDimitry Andric AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 56710b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_ptr: 56720b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56730b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_PTR); 56740b57cec5SDimitry Andric case Intrinsic::amdgcn_queue_ptr: 56750b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56760b57cec5SDimitry Andric AMDGPUFunctionArgInfo::QUEUE_PTR); 56770b57cec5SDimitry Andric case Intrinsic::amdgcn_implicit_buffer_ptr: 56780b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 56790b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 56800b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_id: 56810b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56820b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_ID); 568381ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_x: 568481ad6265SDimitry Andric // TODO: Emit error for hsa 568581ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 568681ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_X); 568781ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_y: 568881ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 568981ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_Y); 569081ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_z: 569181ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 569281ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_Z); 569381ad6265SDimitry Andric case Intrinsic::r600_read_local_size_x: 569481ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 569581ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 569681ad6265SDimitry Andric case Intrinsic::r600_read_local_size_y: 569781ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 569881ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 569981ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 570081ad6265SDimitry Andric case Intrinsic::r600_read_local_size_z: 570181ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 570281ad6265SDimitry Andric case Intrinsic::r600_read_global_size_x: 570381ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 570481ad6265SDimitry Andric case Intrinsic::r600_read_global_size_y: 570581ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 570681ad6265SDimitry Andric case Intrinsic::r600_read_global_size_z: 570781ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 57088bcb0991SDimitry Andric case Intrinsic::amdgcn_fdiv_fast: 57098bcb0991SDimitry Andric return legalizeFDIVFastIntrin(MI, MRI, B); 57108bcb0991SDimitry Andric case Intrinsic::amdgcn_is_shared: 57118bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 57128bcb0991SDimitry Andric case Intrinsic::amdgcn_is_private: 57138bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 57148bcb0991SDimitry Andric case Intrinsic::amdgcn_wavefrontsize: { 57158bcb0991SDimitry Andric B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 57168bcb0991SDimitry Andric MI.eraseFromParent(); 57178bcb0991SDimitry Andric return true; 57188bcb0991SDimitry Andric } 57195ffd83dbSDimitry Andric case Intrinsic::amdgcn_s_buffer_load: 5720e8d8bef9SDimitry Andric return legalizeSBufferLoad(Helper, MI); 57218bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store: 57225ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store: 57235ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, false); 57248bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store_format: 57255ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store_format: 57265ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, true); 57275ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_store: 57285ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_store: 57295ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, true, true); 57305ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load: 57315ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load: 57325ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, false, false); 57335ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load_format: 57345ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load_format: 57355ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, false); 57365ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_load: 57375ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_load: 57385ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, true); 57395ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 57405ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 57415ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 57425ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 57435ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 57445ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 57455ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 57465ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 57475ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 57485ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 57495ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 57505ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 57515ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 57525ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 57535ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 57545ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 57555ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 57565ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 57575ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 57585ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 57595ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 57605ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 57615ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 57625ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 57635ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 57645ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5765fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5766fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5767fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5768fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 57695ffd83dbSDimitry Andric return legalizeBufferAtomic(MI, B, IntrID); 577004eeddc0SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 577104eeddc0SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { 577204eeddc0SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 577381ad6265SDimitry Andric if (!MRI.use_empty(DstReg) && 577481ad6265SDimitry Andric !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { 577504eeddc0SDimitry Andric Function &F = B.getMF().getFunction(); 577604eeddc0SDimitry Andric DiagnosticInfoUnsupported NoFpRet( 577704eeddc0SDimitry Andric F, "return versions of fp atomics not supported", B.getDebugLoc(), 577804eeddc0SDimitry Andric DS_Error); 577904eeddc0SDimitry Andric F.getContext().diagnose(NoFpRet); 578004eeddc0SDimitry Andric B.buildUndef(DstReg); 578104eeddc0SDimitry Andric MI.eraseFromParent(); 578204eeddc0SDimitry Andric return true; 578304eeddc0SDimitry Andric } 578404eeddc0SDimitry Andric 578504eeddc0SDimitry Andric return legalizeBufferAtomic(MI, B, IntrID); 578604eeddc0SDimitry Andric } 57875ffd83dbSDimitry Andric case Intrinsic::amdgcn_atomic_inc: 57885ffd83dbSDimitry Andric return legalizeAtomicIncDec(MI, B, true); 57895ffd83dbSDimitry Andric case Intrinsic::amdgcn_atomic_dec: 57905ffd83dbSDimitry Andric return legalizeAtomicIncDec(MI, B, false); 57915ffd83dbSDimitry Andric case Intrinsic::trap: 57925ffd83dbSDimitry Andric return legalizeTrapIntrinsic(MI, MRI, B); 57935ffd83dbSDimitry Andric case Intrinsic::debugtrap: 57945ffd83dbSDimitry Andric return legalizeDebugTrapIntrinsic(MI, MRI, B); 5795e8d8bef9SDimitry Andric case Intrinsic::amdgcn_rsq_clamp: 5796e8d8bef9SDimitry Andric return legalizeRsqClampIntrinsic(MI, MRI, B); 5797e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 5798e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 5799e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 5800e8d8bef9SDimitry Andric return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5801e8d8bef9SDimitry Andric case Intrinsic::amdgcn_image_bvh_intersect_ray: 5802e8d8bef9SDimitry Andric return legalizeBVHIntrinsic(MI, B); 58035ffd83dbSDimitry Andric default: { 58045ffd83dbSDimitry Andric if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 58055ffd83dbSDimitry Andric AMDGPU::getImageDimIntrinsicInfo(IntrID)) 58065ffd83dbSDimitry Andric return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 58070b57cec5SDimitry Andric return true; 58080b57cec5SDimitry Andric } 58095ffd83dbSDimitry Andric } 58100b57cec5SDimitry Andric 58110b57cec5SDimitry Andric return true; 58120b57cec5SDimitry Andric } 5813