10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for 100b57cec5SDimitry Andric /// AMDGPU. 110b57cec5SDimitry Andric /// \todo This should be generated by TableGen. 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 158bcb0991SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h" 18e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h" 190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 21fe6060f1SDimitry Andric #include "Utils/AMDGPUBaseInfo.h" 225ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h" 23fe6060f1SDimitry Andric #include "llvm/BinaryFormat/ELF.h" 240b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 28e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 2981ad6265SDimitry Andric #include "llvm/IR/IntrinsicsR600.h" 300b57cec5SDimitry Andric 310b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo" 320b57cec5SDimitry Andric 330b57cec5SDimitry Andric using namespace llvm; 340b57cec5SDimitry Andric using namespace LegalizeActions; 350b57cec5SDimitry Andric using namespace LegalizeMutations; 360b57cec5SDimitry Andric using namespace LegalityPredicates; 375ffd83dbSDimitry Andric using namespace MIPatternMatch; 380b57cec5SDimitry Andric 395ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types. 405ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality( 415ffd83dbSDimitry Andric "amdgpu-global-isel-new-legality", 425ffd83dbSDimitry Andric cl::desc("Use GlobalISel desired legality, rather than try to use" 435ffd83dbSDimitry Andric "rules compatible with selection patterns"), 445ffd83dbSDimitry Andric cl::init(false), 455ffd83dbSDimitry Andric cl::ReallyHidden); 460b57cec5SDimitry Andric 475ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024; 485ffd83dbSDimitry Andric 495ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements 505ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) { 515ffd83dbSDimitry Andric unsigned NElts = Ty.getNumElements(); 525ffd83dbSDimitry Andric unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53fe6060f1SDimitry Andric return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts)); 540b57cec5SDimitry Andric } 550b57cec5SDimitry Andric 565ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits 575ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) { 585ffd83dbSDimitry Andric unsigned Bits = Ty.getSizeInBits(); 595ffd83dbSDimitry Andric unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 605ffd83dbSDimitry Andric return LLT::scalar(Pow2Bits); 618bcb0991SDimitry Andric } 628bcb0991SDimitry Andric 63349cc55cSDimitry Andric /// \returns true if this is an odd sized vector which should widen by adding an 64e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 65e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized. 660b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 670b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 680b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 69e8d8bef9SDimitry Andric if (!Ty.isVector()) 70e8d8bef9SDimitry Andric return false; 71e8d8bef9SDimitry Andric 72e8d8bef9SDimitry Andric const LLT EltTy = Ty.getElementType(); 73e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 74e8d8bef9SDimitry Andric return Ty.getNumElements() % 2 != 0 && 75e8d8bef9SDimitry Andric EltSize > 1 && EltSize < 32 && 768bcb0991SDimitry Andric Ty.getSizeInBits() % 32 != 0; 778bcb0991SDimitry Andric }; 788bcb0991SDimitry Andric } 798bcb0991SDimitry Andric 80e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 81e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 82e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 83e8d8bef9SDimitry Andric return Ty.getSizeInBits() % 32 == 0; 84e8d8bef9SDimitry Andric }; 85e8d8bef9SDimitry Andric } 86e8d8bef9SDimitry Andric 878bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) { 888bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 898bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 908bcb0991SDimitry Andric const LLT EltTy = Ty.getScalarType(); 918bcb0991SDimitry Andric return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 920b57cec5SDimitry Andric }; 930b57cec5SDimitry Andric } 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 960b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 970b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 980b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 99*bdd1243dSDimitry Andric return std::pair(TypeIdx, 100fe6060f1SDimitry Andric LLT::fixed_vector(Ty.getNumElements() + 1, EltTy)); 1010b57cec5SDimitry Andric }; 1020b57cec5SDimitry Andric } 1030b57cec5SDimitry Andric 1040b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 1050b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1060b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1070b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 1080b57cec5SDimitry Andric unsigned Size = Ty.getSizeInBits(); 1090b57cec5SDimitry Andric unsigned Pieces = (Size + 63) / 64; 1100b57cec5SDimitry Andric unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 111*bdd1243dSDimitry Andric return std::pair(TypeIdx, LLT::scalarOrVector( 112*bdd1243dSDimitry Andric ElementCount::getFixed(NewNumElts), EltTy)); 1130b57cec5SDimitry Andric }; 1140b57cec5SDimitry Andric } 1150b57cec5SDimitry Andric 1168bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit 1178bcb0991SDimitry Andric // type. 1188bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 1198bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1208bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1218bcb0991SDimitry Andric 1228bcb0991SDimitry Andric const LLT EltTy = Ty.getElementType(); 1238bcb0991SDimitry Andric const int Size = Ty.getSizeInBits(); 1248bcb0991SDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1258bcb0991SDimitry Andric const int NextMul32 = (Size + 31) / 32; 1268bcb0991SDimitry Andric 1278bcb0991SDimitry Andric assert(EltSize < 32); 1288bcb0991SDimitry Andric 1298bcb0991SDimitry Andric const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 130*bdd1243dSDimitry Andric return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy)); 1318bcb0991SDimitry Andric }; 1328bcb0991SDimitry Andric } 1338bcb0991SDimitry Andric 134e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) { 135e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 1365ffd83dbSDimitry Andric 1375ffd83dbSDimitry Andric if (Size <= 32) { 1385ffd83dbSDimitry Andric // <2 x s8> -> s16 1395ffd83dbSDimitry Andric // <4 x s8> -> s32 140e8d8bef9SDimitry Andric return LLT::scalar(Size); 141e8d8bef9SDimitry Andric } 1425ffd83dbSDimitry Andric 143fe6060f1SDimitry Andric return LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32); 144e8d8bef9SDimitry Andric } 145e8d8bef9SDimitry Andric 146e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 147e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 148e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 149*bdd1243dSDimitry Andric return std::pair(TypeIdx, getBitcastRegisterType(Ty)); 150e8d8bef9SDimitry Andric }; 151e8d8bef9SDimitry Andric } 152e8d8bef9SDimitry Andric 153e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 154e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 155e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 156e8d8bef9SDimitry Andric unsigned Size = Ty.getSizeInBits(); 157e8d8bef9SDimitry Andric assert(Size % 32 == 0); 158*bdd1243dSDimitry Andric return std::pair( 159fe6060f1SDimitry Andric TypeIdx, LLT::scalarOrVector(ElementCount::getFixed(Size / 32), 32)); 1605ffd83dbSDimitry Andric }; 1615ffd83dbSDimitry Andric } 1625ffd83dbSDimitry Andric 1638bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 1648bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1658bcb0991SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1668bcb0991SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 1678bcb0991SDimitry Andric }; 1688bcb0991SDimitry Andric } 1698bcb0991SDimitry Andric 1700b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 1710b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1720b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1730b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 1740b57cec5SDimitry Andric }; 1750b57cec5SDimitry Andric } 1760b57cec5SDimitry Andric 1770b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 1780b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1790b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1800b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 1810b57cec5SDimitry Andric }; 1820b57cec5SDimitry Andric } 1830b57cec5SDimitry Andric 1845ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) { 1855ffd83dbSDimitry Andric return Size % 32 == 0 && Size <= MaxRegisterSize; 1865ffd83dbSDimitry Andric } 1875ffd83dbSDimitry Andric 1885ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) { 1895ffd83dbSDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1905ffd83dbSDimitry Andric return EltSize == 16 || EltSize % 32 == 0; 1915ffd83dbSDimitry Andric } 1925ffd83dbSDimitry Andric 1935ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) { 1940b57cec5SDimitry Andric const int EltSize = Ty.getElementType().getSizeInBits(); 1950b57cec5SDimitry Andric return EltSize == 32 || EltSize == 64 || 1960b57cec5SDimitry Andric (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 1970b57cec5SDimitry Andric EltSize == 128 || EltSize == 256; 1980b57cec5SDimitry Andric } 1990b57cec5SDimitry Andric 2005ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) { 2015ffd83dbSDimitry Andric if (!isRegisterSize(Ty.getSizeInBits())) 2025ffd83dbSDimitry Andric return false; 2035ffd83dbSDimitry Andric 2045ffd83dbSDimitry Andric if (Ty.isVector()) 2055ffd83dbSDimitry Andric return isRegisterVectorType(Ty); 2065ffd83dbSDimitry Andric 2075ffd83dbSDimitry Andric return true; 2085ffd83dbSDimitry Andric } 2095ffd83dbSDimitry Andric 2105ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and 2115ffd83dbSDimitry Andric // multiples of v2s16. 2125ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) { 2135ffd83dbSDimitry Andric return [=](const LegalityQuery &Query) { 2145ffd83dbSDimitry Andric return isRegisterType(Query.Types[TypeIdx]); 2158bcb0991SDimitry Andric }; 2168bcb0991SDimitry Andric } 2178bcb0991SDimitry Andric 2185ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 2198bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2205ffd83dbSDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2215ffd83dbSDimitry Andric if (!QueryTy.isVector()) 2225ffd83dbSDimitry Andric return false; 2235ffd83dbSDimitry Andric const LLT EltTy = QueryTy.getElementType(); 2245ffd83dbSDimitry Andric return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 2258bcb0991SDimitry Andric }; 2268bcb0991SDimitry Andric } 2278bcb0991SDimitry Andric 228fe6060f1SDimitry Andric // If we have a truncating store or an extending load with a data size larger 229fe6060f1SDimitry Andric // than 32-bits, we need to reduce to a 32-bit type. 230fe6060f1SDimitry Andric static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { 2318bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2328bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 2338bcb0991SDimitry Andric return !Ty.isVector() && Ty.getSizeInBits() > 32 && 234fe6060f1SDimitry Andric Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits(); 2350b57cec5SDimitry Andric }; 2360b57cec5SDimitry Andric } 2370b57cec5SDimitry Andric 2385ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 2395ffd83dbSDimitry Andric // handle some operations by just promoting the register during 2405ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits. 2415ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 2425ffd83dbSDimitry Andric bool IsLoad) { 2435ffd83dbSDimitry Andric switch (AS) { 2445ffd83dbSDimitry Andric case AMDGPUAS::PRIVATE_ADDRESS: 2455ffd83dbSDimitry Andric // FIXME: Private element size. 246e8d8bef9SDimitry Andric return ST.enableFlatScratch() ? 128 : 32; 2475ffd83dbSDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 2485ffd83dbSDimitry Andric return ST.useDS128() ? 128 : 64; 2495ffd83dbSDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 2505ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS: 2515ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 2525ffd83dbSDimitry Andric // Treat constant and global as identical. SMRD loads are sometimes usable for 2535ffd83dbSDimitry Andric // global loads (ideally constant address space should be eliminated) 2545ffd83dbSDimitry Andric // depending on the context. Legality cannot be context dependent, but 2555ffd83dbSDimitry Andric // RegBankSelect can split the load as necessary depending on the pointer 2565ffd83dbSDimitry Andric // register bank/uniformity and if the memory is invariant or not written in a 2575ffd83dbSDimitry Andric // kernel. 2585ffd83dbSDimitry Andric return IsLoad ? 512 : 128; 2595ffd83dbSDimitry Andric default: 2605ffd83dbSDimitry Andric // Flat addresses may contextually need to be split to 32-bit parts if they 2615ffd83dbSDimitry Andric // may alias scratch depending on the subtarget. 2625ffd83dbSDimitry Andric return 128; 2635ffd83dbSDimitry Andric } 2645ffd83dbSDimitry Andric } 2655ffd83dbSDimitry Andric 2665ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 267fe6060f1SDimitry Andric const LegalityQuery &Query) { 2685ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 2695ffd83dbSDimitry Andric 2705ffd83dbSDimitry Andric // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 271fe6060f1SDimitry Andric const bool IsLoad = Query.Opcode != AMDGPU::G_STORE; 2725ffd83dbSDimitry Andric 2735ffd83dbSDimitry Andric unsigned RegSize = Ty.getSizeInBits(); 27404eeddc0SDimitry Andric uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 27504eeddc0SDimitry Andric uint64_t AlignBits = Query.MMODescrs[0].AlignInBits; 2765ffd83dbSDimitry Andric unsigned AS = Query.Types[1].getAddressSpace(); 2775ffd83dbSDimitry Andric 2785ffd83dbSDimitry Andric // All of these need to be custom lowered to cast the pointer operand. 2795ffd83dbSDimitry Andric if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2805ffd83dbSDimitry Andric return false; 2815ffd83dbSDimitry Andric 282fe6060f1SDimitry Andric // Do not handle extending vector loads. 283fe6060f1SDimitry Andric if (Ty.isVector() && MemSize != RegSize) 284fe6060f1SDimitry Andric return false; 285fe6060f1SDimitry Andric 2865ffd83dbSDimitry Andric // TODO: We should be able to widen loads if the alignment is high enough, but 2875ffd83dbSDimitry Andric // we also need to modify the memory access size. 2885ffd83dbSDimitry Andric #if 0 2895ffd83dbSDimitry Andric // Accept widening loads based on alignment. 2905ffd83dbSDimitry Andric if (IsLoad && MemSize < Size) 2915ffd83dbSDimitry Andric MemSize = std::max(MemSize, Align); 2925ffd83dbSDimitry Andric #endif 2935ffd83dbSDimitry Andric 2945ffd83dbSDimitry Andric // Only 1-byte and 2-byte to 32-bit extloads are valid. 2955ffd83dbSDimitry Andric if (MemSize != RegSize && RegSize != 32) 2965ffd83dbSDimitry Andric return false; 2975ffd83dbSDimitry Andric 2985ffd83dbSDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 2995ffd83dbSDimitry Andric return false; 3005ffd83dbSDimitry Andric 3015ffd83dbSDimitry Andric switch (MemSize) { 3025ffd83dbSDimitry Andric case 8: 3035ffd83dbSDimitry Andric case 16: 3045ffd83dbSDimitry Andric case 32: 3055ffd83dbSDimitry Andric case 64: 3065ffd83dbSDimitry Andric case 128: 3075ffd83dbSDimitry Andric break; 3085ffd83dbSDimitry Andric case 96: 3095ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 3105ffd83dbSDimitry Andric return false; 3115ffd83dbSDimitry Andric break; 3125ffd83dbSDimitry Andric case 256: 3135ffd83dbSDimitry Andric case 512: 3145ffd83dbSDimitry Andric // These may contextually need to be broken down. 3155ffd83dbSDimitry Andric break; 3165ffd83dbSDimitry Andric default: 3175ffd83dbSDimitry Andric return false; 3185ffd83dbSDimitry Andric } 3195ffd83dbSDimitry Andric 3205ffd83dbSDimitry Andric assert(RegSize >= MemSize); 3215ffd83dbSDimitry Andric 322e8d8bef9SDimitry Andric if (AlignBits < MemSize) { 3235ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 324e8d8bef9SDimitry Andric if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 325e8d8bef9SDimitry Andric Align(AlignBits / 8))) 3265ffd83dbSDimitry Andric return false; 3275ffd83dbSDimitry Andric } 3285ffd83dbSDimitry Andric 3295ffd83dbSDimitry Andric return true; 3305ffd83dbSDimitry Andric } 3315ffd83dbSDimitry Andric 3325ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 3335ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care 3345ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by 3355ffd83dbSDimitry Andric // bitcasting. 3365ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) { 3375ffd83dbSDimitry Andric if (EnableNewLegality) 3385ffd83dbSDimitry Andric return false; 3395ffd83dbSDimitry Andric 3405ffd83dbSDimitry Andric const unsigned Size = Ty.getSizeInBits(); 3415ffd83dbSDimitry Andric if (Size <= 64) 3425ffd83dbSDimitry Andric return false; 3435ffd83dbSDimitry Andric if (!Ty.isVector()) 3445ffd83dbSDimitry Andric return true; 345e8d8bef9SDimitry Andric 346e8d8bef9SDimitry Andric LLT EltTy = Ty.getElementType(); 347e8d8bef9SDimitry Andric if (EltTy.isPointer()) 348e8d8bef9SDimitry Andric return true; 349e8d8bef9SDimitry Andric 350e8d8bef9SDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 3515ffd83dbSDimitry Andric return EltSize != 32 && EltSize != 64; 3525ffd83dbSDimitry Andric } 3535ffd83dbSDimitry Andric 354fe6060f1SDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) { 3555ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 356fe6060f1SDimitry Andric return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) && 3575ffd83dbSDimitry Andric !loadStoreBitcastWorkaround(Ty); 3585ffd83dbSDimitry Andric } 3595ffd83dbSDimitry Andric 360e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast 361e8d8bef9SDimitry Andric /// to a different type. 362e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 363fe6060f1SDimitry Andric const LLT MemTy) { 364fe6060f1SDimitry Andric const unsigned MemSizeInBits = MemTy.getSizeInBits(); 365e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 366e8d8bef9SDimitry Andric if (Size != MemSizeInBits) 367e8d8bef9SDimitry Andric return Size <= 32 && Ty.isVector(); 368e8d8bef9SDimitry Andric 369e8d8bef9SDimitry Andric if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 370e8d8bef9SDimitry Andric return true; 371fe6060f1SDimitry Andric 372fe6060f1SDimitry Andric // Don't try to handle bitcasting vector ext loads for now. 373fe6060f1SDimitry Andric return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) && 374fe6060f1SDimitry Andric (Size <= 32 || isRegisterSize(Size)) && 375e8d8bef9SDimitry Andric !isRegisterVectorElementType(Ty.getElementType()); 376e8d8bef9SDimitry Andric } 377e8d8bef9SDimitry Andric 378e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory 379e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself 380e8d8bef9SDimitry Andric /// changes, not the size of the result register. 381fe6060f1SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, 38204eeddc0SDimitry Andric uint64_t AlignInBits, unsigned AddrSpace, 383e8d8bef9SDimitry Andric unsigned Opcode) { 384fe6060f1SDimitry Andric unsigned SizeInBits = MemoryTy.getSizeInBits(); 385e8d8bef9SDimitry Andric // We don't want to widen cases that are naturally legal. 386e8d8bef9SDimitry Andric if (isPowerOf2_32(SizeInBits)) 387e8d8bef9SDimitry Andric return false; 388e8d8bef9SDimitry Andric 389e8d8bef9SDimitry Andric // If we have 96-bit memory operations, we shouldn't touch them. Note we may 390e8d8bef9SDimitry Andric // end up widening these for a scalar load during RegBankSelect, since there 391e8d8bef9SDimitry Andric // aren't 96-bit scalar loads. 392e8d8bef9SDimitry Andric if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 393e8d8bef9SDimitry Andric return false; 394e8d8bef9SDimitry Andric 395e8d8bef9SDimitry Andric if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 396e8d8bef9SDimitry Andric return false; 397e8d8bef9SDimitry Andric 398e8d8bef9SDimitry Andric // A load is known dereferenceable up to the alignment, so it's legal to widen 399e8d8bef9SDimitry Andric // to it. 400e8d8bef9SDimitry Andric // 401e8d8bef9SDimitry Andric // TODO: Could check dereferenceable for less aligned cases. 402e8d8bef9SDimitry Andric unsigned RoundedSize = NextPowerOf2(SizeInBits); 403e8d8bef9SDimitry Andric if (AlignInBits < RoundedSize) 404e8d8bef9SDimitry Andric return false; 405e8d8bef9SDimitry Andric 406e8d8bef9SDimitry Andric // Do not widen if it would introduce a slow unaligned load. 407e8d8bef9SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 408*bdd1243dSDimitry Andric unsigned Fast = 0; 409e8d8bef9SDimitry Andric return TLI->allowsMisalignedMemoryAccessesImpl( 410e8d8bef9SDimitry Andric RoundedSize, AddrSpace, Align(AlignInBits / 8), 411e8d8bef9SDimitry Andric MachineMemOperand::MOLoad, &Fast) && 412e8d8bef9SDimitry Andric Fast; 413e8d8bef9SDimitry Andric } 414e8d8bef9SDimitry Andric 415e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 416e8d8bef9SDimitry Andric unsigned Opcode) { 417e8d8bef9SDimitry Andric if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 418e8d8bef9SDimitry Andric return false; 419e8d8bef9SDimitry Andric 420fe6060f1SDimitry Andric return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy, 421e8d8bef9SDimitry Andric Query.MMODescrs[0].AlignInBits, 422e8d8bef9SDimitry Andric Query.Types[1].getAddressSpace(), Opcode); 423e8d8bef9SDimitry Andric } 424e8d8bef9SDimitry Andric 4250b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 4260b57cec5SDimitry Andric const GCNTargetMachine &TM) 4270b57cec5SDimitry Andric : ST(ST_) { 4280b57cec5SDimitry Andric using namespace TargetOpcode; 4290b57cec5SDimitry Andric 4300b57cec5SDimitry Andric auto GetAddrSpacePtr = [&TM](unsigned AS) { 4310b57cec5SDimitry Andric return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 4320b57cec5SDimitry Andric }; 4330b57cec5SDimitry Andric 4340b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 435e8d8bef9SDimitry Andric const LLT S8 = LLT::scalar(8); 4360b57cec5SDimitry Andric const LLT S16 = LLT::scalar(16); 4370b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 4380b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 4390b57cec5SDimitry Andric const LLT S128 = LLT::scalar(128); 4400b57cec5SDimitry Andric const LLT S256 = LLT::scalar(256); 4415ffd83dbSDimitry Andric const LLT S512 = LLT::scalar(512); 4425ffd83dbSDimitry Andric const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 4430b57cec5SDimitry Andric 444fe6060f1SDimitry Andric const LLT V2S8 = LLT::fixed_vector(2, 8); 445fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 446fe6060f1SDimitry Andric const LLT V4S16 = LLT::fixed_vector(4, 16); 4470b57cec5SDimitry Andric 448fe6060f1SDimitry Andric const LLT V2S32 = LLT::fixed_vector(2, 32); 449fe6060f1SDimitry Andric const LLT V3S32 = LLT::fixed_vector(3, 32); 450fe6060f1SDimitry Andric const LLT V4S32 = LLT::fixed_vector(4, 32); 451fe6060f1SDimitry Andric const LLT V5S32 = LLT::fixed_vector(5, 32); 452fe6060f1SDimitry Andric const LLT V6S32 = LLT::fixed_vector(6, 32); 453fe6060f1SDimitry Andric const LLT V7S32 = LLT::fixed_vector(7, 32); 454fe6060f1SDimitry Andric const LLT V8S32 = LLT::fixed_vector(8, 32); 455fe6060f1SDimitry Andric const LLT V9S32 = LLT::fixed_vector(9, 32); 456fe6060f1SDimitry Andric const LLT V10S32 = LLT::fixed_vector(10, 32); 457fe6060f1SDimitry Andric const LLT V11S32 = LLT::fixed_vector(11, 32); 458fe6060f1SDimitry Andric const LLT V12S32 = LLT::fixed_vector(12, 32); 459fe6060f1SDimitry Andric const LLT V13S32 = LLT::fixed_vector(13, 32); 460fe6060f1SDimitry Andric const LLT V14S32 = LLT::fixed_vector(14, 32); 461fe6060f1SDimitry Andric const LLT V15S32 = LLT::fixed_vector(15, 32); 462fe6060f1SDimitry Andric const LLT V16S32 = LLT::fixed_vector(16, 32); 463fe6060f1SDimitry Andric const LLT V32S32 = LLT::fixed_vector(32, 32); 4640b57cec5SDimitry Andric 465fe6060f1SDimitry Andric const LLT V2S64 = LLT::fixed_vector(2, 64); 466fe6060f1SDimitry Andric const LLT V3S64 = LLT::fixed_vector(3, 64); 467fe6060f1SDimitry Andric const LLT V4S64 = LLT::fixed_vector(4, 64); 468fe6060f1SDimitry Andric const LLT V5S64 = LLT::fixed_vector(5, 64); 469fe6060f1SDimitry Andric const LLT V6S64 = LLT::fixed_vector(6, 64); 470fe6060f1SDimitry Andric const LLT V7S64 = LLT::fixed_vector(7, 64); 471fe6060f1SDimitry Andric const LLT V8S64 = LLT::fixed_vector(8, 64); 472fe6060f1SDimitry Andric const LLT V16S64 = LLT::fixed_vector(16, 64); 4730b57cec5SDimitry Andric 4740b57cec5SDimitry Andric std::initializer_list<LLT> AllS32Vectors = 4750b57cec5SDimitry Andric {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 4768bcb0991SDimitry Andric V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 4770b57cec5SDimitry Andric std::initializer_list<LLT> AllS64Vectors = 4788bcb0991SDimitry Andric {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 4790b57cec5SDimitry Andric 4800b57cec5SDimitry Andric const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 4810b57cec5SDimitry Andric const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 4828bcb0991SDimitry Andric const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 4830b57cec5SDimitry Andric const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 4848bcb0991SDimitry Andric const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 4850b57cec5SDimitry Andric const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 4860b57cec5SDimitry Andric const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 4870b57cec5SDimitry Andric 4880b57cec5SDimitry Andric const LLT CodePtr = FlatPtr; 4890b57cec5SDimitry Andric 4900b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces64 = { 4910b57cec5SDimitry Andric GlobalPtr, ConstantPtr, FlatPtr 4920b57cec5SDimitry Andric }; 4930b57cec5SDimitry Andric 4940b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces32 = { 4958bcb0991SDimitry Andric LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 4960b57cec5SDimitry Andric }; 4970b57cec5SDimitry Andric 4980b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesBase = { 4990b57cec5SDimitry Andric S32, S64 5000b57cec5SDimitry Andric }; 5010b57cec5SDimitry Andric 5020b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypes16 = { 5030b57cec5SDimitry Andric S32, S64, S16 5040b57cec5SDimitry Andric }; 5050b57cec5SDimitry Andric 5060b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesPK16 = { 5070b57cec5SDimitry Andric S32, S64, S16, V2S16 5080b57cec5SDimitry Andric }; 5090b57cec5SDimitry Andric 5105ffd83dbSDimitry Andric const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 5115ffd83dbSDimitry Andric 512fe6060f1SDimitry Andric // s1 for VCC branches, s32 for SCC branches. 513fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_BRCOND).legalFor({S1, S32}); 5140b57cec5SDimitry Andric 5150b57cec5SDimitry Andric // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 5160b57cec5SDimitry Andric // elements for v3s16 5170b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PHI) 518e8d8bef9SDimitry Andric .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 5190b57cec5SDimitry Andric .legalFor(AllS32Vectors) 5200b57cec5SDimitry Andric .legalFor(AllS64Vectors) 5210b57cec5SDimitry Andric .legalFor(AddrSpaces64) 5220b57cec5SDimitry Andric .legalFor(AddrSpaces32) 523e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 524e8d8bef9SDimitry Andric .clampScalar(0, S16, S256) 5250b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 5260b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16) 5270b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 528e8d8bef9SDimitry Andric .scalarize(0); 5290b57cec5SDimitry Andric 530e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 531e8d8bef9SDimitry Andric // Full set of gfx9 features. 53281ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 5335ffd83dbSDimitry Andric .legalFor({S32, S16, V2S16}) 5340eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 53581ad6265SDimitry Andric .scalarize(0) 53681ad6265SDimitry Andric .minScalar(0, S16) 537349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 53881ad6265SDimitry Andric .maxScalar(0, S32); 53981ad6265SDimitry Andric 54081ad6265SDimitry Andric getActionDefinitionsBuilder(G_MUL) 54181ad6265SDimitry Andric .legalFor({S32, S16, V2S16}) 54281ad6265SDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 54381ad6265SDimitry Andric .scalarize(0) 54481ad6265SDimitry Andric .minScalar(0, S16) 54581ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 54681ad6265SDimitry Andric .custom(); 54781ad6265SDimitry Andric assert(ST.hasMad64_32()); 548e8d8bef9SDimitry Andric 549e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 550e8d8bef9SDimitry Andric .legalFor({S32, S16, V2S16}) // Clamp modifier 551e8d8bef9SDimitry Andric .minScalarOrElt(0, S16) 5520eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 553e8d8bef9SDimitry Andric .scalarize(0) 554e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 32) 555e8d8bef9SDimitry Andric .lower(); 5565ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 55781ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 5580b57cec5SDimitry Andric .legalFor({S32, S16}) 559349cc55cSDimitry Andric .minScalar(0, S16) 560349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 561349cc55cSDimitry Andric .maxScalar(0, S32) 562349cc55cSDimitry Andric .scalarize(0); 563e8d8bef9SDimitry Andric 56481ad6265SDimitry Andric getActionDefinitionsBuilder(G_MUL) 56581ad6265SDimitry Andric .legalFor({S32, S16}) 56681ad6265SDimitry Andric .scalarize(0) 56781ad6265SDimitry Andric .minScalar(0, S16) 56881ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32) 56981ad6265SDimitry Andric .custom(); 57081ad6265SDimitry Andric assert(ST.hasMad64_32()); 57181ad6265SDimitry Andric 572e8d8bef9SDimitry Andric // Technically the saturating operations require clamp bit support, but this 573e8d8bef9SDimitry Andric // was introduced at the same time as 16-bit operations. 574e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 575e8d8bef9SDimitry Andric .legalFor({S32, S16}) // Clamp modifier 576e8d8bef9SDimitry Andric .minScalar(0, S16) 577e8d8bef9SDimitry Andric .scalarize(0) 578e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 16) 579e8d8bef9SDimitry Andric .lower(); 580e8d8bef9SDimitry Andric 581e8d8bef9SDimitry Andric // We're just lowering this, but it helps get a better result to try to 582e8d8bef9SDimitry Andric // coerce to the desired type first. 583e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 584e8d8bef9SDimitry Andric .minScalar(0, S16) 585e8d8bef9SDimitry Andric .scalarize(0) 586e8d8bef9SDimitry Andric .lower(); 5870b57cec5SDimitry Andric } else { 58881ad6265SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB}) 5890b57cec5SDimitry Andric .legalFor({S32}) 590349cc55cSDimitry Andric .widenScalarToNextMultipleOf(0, 32) 5910b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5920b57cec5SDimitry Andric .scalarize(0); 593e8d8bef9SDimitry Andric 59481ad6265SDimitry Andric auto &Mul = getActionDefinitionsBuilder(G_MUL) 59581ad6265SDimitry Andric .legalFor({S32}) 59681ad6265SDimitry Andric .scalarize(0) 59781ad6265SDimitry Andric .minScalar(0, S32) 59881ad6265SDimitry Andric .widenScalarToNextMultipleOf(0, 32); 59981ad6265SDimitry Andric 60081ad6265SDimitry Andric if (ST.hasMad64_32()) 60181ad6265SDimitry Andric Mul.custom(); 60281ad6265SDimitry Andric else 60381ad6265SDimitry Andric Mul.maxScalar(0, S32); 60481ad6265SDimitry Andric 605e8d8bef9SDimitry Andric if (ST.hasIntClamp()) { 606e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 607e8d8bef9SDimitry Andric .legalFor({S32}) // Clamp modifier. 608e8d8bef9SDimitry Andric .scalarize(0) 609e8d8bef9SDimitry Andric .minScalarOrElt(0, S32) 610e8d8bef9SDimitry Andric .lower(); 611e8d8bef9SDimitry Andric } else { 612e8d8bef9SDimitry Andric // Clamp bit support was added in VI, along with 16-bit operations. 613e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 614e8d8bef9SDimitry Andric .minScalar(0, S32) 615e8d8bef9SDimitry Andric .scalarize(0) 616e8d8bef9SDimitry Andric .lower(); 6170b57cec5SDimitry Andric } 6180b57cec5SDimitry Andric 619e8d8bef9SDimitry Andric // FIXME: DAG expansion gets better results. The widening uses the smaller 620e8d8bef9SDimitry Andric // range values and goes for the min/max lowering directly. 621e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 622e8d8bef9SDimitry Andric .minScalar(0, S32) 623e8d8bef9SDimitry Andric .scalarize(0) 624e8d8bef9SDimitry Andric .lower(); 625e8d8bef9SDimitry Andric } 626e8d8bef9SDimitry Andric 627fe6060f1SDimitry Andric getActionDefinitionsBuilder( 628fe6060f1SDimitry Andric {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM}) 6295ffd83dbSDimitry Andric .customFor({S32, S64}) 630480093f4SDimitry Andric .clampScalar(0, S32, S64) 631480093f4SDimitry Andric .widenScalarToNextPow2(0, 32) 632480093f4SDimitry Andric .scalarize(0); 633480093f4SDimitry Andric 634e8d8bef9SDimitry Andric auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 6350b57cec5SDimitry Andric .legalFor({S32}) 636349cc55cSDimitry Andric .maxScalar(0, S32); 637e8d8bef9SDimitry Andric 638e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts()) { 639e8d8bef9SDimitry Andric Mulh 640e8d8bef9SDimitry Andric .clampMaxNumElements(0, S8, 2) 641e8d8bef9SDimitry Andric .lowerFor({V2S8}); 642e8d8bef9SDimitry Andric } 643e8d8bef9SDimitry Andric 644e8d8bef9SDimitry Andric Mulh 645e8d8bef9SDimitry Andric .scalarize(0) 646e8d8bef9SDimitry Andric .lower(); 6470b57cec5SDimitry Andric 6480b57cec5SDimitry Andric // Report legal for any types we can handle anywhere. For the cases only legal 6490b57cec5SDimitry Andric // on the SALU, RegBankSelect will be able to re-legalize. 6500b57cec5SDimitry Andric getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 6510b57cec5SDimitry Andric .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 6520b57cec5SDimitry Andric .clampScalar(0, S32, S64) 6530b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 6548bcb0991SDimitry Andric .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 6550b57cec5SDimitry Andric .widenScalarToNextPow2(0) 6560b57cec5SDimitry Andric .scalarize(0); 6570b57cec5SDimitry Andric 658*bdd1243dSDimitry Andric getActionDefinitionsBuilder( 659*bdd1243dSDimitry Andric {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 660480093f4SDimitry Andric .legalFor({{S32, S1}, {S32, S32}}) 661*bdd1243dSDimitry Andric .clampScalar(0, S32, S32) 662*bdd1243dSDimitry Andric .scalarize(0); 6630b57cec5SDimitry Andric 6640b57cec5SDimitry Andric getActionDefinitionsBuilder(G_BITCAST) 6650b57cec5SDimitry Andric // Don't worry about the size constraint. 6668bcb0991SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 6675ffd83dbSDimitry Andric .lower(); 6680b57cec5SDimitry Andric 6690b57cec5SDimitry Andric 6700b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONSTANT) 6718bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, GlobalPtr, 6720b57cec5SDimitry Andric LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 673e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 6740b57cec5SDimitry Andric .clampScalar(0, S32, S64) 675e8d8bef9SDimitry Andric .widenScalarToNextPow2(0); 6760b57cec5SDimitry Andric 6775ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FCONSTANT) 6785ffd83dbSDimitry Andric .legalFor({S32, S64, S16}) 6795ffd83dbSDimitry Andric .clampScalar(0, S16, S64); 6808bcb0991SDimitry Andric 6815ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 6825ffd83dbSDimitry Andric .legalIf(isRegisterType(0)) 6835ffd83dbSDimitry Andric // s1 and s16 are special cases because they have legal operations on 6845ffd83dbSDimitry Andric // them, but don't really occupy registers in the normal way. 6855ffd83dbSDimitry Andric .legalFor({S1, S16}) 6865ffd83dbSDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 6875ffd83dbSDimitry Andric .clampScalarOrElt(0, S32, MaxScalar) 6885ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 6895ffd83dbSDimitry Andric .clampMaxNumElements(0, S32, 16); 6905ffd83dbSDimitry Andric 691fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr}); 6925ffd83dbSDimitry Andric 6935ffd83dbSDimitry Andric // If the amount is divergent, we have to do a wave reduction to get the 6945ffd83dbSDimitry Andric // maximum value, so this is expanded during RegBankSelect. 6955ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_DYN_STACKALLOC) 6965ffd83dbSDimitry Andric .legalFor({{PrivatePtr, S32}}); 6975ffd83dbSDimitry Andric 6985ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_GLOBAL_VALUE) 699e8d8bef9SDimitry Andric .customIf(typeIsNot(0, PrivatePtr)); 700e8d8bef9SDimitry Andric 701fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr}); 7020b57cec5SDimitry Andric 7030b57cec5SDimitry Andric auto &FPOpActions = getActionDefinitionsBuilder( 704*bdd1243dSDimitry Andric { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE, 705*bdd1243dSDimitry Andric G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA}) 7060b57cec5SDimitry Andric .legalFor({S32, S64}); 7078bcb0991SDimitry Andric auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 7088bcb0991SDimitry Andric .customFor({S32, S64}); 7098bcb0991SDimitry Andric auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 7108bcb0991SDimitry Andric .customFor({S32, S64}); 7110b57cec5SDimitry Andric 7120b57cec5SDimitry Andric if (ST.has16BitInsts()) { 7130b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 7140b57cec5SDimitry Andric FPOpActions.legalFor({S16, V2S16}); 7150b57cec5SDimitry Andric else 7160b57cec5SDimitry Andric FPOpActions.legalFor({S16}); 7178bcb0991SDimitry Andric 7188bcb0991SDimitry Andric TrigActions.customFor({S16}); 7198bcb0991SDimitry Andric FDIVActions.customFor({S16}); 7200b57cec5SDimitry Andric } 7210b57cec5SDimitry Andric 7220b57cec5SDimitry Andric auto &MinNumMaxNum = getActionDefinitionsBuilder({ 7230b57cec5SDimitry Andric G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 7240b57cec5SDimitry Andric 7250b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 7260b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesPK16) 727480093f4SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 7280b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 7290b57cec5SDimitry Andric .clampScalar(0, S16, S64) 7300b57cec5SDimitry Andric .scalarize(0); 7310b57cec5SDimitry Andric } else if (ST.has16BitInsts()) { 7320b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypes16) 7330b57cec5SDimitry Andric .clampScalar(0, S16, S64) 7340b57cec5SDimitry Andric .scalarize(0); 7350b57cec5SDimitry Andric } else { 7360b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesBase) 7370b57cec5SDimitry Andric .clampScalar(0, S32, S64) 7380b57cec5SDimitry Andric .scalarize(0); 7390b57cec5SDimitry Andric } 7400b57cec5SDimitry Andric 7410b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 7420eae32dcSDimitry Andric FPOpActions.clampMaxNumElementsStrict(0, S16, 2); 7438bcb0991SDimitry Andric 7440b57cec5SDimitry Andric FPOpActions 7450b57cec5SDimitry Andric .scalarize(0) 7460b57cec5SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7470b57cec5SDimitry Andric 7488bcb0991SDimitry Andric TrigActions 7498bcb0991SDimitry Andric .scalarize(0) 7508bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7518bcb0991SDimitry Andric 7528bcb0991SDimitry Andric FDIVActions 7538bcb0991SDimitry Andric .scalarize(0) 7548bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7558bcb0991SDimitry Andric 7568bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FNEG, G_FABS}) 7578bcb0991SDimitry Andric .legalFor(FPTypesPK16) 7580eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 7598bcb0991SDimitry Andric .scalarize(0) 7608bcb0991SDimitry Andric .clampScalar(0, S16, S64); 7618bcb0991SDimitry Andric 7620b57cec5SDimitry Andric if (ST.has16BitInsts()) { 7638bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 7640b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 7650b57cec5SDimitry Andric .scalarize(0) 7660b57cec5SDimitry Andric .clampScalar(0, S16, S64); 7670b57cec5SDimitry Andric } else { 7685ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSQRT) 7695ffd83dbSDimitry Andric .legalFor({S32, S64}) 7705ffd83dbSDimitry Andric .scalarize(0) 7715ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 7725ffd83dbSDimitry Andric 7735ffd83dbSDimitry Andric if (ST.hasFractBug()) { 7745ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 7755ffd83dbSDimitry Andric .customFor({S64}) 7765ffd83dbSDimitry Andric .legalFor({S32, S64}) 7775ffd83dbSDimitry Andric .scalarize(0) 7785ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 7795ffd83dbSDimitry Andric } else { 7805ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 7810b57cec5SDimitry Andric .legalFor({S32, S64}) 7820b57cec5SDimitry Andric .scalarize(0) 7830b57cec5SDimitry Andric .clampScalar(0, S32, S64); 7840b57cec5SDimitry Andric } 7855ffd83dbSDimitry Andric } 7860b57cec5SDimitry Andric 7870b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPTRUNC) 7880b57cec5SDimitry Andric .legalFor({{S32, S64}, {S16, S32}}) 7895ffd83dbSDimitry Andric .scalarize(0) 7905ffd83dbSDimitry Andric .lower(); 7910b57cec5SDimitry Andric 7920b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPEXT) 7930b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}}) 794e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 7950b57cec5SDimitry Andric .scalarize(0); 7960b57cec5SDimitry Andric 797*bdd1243dSDimitry Andric auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB}); 79881ad6265SDimitry Andric if (ST.has16BitInsts()) { 79981ad6265SDimitry Andric FSubActions 80081ad6265SDimitry Andric // Use actual fsub instruction 80181ad6265SDimitry Andric .legalFor({S32, S16}) 80281ad6265SDimitry Andric // Must use fadd + fneg 80381ad6265SDimitry Andric .lowerFor({S64, V2S16}); 80481ad6265SDimitry Andric } else { 80581ad6265SDimitry Andric FSubActions 8060b57cec5SDimitry Andric // Use actual fsub instruction 8070b57cec5SDimitry Andric .legalFor({S32}) 8080b57cec5SDimitry Andric // Must use fadd + fneg 80981ad6265SDimitry Andric .lowerFor({S64, S16, V2S16}); 81081ad6265SDimitry Andric } 81181ad6265SDimitry Andric 81281ad6265SDimitry Andric FSubActions 8130b57cec5SDimitry Andric .scalarize(0) 8140b57cec5SDimitry Andric .clampScalar(0, S32, S64); 8150b57cec5SDimitry Andric 8168bcb0991SDimitry Andric // Whether this is legal depends on the floating point mode for the function. 8178bcb0991SDimitry Andric auto &FMad = getActionDefinitionsBuilder(G_FMAD); 8185ffd83dbSDimitry Andric if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 8198bcb0991SDimitry Andric FMad.customFor({S32, S16}); 8205ffd83dbSDimitry Andric else if (ST.hasMadMacF32Insts()) 8218bcb0991SDimitry Andric FMad.customFor({S32}); 8225ffd83dbSDimitry Andric else if (ST.hasMadF16()) 8235ffd83dbSDimitry Andric FMad.customFor({S16}); 8248bcb0991SDimitry Andric FMad.scalarize(0) 8258bcb0991SDimitry Andric .lower(); 8268bcb0991SDimitry Andric 827e8d8bef9SDimitry Andric auto &FRem = getActionDefinitionsBuilder(G_FREM); 828e8d8bef9SDimitry Andric if (ST.has16BitInsts()) { 829e8d8bef9SDimitry Andric FRem.customFor({S16, S32, S64}); 830e8d8bef9SDimitry Andric } else { 831e8d8bef9SDimitry Andric FRem.minScalar(0, S32) 832e8d8bef9SDimitry Andric .customFor({S32, S64}); 833e8d8bef9SDimitry Andric } 834e8d8bef9SDimitry Andric FRem.scalarize(0); 835e8d8bef9SDimitry Andric 8365ffd83dbSDimitry Andric // TODO: Do we need to clamp maximum bitwidth? 8375ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_TRUNC) 8385ffd83dbSDimitry Andric .legalIf(isScalar(0)) 8395ffd83dbSDimitry Andric .legalFor({{V2S16, V2S32}}) 8405ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 8415ffd83dbSDimitry Andric // Avoid scalarizing in cases that should be truly illegal. In unresolvable 8425ffd83dbSDimitry Andric // situations (like an invalid implicit use), we don't want to infinite loop 8435ffd83dbSDimitry Andric // in the legalizer. 8445ffd83dbSDimitry Andric .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 8455ffd83dbSDimitry Andric .alwaysLegal(); 8465ffd83dbSDimitry Andric 8470b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 8480b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 8495ffd83dbSDimitry Andric {S32, S1}, {S64, S1}, {S16, S1}}) 850480093f4SDimitry Andric .scalarize(0) 8515ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 8525ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 8530b57cec5SDimitry Andric 8548bcb0991SDimitry Andric // TODO: Split s1->s64 during regbankselect for VALU. 8558bcb0991SDimitry Andric auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 856480093f4SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 857480093f4SDimitry Andric .lowerIf(typeIs(1, S1)) 858349cc55cSDimitry Andric .customFor({{S32, S64}, {S64, S64}}); 8598bcb0991SDimitry Andric if (ST.has16BitInsts()) 8608bcb0991SDimitry Andric IToFP.legalFor({{S16, S16}}); 8618bcb0991SDimitry Andric IToFP.clampScalar(1, S32, S64) 862e8d8bef9SDimitry Andric .minScalar(0, S32) 8635ffd83dbSDimitry Andric .scalarize(0) 8645ffd83dbSDimitry Andric .widenScalarToNextPow2(1); 8650b57cec5SDimitry Andric 8668bcb0991SDimitry Andric auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 8675ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 868fe6060f1SDimitry Andric .customFor({{S64, S32}, {S64, S64}}) 869e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 8708bcb0991SDimitry Andric if (ST.has16BitInsts()) 8718bcb0991SDimitry Andric FPToI.legalFor({{S16, S16}}); 8728bcb0991SDimitry Andric else 8738bcb0991SDimitry Andric FPToI.minScalar(1, S32); 8748bcb0991SDimitry Andric 8758bcb0991SDimitry Andric FPToI.minScalar(0, S32) 876fe6060f1SDimitry Andric .widenScalarToNextPow2(0, 32) 8775ffd83dbSDimitry Andric .scalarize(0) 8785ffd83dbSDimitry Andric .lower(); 8790b57cec5SDimitry Andric 88081ad6265SDimitry Andric getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) 88181ad6265SDimitry Andric .customFor({S16, S32}) 88281ad6265SDimitry Andric .scalarize(0) 88381ad6265SDimitry Andric .lower(); 88481ad6265SDimitry Andric 885e8d8bef9SDimitry Andric // Lower roundeven into G_FRINT 886e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 887480093f4SDimitry Andric .scalarize(0) 888480093f4SDimitry Andric .lower(); 8890b57cec5SDimitry Andric 890480093f4SDimitry Andric if (ST.has16BitInsts()) { 891480093f4SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 892480093f4SDimitry Andric .legalFor({S16, S32, S64}) 893480093f4SDimitry Andric .clampScalar(0, S16, S64) 894480093f4SDimitry Andric .scalarize(0); 895480093f4SDimitry Andric } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 8960b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 8970b57cec5SDimitry Andric .legalFor({S32, S64}) 8980b57cec5SDimitry Andric .clampScalar(0, S32, S64) 8990b57cec5SDimitry Andric .scalarize(0); 9000b57cec5SDimitry Andric } else { 9010b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 9020b57cec5SDimitry Andric .legalFor({S32}) 9030b57cec5SDimitry Andric .customFor({S64}) 9040b57cec5SDimitry Andric .clampScalar(0, S32, S64) 9050b57cec5SDimitry Andric .scalarize(0); 9060b57cec5SDimitry Andric } 9070b57cec5SDimitry Andric 908480093f4SDimitry Andric getActionDefinitionsBuilder(G_PTR_ADD) 909e8d8bef9SDimitry Andric .legalIf(all(isPointer(0), sameSize(0, 1))) 910e8d8bef9SDimitry Andric .scalarize(0) 911e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0); 9120b57cec5SDimitry Andric 9135ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_PTRMASK) 914e8d8bef9SDimitry Andric .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 915e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0) 9165ffd83dbSDimitry Andric .scalarize(0); 9170b57cec5SDimitry Andric 9180b57cec5SDimitry Andric auto &CmpBuilder = 9190b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ICMP) 920480093f4SDimitry Andric // The compare output type differs based on the register bank of the output, 921480093f4SDimitry Andric // so make both s1 and s32 legal. 922480093f4SDimitry Andric // 923480093f4SDimitry Andric // Scalar compares producing output in scc will be promoted to s32, as that 924480093f4SDimitry Andric // is the allocatable register type that will be needed for the copy from 925480093f4SDimitry Andric // scc. This will be promoted during RegBankSelect, and we assume something 926480093f4SDimitry Andric // before that won't try to use s32 result types. 927480093f4SDimitry Andric // 928480093f4SDimitry Andric // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 929480093f4SDimitry Andric // bank. 9300b57cec5SDimitry Andric .legalForCartesianProduct( 9310b57cec5SDimitry Andric {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 932480093f4SDimitry Andric .legalForCartesianProduct( 933480093f4SDimitry Andric {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 9340b57cec5SDimitry Andric if (ST.has16BitInsts()) { 9350b57cec5SDimitry Andric CmpBuilder.legalFor({{S1, S16}}); 9360b57cec5SDimitry Andric } 9370b57cec5SDimitry Andric 9380b57cec5SDimitry Andric CmpBuilder 9390b57cec5SDimitry Andric .widenScalarToNextPow2(1) 9400b57cec5SDimitry Andric .clampScalar(1, S32, S64) 9410b57cec5SDimitry Andric .scalarize(0) 942480093f4SDimitry Andric .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 9430b57cec5SDimitry Andric 9440b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCMP) 9450b57cec5SDimitry Andric .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 9460b57cec5SDimitry Andric .widenScalarToNextPow2(1) 9470b57cec5SDimitry Andric .clampScalar(1, S32, S64) 9480b57cec5SDimitry Andric .scalarize(0); 9490b57cec5SDimitry Andric 9505ffd83dbSDimitry Andric // FIXME: fpow has a selection pattern that should move to custom lowering. 9515ffd83dbSDimitry Andric auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 9525ffd83dbSDimitry Andric if (ST.has16BitInsts()) 9535ffd83dbSDimitry Andric Exp2Ops.legalFor({S32, S16}); 9545ffd83dbSDimitry Andric else 9555ffd83dbSDimitry Andric Exp2Ops.legalFor({S32}); 9565ffd83dbSDimitry Andric Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 9575ffd83dbSDimitry Andric Exp2Ops.scalarize(0); 9585ffd83dbSDimitry Andric 9595ffd83dbSDimitry Andric auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 9605ffd83dbSDimitry Andric if (ST.has16BitInsts()) 9615ffd83dbSDimitry Andric ExpOps.customFor({{S32}, {S16}}); 9625ffd83dbSDimitry Andric else 9635ffd83dbSDimitry Andric ExpOps.customFor({S32}); 9645ffd83dbSDimitry Andric ExpOps.clampScalar(0, MinScalarFPTy, S32) 9650b57cec5SDimitry Andric .scalarize(0); 9660b57cec5SDimitry Andric 967e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FPOWI) 968e8d8bef9SDimitry Andric .clampScalar(0, MinScalarFPTy, S32) 969e8d8bef9SDimitry Andric .lower(); 970e8d8bef9SDimitry Andric 9710b57cec5SDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 9725ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_CTPOP) 9730b57cec5SDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 9740b57cec5SDimitry Andric .clampScalar(0, S32, S32) 97504eeddc0SDimitry Andric .widenScalarToNextPow2(1, 32) 9760b57cec5SDimitry Andric .clampScalar(1, S32, S64) 9770b57cec5SDimitry Andric .scalarize(0) 97804eeddc0SDimitry Andric .widenScalarToNextPow2(0, 32); 97904eeddc0SDimitry Andric 980*bdd1243dSDimitry Andric // If no 16 bit instr is available, lower into different instructions. 981*bdd1243dSDimitry Andric if (ST.has16BitInsts()) 982*bdd1243dSDimitry Andric getActionDefinitionsBuilder(G_IS_FPCLASS) 983*bdd1243dSDimitry Andric .legalForCartesianProduct({S1}, FPTypes16) 984*bdd1243dSDimitry Andric .widenScalarToNextPow2(1) 985*bdd1243dSDimitry Andric .scalarize(0) 986*bdd1243dSDimitry Andric .lower(); 987*bdd1243dSDimitry Andric else 988*bdd1243dSDimitry Andric getActionDefinitionsBuilder(G_IS_FPCLASS) 989*bdd1243dSDimitry Andric .legalForCartesianProduct({S1}, FPTypesBase) 990*bdd1243dSDimitry Andric .lowerFor({S1, S16}) 991*bdd1243dSDimitry Andric .widenScalarToNextPow2(1) 992*bdd1243dSDimitry Andric .scalarize(0) 993*bdd1243dSDimitry Andric .lower(); 9940b57cec5SDimitry Andric 9955ffd83dbSDimitry Andric // The hardware instructions return a different result on 0 than the generic 9965ffd83dbSDimitry Andric // instructions expect. The hardware produces -1, but these produce the 9975ffd83dbSDimitry Andric // bitwidth. 9985ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 9995ffd83dbSDimitry Andric .scalarize(0) 10005ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 10015ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 10025ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 10035ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32) 1004349cc55cSDimitry Andric .custom(); 10055ffd83dbSDimitry Andric 10065ffd83dbSDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 10075ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 10085ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 10095ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 10105ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 10115ffd83dbSDimitry Andric .scalarize(0) 10125ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 10135ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 10145ffd83dbSDimitry Andric 1015fe6060f1SDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1016fe6060f1SDimitry Andric // RegBankSelect. 10175ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BITREVERSE) 1018fe6060f1SDimitry Andric .legalFor({S32, S64}) 1019fe6060f1SDimitry Andric .clampScalar(0, S32, S64) 1020fe6060f1SDimitry Andric .scalarize(0) 1021fe6060f1SDimitry Andric .widenScalarToNextPow2(0); 10220b57cec5SDimitry Andric 10230b57cec5SDimitry Andric if (ST.has16BitInsts()) { 10245ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 10255ffd83dbSDimitry Andric .legalFor({S16, S32, V2S16}) 10260eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 10275ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 10285ffd83dbSDimitry Andric // narrowScalar limitation. 10295ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 10305ffd83dbSDimitry Andric .clampScalar(0, S16, S32) 10315ffd83dbSDimitry Andric .scalarize(0); 10325ffd83dbSDimitry Andric 10330b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 1034fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 10350b57cec5SDimitry Andric .legalFor({S32, S16, V2S16}) 10360b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 10370b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 10385ffd83dbSDimitry Andric .minScalar(0, S16) 10390b57cec5SDimitry Andric .widenScalarToNextPow2(0) 10405ffd83dbSDimitry Andric .scalarize(0) 10415ffd83dbSDimitry Andric .lower(); 10420b57cec5SDimitry Andric } else { 1043fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 10440b57cec5SDimitry Andric .legalFor({S32, S16}) 10450b57cec5SDimitry Andric .widenScalarToNextPow2(0) 10465ffd83dbSDimitry Andric .minScalar(0, S16) 10475ffd83dbSDimitry Andric .scalarize(0) 10485ffd83dbSDimitry Andric .lower(); 10490b57cec5SDimitry Andric } 10500b57cec5SDimitry Andric } else { 10515ffd83dbSDimitry Andric // TODO: Should have same legality without v_perm_b32 10525ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 10535ffd83dbSDimitry Andric .legalFor({S32}) 10545ffd83dbSDimitry Andric .lowerIf(scalarNarrowerThan(0, 32)) 10555ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 10565ffd83dbSDimitry Andric // narrowScalar limitation. 10575ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 10585ffd83dbSDimitry Andric .maxScalar(0, S32) 10595ffd83dbSDimitry Andric .scalarize(0) 10605ffd83dbSDimitry Andric .lower(); 10615ffd83dbSDimitry Andric 1062fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS}) 10630b57cec5SDimitry Andric .legalFor({S32}) 10645ffd83dbSDimitry Andric .minScalar(0, S32) 10650b57cec5SDimitry Andric .widenScalarToNextPow2(0) 10665ffd83dbSDimitry Andric .scalarize(0) 10675ffd83dbSDimitry Andric .lower(); 10680b57cec5SDimitry Andric } 10690b57cec5SDimitry Andric 10700b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTTOPTR) 10710b57cec5SDimitry Andric // List the common cases 10720b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 10730b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 10740b57cec5SDimitry Andric .scalarize(0) 10750b57cec5SDimitry Andric // Accept any address space as long as the size matches 10760b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 10770b57cec5SDimitry Andric .widenScalarIf(smallerThan(1, 0), 10780b57cec5SDimitry Andric [](const LegalityQuery &Query) { 1079*bdd1243dSDimitry Andric return std::pair( 1080*bdd1243dSDimitry Andric 1, LLT::scalar(Query.Types[0].getSizeInBits())); 10810b57cec5SDimitry Andric }) 1082*bdd1243dSDimitry Andric .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { 1083*bdd1243dSDimitry Andric return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 10840b57cec5SDimitry Andric }); 10850b57cec5SDimitry Andric 10860b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PTRTOINT) 10870b57cec5SDimitry Andric // List the common cases 10880b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 10890b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 10900b57cec5SDimitry Andric .scalarize(0) 10910b57cec5SDimitry Andric // Accept any address space as long as the size matches 10920b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 10930b57cec5SDimitry Andric .widenScalarIf(smallerThan(0, 1), 10940b57cec5SDimitry Andric [](const LegalityQuery &Query) { 1095*bdd1243dSDimitry Andric return std::pair( 1096*bdd1243dSDimitry Andric 0, LLT::scalar(Query.Types[1].getSizeInBits())); 10970b57cec5SDimitry Andric }) 1098*bdd1243dSDimitry Andric .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) { 1099*bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 11000b57cec5SDimitry Andric }); 11010b57cec5SDimitry Andric 11020b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 11030b57cec5SDimitry Andric .scalarize(0) 11040b57cec5SDimitry Andric .custom(); 11050b57cec5SDimitry Andric 11065ffd83dbSDimitry Andric const auto needToSplitMemOp = [=](const LegalityQuery &Query, 11075ffd83dbSDimitry Andric bool IsLoad) -> bool { 11088bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 11098bcb0991SDimitry Andric 11108bcb0991SDimitry Andric // Split vector extloads. 1111fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1112480093f4SDimitry Andric 11138bcb0991SDimitry Andric if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 11148bcb0991SDimitry Andric return true; 11158bcb0991SDimitry Andric 11168bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 11178bcb0991SDimitry Andric unsigned AS = PtrTy.getAddressSpace(); 11185ffd83dbSDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 11198bcb0991SDimitry Andric return true; 11208bcb0991SDimitry Andric 11218bcb0991SDimitry Andric // Catch weird sized loads that don't evenly divide into the access sizes 11228bcb0991SDimitry Andric // TODO: May be able to widen depending on alignment etc. 11235ffd83dbSDimitry Andric unsigned NumRegs = (MemSize + 31) / 32; 11245ffd83dbSDimitry Andric if (NumRegs == 3) { 11255ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 11268bcb0991SDimitry Andric return true; 11275ffd83dbSDimitry Andric } else { 11285ffd83dbSDimitry Andric // If the alignment allows, these should have been widened. 11295ffd83dbSDimitry Andric if (!isPowerOf2_32(NumRegs)) 11305ffd83dbSDimitry Andric return true; 11315ffd83dbSDimitry Andric } 11328bcb0991SDimitry Andric 11338bcb0991SDimitry Andric return false; 11348bcb0991SDimitry Andric }; 11358bcb0991SDimitry Andric 1136e8d8bef9SDimitry Andric unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1137e8d8bef9SDimitry Andric unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1138e8d8bef9SDimitry Andric unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 11398bcb0991SDimitry Andric 11408bcb0991SDimitry Andric // TODO: Refine based on subtargets which support unaligned access or 128-bit 11418bcb0991SDimitry Andric // LDS 11428bcb0991SDimitry Andric // TODO: Unsupported flat for SI. 11438bcb0991SDimitry Andric 11448bcb0991SDimitry Andric for (unsigned Op : {G_LOAD, G_STORE}) { 11458bcb0991SDimitry Andric const bool IsStore = Op == G_STORE; 11468bcb0991SDimitry Andric 11478bcb0991SDimitry Andric auto &Actions = getActionDefinitionsBuilder(Op); 11485ffd83dbSDimitry Andric // Explicitly list some common cases. 11495ffd83dbSDimitry Andric // TODO: Does this help compile time at all? 1150fe6060f1SDimitry Andric Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32}, 1151fe6060f1SDimitry Andric {V2S32, GlobalPtr, V2S32, GlobalAlign32}, 1152fe6060f1SDimitry Andric {V4S32, GlobalPtr, V4S32, GlobalAlign32}, 1153fe6060f1SDimitry Andric {S64, GlobalPtr, S64, GlobalAlign32}, 1154fe6060f1SDimitry Andric {V2S64, GlobalPtr, V2S64, GlobalAlign32}, 1155fe6060f1SDimitry Andric {V2S16, GlobalPtr, V2S16, GlobalAlign32}, 1156fe6060f1SDimitry Andric {S32, GlobalPtr, S8, GlobalAlign8}, 1157fe6060f1SDimitry Andric {S32, GlobalPtr, S16, GlobalAlign16}, 11588bcb0991SDimitry Andric 1159fe6060f1SDimitry Andric {S32, LocalPtr, S32, 32}, 1160fe6060f1SDimitry Andric {S64, LocalPtr, S64, 32}, 1161fe6060f1SDimitry Andric {V2S32, LocalPtr, V2S32, 32}, 1162fe6060f1SDimitry Andric {S32, LocalPtr, S8, 8}, 1163fe6060f1SDimitry Andric {S32, LocalPtr, S16, 16}, 1164fe6060f1SDimitry Andric {V2S16, LocalPtr, S32, 32}, 11658bcb0991SDimitry Andric 1166fe6060f1SDimitry Andric {S32, PrivatePtr, S32, 32}, 1167fe6060f1SDimitry Andric {S32, PrivatePtr, S8, 8}, 1168fe6060f1SDimitry Andric {S32, PrivatePtr, S16, 16}, 1169fe6060f1SDimitry Andric {V2S16, PrivatePtr, S32, 32}, 11708bcb0991SDimitry Andric 1171fe6060f1SDimitry Andric {S32, ConstantPtr, S32, GlobalAlign32}, 1172fe6060f1SDimitry Andric {V2S32, ConstantPtr, V2S32, GlobalAlign32}, 1173fe6060f1SDimitry Andric {V4S32, ConstantPtr, V4S32, GlobalAlign32}, 1174fe6060f1SDimitry Andric {S64, ConstantPtr, S64, GlobalAlign32}, 1175fe6060f1SDimitry Andric {V2S32, ConstantPtr, V2S32, GlobalAlign32}}); 11765ffd83dbSDimitry Andric Actions.legalIf( 11775ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1178fe6060f1SDimitry Andric return isLoadStoreLegal(ST, Query); 11795ffd83dbSDimitry Andric }); 11805ffd83dbSDimitry Andric 11815ffd83dbSDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 11825ffd83dbSDimitry Andric // 64-bits. 11835ffd83dbSDimitry Andric // 11845ffd83dbSDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 11855ffd83dbSDimitry Andric // inserting addrspacecasts. 11865ffd83dbSDimitry Andric Actions.customIf(typeIs(1, Constant32Ptr)); 11875ffd83dbSDimitry Andric 11885ffd83dbSDimitry Andric // Turn any illegal element vectors into something easier to deal 11895ffd83dbSDimitry Andric // with. These will ultimately produce 32-bit scalar shifts to extract the 11905ffd83dbSDimitry Andric // parts anyway. 11915ffd83dbSDimitry Andric // 11925ffd83dbSDimitry Andric // For odd 16-bit element vectors, prefer to split those into pieces with 11935ffd83dbSDimitry Andric // 16-bit vector parts. 11945ffd83dbSDimitry Andric Actions.bitcastIf( 11955ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1196e8d8bef9SDimitry Andric return shouldBitcastLoadStoreType(ST, Query.Types[0], 1197fe6060f1SDimitry Andric Query.MMODescrs[0].MemoryTy); 11985ffd83dbSDimitry Andric }, bitcastToRegisterType(0)); 11995ffd83dbSDimitry Andric 1200e8d8bef9SDimitry Andric if (!IsStore) { 1201e8d8bef9SDimitry Andric // Widen suitably aligned loads by loading extra bytes. The standard 1202e8d8bef9SDimitry Andric // legalization actions can't properly express widening memory operands. 1203e8d8bef9SDimitry Andric Actions.customIf([=](const LegalityQuery &Query) -> bool { 1204e8d8bef9SDimitry Andric return shouldWidenLoad(ST, Query, G_LOAD); 1205e8d8bef9SDimitry Andric }); 1206e8d8bef9SDimitry Andric } 1207e8d8bef9SDimitry Andric 1208e8d8bef9SDimitry Andric // FIXME: load/store narrowing should be moved to lower action 12098bcb0991SDimitry Andric Actions 12108bcb0991SDimitry Andric .narrowScalarIf( 12118bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 12125ffd83dbSDimitry Andric return !Query.Types[0].isVector() && 12135ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 12148bcb0991SDimitry Andric }, 12158bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 12168bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 12178bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 12188bcb0991SDimitry Andric 12198bcb0991SDimitry Andric const unsigned DstSize = DstTy.getSizeInBits(); 1220fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 12218bcb0991SDimitry Andric 12228bcb0991SDimitry Andric // Split extloads. 12238bcb0991SDimitry Andric if (DstSize > MemSize) 1224*bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(MemSize)); 12258bcb0991SDimitry Andric 12265ffd83dbSDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(ST, 12275ffd83dbSDimitry Andric PtrTy.getAddressSpace(), 12285ffd83dbSDimitry Andric Op == G_LOAD); 12298bcb0991SDimitry Andric if (MemSize > MaxSize) 1230*bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(MaxSize)); 12318bcb0991SDimitry Andric 123204eeddc0SDimitry Andric uint64_t Align = Query.MMODescrs[0].AlignInBits; 1233*bdd1243dSDimitry Andric return std::pair(0, LLT::scalar(Align)); 12348bcb0991SDimitry Andric }) 12358bcb0991SDimitry Andric .fewerElementsIf( 12368bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 12375ffd83dbSDimitry Andric return Query.Types[0].isVector() && 12385ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 12398bcb0991SDimitry Andric }, 12408bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 12418bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 12428bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 12438bcb0991SDimitry Andric 12448bcb0991SDimitry Andric LLT EltTy = DstTy.getElementType(); 12455ffd83dbSDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(ST, 12465ffd83dbSDimitry Andric PtrTy.getAddressSpace(), 12475ffd83dbSDimitry Andric Op == G_LOAD); 12485ffd83dbSDimitry Andric 12495ffd83dbSDimitry Andric // FIXME: Handle widened to power of 2 results better. This ends 12505ffd83dbSDimitry Andric // up scalarizing. 12515ffd83dbSDimitry Andric // FIXME: 3 element stores scalarized on SI 12528bcb0991SDimitry Andric 12538bcb0991SDimitry Andric // Split if it's too large for the address space. 1254fe6060f1SDimitry Andric unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); 1255fe6060f1SDimitry Andric if (MemSize > MaxSize) { 12568bcb0991SDimitry Andric unsigned NumElts = DstTy.getNumElements(); 12575ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 12585ffd83dbSDimitry Andric 12595ffd83dbSDimitry Andric if (MaxSize % EltSize == 0) { 1260*bdd1243dSDimitry Andric return std::pair( 1261fe6060f1SDimitry Andric 0, LLT::scalarOrVector( 1262fe6060f1SDimitry Andric ElementCount::getFixed(MaxSize / EltSize), EltTy)); 12635ffd83dbSDimitry Andric } 12645ffd83dbSDimitry Andric 1265fe6060f1SDimitry Andric unsigned NumPieces = MemSize / MaxSize; 12668bcb0991SDimitry Andric 12678bcb0991SDimitry Andric // FIXME: Refine when odd breakdowns handled 12688bcb0991SDimitry Andric // The scalars will need to be re-legalized. 12698bcb0991SDimitry Andric if (NumPieces == 1 || NumPieces >= NumElts || 12708bcb0991SDimitry Andric NumElts % NumPieces != 0) 1271*bdd1243dSDimitry Andric return std::pair(0, EltTy); 12728bcb0991SDimitry Andric 1273*bdd1243dSDimitry Andric return std::pair(0, 1274*bdd1243dSDimitry Andric LLT::fixed_vector(NumElts / NumPieces, EltTy)); 12758bcb0991SDimitry Andric } 12768bcb0991SDimitry Andric 12775ffd83dbSDimitry Andric // FIXME: We could probably handle weird extending loads better. 12785ffd83dbSDimitry Andric if (DstTy.getSizeInBits() > MemSize) 1279*bdd1243dSDimitry Andric return std::pair(0, EltTy); 12805ffd83dbSDimitry Andric 12815ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 12825ffd83dbSDimitry Andric unsigned DstSize = DstTy.getSizeInBits(); 12835ffd83dbSDimitry Andric if (!isPowerOf2_32(DstSize)) { 12845ffd83dbSDimitry Andric // We're probably decomposing an odd sized store. Try to split 12855ffd83dbSDimitry Andric // to the widest type. TODO: Account for alignment. As-is it 12865ffd83dbSDimitry Andric // should be OK, since the new parts will be further legalized. 12875ffd83dbSDimitry Andric unsigned FloorSize = PowerOf2Floor(DstSize); 1288*bdd1243dSDimitry Andric return std::pair( 1289fe6060f1SDimitry Andric 0, LLT::scalarOrVector( 1290fe6060f1SDimitry Andric ElementCount::getFixed(FloorSize / EltSize), EltTy)); 12915ffd83dbSDimitry Andric } 12925ffd83dbSDimitry Andric 12938bcb0991SDimitry Andric // May need relegalization for the scalars. 1294*bdd1243dSDimitry Andric return std::pair(0, EltTy); 12958bcb0991SDimitry Andric }) 1296fe6060f1SDimitry Andric .minScalar(0, S32) 1297fe6060f1SDimitry Andric .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) 12988bcb0991SDimitry Andric .widenScalarToNextPow2(0) 1299e8d8bef9SDimitry Andric .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1300e8d8bef9SDimitry Andric .lower(); 13018bcb0991SDimitry Andric } 13020b57cec5SDimitry Andric 1303fe6060f1SDimitry Andric // FIXME: Unaligned accesses not lowered. 13040b57cec5SDimitry Andric auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 1305fe6060f1SDimitry Andric .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8}, 1306fe6060f1SDimitry Andric {S32, GlobalPtr, S16, 2 * 8}, 1307fe6060f1SDimitry Andric {S32, LocalPtr, S8, 8}, 1308fe6060f1SDimitry Andric {S32, LocalPtr, S16, 16}, 1309fe6060f1SDimitry Andric {S32, PrivatePtr, S8, 8}, 1310fe6060f1SDimitry Andric {S32, PrivatePtr, S16, 16}, 1311fe6060f1SDimitry Andric {S32, ConstantPtr, S8, 8}, 1312fe6060f1SDimitry Andric {S32, ConstantPtr, S16, 2 * 8}}) 1313fe6060f1SDimitry Andric .legalIf( 1314fe6060f1SDimitry Andric [=](const LegalityQuery &Query) -> bool { 1315fe6060f1SDimitry Andric return isLoadStoreLegal(ST, Query); 1316fe6060f1SDimitry Andric }); 1317fe6060f1SDimitry Andric 13180b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 13198bcb0991SDimitry Andric ExtLoads.legalForTypesWithMemDesc( 1320fe6060f1SDimitry Andric {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}}); 13210b57cec5SDimitry Andric } 13220b57cec5SDimitry Andric 1323fe6060f1SDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 1324fe6060f1SDimitry Andric // 64-bits. 1325fe6060f1SDimitry Andric // 1326fe6060f1SDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 1327fe6060f1SDimitry Andric // inserting addrspacecasts. 1328fe6060f1SDimitry Andric ExtLoads.customIf(typeIs(1, Constant32Ptr)); 1329fe6060f1SDimitry Andric 13300b57cec5SDimitry Andric ExtLoads.clampScalar(0, S32, S32) 13310b57cec5SDimitry Andric .widenScalarToNextPow2(0) 13320b57cec5SDimitry Andric .lower(); 13330b57cec5SDimitry Andric 13340b57cec5SDimitry Andric auto &Atomics = getActionDefinitionsBuilder( 13350b57cec5SDimitry Andric {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 13360b57cec5SDimitry Andric G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 13370b57cec5SDimitry Andric G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1338480093f4SDimitry Andric G_ATOMICRMW_UMIN}) 13390b57cec5SDimitry Andric .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1340e8d8bef9SDimitry Andric {S64, GlobalPtr}, {S64, LocalPtr}, 1341e8d8bef9SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 13420b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 13430b57cec5SDimitry Andric Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 13440b57cec5SDimitry Andric } 13450b57cec5SDimitry Andric 1346fe6060f1SDimitry Andric auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); 1347349cc55cSDimitry Andric if (ST.hasLDSFPAtomicAdd()) { 1348fe6060f1SDimitry Andric Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 1349fe6060f1SDimitry Andric if (ST.hasGFX90AInsts()) 1350fe6060f1SDimitry Andric Atomic.legalFor({{S64, LocalPtr}}); 135181ad6265SDimitry Andric if (ST.hasGFX940Insts()) 135281ad6265SDimitry Andric Atomic.legalFor({{V2S16, LocalPtr}}); 13535ffd83dbSDimitry Andric } 1354fe6060f1SDimitry Andric if (ST.hasAtomicFaddInsts()) 1355fe6060f1SDimitry Andric Atomic.legalFor({{S32, GlobalPtr}}); 1356*bdd1243dSDimitry Andric if (ST.hasFlatAtomicFaddF32Inst()) 1357*bdd1243dSDimitry Andric Atomic.legalFor({{S32, FlatPtr}}); 13588bcb0991SDimitry Andric 135904eeddc0SDimitry Andric if (ST.hasGFX90AInsts()) { 136004eeddc0SDimitry Andric // These are legal with some caveats, and should have undergone expansion in 136104eeddc0SDimitry Andric // the IR in most situations 136204eeddc0SDimitry Andric // TODO: Move atomic expansion into legalizer 136304eeddc0SDimitry Andric Atomic.legalFor({ 136404eeddc0SDimitry Andric {S32, GlobalPtr}, 136504eeddc0SDimitry Andric {S64, GlobalPtr}, 136604eeddc0SDimitry Andric {S64, FlatPtr} 136704eeddc0SDimitry Andric }); 136804eeddc0SDimitry Andric } 136904eeddc0SDimitry Andric 1370480093f4SDimitry Andric // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1371480093f4SDimitry Andric // demarshalling 1372480093f4SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1373480093f4SDimitry Andric .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1374480093f4SDimitry Andric {S32, FlatPtr}, {S64, FlatPtr}}) 1375480093f4SDimitry Andric .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1376480093f4SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 13770b57cec5SDimitry Andric // TODO: Pointer types, any 32-bit or 64-bit vector 1378480093f4SDimitry Andric 1379480093f4SDimitry Andric // Condition should be s32 for scalar, s1 for vector. 13800b57cec5SDimitry Andric getActionDefinitionsBuilder(G_SELECT) 1381fe6060f1SDimitry Andric .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr, 1382fe6060f1SDimitry Andric LocalPtr, FlatPtr, PrivatePtr, 1383fe6060f1SDimitry Andric LLT::fixed_vector(2, LocalPtr), 1384fe6060f1SDimitry Andric LLT::fixed_vector(2, PrivatePtr)}, 1385fe6060f1SDimitry Andric {S1, S32}) 13860b57cec5SDimitry Andric .clampScalar(0, S16, S64) 13875ffd83dbSDimitry Andric .scalarize(1) 13880b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 13890b57cec5SDimitry Andric .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 13900b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 2) 13910b57cec5SDimitry Andric .clampMaxNumElements(0, LocalPtr, 2) 13920b57cec5SDimitry Andric .clampMaxNumElements(0, PrivatePtr, 2) 13930b57cec5SDimitry Andric .scalarize(0) 13940b57cec5SDimitry Andric .widenScalarToNextPow2(0) 1395480093f4SDimitry Andric .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 13960b57cec5SDimitry Andric 13970b57cec5SDimitry Andric // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 13980b57cec5SDimitry Andric // be more flexible with the shift amount type. 13990b57cec5SDimitry Andric auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 14000b57cec5SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}); 14010b57cec5SDimitry Andric if (ST.has16BitInsts()) { 14020b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 14035ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 14040b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2); 14050b57cec5SDimitry Andric } else 14065ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}}); 14070b57cec5SDimitry Andric 14085ffd83dbSDimitry Andric // TODO: Support 16-bit shift amounts for all types 14095ffd83dbSDimitry Andric Shifts.widenScalarIf( 14105ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { 14115ffd83dbSDimitry Andric // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 14125ffd83dbSDimitry Andric // 32-bit amount. 14135ffd83dbSDimitry Andric const LLT ValTy = Query.Types[0]; 14145ffd83dbSDimitry Andric const LLT AmountTy = Query.Types[1]; 14155ffd83dbSDimitry Andric return ValTy.getSizeInBits() <= 16 && 14165ffd83dbSDimitry Andric AmountTy.getSizeInBits() < 16; 14175ffd83dbSDimitry Andric }, changeTo(1, S16)); 14185ffd83dbSDimitry Andric Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1419480093f4SDimitry Andric Shifts.clampScalar(1, S32, S32); 14200b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 16); 142104eeddc0SDimitry Andric Shifts.clampScalar(0, S16, S64); 1422e8d8bef9SDimitry Andric 1423e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1424e8d8bef9SDimitry Andric .minScalar(0, S16) 1425e8d8bef9SDimitry Andric .scalarize(0) 1426e8d8bef9SDimitry Andric .lower(); 14270b57cec5SDimitry Andric } else { 14280b57cec5SDimitry Andric // Make sure we legalize the shift amount type first, as the general 14290b57cec5SDimitry Andric // expansion for the shifted type will produce much worse code if it hasn't 14300b57cec5SDimitry Andric // been truncated already. 14310b57cec5SDimitry Andric Shifts.clampScalar(1, S32, S32); 14320b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 32); 143304eeddc0SDimitry Andric Shifts.clampScalar(0, S32, S64); 1434e8d8bef9SDimitry Andric 1435e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1436e8d8bef9SDimitry Andric .minScalar(0, S32) 1437e8d8bef9SDimitry Andric .scalarize(0) 1438e8d8bef9SDimitry Andric .lower(); 14390b57cec5SDimitry Andric } 14400b57cec5SDimitry Andric Shifts.scalarize(0); 14410b57cec5SDimitry Andric 14420b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 14430b57cec5SDimitry Andric unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 14440b57cec5SDimitry Andric unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 14450b57cec5SDimitry Andric unsigned IdxTypeIdx = 2; 14460b57cec5SDimitry Andric 14470b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 14480b57cec5SDimitry Andric .customIf([=](const LegalityQuery &Query) { 14490b57cec5SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 14500b57cec5SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 14510b57cec5SDimitry Andric const LLT IdxTy = Query.Types[IdxTypeIdx]; 1452e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 1453e8d8bef9SDimitry Andric return (EltSize == 32 || EltSize == 64) && 14540b57cec5SDimitry Andric VecTy.getSizeInBits() % 32 == 0 && 14555ffd83dbSDimitry Andric VecTy.getSizeInBits() <= MaxRegisterSize && 14560b57cec5SDimitry Andric IdxTy.getSizeInBits() == 32; 14570b57cec5SDimitry Andric }) 1458e8d8bef9SDimitry Andric .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1459e8d8bef9SDimitry Andric bitcastToVectorElement32(VecTypeIdx)) 1460e8d8bef9SDimitry Andric //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1461e8d8bef9SDimitry Andric .bitcastIf( 1462e8d8bef9SDimitry Andric all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1463e8d8bef9SDimitry Andric [=](const LegalityQuery &Query) { 1464e8d8bef9SDimitry Andric // For > 64-bit element types, try to turn this into a 64-bit 1465e8d8bef9SDimitry Andric // element vector since we may be able to do better indexing 1466e8d8bef9SDimitry Andric // if this is scalar. If not, fall back to 32. 1467e8d8bef9SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 1468e8d8bef9SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 1469e8d8bef9SDimitry Andric const unsigned DstEltSize = EltTy.getSizeInBits(); 1470e8d8bef9SDimitry Andric const unsigned VecSize = VecTy.getSizeInBits(); 1471e8d8bef9SDimitry Andric 1472e8d8bef9SDimitry Andric const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1473*bdd1243dSDimitry Andric return std::pair( 1474fe6060f1SDimitry Andric VecTypeIdx, 1475fe6060f1SDimitry Andric LLT::fixed_vector(VecSize / TargetEltSize, TargetEltSize)); 1476e8d8bef9SDimitry Andric }) 14770b57cec5SDimitry Andric .clampScalar(EltTypeIdx, S32, S64) 14780b57cec5SDimitry Andric .clampScalar(VecTypeIdx, S32, S64) 1479e8d8bef9SDimitry Andric .clampScalar(IdxTypeIdx, S32, S32) 1480e8d8bef9SDimitry Andric .clampMaxNumElements(VecTypeIdx, S32, 32) 1481e8d8bef9SDimitry Andric // TODO: Clamp elements for 64-bit vectors? 1482e8d8bef9SDimitry Andric // It should only be necessary with variable indexes. 1483e8d8bef9SDimitry Andric // As a last resort, lower to the stack 1484e8d8bef9SDimitry Andric .lower(); 14850b57cec5SDimitry Andric } 14860b57cec5SDimitry Andric 14870b57cec5SDimitry Andric getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 14880b57cec5SDimitry Andric .unsupportedIf([=](const LegalityQuery &Query) { 14890b57cec5SDimitry Andric const LLT &EltTy = Query.Types[1].getElementType(); 14900b57cec5SDimitry Andric return Query.Types[0] != EltTy; 14910b57cec5SDimitry Andric }); 14920b57cec5SDimitry Andric 14930b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT, G_INSERT}) { 14940b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 14950b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 14960b57cec5SDimitry Andric 14970b57cec5SDimitry Andric // FIXME: Doesn't handle extract of illegal sizes. 14980b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 14998bcb0991SDimitry Andric .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 15000eae32dcSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 15010eae32dcSDimitry Andric // Sub-vector(or single element) insert and extract. 15020eae32dcSDimitry Andric // TODO: verify immediate offset here since lower only works with 15030eae32dcSDimitry Andric // whole elements. 15040eae32dcSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 15050eae32dcSDimitry Andric return BigTy.isVector(); 15060eae32dcSDimitry Andric }) 15078bcb0991SDimitry Andric // FIXME: Multiples of 16 should not be legal. 15080b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 15090b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 15100b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 15110b57cec5SDimitry Andric return (BigTy.getSizeInBits() % 32 == 0) && 15120b57cec5SDimitry Andric (LitTy.getSizeInBits() % 16 == 0); 15130b57cec5SDimitry Andric }) 15140b57cec5SDimitry Andric .widenScalarIf( 15150b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 15160b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 15170b57cec5SDimitry Andric return (BigTy.getScalarSizeInBits() < 16); 15180b57cec5SDimitry Andric }, 15190b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 15200b57cec5SDimitry Andric .widenScalarIf( 15210b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 15220b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 15230b57cec5SDimitry Andric return (LitTy.getScalarSizeInBits() < 16); 15240b57cec5SDimitry Andric }, 15250b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 15260b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 15270b57cec5SDimitry Andric .widenScalarToNextPow2(BigTyIdx, 32); 15280b57cec5SDimitry Andric 15290b57cec5SDimitry Andric } 15300b57cec5SDimitry Andric 15318bcb0991SDimitry Andric auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 15320b57cec5SDimitry Andric .legalForCartesianProduct(AllS32Vectors, {S32}) 15330b57cec5SDimitry Andric .legalForCartesianProduct(AllS64Vectors, {S64}) 15348bcb0991SDimitry Andric .clampNumElements(0, V16S32, V32S32) 15358bcb0991SDimitry Andric .clampNumElements(0, V2S64, V16S64) 15368bcb0991SDimitry Andric .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 15378bcb0991SDimitry Andric 15388bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) { 15395ffd83dbSDimitry Andric BuildVector 15405ffd83dbSDimitry Andric // FIXME: Should probably widen s1 vectors straight to s32 15415ffd83dbSDimitry Andric .minScalarOrElt(0, S16) 1542*bdd1243dSDimitry Andric .minScalar(1, S16); 15435ffd83dbSDimitry Andric 15448bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 15458bcb0991SDimitry Andric .legalFor({V2S16, S32}) 15468bcb0991SDimitry Andric .lower(); 15478bcb0991SDimitry Andric } else { 15485ffd83dbSDimitry Andric BuildVector.customFor({V2S16, S16}); 15495ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 15505ffd83dbSDimitry Andric 15518bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 15525ffd83dbSDimitry Andric .customFor({V2S16, S32}) 15538bcb0991SDimitry Andric .lower(); 15548bcb0991SDimitry Andric } 15558bcb0991SDimitry Andric 15565ffd83dbSDimitry Andric BuildVector.legalIf(isRegisterType(0)); 15575ffd83dbSDimitry Andric 15585ffd83dbSDimitry Andric // FIXME: Clamp maximum size 15590b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1560e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 1561e8d8bef9SDimitry Andric .clampMaxNumElements(0, S32, 32) 1562e8d8bef9SDimitry Andric .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1563e8d8bef9SDimitry Andric .clampMaxNumElements(0, S16, 64); 15640b57cec5SDimitry Andric 15658bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 15668bcb0991SDimitry Andric 15670b57cec5SDimitry Andric // Merge/Unmerge 15680b57cec5SDimitry Andric for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 15690b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 15700b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 15710b57cec5SDimitry Andric 15720b57cec5SDimitry Andric auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 15735ffd83dbSDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 15740b57cec5SDimitry Andric if (Ty.isVector()) { 15750b57cec5SDimitry Andric const LLT &EltTy = Ty.getElementType(); 15765ffd83dbSDimitry Andric if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 15770b57cec5SDimitry Andric return true; 15780b57cec5SDimitry Andric if (!isPowerOf2_32(EltTy.getSizeInBits())) 15790b57cec5SDimitry Andric return true; 15800b57cec5SDimitry Andric } 15810b57cec5SDimitry Andric return false; 15820b57cec5SDimitry Andric }; 15830b57cec5SDimitry Andric 15848bcb0991SDimitry Andric auto &Builder = getActionDefinitionsBuilder(Op) 1585e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 15865ffd83dbSDimitry Andric .lowerFor({{S16, V2S16}}) 15875ffd83dbSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 15885ffd83dbSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 15895ffd83dbSDimitry Andric return BigTy.getSizeInBits() == 32; 15905ffd83dbSDimitry Andric }) 15915ffd83dbSDimitry Andric // Try to widen to s16 first for small types. 15925ffd83dbSDimitry Andric // TODO: Only do this on targets with legal s16 shifts 15935ffd83dbSDimitry Andric .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 15940b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 15958bcb0991SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 15968bcb0991SDimitry Andric .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 15978bcb0991SDimitry Andric elementTypeIs(1, S16)), 15988bcb0991SDimitry Andric changeTo(1, V2S16)) 15995ffd83dbSDimitry Andric // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 16005ffd83dbSDimitry Andric // worth considering the multiples of 64 since 2*192 and 2*384 are not 16015ffd83dbSDimitry Andric // valid. 16025ffd83dbSDimitry Andric .clampScalar(LitTyIdx, S32, S512) 16035ffd83dbSDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 16040b57cec5SDimitry Andric // Break up vectors with weird elements into scalars 16050b57cec5SDimitry Andric .fewerElementsIf( 16065ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 16070b57cec5SDimitry Andric scalarize(0)) 16080b57cec5SDimitry Andric .fewerElementsIf( 16095ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 16100b57cec5SDimitry Andric scalarize(1)) 16115ffd83dbSDimitry Andric .clampScalar(BigTyIdx, S32, MaxScalar); 16128bcb0991SDimitry Andric 16138bcb0991SDimitry Andric if (Op == G_MERGE_VALUES) { 16148bcb0991SDimitry Andric Builder.widenScalarIf( 16158bcb0991SDimitry Andric // TODO: Use 16-bit shifts if legal for 8-bit values? 16160b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 16178bcb0991SDimitry Andric const LLT Ty = Query.Types[LitTyIdx]; 16188bcb0991SDimitry Andric return Ty.getSizeInBits() < 32; 16198bcb0991SDimitry Andric }, 16208bcb0991SDimitry Andric changeTo(LitTyIdx, S32)); 16218bcb0991SDimitry Andric } 16228bcb0991SDimitry Andric 16238bcb0991SDimitry Andric Builder.widenScalarIf( 16248bcb0991SDimitry Andric [=](const LegalityQuery &Query) { 16258bcb0991SDimitry Andric const LLT Ty = Query.Types[BigTyIdx]; 16260b57cec5SDimitry Andric return !isPowerOf2_32(Ty.getSizeInBits()) && 16270b57cec5SDimitry Andric Ty.getSizeInBits() % 16 != 0; 16280b57cec5SDimitry Andric }, 16290b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 16300b57cec5SDimitry Andric // Pick the next power of 2, or a multiple of 64 over 128. 16310b57cec5SDimitry Andric // Whichever is smaller. 16320b57cec5SDimitry Andric const LLT &Ty = Query.Types[BigTyIdx]; 16330b57cec5SDimitry Andric unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 16340b57cec5SDimitry Andric if (NewSizeInBits >= 256) { 16350b57cec5SDimitry Andric unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 16360b57cec5SDimitry Andric if (RoundedTo < NewSizeInBits) 16370b57cec5SDimitry Andric NewSizeInBits = RoundedTo; 16380b57cec5SDimitry Andric } 1639*bdd1243dSDimitry Andric return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 16400b57cec5SDimitry Andric }) 16410b57cec5SDimitry Andric // Any vectors left are the wrong size. Scalarize them. 16420b57cec5SDimitry Andric .scalarize(0) 16430b57cec5SDimitry Andric .scalarize(1); 16440b57cec5SDimitry Andric } 16450b57cec5SDimitry Andric 16465ffd83dbSDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 16475ffd83dbSDimitry Andric // RegBankSelect. 16485ffd83dbSDimitry Andric auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 16495ffd83dbSDimitry Andric .legalFor({{S32}, {S64}}); 16508bcb0991SDimitry Andric 16515ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 16525ffd83dbSDimitry Andric SextInReg.lowerFor({{V2S16}}) 16535ffd83dbSDimitry Andric // Prefer to reduce vector widths for 16-bit vectors before lowering, to 16545ffd83dbSDimitry Andric // get more vector shift opportunities, since we'll get those when 16555ffd83dbSDimitry Andric // expanded. 16560eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2); 16575ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 16585ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}, {S16}}); 16595ffd83dbSDimitry Andric } else { 16605ffd83dbSDimitry Andric // Prefer to promote to s32 before lowering if we don't have 16-bit 16615ffd83dbSDimitry Andric // shifts. This avoid a lot of intermediate truncate and extend operations. 16625ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}}); 16635ffd83dbSDimitry Andric } 16645ffd83dbSDimitry Andric 16655ffd83dbSDimitry Andric SextInReg 16665ffd83dbSDimitry Andric .scalarize(0) 16675ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 16685ffd83dbSDimitry Andric .lower(); 16695ffd83dbSDimitry Andric 1670349cc55cSDimitry Andric getActionDefinitionsBuilder({G_ROTR, G_ROTL}) 1671349cc55cSDimitry Andric .scalarize(0) 1672349cc55cSDimitry Andric .lower(); 1673349cc55cSDimitry Andric 1674fe6060f1SDimitry Andric // TODO: Only Try to form v2s16 with legal packed instructions. 16755ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSHR) 16765ffd83dbSDimitry Andric .legalFor({{S32, S32}}) 1677fe6060f1SDimitry Andric .lowerFor({{V2S16, V2S16}}) 16780eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 16795ffd83dbSDimitry Andric .scalarize(0) 16805ffd83dbSDimitry Andric .lower(); 1681480093f4SDimitry Andric 1682fe6060f1SDimitry Andric if (ST.hasVOP3PInsts()) { 1683fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FSHL) 1684fe6060f1SDimitry Andric .lowerFor({{V2S16, V2S16}}) 16850eae32dcSDimitry Andric .clampMaxNumElementsStrict(0, S16, 2) 1686fe6060f1SDimitry Andric .scalarize(0) 1687fe6060f1SDimitry Andric .lower(); 1688fe6060f1SDimitry Andric } else { 1689fe6060f1SDimitry Andric getActionDefinitionsBuilder(G_FSHL) 1690fe6060f1SDimitry Andric .scalarize(0) 1691fe6060f1SDimitry Andric .lower(); 1692fe6060f1SDimitry Andric } 1693fe6060f1SDimitry Andric 1694480093f4SDimitry Andric getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1695480093f4SDimitry Andric .legalFor({S64}); 1696480093f4SDimitry Andric 1697e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FENCE) 1698e8d8bef9SDimitry Andric .alwaysLegal(); 1699e8d8bef9SDimitry Andric 1700fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SMULO, G_UMULO}) 1701fe6060f1SDimitry Andric .scalarize(0) 1702fe6060f1SDimitry Andric .minScalar(0, S32) 1703fe6060f1SDimitry Andric .lower(); 1704fe6060f1SDimitry Andric 1705fe6060f1SDimitry Andric getActionDefinitionsBuilder({G_SBFX, G_UBFX}) 1706fe6060f1SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}) 1707fe6060f1SDimitry Andric .clampScalar(1, S32, S32) 1708fe6060f1SDimitry Andric .clampScalar(0, S32, S64) 1709fe6060f1SDimitry Andric .widenScalarToNextPow2(0) 1710fe6060f1SDimitry Andric .scalarize(0); 1711fe6060f1SDimitry Andric 17125ffd83dbSDimitry Andric getActionDefinitionsBuilder({ 17135ffd83dbSDimitry Andric // TODO: Verify V_BFI_B32 is generated from expanded bit ops 17145ffd83dbSDimitry Andric G_FCOPYSIGN, 17155ffd83dbSDimitry Andric 17165ffd83dbSDimitry Andric G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1717e8d8bef9SDimitry Andric G_ATOMICRMW_NAND, 1718e8d8bef9SDimitry Andric G_ATOMICRMW_FSUB, 17195ffd83dbSDimitry Andric G_READ_REGISTER, 17205ffd83dbSDimitry Andric G_WRITE_REGISTER, 17215ffd83dbSDimitry Andric 17225ffd83dbSDimitry Andric G_SADDO, G_SSUBO, 17235ffd83dbSDimitry Andric 17245ffd83dbSDimitry Andric // TODO: Implement 1725fe6060f1SDimitry Andric G_FMINIMUM, G_FMAXIMUM}).lower(); 17265ffd83dbSDimitry Andric 1727349cc55cSDimitry Andric getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET}) 1728349cc55cSDimitry Andric .lower(); 1729349cc55cSDimitry Andric 1730480093f4SDimitry Andric getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 17315ffd83dbSDimitry Andric G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1732480093f4SDimitry Andric G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1733480093f4SDimitry Andric .unsupported(); 1734480093f4SDimitry Andric 1735fe6060f1SDimitry Andric getLegacyLegalizerInfo().computeTables(); 17360b57cec5SDimitry Andric verify(*ST.getInstrInfo()); 17370b57cec5SDimitry Andric } 17380b57cec5SDimitry Andric 17395ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 17405ffd83dbSDimitry Andric MachineInstr &MI) const { 17415ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 17425ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 17435ffd83dbSDimitry Andric 17440b57cec5SDimitry Andric switch (MI.getOpcode()) { 17450b57cec5SDimitry Andric case TargetOpcode::G_ADDRSPACE_CAST: 17468bcb0991SDimitry Andric return legalizeAddrSpaceCast(MI, MRI, B); 17470b57cec5SDimitry Andric case TargetOpcode::G_FRINT: 17488bcb0991SDimitry Andric return legalizeFrint(MI, MRI, B); 17490b57cec5SDimitry Andric case TargetOpcode::G_FCEIL: 17508bcb0991SDimitry Andric return legalizeFceil(MI, MRI, B); 1751e8d8bef9SDimitry Andric case TargetOpcode::G_FREM: 1752e8d8bef9SDimitry Andric return legalizeFrem(MI, MRI, B); 17530b57cec5SDimitry Andric case TargetOpcode::G_INTRINSIC_TRUNC: 17548bcb0991SDimitry Andric return legalizeIntrinsicTrunc(MI, MRI, B); 17550b57cec5SDimitry Andric case TargetOpcode::G_SITOFP: 17568bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, true); 17570b57cec5SDimitry Andric case TargetOpcode::G_UITOFP: 17588bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, false); 17595ffd83dbSDimitry Andric case TargetOpcode::G_FPTOSI: 17605ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, true); 17615ffd83dbSDimitry Andric case TargetOpcode::G_FPTOUI: 17625ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, false); 17630b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM: 17640b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM: 17650b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM_IEEE: 17660b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM_IEEE: 17675ffd83dbSDimitry Andric return legalizeMinNumMaxNum(Helper, MI); 17680b57cec5SDimitry Andric case TargetOpcode::G_EXTRACT_VECTOR_ELT: 17698bcb0991SDimitry Andric return legalizeExtractVectorElt(MI, MRI, B); 17700b57cec5SDimitry Andric case TargetOpcode::G_INSERT_VECTOR_ELT: 17718bcb0991SDimitry Andric return legalizeInsertVectorElt(MI, MRI, B); 17728bcb0991SDimitry Andric case TargetOpcode::G_FSIN: 17738bcb0991SDimitry Andric case TargetOpcode::G_FCOS: 17748bcb0991SDimitry Andric return legalizeSinCos(MI, MRI, B); 17758bcb0991SDimitry Andric case TargetOpcode::G_GLOBAL_VALUE: 17768bcb0991SDimitry Andric return legalizeGlobalValue(MI, MRI, B); 17778bcb0991SDimitry Andric case TargetOpcode::G_LOAD: 1778fe6060f1SDimitry Andric case TargetOpcode::G_SEXTLOAD: 1779fe6060f1SDimitry Andric case TargetOpcode::G_ZEXTLOAD: 1780e8d8bef9SDimitry Andric return legalizeLoad(Helper, MI); 17818bcb0991SDimitry Andric case TargetOpcode::G_FMAD: 17828bcb0991SDimitry Andric return legalizeFMad(MI, MRI, B); 17838bcb0991SDimitry Andric case TargetOpcode::G_FDIV: 17848bcb0991SDimitry Andric return legalizeFDIV(MI, MRI, B); 17855ffd83dbSDimitry Andric case TargetOpcode::G_UDIV: 17865ffd83dbSDimitry Andric case TargetOpcode::G_UREM: 1787fe6060f1SDimitry Andric case TargetOpcode::G_UDIVREM: 1788fe6060f1SDimitry Andric return legalizeUnsignedDIV_REM(MI, MRI, B); 17895ffd83dbSDimitry Andric case TargetOpcode::G_SDIV: 17905ffd83dbSDimitry Andric case TargetOpcode::G_SREM: 1791fe6060f1SDimitry Andric case TargetOpcode::G_SDIVREM: 1792fe6060f1SDimitry Andric return legalizeSignedDIV_REM(MI, MRI, B); 1793480093f4SDimitry Andric case TargetOpcode::G_ATOMIC_CMPXCHG: 1794480093f4SDimitry Andric return legalizeAtomicCmpXChg(MI, MRI, B); 17955ffd83dbSDimitry Andric case TargetOpcode::G_FLOG: 17965ffd83dbSDimitry Andric return legalizeFlog(MI, B, numbers::ln2f); 17975ffd83dbSDimitry Andric case TargetOpcode::G_FLOG10: 17985ffd83dbSDimitry Andric return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 17995ffd83dbSDimitry Andric case TargetOpcode::G_FEXP: 18005ffd83dbSDimitry Andric return legalizeFExp(MI, B); 18015ffd83dbSDimitry Andric case TargetOpcode::G_FPOW: 18025ffd83dbSDimitry Andric return legalizeFPow(MI, B); 18035ffd83dbSDimitry Andric case TargetOpcode::G_FFLOOR: 18045ffd83dbSDimitry Andric return legalizeFFloor(MI, MRI, B); 18055ffd83dbSDimitry Andric case TargetOpcode::G_BUILD_VECTOR: 1806*bdd1243dSDimitry Andric case TargetOpcode::G_BUILD_VECTOR_TRUNC: 18075ffd83dbSDimitry Andric return legalizeBuildVector(MI, MRI, B); 180881ad6265SDimitry Andric case TargetOpcode::G_MUL: 180981ad6265SDimitry Andric return legalizeMul(Helper, MI); 1810349cc55cSDimitry Andric case TargetOpcode::G_CTLZ: 1811349cc55cSDimitry Andric case TargetOpcode::G_CTTZ: 1812349cc55cSDimitry Andric return legalizeCTLZ_CTTZ(MI, MRI, B); 181381ad6265SDimitry Andric case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: 181481ad6265SDimitry Andric return legalizeFPTruncRound(MI, B); 18150b57cec5SDimitry Andric default: 18160b57cec5SDimitry Andric return false; 18170b57cec5SDimitry Andric } 18180b57cec5SDimitry Andric 18190b57cec5SDimitry Andric llvm_unreachable("expected switch to return"); 18200b57cec5SDimitry Andric } 18210b57cec5SDimitry Andric 18220b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture( 18230b57cec5SDimitry Andric unsigned AS, 18240b57cec5SDimitry Andric MachineRegisterInfo &MRI, 18258bcb0991SDimitry Andric MachineIRBuilder &B) const { 18268bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 18270b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 18280b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 1829*bdd1243dSDimitry Andric const LLT S64 = LLT::scalar(64); 18300b57cec5SDimitry Andric 18318bcb0991SDimitry Andric assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 18328bcb0991SDimitry Andric 18330b57cec5SDimitry Andric if (ST.hasApertureRegs()) { 1834*bdd1243dSDimitry Andric // Note: this register is somewhat broken. When used as a 32-bit operand, 1835*bdd1243dSDimitry Andric // it only returns zeroes. The real value is in the upper 32 bits. 1836*bdd1243dSDimitry Andric // Thus, we must emit extract the high 32 bits. 1837*bdd1243dSDimitry Andric const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS) 1838*bdd1243dSDimitry Andric ? AMDGPU::SRC_SHARED_BASE 1839*bdd1243dSDimitry Andric : AMDGPU::SRC_PRIVATE_BASE; 1840*bdd1243dSDimitry Andric // FIXME: It would be more natural to emit a COPY here, but then copy 1841*bdd1243dSDimitry Andric // coalescing would kick in and it would think it's okay to use the "HI" 1842*bdd1243dSDimitry Andric // subregister (instead of extracting the HI 32 bits) which is an artificial 1843*bdd1243dSDimitry Andric // (unusable) register. 1844*bdd1243dSDimitry Andric // Register TableGen definitions would need an overhaul to get rid of the 1845*bdd1243dSDimitry Andric // artificial "HI" aperture registers and prevent this kind of issue from 1846*bdd1243dSDimitry Andric // happening. 1847*bdd1243dSDimitry Andric Register Dst = MRI.createGenericVirtualRegister(S64); 1848*bdd1243dSDimitry Andric MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass); 1849*bdd1243dSDimitry Andric B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {Register(ApertureRegNo)}); 1850*bdd1243dSDimitry Andric return B.buildUnmerge(S32, Dst).getReg(1); 18510b57cec5SDimitry Andric } 18520b57cec5SDimitry Andric 185381ad6265SDimitry Andric // TODO: can we be smarter about machine pointer info? 185481ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 185581ad6265SDimitry Andric Register LoadAddr = MRI.createGenericVirtualRegister( 185681ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 185781ad6265SDimitry Andric // For code object version 5, private_base and shared_base are passed through 185881ad6265SDimitry Andric // implicit kernargs. 185981ad6265SDimitry Andric if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 186081ad6265SDimitry Andric AMDGPUTargetLowering::ImplicitParameter Param = 186181ad6265SDimitry Andric AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE 186281ad6265SDimitry Andric : AMDGPUTargetLowering::PRIVATE_BASE; 186381ad6265SDimitry Andric uint64_t Offset = 186481ad6265SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 186581ad6265SDimitry Andric 186681ad6265SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister( 186781ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 186881ad6265SDimitry Andric 186981ad6265SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 187081ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 187181ad6265SDimitry Andric return Register(); 187281ad6265SDimitry Andric 187381ad6265SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 187481ad6265SDimitry Andric PtrInfo, 187581ad6265SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 187681ad6265SDimitry Andric MachineMemOperand::MOInvariant, 187781ad6265SDimitry Andric LLT::scalar(32), commonAlignment(Align(64), Offset)); 187881ad6265SDimitry Andric 187981ad6265SDimitry Andric // Pointer address 188081ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, KernargPtrReg, 188181ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 188281ad6265SDimitry Andric // Load address 188381ad6265SDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 188481ad6265SDimitry Andric } 188581ad6265SDimitry Andric 18860b57cec5SDimitry Andric Register QueuePtr = MRI.createGenericVirtualRegister( 18870b57cec5SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 18880b57cec5SDimitry Andric 1889e8d8bef9SDimitry Andric if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 18908bcb0991SDimitry Andric return Register(); 18910b57cec5SDimitry Andric 18920b57cec5SDimitry Andric // Offset into amd_queue_t for group_segment_aperture_base_hi / 18930b57cec5SDimitry Andric // private_segment_aperture_base_hi. 18940b57cec5SDimitry Andric uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 18950b57cec5SDimitry Andric 18960b57cec5SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 18970b57cec5SDimitry Andric PtrInfo, 18985ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 18990b57cec5SDimitry Andric MachineMemOperand::MOInvariant, 1900fe6060f1SDimitry Andric LLT::scalar(32), commonAlignment(Align(64), StructOffset)); 19010b57cec5SDimitry Andric 190281ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, QueuePtr, 190381ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); 19045ffd83dbSDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 19050b57cec5SDimitry Andric } 19060b57cec5SDimitry Andric 190704eeddc0SDimitry Andric /// Return true if the value is a known valid address, such that a null check is 190804eeddc0SDimitry Andric /// not necessary. 190904eeddc0SDimitry Andric static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, 191004eeddc0SDimitry Andric const AMDGPUTargetMachine &TM, unsigned AddrSpace) { 191104eeddc0SDimitry Andric MachineInstr *Def = MRI.getVRegDef(Val); 191204eeddc0SDimitry Andric switch (Def->getOpcode()) { 191304eeddc0SDimitry Andric case AMDGPU::G_FRAME_INDEX: 191404eeddc0SDimitry Andric case AMDGPU::G_GLOBAL_VALUE: 191504eeddc0SDimitry Andric case AMDGPU::G_BLOCK_ADDR: 191604eeddc0SDimitry Andric return true; 191704eeddc0SDimitry Andric case AMDGPU::G_CONSTANT: { 191804eeddc0SDimitry Andric const ConstantInt *CI = Def->getOperand(1).getCImm(); 191904eeddc0SDimitry Andric return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace); 192004eeddc0SDimitry Andric } 192104eeddc0SDimitry Andric default: 192204eeddc0SDimitry Andric return false; 192304eeddc0SDimitry Andric } 192404eeddc0SDimitry Andric 192504eeddc0SDimitry Andric return false; 192604eeddc0SDimitry Andric } 192704eeddc0SDimitry Andric 19280b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 19290b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 19308bcb0991SDimitry Andric MachineIRBuilder &B) const { 19318bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 19320b57cec5SDimitry Andric 19338bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 19340b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 19350b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 19360b57cec5SDimitry Andric 19370b57cec5SDimitry Andric LLT DstTy = MRI.getType(Dst); 19380b57cec5SDimitry Andric LLT SrcTy = MRI.getType(Src); 19390b57cec5SDimitry Andric unsigned DestAS = DstTy.getAddressSpace(); 19400b57cec5SDimitry Andric unsigned SrcAS = SrcTy.getAddressSpace(); 19410b57cec5SDimitry Andric 19420b57cec5SDimitry Andric // TODO: Avoid reloading from the queue ptr for each cast, or at least each 19430b57cec5SDimitry Andric // vector element. 19440b57cec5SDimitry Andric assert(!DstTy.isVector()); 19450b57cec5SDimitry Andric 19460b57cec5SDimitry Andric const AMDGPUTargetMachine &TM 19470b57cec5SDimitry Andric = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 19480b57cec5SDimitry Andric 1949e8d8bef9SDimitry Andric if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 19508bcb0991SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 19518bcb0991SDimitry Andric return true; 19528bcb0991SDimitry Andric } 19538bcb0991SDimitry Andric 195481ad6265SDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS && 195581ad6265SDimitry Andric (DestAS == AMDGPUAS::LOCAL_ADDRESS || 195681ad6265SDimitry Andric DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { 195704eeddc0SDimitry Andric if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 195804eeddc0SDimitry Andric // Extract low 32-bits of the pointer. 195904eeddc0SDimitry Andric B.buildExtract(Dst, Src, 0); 196004eeddc0SDimitry Andric MI.eraseFromParent(); 196104eeddc0SDimitry Andric return true; 196204eeddc0SDimitry Andric } 196304eeddc0SDimitry Andric 19640b57cec5SDimitry Andric unsigned NullVal = TM.getNullPointerValue(DestAS); 19650b57cec5SDimitry Andric 19668bcb0991SDimitry Andric auto SegmentNull = B.buildConstant(DstTy, NullVal); 19678bcb0991SDimitry Andric auto FlatNull = B.buildConstant(SrcTy, 0); 19680b57cec5SDimitry Andric 19690b57cec5SDimitry Andric // Extract low 32-bits of the pointer. 19705ffd83dbSDimitry Andric auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 19710b57cec5SDimitry Andric 19725ffd83dbSDimitry Andric auto CmpRes = 19735ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 19748bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 19750b57cec5SDimitry Andric 19760b57cec5SDimitry Andric MI.eraseFromParent(); 19770b57cec5SDimitry Andric return true; 19780b57cec5SDimitry Andric } 19790b57cec5SDimitry Andric 198081ad6265SDimitry Andric if (DestAS == AMDGPUAS::FLAT_ADDRESS && 198181ad6265SDimitry Andric (SrcAS == AMDGPUAS::LOCAL_ADDRESS || 198281ad6265SDimitry Andric SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { 19838bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 19848bcb0991SDimitry Andric if (!ApertureReg.isValid()) 19858bcb0991SDimitry Andric return false; 19860b57cec5SDimitry Andric 19870b57cec5SDimitry Andric // Coerce the type of the low half of the result so we can use merge_values. 19885ffd83dbSDimitry Andric Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 19890b57cec5SDimitry Andric 19900b57cec5SDimitry Andric // TODO: Should we allow mismatched types but matching sizes in merges to 19910b57cec5SDimitry Andric // avoid the ptrtoint? 1992*bdd1243dSDimitry Andric auto BuildPtr = B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg}); 199304eeddc0SDimitry Andric 199404eeddc0SDimitry Andric if (isKnownNonNull(Src, MRI, TM, SrcAS)) { 199504eeddc0SDimitry Andric B.buildCopy(Dst, BuildPtr); 199604eeddc0SDimitry Andric MI.eraseFromParent(); 199704eeddc0SDimitry Andric return true; 199804eeddc0SDimitry Andric } 199904eeddc0SDimitry Andric 200004eeddc0SDimitry Andric auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 200104eeddc0SDimitry Andric auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 200204eeddc0SDimitry Andric 200381ad6265SDimitry Andric auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, 200481ad6265SDimitry Andric SegmentNull.getReg(0)); 200504eeddc0SDimitry Andric 20065ffd83dbSDimitry Andric B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 20070b57cec5SDimitry Andric 20080b57cec5SDimitry Andric MI.eraseFromParent(); 20090b57cec5SDimitry Andric return true; 20100b57cec5SDimitry Andric } 20110b57cec5SDimitry Andric 201281ad6265SDimitry Andric if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 201381ad6265SDimitry Andric SrcTy.getSizeInBits() == 64) { 201481ad6265SDimitry Andric // Truncate. 201581ad6265SDimitry Andric B.buildExtract(Dst, Src, 0); 201681ad6265SDimitry Andric MI.eraseFromParent(); 201781ad6265SDimitry Andric return true; 201881ad6265SDimitry Andric } 201981ad6265SDimitry Andric 202081ad6265SDimitry Andric if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && 202181ad6265SDimitry Andric DstTy.getSizeInBits() == 64) { 202281ad6265SDimitry Andric const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 202381ad6265SDimitry Andric uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 2024*bdd1243dSDimitry Andric auto PtrLo = B.buildPtrToInt(S32, Src); 2025*bdd1243dSDimitry Andric auto HighAddr = B.buildConstant(S32, AddrHiVal); 2026*bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr}); 202781ad6265SDimitry Andric MI.eraseFromParent(); 202881ad6265SDimitry Andric return true; 202981ad6265SDimitry Andric } 203081ad6265SDimitry Andric 203181ad6265SDimitry Andric DiagnosticInfoUnsupported InvalidAddrSpaceCast( 203281ad6265SDimitry Andric MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); 203381ad6265SDimitry Andric 203481ad6265SDimitry Andric LLVMContext &Ctx = MF.getFunction().getContext(); 203581ad6265SDimitry Andric Ctx.diagnose(InvalidAddrSpaceCast); 203681ad6265SDimitry Andric B.buildUndef(Dst); 203781ad6265SDimitry Andric MI.eraseFromParent(); 203881ad6265SDimitry Andric return true; 203981ad6265SDimitry Andric } 204081ad6265SDimitry Andric 20410b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint( 20420b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 20438bcb0991SDimitry Andric MachineIRBuilder &B) const { 20440b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 20450b57cec5SDimitry Andric LLT Ty = MRI.getType(Src); 20460b57cec5SDimitry Andric assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 20470b57cec5SDimitry Andric 20480b57cec5SDimitry Andric APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 20490b57cec5SDimitry Andric APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 20500b57cec5SDimitry Andric 20518bcb0991SDimitry Andric auto C1 = B.buildFConstant(Ty, C1Val); 20528bcb0991SDimitry Andric auto CopySign = B.buildFCopysign(Ty, C1, Src); 20530b57cec5SDimitry Andric 20540b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 20558bcb0991SDimitry Andric auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 20568bcb0991SDimitry Andric auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 20570b57cec5SDimitry Andric 20588bcb0991SDimitry Andric auto C2 = B.buildFConstant(Ty, C2Val); 20598bcb0991SDimitry Andric auto Fabs = B.buildFAbs(Ty, Src); 20600b57cec5SDimitry Andric 20618bcb0991SDimitry Andric auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 20628bcb0991SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 2063e8d8bef9SDimitry Andric MI.eraseFromParent(); 20640b57cec5SDimitry Andric return true; 20650b57cec5SDimitry Andric } 20660b57cec5SDimitry Andric 20670b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil( 20680b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 20690b57cec5SDimitry Andric MachineIRBuilder &B) const { 20700b57cec5SDimitry Andric 20710b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 20720b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 20730b57cec5SDimitry Andric 20740b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 20750b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 20760b57cec5SDimitry Andric 20770b57cec5SDimitry Andric // result = trunc(src) 20780b57cec5SDimitry Andric // if (src > 0.0 && src != result) 20790b57cec5SDimitry Andric // result += 1.0 20800b57cec5SDimitry Andric 20815ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S64, Src); 20820b57cec5SDimitry Andric 20830b57cec5SDimitry Andric const auto Zero = B.buildFConstant(S64, 0.0); 20840b57cec5SDimitry Andric const auto One = B.buildFConstant(S64, 1.0); 20850b57cec5SDimitry Andric auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 20860b57cec5SDimitry Andric auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 20870b57cec5SDimitry Andric auto And = B.buildAnd(S1, Lt0, NeTrunc); 20880b57cec5SDimitry Andric auto Add = B.buildSelect(S64, And, One, Zero); 20890b57cec5SDimitry Andric 20900b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 20910b57cec5SDimitry Andric B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 209204eeddc0SDimitry Andric MI.eraseFromParent(); 20930b57cec5SDimitry Andric return true; 20940b57cec5SDimitry Andric } 20950b57cec5SDimitry Andric 2096e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem( 2097e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 2098e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 2099e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2100e8d8bef9SDimitry Andric Register Src0Reg = MI.getOperand(1).getReg(); 2101e8d8bef9SDimitry Andric Register Src1Reg = MI.getOperand(2).getReg(); 2102e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 2103e8d8bef9SDimitry Andric LLT Ty = MRI.getType(DstReg); 2104e8d8bef9SDimitry Andric 2105e8d8bef9SDimitry Andric auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 2106e8d8bef9SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 2107e8d8bef9SDimitry Andric auto Neg = B.buildFNeg(Ty, Trunc, Flags); 2108e8d8bef9SDimitry Andric B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 2109e8d8bef9SDimitry Andric MI.eraseFromParent(); 2110e8d8bef9SDimitry Andric return true; 2111e8d8bef9SDimitry Andric } 2112e8d8bef9SDimitry Andric 2113e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi, 21140b57cec5SDimitry Andric MachineIRBuilder &B) { 21150b57cec5SDimitry Andric const unsigned FractBits = 52; 21160b57cec5SDimitry Andric const unsigned ExpBits = 11; 21170b57cec5SDimitry Andric LLT S32 = LLT::scalar(32); 21180b57cec5SDimitry Andric 21190b57cec5SDimitry Andric auto Const0 = B.buildConstant(S32, FractBits - 32); 21200b57cec5SDimitry Andric auto Const1 = B.buildConstant(S32, ExpBits); 21210b57cec5SDimitry Andric 21220b57cec5SDimitry Andric auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 2123e8d8bef9SDimitry Andric .addUse(Hi) 21240b57cec5SDimitry Andric .addUse(Const0.getReg(0)) 21250b57cec5SDimitry Andric .addUse(Const1.getReg(0)); 21260b57cec5SDimitry Andric 21270b57cec5SDimitry Andric return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 21280b57cec5SDimitry Andric } 21290b57cec5SDimitry Andric 21300b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 21310b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21320b57cec5SDimitry Andric MachineIRBuilder &B) const { 21330b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 21340b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 21350b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 21360b57cec5SDimitry Andric 21370b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 21380b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 21390b57cec5SDimitry Andric 21400b57cec5SDimitry Andric // TODO: Should this use extract since the low half is unused? 21410b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 21420b57cec5SDimitry Andric Register Hi = Unmerge.getReg(1); 21430b57cec5SDimitry Andric 21440b57cec5SDimitry Andric // Extract the upper half, since this is where we will find the sign and 21450b57cec5SDimitry Andric // exponent. 21460b57cec5SDimitry Andric auto Exp = extractF64Exponent(Hi, B); 21470b57cec5SDimitry Andric 21480b57cec5SDimitry Andric const unsigned FractBits = 52; 21490b57cec5SDimitry Andric 21500b57cec5SDimitry Andric // Extract the sign bit. 21510b57cec5SDimitry Andric const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 21520b57cec5SDimitry Andric auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 21530b57cec5SDimitry Andric 21540b57cec5SDimitry Andric const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 21550b57cec5SDimitry Andric 21560b57cec5SDimitry Andric const auto Zero32 = B.buildConstant(S32, 0); 21570b57cec5SDimitry Andric 21580b57cec5SDimitry Andric // Extend back to 64-bits. 2159*bdd1243dSDimitry Andric auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit}); 21600b57cec5SDimitry Andric 21610b57cec5SDimitry Andric auto Shr = B.buildAShr(S64, FractMask, Exp); 21620b57cec5SDimitry Andric auto Not = B.buildNot(S64, Shr); 21630b57cec5SDimitry Andric auto Tmp0 = B.buildAnd(S64, Src, Not); 21640b57cec5SDimitry Andric auto FiftyOne = B.buildConstant(S32, FractBits - 1); 21650b57cec5SDimitry Andric 21660b57cec5SDimitry Andric auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 21670b57cec5SDimitry Andric auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 21680b57cec5SDimitry Andric 21690b57cec5SDimitry Andric auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 21700b57cec5SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 2171e8d8bef9SDimitry Andric MI.eraseFromParent(); 21720b57cec5SDimitry Andric return true; 21730b57cec5SDimitry Andric } 21740b57cec5SDimitry Andric 21750b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP( 21760b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21770b57cec5SDimitry Andric MachineIRBuilder &B, bool Signed) const { 21780b57cec5SDimitry Andric 21790b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 21800b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 21810b57cec5SDimitry Andric 21820b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 21830b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 21840b57cec5SDimitry Andric 2185349cc55cSDimitry Andric assert(MRI.getType(Src) == S64); 21860b57cec5SDimitry Andric 21870b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 2188349cc55cSDimitry Andric auto ThirtyTwo = B.buildConstant(S32, 32); 21890b57cec5SDimitry Andric 2190349cc55cSDimitry Andric if (MRI.getType(Dst) == S64) { 2191349cc55cSDimitry Andric auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1)) 2192349cc55cSDimitry Andric : B.buildUITOFP(S64, Unmerge.getReg(1)); 21930b57cec5SDimitry Andric 21940b57cec5SDimitry Andric auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 21950b57cec5SDimitry Andric auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 21960b57cec5SDimitry Andric .addUse(CvtHi.getReg(0)) 21970b57cec5SDimitry Andric .addUse(ThirtyTwo.getReg(0)); 21980b57cec5SDimitry Andric 21990b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 22000b57cec5SDimitry Andric B.buildFAdd(Dst, LdExp, CvtLo); 22010b57cec5SDimitry Andric MI.eraseFromParent(); 22020b57cec5SDimitry Andric return true; 22030b57cec5SDimitry Andric } 22040b57cec5SDimitry Andric 2205349cc55cSDimitry Andric assert(MRI.getType(Dst) == S32); 2206349cc55cSDimitry Andric 2207349cc55cSDimitry Andric auto One = B.buildConstant(S32, 1); 2208349cc55cSDimitry Andric 2209349cc55cSDimitry Andric MachineInstrBuilder ShAmt; 2210349cc55cSDimitry Andric if (Signed) { 2211349cc55cSDimitry Andric auto ThirtyOne = B.buildConstant(S32, 31); 2212349cc55cSDimitry Andric auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1)); 2213349cc55cSDimitry Andric auto OppositeSign = B.buildAShr(S32, X, ThirtyOne); 2214349cc55cSDimitry Andric auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign); 2215349cc55cSDimitry Andric auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}, 2216349cc55cSDimitry Andric /*HasSideEffects=*/false) 2217349cc55cSDimitry Andric .addUse(Unmerge.getReg(1)); 2218349cc55cSDimitry Andric auto LS2 = B.buildSub(S32, LS, One); 2219349cc55cSDimitry Andric ShAmt = B.buildUMin(S32, LS2, MaxShAmt); 2220349cc55cSDimitry Andric } else 2221349cc55cSDimitry Andric ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1)); 2222349cc55cSDimitry Andric auto Norm = B.buildShl(S64, Src, ShAmt); 2223349cc55cSDimitry Andric auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm); 2224349cc55cSDimitry Andric auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0)); 2225349cc55cSDimitry Andric auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust); 2226349cc55cSDimitry Andric auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2); 2227349cc55cSDimitry Andric auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt); 2228349cc55cSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst}, 2229349cc55cSDimitry Andric /*HasSideEffects=*/false) 2230349cc55cSDimitry Andric .addUse(FVal.getReg(0)) 2231349cc55cSDimitry Andric .addUse(Scale.getReg(0)); 2232349cc55cSDimitry Andric MI.eraseFromParent(); 2233349cc55cSDimitry Andric return true; 2234349cc55cSDimitry Andric } 2235349cc55cSDimitry Andric 22365ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this 22375ffd83dbSDimitry Andric // actually works. 2238fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI, 2239fe6060f1SDimitry Andric MachineRegisterInfo &MRI, 2240fe6060f1SDimitry Andric MachineIRBuilder &B, 2241fe6060f1SDimitry Andric bool Signed) const { 22425ffd83dbSDimitry Andric 22435ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 22445ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 22455ffd83dbSDimitry Andric 22465ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 22475ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 22485ffd83dbSDimitry Andric 2249fe6060f1SDimitry Andric const LLT SrcLT = MRI.getType(Src); 2250fe6060f1SDimitry Andric assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64); 22515ffd83dbSDimitry Andric 22525ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 22535ffd83dbSDimitry Andric 2254fe6060f1SDimitry Andric // The basic idea of converting a floating point number into a pair of 32-bit 2255fe6060f1SDimitry Andric // integers is illustrated as follows: 2256fe6060f1SDimitry Andric // 2257fe6060f1SDimitry Andric // tf := trunc(val); 2258fe6060f1SDimitry Andric // hif := floor(tf * 2^-32); 2259fe6060f1SDimitry Andric // lof := tf - hif * 2^32; // lof is always positive due to floor. 2260fe6060f1SDimitry Andric // hi := fptoi(hif); 2261fe6060f1SDimitry Andric // lo := fptoi(lof); 2262fe6060f1SDimitry Andric // 2263fe6060f1SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags); 2264fe6060f1SDimitry Andric MachineInstrBuilder Sign; 2265fe6060f1SDimitry Andric if (Signed && SrcLT == S32) { 2266fe6060f1SDimitry Andric // However, a 32-bit floating point number has only 23 bits mantissa and 2267fe6060f1SDimitry Andric // it's not enough to hold all the significant bits of `lof` if val is 2268fe6060f1SDimitry Andric // negative. To avoid the loss of precision, We need to take the absolute 2269fe6060f1SDimitry Andric // value after truncating and flip the result back based on the original 2270fe6060f1SDimitry Andric // signedness. 2271fe6060f1SDimitry Andric Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31)); 2272fe6060f1SDimitry Andric Trunc = B.buildFAbs(S32, Trunc, Flags); 2273fe6060f1SDimitry Andric } 2274fe6060f1SDimitry Andric MachineInstrBuilder K0, K1; 2275fe6060f1SDimitry Andric if (SrcLT == S64) { 2276fe6060f1SDimitry Andric K0 = B.buildFConstant(S64, 2277fe6060f1SDimitry Andric BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000))); 2278fe6060f1SDimitry Andric K1 = B.buildFConstant(S64, 2279fe6060f1SDimitry Andric BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000))); 2280fe6060f1SDimitry Andric } else { 2281fe6060f1SDimitry Andric K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000))); 2282fe6060f1SDimitry Andric K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000))); 2283fe6060f1SDimitry Andric } 22845ffd83dbSDimitry Andric 2285fe6060f1SDimitry Andric auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags); 2286fe6060f1SDimitry Andric auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags); 2287fe6060f1SDimitry Andric auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags); 22885ffd83dbSDimitry Andric 2289fe6060f1SDimitry Andric auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul) 2290fe6060f1SDimitry Andric : B.buildFPTOUI(S32, FloorMul); 22915ffd83dbSDimitry Andric auto Lo = B.buildFPTOUI(S32, Fma); 22925ffd83dbSDimitry Andric 2293fe6060f1SDimitry Andric if (Signed && SrcLT == S32) { 2294fe6060f1SDimitry Andric // Flip the result based on the signedness, which is either all 0s or 1s. 2295*bdd1243dSDimitry Andric Sign = B.buildMergeLikeInstr(S64, {Sign, Sign}); 2296fe6060f1SDimitry Andric // r := xor({lo, hi}, sign) - sign; 2297*bdd1243dSDimitry Andric B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign), 2298*bdd1243dSDimitry Andric Sign); 2299fe6060f1SDimitry Andric } else 2300*bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, {Lo, Hi}); 23015ffd83dbSDimitry Andric MI.eraseFromParent(); 23025ffd83dbSDimitry Andric 23035ffd83dbSDimitry Andric return true; 23045ffd83dbSDimitry Andric } 23055ffd83dbSDimitry Andric 23065ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 23075ffd83dbSDimitry Andric MachineInstr &MI) const { 23085ffd83dbSDimitry Andric MachineFunction &MF = Helper.MIRBuilder.getMF(); 23090b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 23100b57cec5SDimitry Andric 23110b57cec5SDimitry Andric const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 23120b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 23130b57cec5SDimitry Andric 23140b57cec5SDimitry Andric // With ieee_mode disabled, the instructions have the correct behavior 23150b57cec5SDimitry Andric // already for G_FMINNUM/G_FMAXNUM 23160b57cec5SDimitry Andric if (!MFI->getMode().IEEE) 23170b57cec5SDimitry Andric return !IsIEEEOp; 23180b57cec5SDimitry Andric 23190b57cec5SDimitry Andric if (IsIEEEOp) 23200b57cec5SDimitry Andric return true; 23210b57cec5SDimitry Andric 23220b57cec5SDimitry Andric return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 23230b57cec5SDimitry Andric } 23240b57cec5SDimitry Andric 23250b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 23260b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 23270b57cec5SDimitry Andric MachineIRBuilder &B) const { 23280b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 23290b57cec5SDimitry Andric 23300b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 23315ffd83dbSDimitry Andric 23325ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 23335ffd83dbSDimitry Andric // constant before this, so we shouldn't need 2334349cc55cSDimitry Andric // getIConstantVRegValWithLookThrough. 2335*bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeIdxVal = 2336349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2337e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 23380b57cec5SDimitry Andric return true; 2339*bdd1243dSDimitry Andric const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 23400b57cec5SDimitry Andric 23410b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 23420b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 23430b57cec5SDimitry Andric 23440b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 23450b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 23460b57cec5SDimitry Andric assert(EltTy == MRI.getType(Dst)); 23470b57cec5SDimitry Andric 234804eeddc0SDimitry Andric if (IdxVal < VecTy.getNumElements()) { 234904eeddc0SDimitry Andric auto Unmerge = B.buildUnmerge(EltTy, Vec); 235004eeddc0SDimitry Andric B.buildCopy(Dst, Unmerge.getReg(IdxVal)); 235104eeddc0SDimitry Andric } else { 23520b57cec5SDimitry Andric B.buildUndef(Dst); 235304eeddc0SDimitry Andric } 23540b57cec5SDimitry Andric 23550b57cec5SDimitry Andric MI.eraseFromParent(); 23560b57cec5SDimitry Andric return true; 23570b57cec5SDimitry Andric } 23580b57cec5SDimitry Andric 23590b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 23600b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 23610b57cec5SDimitry Andric MachineIRBuilder &B) const { 23620b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 23630b57cec5SDimitry Andric 23640b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 23655ffd83dbSDimitry Andric 23665ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 23675ffd83dbSDimitry Andric // constant before this, so we shouldn't need 2368349cc55cSDimitry Andric // getIConstantVRegValWithLookThrough. 2369*bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeIdxVal = 2370349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2371e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 23720b57cec5SDimitry Andric return true; 23730b57cec5SDimitry Andric 2374*bdd1243dSDimitry Andric const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue(); 23750b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 23760b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 23770b57cec5SDimitry Andric Register Ins = MI.getOperand(2).getReg(); 23780b57cec5SDimitry Andric 23790b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 23800b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 23810b57cec5SDimitry Andric assert(EltTy == MRI.getType(Ins)); 238204eeddc0SDimitry Andric (void)Ins; 23830b57cec5SDimitry Andric 238404eeddc0SDimitry Andric unsigned NumElts = VecTy.getNumElements(); 238504eeddc0SDimitry Andric if (IdxVal < NumElts) { 238604eeddc0SDimitry Andric SmallVector<Register, 8> SrcRegs; 238704eeddc0SDimitry Andric for (unsigned i = 0; i < NumElts; ++i) 238804eeddc0SDimitry Andric SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy)); 238904eeddc0SDimitry Andric B.buildUnmerge(SrcRegs, Vec); 239004eeddc0SDimitry Andric 239104eeddc0SDimitry Andric SrcRegs[IdxVal] = MI.getOperand(2).getReg(); 2392*bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, SrcRegs); 239304eeddc0SDimitry Andric } else { 23940b57cec5SDimitry Andric B.buildUndef(Dst); 239504eeddc0SDimitry Andric } 23960b57cec5SDimitry Andric 23970b57cec5SDimitry Andric MI.eraseFromParent(); 23980b57cec5SDimitry Andric return true; 23990b57cec5SDimitry Andric } 24000b57cec5SDimitry Andric 24018bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos( 24028bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 24038bcb0991SDimitry Andric MachineIRBuilder &B) const { 24048bcb0991SDimitry Andric 24058bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 24068bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 24078bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 24088bcb0991SDimitry Andric unsigned Flags = MI.getFlags(); 24098bcb0991SDimitry Andric 24108bcb0991SDimitry Andric Register TrigVal; 24115ffd83dbSDimitry Andric auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 24128bcb0991SDimitry Andric if (ST.hasTrigReducedRange()) { 24138bcb0991SDimitry Andric auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 24148bcb0991SDimitry Andric TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 24158bcb0991SDimitry Andric .addUse(MulVal.getReg(0)) 24168bcb0991SDimitry Andric .setMIFlags(Flags).getReg(0); 24178bcb0991SDimitry Andric } else 24188bcb0991SDimitry Andric TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 24198bcb0991SDimitry Andric 24208bcb0991SDimitry Andric Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 24218bcb0991SDimitry Andric Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 2422*bdd1243dSDimitry Andric B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false) 24238bcb0991SDimitry Andric .addUse(TrigVal) 24248bcb0991SDimitry Andric .setMIFlags(Flags); 24258bcb0991SDimitry Andric MI.eraseFromParent(); 24268bcb0991SDimitry Andric return true; 24278bcb0991SDimitry Andric } 24288bcb0991SDimitry Andric 24295ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 24305ffd83dbSDimitry Andric MachineIRBuilder &B, 24315ffd83dbSDimitry Andric const GlobalValue *GV, 24325ffd83dbSDimitry Andric int64_t Offset, 24335ffd83dbSDimitry Andric unsigned GAFlags) const { 24345ffd83dbSDimitry Andric assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 24358bcb0991SDimitry Andric // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 24368bcb0991SDimitry Andric // to the following code sequence: 24378bcb0991SDimitry Andric // 24388bcb0991SDimitry Andric // For constant address space: 24398bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 24408bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol 24418bcb0991SDimitry Andric // s_addc_u32 s1, s1, 0 24428bcb0991SDimitry Andric // 24438bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 24448bcb0991SDimitry Andric // a fixup or relocation is emitted to replace $symbol with a literal 24458bcb0991SDimitry Andric // constant, which is a pc-relative offset from the encoding of the $symbol 24468bcb0991SDimitry Andric // operand to the global variable. 24478bcb0991SDimitry Andric // 24488bcb0991SDimitry Andric // For global address space: 24498bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 24508bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 24518bcb0991SDimitry Andric // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 24528bcb0991SDimitry Andric // 24538bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 24548bcb0991SDimitry Andric // fixups or relocations are emitted to replace $symbol@*@lo and 24558bcb0991SDimitry Andric // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 24568bcb0991SDimitry Andric // which is a 64-bit pc-relative offset from the encoding of the $symbol 24578bcb0991SDimitry Andric // operand to the global variable. 24588bcb0991SDimitry Andric // 24598bcb0991SDimitry Andric // What we want here is an offset from the value returned by s_getpc 24608bcb0991SDimitry Andric // (which is the address of the s_add_u32 instruction) to the global 24618bcb0991SDimitry Andric // variable, but since the encoding of $symbol starts 4 bytes after the start 24628bcb0991SDimitry Andric // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 24638bcb0991SDimitry Andric // small. This requires us to add 4 to the global variable offset in order to 2464e8d8bef9SDimitry Andric // compute the correct address. Similarly for the s_addc_u32 instruction, the 2465e8d8bef9SDimitry Andric // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2466e8d8bef9SDimitry Andric // instruction. 24678bcb0991SDimitry Andric 24688bcb0991SDimitry Andric LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 24698bcb0991SDimitry Andric 24708bcb0991SDimitry Andric Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 24718bcb0991SDimitry Andric B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 24728bcb0991SDimitry Andric 24738bcb0991SDimitry Andric MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 24748bcb0991SDimitry Andric .addDef(PCReg); 24758bcb0991SDimitry Andric 24768bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 24778bcb0991SDimitry Andric if (GAFlags == SIInstrInfo::MO_NONE) 24788bcb0991SDimitry Andric MIB.addImm(0); 24798bcb0991SDimitry Andric else 2480e8d8bef9SDimitry Andric MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 24818bcb0991SDimitry Andric 24828bcb0991SDimitry Andric B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 24838bcb0991SDimitry Andric 24848bcb0991SDimitry Andric if (PtrTy.getSizeInBits() == 32) 24858bcb0991SDimitry Andric B.buildExtract(DstReg, PCReg, 0); 24868bcb0991SDimitry Andric return true; 24878bcb0991SDimitry Andric } 24888bcb0991SDimitry Andric 24898bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue( 24908bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 24918bcb0991SDimitry Andric MachineIRBuilder &B) const { 24928bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 24938bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 24948bcb0991SDimitry Andric unsigned AS = Ty.getAddressSpace(); 24958bcb0991SDimitry Andric 24968bcb0991SDimitry Andric const GlobalValue *GV = MI.getOperand(1).getGlobal(); 24978bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 24988bcb0991SDimitry Andric SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 24998bcb0991SDimitry Andric 25008bcb0991SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2501fe6060f1SDimitry Andric if (!MFI->isModuleEntryFunction() && 2502fe6060f1SDimitry Andric !GV->getName().equals("llvm.amdgcn.module.lds")) { 25038bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 25048bcb0991SDimitry Andric DiagnosticInfoUnsupported BadLDSDecl( 25055ffd83dbSDimitry Andric Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 25065ffd83dbSDimitry Andric DS_Warning); 25078bcb0991SDimitry Andric Fn.getContext().diagnose(BadLDSDecl); 25085ffd83dbSDimitry Andric 25095ffd83dbSDimitry Andric // We currently don't have a way to correctly allocate LDS objects that 25105ffd83dbSDimitry Andric // aren't directly associated with a kernel. We do force inlining of 25115ffd83dbSDimitry Andric // functions that use local objects. However, if these dead functions are 25125ffd83dbSDimitry Andric // not eliminated, we don't want a compile time error. Just emit a warning 25135ffd83dbSDimitry Andric // and a trap, since there should be no callable path here. 25145ffd83dbSDimitry Andric B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 25155ffd83dbSDimitry Andric B.buildUndef(DstReg); 25165ffd83dbSDimitry Andric MI.eraseFromParent(); 25175ffd83dbSDimitry Andric return true; 25188bcb0991SDimitry Andric } 25198bcb0991SDimitry Andric 25208bcb0991SDimitry Andric // TODO: We could emit code to handle the initialization somewhere. 2521349cc55cSDimitry Andric // We ignore the initializer for now and legalize it to allow selection. 2522349cc55cSDimitry Andric // The initializer will anyway get errored out during assembly emission. 25235ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 25245ffd83dbSDimitry Andric if (!TLI->shouldUseLDSConstAddress(GV)) { 25255ffd83dbSDimitry Andric MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 25265ffd83dbSDimitry Andric return true; // Leave in place; 25275ffd83dbSDimitry Andric } 25285ffd83dbSDimitry Andric 2529e8d8bef9SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2530e8d8bef9SDimitry Andric Type *Ty = GV->getValueType(); 2531e8d8bef9SDimitry Andric // HIP uses an unsized array `extern __shared__ T s[]` or similar 2532e8d8bef9SDimitry Andric // zero-sized type in other languages to declare the dynamic shared 2533e8d8bef9SDimitry Andric // memory which size is not known at the compile time. They will be 2534e8d8bef9SDimitry Andric // allocated by the runtime and placed directly after the static 2535e8d8bef9SDimitry Andric // allocated ones. They all share the same offset. 2536e8d8bef9SDimitry Andric if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2537e8d8bef9SDimitry Andric // Adjust alignment for that dynamic shared memory array. 2538e8d8bef9SDimitry Andric MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2539e8d8bef9SDimitry Andric LLT S32 = LLT::scalar(32); 2540e8d8bef9SDimitry Andric auto Sz = 2541e8d8bef9SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2542e8d8bef9SDimitry Andric B.buildIntToPtr(DstReg, Sz); 2543e8d8bef9SDimitry Andric MI.eraseFromParent(); 2544e8d8bef9SDimitry Andric return true; 2545e8d8bef9SDimitry Andric } 2546e8d8bef9SDimitry Andric } 2547e8d8bef9SDimitry Andric 2548349cc55cSDimitry Andric B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), 2549349cc55cSDimitry Andric *cast<GlobalVariable>(GV))); 25508bcb0991SDimitry Andric MI.eraseFromParent(); 25518bcb0991SDimitry Andric return true; 25528bcb0991SDimitry Andric } 25538bcb0991SDimitry Andric 25548bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 25558bcb0991SDimitry Andric 25568bcb0991SDimitry Andric if (TLI->shouldEmitFixup(GV)) { 25578bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 25588bcb0991SDimitry Andric MI.eraseFromParent(); 25598bcb0991SDimitry Andric return true; 25608bcb0991SDimitry Andric } 25618bcb0991SDimitry Andric 25628bcb0991SDimitry Andric if (TLI->shouldEmitPCReloc(GV)) { 25638bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 25648bcb0991SDimitry Andric MI.eraseFromParent(); 25658bcb0991SDimitry Andric return true; 25668bcb0991SDimitry Andric } 25678bcb0991SDimitry Andric 25688bcb0991SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 25698bcb0991SDimitry Andric Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 25708bcb0991SDimitry Andric 2571fe6060f1SDimitry Andric LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty; 25728bcb0991SDimitry Andric MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 25738bcb0991SDimitry Andric MachinePointerInfo::getGOT(MF), 25748bcb0991SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 25758bcb0991SDimitry Andric MachineMemOperand::MOInvariant, 2576fe6060f1SDimitry Andric LoadTy, Align(8)); 25778bcb0991SDimitry Andric 25788bcb0991SDimitry Andric buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 25798bcb0991SDimitry Andric 25808bcb0991SDimitry Andric if (Ty.getSizeInBits() == 32) { 2581349cc55cSDimitry Andric // Truncate if this is a 32-bit constant address. 25828bcb0991SDimitry Andric auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 25838bcb0991SDimitry Andric B.buildExtract(DstReg, Load, 0); 25848bcb0991SDimitry Andric } else 25858bcb0991SDimitry Andric B.buildLoad(DstReg, GOTAddr, *GOTMMO); 25868bcb0991SDimitry Andric 25878bcb0991SDimitry Andric MI.eraseFromParent(); 25888bcb0991SDimitry Andric return true; 25898bcb0991SDimitry Andric } 25908bcb0991SDimitry Andric 2591e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) { 2592e8d8bef9SDimitry Andric if (Ty.isVector()) 2593fe6060f1SDimitry Andric return Ty.changeElementCount( 2594fe6060f1SDimitry Andric ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements()))); 2595e8d8bef9SDimitry Andric return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2596e8d8bef9SDimitry Andric } 2597e8d8bef9SDimitry Andric 2598e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2599e8d8bef9SDimitry Andric MachineInstr &MI) const { 2600e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 2601e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 2602e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 2603e8d8bef9SDimitry Andric 2604e8d8bef9SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2605e8d8bef9SDimitry Andric LLT PtrTy = MRI.getType(PtrReg); 2606e8d8bef9SDimitry Andric unsigned AddrSpace = PtrTy.getAddressSpace(); 2607e8d8bef9SDimitry Andric 2608e8d8bef9SDimitry Andric if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 26098bcb0991SDimitry Andric LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2610e8d8bef9SDimitry Andric auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 26118bcb0991SDimitry Andric Observer.changingInstr(MI); 26128bcb0991SDimitry Andric MI.getOperand(1).setReg(Cast.getReg(0)); 26138bcb0991SDimitry Andric Observer.changedInstr(MI); 26148bcb0991SDimitry Andric return true; 26158bcb0991SDimitry Andric } 26168bcb0991SDimitry Andric 2617fe6060f1SDimitry Andric if (MI.getOpcode() != AMDGPU::G_LOAD) 2618fe6060f1SDimitry Andric return false; 2619fe6060f1SDimitry Andric 2620e8d8bef9SDimitry Andric Register ValReg = MI.getOperand(0).getReg(); 2621e8d8bef9SDimitry Andric LLT ValTy = MRI.getType(ValReg); 2622e8d8bef9SDimitry Andric 2623e8d8bef9SDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 2624e8d8bef9SDimitry Andric const unsigned ValSize = ValTy.getSizeInBits(); 2625fe6060f1SDimitry Andric const LLT MemTy = MMO->getMemoryType(); 2626e8d8bef9SDimitry Andric const Align MemAlign = MMO->getAlign(); 2627fe6060f1SDimitry Andric const unsigned MemSize = MemTy.getSizeInBits(); 262804eeddc0SDimitry Andric const uint64_t AlignInBits = 8 * MemAlign.value(); 2629e8d8bef9SDimitry Andric 2630e8d8bef9SDimitry Andric // Widen non-power-of-2 loads to the alignment if needed 2631fe6060f1SDimitry Andric if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) { 2632e8d8bef9SDimitry Andric const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2633e8d8bef9SDimitry Andric 2634e8d8bef9SDimitry Andric // This was already the correct extending load result type, so just adjust 2635e8d8bef9SDimitry Andric // the memory type. 2636e8d8bef9SDimitry Andric if (WideMemSize == ValSize) { 2637e8d8bef9SDimitry Andric MachineFunction &MF = B.getMF(); 2638e8d8bef9SDimitry Andric 2639e8d8bef9SDimitry Andric MachineMemOperand *WideMMO = 2640e8d8bef9SDimitry Andric MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2641e8d8bef9SDimitry Andric Observer.changingInstr(MI); 2642e8d8bef9SDimitry Andric MI.setMemRefs(MF, {WideMMO}); 2643e8d8bef9SDimitry Andric Observer.changedInstr(MI); 2644e8d8bef9SDimitry Andric return true; 2645e8d8bef9SDimitry Andric } 2646e8d8bef9SDimitry Andric 2647e8d8bef9SDimitry Andric // Don't bother handling edge case that should probably never be produced. 2648e8d8bef9SDimitry Andric if (ValSize > WideMemSize) 2649e8d8bef9SDimitry Andric return false; 2650e8d8bef9SDimitry Andric 2651e8d8bef9SDimitry Andric LLT WideTy = widenToNextPowerOf2(ValTy); 2652e8d8bef9SDimitry Andric 2653e8d8bef9SDimitry Andric Register WideLoad; 2654e8d8bef9SDimitry Andric if (!WideTy.isVector()) { 2655e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2656e8d8bef9SDimitry Andric B.buildTrunc(ValReg, WideLoad).getReg(0); 2657e8d8bef9SDimitry Andric } else { 2658e8d8bef9SDimitry Andric // Extract the subvector. 2659e8d8bef9SDimitry Andric 2660e8d8bef9SDimitry Andric if (isRegisterType(ValTy)) { 2661e8d8bef9SDimitry Andric // If this a case where G_EXTRACT is legal, use it. 2662e8d8bef9SDimitry Andric // (e.g. <3 x s32> -> <4 x s32>) 2663e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2664e8d8bef9SDimitry Andric B.buildExtract(ValReg, WideLoad, 0); 2665e8d8bef9SDimitry Andric } else { 2666e8d8bef9SDimitry Andric // For cases where the widened type isn't a nice register value, unmerge 2667e8d8bef9SDimitry Andric // from a widened register (e.g. <3 x s16> -> <4 x s16>) 26680eae32dcSDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 26690eae32dcSDimitry Andric B.buildDeleteTrailingVectorElements(ValReg, WideLoad); 2670e8d8bef9SDimitry Andric } 2671e8d8bef9SDimitry Andric } 2672e8d8bef9SDimitry Andric 2673e8d8bef9SDimitry Andric MI.eraseFromParent(); 2674e8d8bef9SDimitry Andric return true; 2675e8d8bef9SDimitry Andric } 2676e8d8bef9SDimitry Andric 2677e8d8bef9SDimitry Andric return false; 2678e8d8bef9SDimitry Andric } 2679e8d8bef9SDimitry Andric 26808bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad( 26818bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 26828bcb0991SDimitry Andric MachineIRBuilder &B) const { 26838bcb0991SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 26848bcb0991SDimitry Andric assert(Ty.isScalar()); 26858bcb0991SDimitry Andric 2686480093f4SDimitry Andric MachineFunction &MF = B.getMF(); 2687480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2688480093f4SDimitry Andric 26898bcb0991SDimitry Andric // TODO: Always legal with future ftz flag. 26905ffd83dbSDimitry Andric // FIXME: Do we need just output? 26915ffd83dbSDimitry Andric if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 26928bcb0991SDimitry Andric return true; 26935ffd83dbSDimitry Andric if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 26948bcb0991SDimitry Andric return true; 26958bcb0991SDimitry Andric 26968bcb0991SDimitry Andric MachineIRBuilder HelperBuilder(MI); 26978bcb0991SDimitry Andric GISelObserverWrapper DummyObserver; 26988bcb0991SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 26998bcb0991SDimitry Andric return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 27008bcb0991SDimitry Andric } 27018bcb0991SDimitry Andric 2702480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2703480093f4SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2704480093f4SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2705480093f4SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2706480093f4SDimitry Andric Register CmpVal = MI.getOperand(2).getReg(); 2707480093f4SDimitry Andric Register NewVal = MI.getOperand(3).getReg(); 2708480093f4SDimitry Andric 2709e8d8bef9SDimitry Andric assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2710480093f4SDimitry Andric "this should not have been custom lowered"); 2711480093f4SDimitry Andric 2712480093f4SDimitry Andric LLT ValTy = MRI.getType(CmpVal); 2713fe6060f1SDimitry Andric LLT VecTy = LLT::fixed_vector(2, ValTy); 2714480093f4SDimitry Andric 2715480093f4SDimitry Andric Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2716480093f4SDimitry Andric 2717480093f4SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2718480093f4SDimitry Andric .addDef(DstReg) 2719480093f4SDimitry Andric .addUse(PtrReg) 2720480093f4SDimitry Andric .addUse(PackedVal) 2721480093f4SDimitry Andric .setMemRefs(MI.memoperands()); 2722480093f4SDimitry Andric 2723480093f4SDimitry Andric MI.eraseFromParent(); 2724480093f4SDimitry Andric return true; 2725480093f4SDimitry Andric } 2726480093f4SDimitry Andric 27275ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog( 27285ffd83dbSDimitry Andric MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 27295ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 27305ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 27315ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 27325ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 27335ffd83dbSDimitry Andric 27345ffd83dbSDimitry Andric auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 27355ffd83dbSDimitry Andric auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 27365ffd83dbSDimitry Andric 27375ffd83dbSDimitry Andric B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 27385ffd83dbSDimitry Andric MI.eraseFromParent(); 27395ffd83dbSDimitry Andric return true; 27405ffd83dbSDimitry Andric } 27415ffd83dbSDimitry Andric 27425ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 27435ffd83dbSDimitry Andric MachineIRBuilder &B) const { 27445ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 27455ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 27465ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 27475ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 27485ffd83dbSDimitry Andric 27495ffd83dbSDimitry Andric auto K = B.buildFConstant(Ty, numbers::log2e); 27505ffd83dbSDimitry Andric auto Mul = B.buildFMul(Ty, Src, K, Flags); 27515ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 27525ffd83dbSDimitry Andric MI.eraseFromParent(); 27535ffd83dbSDimitry Andric return true; 27545ffd83dbSDimitry Andric } 27555ffd83dbSDimitry Andric 27565ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 27575ffd83dbSDimitry Andric MachineIRBuilder &B) const { 27585ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 27595ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 27605ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 27615ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 27625ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 27635ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 27645ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 27655ffd83dbSDimitry Andric 27665ffd83dbSDimitry Andric if (Ty == S32) { 27675ffd83dbSDimitry Andric auto Log = B.buildFLog2(S32, Src0, Flags); 27685ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 27695ffd83dbSDimitry Andric .addUse(Log.getReg(0)) 27705ffd83dbSDimitry Andric .addUse(Src1) 27715ffd83dbSDimitry Andric .setMIFlags(Flags); 27725ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 27735ffd83dbSDimitry Andric } else if (Ty == S16) { 27745ffd83dbSDimitry Andric // There's no f16 fmul_legacy, so we need to convert for it. 27755ffd83dbSDimitry Andric auto Log = B.buildFLog2(S16, Src0, Flags); 27765ffd83dbSDimitry Andric auto Ext0 = B.buildFPExt(S32, Log, Flags); 27775ffd83dbSDimitry Andric auto Ext1 = B.buildFPExt(S32, Src1, Flags); 27785ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 27795ffd83dbSDimitry Andric .addUse(Ext0.getReg(0)) 27805ffd83dbSDimitry Andric .addUse(Ext1.getReg(0)) 27815ffd83dbSDimitry Andric .setMIFlags(Flags); 27825ffd83dbSDimitry Andric 27835ffd83dbSDimitry Andric B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 27845ffd83dbSDimitry Andric } else 27855ffd83dbSDimitry Andric return false; 27865ffd83dbSDimitry Andric 27875ffd83dbSDimitry Andric MI.eraseFromParent(); 27885ffd83dbSDimitry Andric return true; 27895ffd83dbSDimitry Andric } 27905ffd83dbSDimitry Andric 27915ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers. 27925ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 27935ffd83dbSDimitry Andric Register ModSrc = OrigSrc; 27945ffd83dbSDimitry Andric if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 27955ffd83dbSDimitry Andric ModSrc = SrcFNeg->getOperand(1).getReg(); 27965ffd83dbSDimitry Andric if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 27975ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 27985ffd83dbSDimitry Andric } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 27995ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 28005ffd83dbSDimitry Andric return ModSrc; 28015ffd83dbSDimitry Andric } 28025ffd83dbSDimitry Andric 28035ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 28045ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 28055ffd83dbSDimitry Andric MachineIRBuilder &B) const { 28065ffd83dbSDimitry Andric 28075ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 28085ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 28095ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 28105ffd83dbSDimitry Andric Register OrigSrc = MI.getOperand(1).getReg(); 28115ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 28125ffd83dbSDimitry Andric assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 28135ffd83dbSDimitry Andric "this should not have been custom lowered"); 28145ffd83dbSDimitry Andric 28155ffd83dbSDimitry Andric // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 28165ffd83dbSDimitry Andric // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 28175ffd83dbSDimitry Andric // efficient way to implement it is using V_FRACT_F64. The workaround for the 28185ffd83dbSDimitry Andric // V_FRACT bug is: 28195ffd83dbSDimitry Andric // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 28205ffd83dbSDimitry Andric // 28215ffd83dbSDimitry Andric // Convert floor(x) to (x - fract(x)) 28225ffd83dbSDimitry Andric 28235ffd83dbSDimitry Andric auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 28245ffd83dbSDimitry Andric .addUse(OrigSrc) 28255ffd83dbSDimitry Andric .setMIFlags(Flags); 28265ffd83dbSDimitry Andric 28275ffd83dbSDimitry Andric // Give source modifier matching some assistance before obscuring a foldable 28285ffd83dbSDimitry Andric // pattern. 28295ffd83dbSDimitry Andric 28305ffd83dbSDimitry Andric // TODO: We can avoid the neg on the fract? The input sign to fract 28315ffd83dbSDimitry Andric // shouldn't matter? 28325ffd83dbSDimitry Andric Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 28335ffd83dbSDimitry Andric 28345ffd83dbSDimitry Andric auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 28355ffd83dbSDimitry Andric 28365ffd83dbSDimitry Andric Register Min = MRI.createGenericVirtualRegister(S64); 28375ffd83dbSDimitry Andric 28385ffd83dbSDimitry Andric // We don't need to concern ourselves with the snan handling difference, so 28395ffd83dbSDimitry Andric // use the one which will directly select. 28405ffd83dbSDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 28415ffd83dbSDimitry Andric if (MFI->getMode().IEEE) 28425ffd83dbSDimitry Andric B.buildFMinNumIEEE(Min, Fract, Const, Flags); 28435ffd83dbSDimitry Andric else 28445ffd83dbSDimitry Andric B.buildFMinNum(Min, Fract, Const, Flags); 28455ffd83dbSDimitry Andric 28465ffd83dbSDimitry Andric Register CorrectedFract = Min; 28475ffd83dbSDimitry Andric if (!MI.getFlag(MachineInstr::FmNoNans)) { 28485ffd83dbSDimitry Andric auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 28495ffd83dbSDimitry Andric CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 28505ffd83dbSDimitry Andric } 28515ffd83dbSDimitry Andric 28525ffd83dbSDimitry Andric auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 28535ffd83dbSDimitry Andric B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 28545ffd83dbSDimitry Andric 28555ffd83dbSDimitry Andric MI.eraseFromParent(); 28565ffd83dbSDimitry Andric return true; 28575ffd83dbSDimitry Andric } 28585ffd83dbSDimitry Andric 28595ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations. 28605ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper. 28615ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector( 28625ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 28635ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 28645ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2865*bdd1243dSDimitry Andric const LLT S16 = LLT::scalar(16); 2866fe6060f1SDimitry Andric assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16)); 28675ffd83dbSDimitry Andric 28685ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 28695ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 28705ffd83dbSDimitry Andric 2871*bdd1243dSDimitry Andric if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) { 2872*bdd1243dSDimitry Andric assert(MRI.getType(Src0) == S32); 2873*bdd1243dSDimitry Andric Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0); 2874*bdd1243dSDimitry Andric Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0); 2875*bdd1243dSDimitry Andric } 2876*bdd1243dSDimitry Andric 2877*bdd1243dSDimitry Andric auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1}); 28785ffd83dbSDimitry Andric B.buildBitcast(Dst, Merge); 28795ffd83dbSDimitry Andric 28805ffd83dbSDimitry Andric MI.eraseFromParent(); 28815ffd83dbSDimitry Andric return true; 28825ffd83dbSDimitry Andric } 28835ffd83dbSDimitry Andric 288481ad6265SDimitry Andric // Build a big integer multiply or multiply-add using MAD_64_32 instructions. 288581ad6265SDimitry Andric // 288681ad6265SDimitry Andric // Source and accumulation registers must all be 32-bits. 288781ad6265SDimitry Andric // 288881ad6265SDimitry Andric // TODO: When the multiply is uniform, we should produce a code sequence 288981ad6265SDimitry Andric // that is better suited to instruction selection on the SALU. Instead of 289081ad6265SDimitry Andric // the outer loop going over parts of the result, the outer loop should go 289181ad6265SDimitry Andric // over parts of one of the factors. This should result in instruction 289281ad6265SDimitry Andric // selection that makes full use of S_ADDC_U32 instructions. 289381ad6265SDimitry Andric void AMDGPULegalizerInfo::buildMultiply( 289481ad6265SDimitry Andric LegalizerHelper &Helper, MutableArrayRef<Register> Accum, 289581ad6265SDimitry Andric ArrayRef<Register> Src0, ArrayRef<Register> Src1, 289681ad6265SDimitry Andric bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { 289781ad6265SDimitry Andric // Use (possibly empty) vectors of S1 registers to represent the set of 289881ad6265SDimitry Andric // carries from one pair of positions to the next. 289981ad6265SDimitry Andric using Carry = SmallVector<Register, 2>; 290081ad6265SDimitry Andric 290181ad6265SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 290281ad6265SDimitry Andric 290381ad6265SDimitry Andric const LLT S1 = LLT::scalar(1); 290481ad6265SDimitry Andric const LLT S32 = LLT::scalar(32); 290581ad6265SDimitry Andric const LLT S64 = LLT::scalar(64); 290681ad6265SDimitry Andric 290781ad6265SDimitry Andric Register Zero32; 290881ad6265SDimitry Andric Register Zero64; 290981ad6265SDimitry Andric 291081ad6265SDimitry Andric auto getZero32 = [&]() -> Register { 291181ad6265SDimitry Andric if (!Zero32) 291281ad6265SDimitry Andric Zero32 = B.buildConstant(S32, 0).getReg(0); 291381ad6265SDimitry Andric return Zero32; 291481ad6265SDimitry Andric }; 291581ad6265SDimitry Andric auto getZero64 = [&]() -> Register { 291681ad6265SDimitry Andric if (!Zero64) 291781ad6265SDimitry Andric Zero64 = B.buildConstant(S64, 0).getReg(0); 291881ad6265SDimitry Andric return Zero64; 291981ad6265SDimitry Andric }; 292081ad6265SDimitry Andric 292181ad6265SDimitry Andric // Merge the given carries into the 32-bit LocalAccum, which is modified 292281ad6265SDimitry Andric // in-place. 292381ad6265SDimitry Andric // 292481ad6265SDimitry Andric // Returns the carry-out, which is a single S1 register or null. 292581ad6265SDimitry Andric auto mergeCarry = 292681ad6265SDimitry Andric [&](Register &LocalAccum, const Carry &CarryIn) -> Register { 292781ad6265SDimitry Andric if (CarryIn.empty()) 292881ad6265SDimitry Andric return Register(); 292981ad6265SDimitry Andric 293081ad6265SDimitry Andric bool HaveCarryOut = true; 293181ad6265SDimitry Andric Register CarryAccum; 293281ad6265SDimitry Andric if (CarryIn.size() == 1) { 293381ad6265SDimitry Andric if (!LocalAccum) { 293481ad6265SDimitry Andric LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 293581ad6265SDimitry Andric return Register(); 293681ad6265SDimitry Andric } 293781ad6265SDimitry Andric 293881ad6265SDimitry Andric CarryAccum = getZero32(); 293981ad6265SDimitry Andric } else { 294081ad6265SDimitry Andric CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); 294181ad6265SDimitry Andric for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { 294281ad6265SDimitry Andric CarryAccum = 294381ad6265SDimitry Andric B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) 294481ad6265SDimitry Andric .getReg(0); 294581ad6265SDimitry Andric } 294681ad6265SDimitry Andric 294781ad6265SDimitry Andric if (!LocalAccum) { 294881ad6265SDimitry Andric LocalAccum = getZero32(); 294981ad6265SDimitry Andric HaveCarryOut = false; 295081ad6265SDimitry Andric } 295181ad6265SDimitry Andric } 295281ad6265SDimitry Andric 295381ad6265SDimitry Andric auto Add = 295481ad6265SDimitry Andric B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); 295581ad6265SDimitry Andric LocalAccum = Add.getReg(0); 295681ad6265SDimitry Andric return HaveCarryOut ? Add.getReg(1) : Register(); 295781ad6265SDimitry Andric }; 295881ad6265SDimitry Andric 295981ad6265SDimitry Andric // Build a multiply-add chain to compute 296081ad6265SDimitry Andric // 296181ad6265SDimitry Andric // LocalAccum + (partial products at DstIndex) 296281ad6265SDimitry Andric // + (opportunistic subset of CarryIn) 296381ad6265SDimitry Andric // 296481ad6265SDimitry Andric // LocalAccum is an array of one or two 32-bit registers that are updated 296581ad6265SDimitry Andric // in-place. The incoming registers may be null. 296681ad6265SDimitry Andric // 296781ad6265SDimitry Andric // In some edge cases, carry-ins can be consumed "for free". In that case, 296881ad6265SDimitry Andric // the consumed carry bits are removed from CarryIn in-place. 296981ad6265SDimitry Andric auto buildMadChain = 297081ad6265SDimitry Andric [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) 297181ad6265SDimitry Andric -> Carry { 297281ad6265SDimitry Andric assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || 297381ad6265SDimitry Andric (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); 297481ad6265SDimitry Andric 297581ad6265SDimitry Andric Carry CarryOut; 297681ad6265SDimitry Andric unsigned j0 = 0; 297781ad6265SDimitry Andric 297881ad6265SDimitry Andric // Use plain 32-bit multiplication for the most significant part of the 297981ad6265SDimitry Andric // result by default. 298081ad6265SDimitry Andric if (LocalAccum.size() == 1 && 298181ad6265SDimitry Andric (!UsePartialMad64_32 || !CarryIn.empty())) { 298281ad6265SDimitry Andric do { 298381ad6265SDimitry Andric unsigned j1 = DstIndex - j0; 298481ad6265SDimitry Andric auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); 298581ad6265SDimitry Andric if (!LocalAccum[0]) { 298681ad6265SDimitry Andric LocalAccum[0] = Mul.getReg(0); 298781ad6265SDimitry Andric } else { 298881ad6265SDimitry Andric if (CarryIn.empty()) { 298981ad6265SDimitry Andric LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); 299081ad6265SDimitry Andric } else { 299181ad6265SDimitry Andric LocalAccum[0] = 299281ad6265SDimitry Andric B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) 299381ad6265SDimitry Andric .getReg(0); 299481ad6265SDimitry Andric CarryIn.pop_back(); 299581ad6265SDimitry Andric } 299681ad6265SDimitry Andric } 299781ad6265SDimitry Andric ++j0; 299881ad6265SDimitry Andric } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); 299981ad6265SDimitry Andric } 300081ad6265SDimitry Andric 300181ad6265SDimitry Andric // Build full 64-bit multiplies. 300281ad6265SDimitry Andric if (j0 <= DstIndex) { 300381ad6265SDimitry Andric bool HaveSmallAccum = false; 300481ad6265SDimitry Andric Register Tmp; 300581ad6265SDimitry Andric 300681ad6265SDimitry Andric if (LocalAccum[0]) { 300781ad6265SDimitry Andric if (LocalAccum.size() == 1) { 300881ad6265SDimitry Andric Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); 300981ad6265SDimitry Andric HaveSmallAccum = true; 301081ad6265SDimitry Andric } else if (LocalAccum[1]) { 3011*bdd1243dSDimitry Andric Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0); 301281ad6265SDimitry Andric HaveSmallAccum = false; 301381ad6265SDimitry Andric } else { 301481ad6265SDimitry Andric Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); 301581ad6265SDimitry Andric HaveSmallAccum = true; 301681ad6265SDimitry Andric } 301781ad6265SDimitry Andric } else { 301881ad6265SDimitry Andric assert(LocalAccum.size() == 1 || !LocalAccum[1]); 301981ad6265SDimitry Andric Tmp = getZero64(); 302081ad6265SDimitry Andric HaveSmallAccum = true; 302181ad6265SDimitry Andric } 302281ad6265SDimitry Andric 302381ad6265SDimitry Andric do { 302481ad6265SDimitry Andric unsigned j1 = DstIndex - j0; 302581ad6265SDimitry Andric auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, 302681ad6265SDimitry Andric {Src0[j0], Src1[j1], Tmp}); 302781ad6265SDimitry Andric Tmp = Mad.getReg(0); 302881ad6265SDimitry Andric if (!HaveSmallAccum) 302981ad6265SDimitry Andric CarryOut.push_back(Mad.getReg(1)); 303081ad6265SDimitry Andric HaveSmallAccum = false; 303181ad6265SDimitry Andric ++j0; 303281ad6265SDimitry Andric } while (j0 <= DstIndex); 303381ad6265SDimitry Andric 303481ad6265SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Tmp); 303581ad6265SDimitry Andric LocalAccum[0] = Unmerge.getReg(0); 303681ad6265SDimitry Andric if (LocalAccum.size() > 1) 303781ad6265SDimitry Andric LocalAccum[1] = Unmerge.getReg(1); 303881ad6265SDimitry Andric } 303981ad6265SDimitry Andric 304081ad6265SDimitry Andric return CarryOut; 304181ad6265SDimitry Andric }; 304281ad6265SDimitry Andric 304381ad6265SDimitry Andric // Outer multiply loop, iterating over destination parts from least 304481ad6265SDimitry Andric // significant to most significant parts. 304581ad6265SDimitry Andric // 304681ad6265SDimitry Andric // The columns of the following diagram correspond to the destination parts 304781ad6265SDimitry Andric // affected by one iteration of the outer loop (ignoring boundary 304881ad6265SDimitry Andric // conditions). 304981ad6265SDimitry Andric // 305081ad6265SDimitry Andric // Dest index relative to 2 * i: 1 0 -1 305181ad6265SDimitry Andric // ------ 305281ad6265SDimitry Andric // Carries from previous iteration: e o 305381ad6265SDimitry Andric // Even-aligned partial product sum: E E . 305481ad6265SDimitry Andric // Odd-aligned partial product sum: O O 305581ad6265SDimitry Andric // 305681ad6265SDimitry Andric // 'o' is OddCarry, 'e' is EvenCarry. 305781ad6265SDimitry Andric // EE and OO are computed from partial products via buildMadChain and use 305881ad6265SDimitry Andric // accumulation where possible and appropriate. 305981ad6265SDimitry Andric // 306081ad6265SDimitry Andric Register SeparateOddCarry; 306181ad6265SDimitry Andric Carry EvenCarry; 306281ad6265SDimitry Andric Carry OddCarry; 306381ad6265SDimitry Andric 306481ad6265SDimitry Andric for (unsigned i = 0; i <= Accum.size() / 2; ++i) { 306581ad6265SDimitry Andric Carry OddCarryIn = std::move(OddCarry); 306681ad6265SDimitry Andric Carry EvenCarryIn = std::move(EvenCarry); 306781ad6265SDimitry Andric OddCarry.clear(); 306881ad6265SDimitry Andric EvenCarry.clear(); 306981ad6265SDimitry Andric 307081ad6265SDimitry Andric // Partial products at offset 2 * i. 307181ad6265SDimitry Andric if (2 * i < Accum.size()) { 307281ad6265SDimitry Andric auto LocalAccum = Accum.drop_front(2 * i).take_front(2); 307381ad6265SDimitry Andric EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); 307481ad6265SDimitry Andric } 307581ad6265SDimitry Andric 307681ad6265SDimitry Andric // Partial products at offset 2 * i - 1. 307781ad6265SDimitry Andric if (i > 0) { 307881ad6265SDimitry Andric if (!SeparateOddAlignedProducts) { 307981ad6265SDimitry Andric auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); 308081ad6265SDimitry Andric OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 308181ad6265SDimitry Andric } else { 308281ad6265SDimitry Andric bool IsHighest = 2 * i >= Accum.size(); 308381ad6265SDimitry Andric Register SeparateOddOut[2]; 3084*bdd1243dSDimitry Andric auto LocalAccum = MutableArrayRef(SeparateOddOut) 308581ad6265SDimitry Andric .take_front(IsHighest ? 1 : 2); 308681ad6265SDimitry Andric OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); 308781ad6265SDimitry Andric 308881ad6265SDimitry Andric MachineInstr *Lo; 308981ad6265SDimitry Andric 309081ad6265SDimitry Andric if (i == 1) { 309181ad6265SDimitry Andric if (!IsHighest) 309281ad6265SDimitry Andric Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); 309381ad6265SDimitry Andric else 309481ad6265SDimitry Andric Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); 309581ad6265SDimitry Andric } else { 309681ad6265SDimitry Andric Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], 309781ad6265SDimitry Andric SeparateOddCarry); 309881ad6265SDimitry Andric } 309981ad6265SDimitry Andric Accum[2 * i - 1] = Lo->getOperand(0).getReg(); 310081ad6265SDimitry Andric 310181ad6265SDimitry Andric if (!IsHighest) { 310281ad6265SDimitry Andric auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], 310381ad6265SDimitry Andric Lo->getOperand(1).getReg()); 310481ad6265SDimitry Andric Accum[2 * i] = Hi.getReg(0); 310581ad6265SDimitry Andric SeparateOddCarry = Hi.getReg(1); 310681ad6265SDimitry Andric } 310781ad6265SDimitry Andric } 310881ad6265SDimitry Andric } 310981ad6265SDimitry Andric 311081ad6265SDimitry Andric // Add in the carries from the previous iteration 311181ad6265SDimitry Andric if (i > 0) { 311281ad6265SDimitry Andric if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) 311381ad6265SDimitry Andric EvenCarryIn.push_back(CarryOut); 311481ad6265SDimitry Andric 311581ad6265SDimitry Andric if (2 * i < Accum.size()) { 311681ad6265SDimitry Andric if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) 311781ad6265SDimitry Andric OddCarry.push_back(CarryOut); 311881ad6265SDimitry Andric } 311981ad6265SDimitry Andric } 312081ad6265SDimitry Andric } 312181ad6265SDimitry Andric } 312281ad6265SDimitry Andric 312381ad6265SDimitry Andric // Custom narrowing of wide multiplies using wide multiply-add instructions. 312481ad6265SDimitry Andric // 312581ad6265SDimitry Andric // TODO: If the multiply is followed by an addition, we should attempt to 312681ad6265SDimitry Andric // integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. 312781ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, 312881ad6265SDimitry Andric MachineInstr &MI) const { 312981ad6265SDimitry Andric assert(ST.hasMad64_32()); 313081ad6265SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_MUL); 313181ad6265SDimitry Andric 313281ad6265SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 313381ad6265SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 313481ad6265SDimitry Andric 313581ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 313681ad6265SDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 313781ad6265SDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 313881ad6265SDimitry Andric 313981ad6265SDimitry Andric LLT Ty = MRI.getType(DstReg); 314081ad6265SDimitry Andric assert(Ty.isScalar()); 314181ad6265SDimitry Andric 314281ad6265SDimitry Andric unsigned Size = Ty.getSizeInBits(); 314381ad6265SDimitry Andric unsigned NumParts = Size / 32; 314481ad6265SDimitry Andric assert((Size % 32) == 0); 314581ad6265SDimitry Andric assert(NumParts >= 2); 314681ad6265SDimitry Andric 314781ad6265SDimitry Andric // Whether to use MAD_64_32 for partial products whose high half is 314881ad6265SDimitry Andric // discarded. This avoids some ADD instructions but risks false dependency 314981ad6265SDimitry Andric // stalls on some subtargets in some cases. 315081ad6265SDimitry Andric const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; 315181ad6265SDimitry Andric 315281ad6265SDimitry Andric // Whether to compute odd-aligned partial products separately. This is 315381ad6265SDimitry Andric // advisable on subtargets where the accumulator of MAD_64_32 must be placed 315481ad6265SDimitry Andric // in an even-aligned VGPR. 315581ad6265SDimitry Andric const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); 315681ad6265SDimitry Andric 315781ad6265SDimitry Andric LLT S32 = LLT::scalar(32); 315881ad6265SDimitry Andric SmallVector<Register, 2> Src0Parts, Src1Parts; 315981ad6265SDimitry Andric for (unsigned i = 0; i < NumParts; ++i) { 316081ad6265SDimitry Andric Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); 316181ad6265SDimitry Andric Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); 316281ad6265SDimitry Andric } 316381ad6265SDimitry Andric B.buildUnmerge(Src0Parts, Src0); 316481ad6265SDimitry Andric B.buildUnmerge(Src1Parts, Src1); 316581ad6265SDimitry Andric 316681ad6265SDimitry Andric SmallVector<Register, 2> AccumRegs(NumParts); 316781ad6265SDimitry Andric buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, 316881ad6265SDimitry Andric SeparateOddAlignedProducts); 316981ad6265SDimitry Andric 3170*bdd1243dSDimitry Andric B.buildMergeLikeInstr(DstReg, AccumRegs); 317181ad6265SDimitry Andric MI.eraseFromParent(); 317281ad6265SDimitry Andric return true; 317381ad6265SDimitry Andric 317481ad6265SDimitry Andric } 317581ad6265SDimitry Andric 3176349cc55cSDimitry Andric // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to 3177349cc55cSDimitry Andric // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input 3178349cc55cSDimitry Andric // case with a single min instruction instead of a compare+select. 3179349cc55cSDimitry Andric bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI, 3180349cc55cSDimitry Andric MachineRegisterInfo &MRI, 3181349cc55cSDimitry Andric MachineIRBuilder &B) const { 3182349cc55cSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 3183349cc55cSDimitry Andric Register Src = MI.getOperand(1).getReg(); 3184349cc55cSDimitry Andric LLT DstTy = MRI.getType(Dst); 3185349cc55cSDimitry Andric LLT SrcTy = MRI.getType(Src); 3186349cc55cSDimitry Andric 3187349cc55cSDimitry Andric unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ 3188349cc55cSDimitry Andric ? AMDGPU::G_AMDGPU_FFBH_U32 3189349cc55cSDimitry Andric : AMDGPU::G_AMDGPU_FFBL_B32; 3190349cc55cSDimitry Andric auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src}); 3191349cc55cSDimitry Andric B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits())); 3192349cc55cSDimitry Andric 3193349cc55cSDimitry Andric MI.eraseFromParent(); 3194349cc55cSDimitry Andric return true; 3195349cc55cSDimitry Andric } 3196349cc55cSDimitry Andric 3197e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1 3198e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 3199e8d8bef9SDimitry Andric if (MI.getOpcode() != TargetOpcode::G_XOR) 3200e8d8bef9SDimitry Andric return false; 3201349cc55cSDimitry Andric auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 3202e8d8bef9SDimitry Andric return ConstVal && *ConstVal == -1; 3203e8d8bef9SDimitry Andric } 3204e8d8bef9SDimitry Andric 32050b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid. 3206e8d8bef9SDimitry Andric static MachineInstr * 3207e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 3208e8d8bef9SDimitry Andric MachineBasicBlock *&UncondBrTarget, bool &Negated) { 32090b57cec5SDimitry Andric Register CondDef = MI.getOperand(0).getReg(); 32100b57cec5SDimitry Andric if (!MRI.hasOneNonDBGUse(CondDef)) 32110b57cec5SDimitry Andric return nullptr; 32120b57cec5SDimitry Andric 32135ffd83dbSDimitry Andric MachineBasicBlock *Parent = MI.getParent(); 3214e8d8bef9SDimitry Andric MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 3215e8d8bef9SDimitry Andric 3216e8d8bef9SDimitry Andric if (isNot(MRI, *UseMI)) { 3217e8d8bef9SDimitry Andric Register NegatedCond = UseMI->getOperand(0).getReg(); 3218e8d8bef9SDimitry Andric if (!MRI.hasOneNonDBGUse(NegatedCond)) 3219e8d8bef9SDimitry Andric return nullptr; 3220e8d8bef9SDimitry Andric 3221e8d8bef9SDimitry Andric // We're deleting the def of this value, so we need to remove it. 3222349cc55cSDimitry Andric eraseInstr(*UseMI, MRI); 3223e8d8bef9SDimitry Andric 3224e8d8bef9SDimitry Andric UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 3225e8d8bef9SDimitry Andric Negated = true; 3226e8d8bef9SDimitry Andric } 3227e8d8bef9SDimitry Andric 3228e8d8bef9SDimitry Andric if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 3229480093f4SDimitry Andric return nullptr; 3230480093f4SDimitry Andric 32315ffd83dbSDimitry Andric // Make sure the cond br is followed by a G_BR, or is the last instruction. 3232e8d8bef9SDimitry Andric MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 32335ffd83dbSDimitry Andric if (Next == Parent->end()) { 32345ffd83dbSDimitry Andric MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 32355ffd83dbSDimitry Andric if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 32365ffd83dbSDimitry Andric return nullptr; 32375ffd83dbSDimitry Andric UncondBrTarget = &*NextMBB; 32385ffd83dbSDimitry Andric } else { 3239480093f4SDimitry Andric if (Next->getOpcode() != AMDGPU::G_BR) 3240480093f4SDimitry Andric return nullptr; 3241480093f4SDimitry Andric Br = &*Next; 32425ffd83dbSDimitry Andric UncondBrTarget = Br->getOperand(0).getMBB(); 3243480093f4SDimitry Andric } 3244480093f4SDimitry Andric 3245e8d8bef9SDimitry Andric return UseMI; 32460b57cec5SDimitry Andric } 32470b57cec5SDimitry Andric 32480b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 3249e8d8bef9SDimitry Andric const ArgDescriptor *Arg, 3250e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC, 3251e8d8bef9SDimitry Andric LLT ArgTy) const { 3252e8d8bef9SDimitry Andric MCRegister SrcReg = Arg->getRegister(); 3253e8d8bef9SDimitry Andric assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 32545ffd83dbSDimitry Andric assert(DstReg.isVirtual() && "Virtual register expected"); 32550b57cec5SDimitry Andric 325604eeddc0SDimitry Andric Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, 325704eeddc0SDimitry Andric *ArgRC, B.getDebugLoc(), ArgTy); 32580b57cec5SDimitry Andric if (Arg->isMasked()) { 32590b57cec5SDimitry Andric // TODO: Should we try to emit this once in the entry block? 32600b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 32610b57cec5SDimitry Andric const unsigned Mask = Arg->getMask(); 32620b57cec5SDimitry Andric const unsigned Shift = countTrailingZeros<unsigned>(Mask); 32630b57cec5SDimitry Andric 32648bcb0991SDimitry Andric Register AndMaskSrc = LiveIn; 32658bcb0991SDimitry Andric 326604eeddc0SDimitry Andric // TODO: Avoid clearing the high bits if we know workitem id y/z are always 326704eeddc0SDimitry Andric // 0. 32688bcb0991SDimitry Andric if (Shift != 0) { 32690b57cec5SDimitry Andric auto ShiftAmt = B.buildConstant(S32, Shift); 32708bcb0991SDimitry Andric AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 32718bcb0991SDimitry Andric } 32728bcb0991SDimitry Andric 32738bcb0991SDimitry Andric B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 32745ffd83dbSDimitry Andric } else { 32750b57cec5SDimitry Andric B.buildCopy(DstReg, LiveIn); 32760b57cec5SDimitry Andric } 32770b57cec5SDimitry Andric 32780b57cec5SDimitry Andric return true; 32790b57cec5SDimitry Andric } 32800b57cec5SDimitry Andric 3281e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue( 3282e8d8bef9SDimitry Andric Register DstReg, MachineIRBuilder &B, 3283e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3284e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3285e8d8bef9SDimitry Andric const ArgDescriptor *Arg; 3286e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC; 3287e8d8bef9SDimitry Andric LLT ArgTy; 3288e8d8bef9SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 3289e8d8bef9SDimitry Andric 3290349cc55cSDimitry Andric if (!Arg) { 3291349cc55cSDimitry Andric if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) { 3292349cc55cSDimitry Andric // The intrinsic may appear when we have a 0 sized kernarg segment, in which 3293349cc55cSDimitry Andric // case the pointer argument may be missing and we use null. 3294349cc55cSDimitry Andric B.buildConstant(DstReg, 0); 3295349cc55cSDimitry Andric return true; 3296349cc55cSDimitry Andric } 3297349cc55cSDimitry Andric 3298349cc55cSDimitry Andric // It's undefined behavior if a function marked with the amdgpu-no-* 3299349cc55cSDimitry Andric // attributes uses the corresponding intrinsic. 3300349cc55cSDimitry Andric B.buildUndef(DstReg); 3301349cc55cSDimitry Andric return true; 3302349cc55cSDimitry Andric } 3303349cc55cSDimitry Andric 3304e8d8bef9SDimitry Andric if (!Arg->isRegister() || !Arg->getRegister().isValid()) 3305e8d8bef9SDimitry Andric return false; // TODO: Handle these 3306e8d8bef9SDimitry Andric return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 3307e8d8bef9SDimitry Andric } 3308e8d8bef9SDimitry Andric 33090b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 33105ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 33110b57cec5SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 3312e8d8bef9SDimitry Andric if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 33135ffd83dbSDimitry Andric return false; 33145ffd83dbSDimitry Andric 33150b57cec5SDimitry Andric MI.eraseFromParent(); 33160b57cec5SDimitry Andric return true; 33170b57cec5SDimitry Andric } 33180b57cec5SDimitry Andric 331981ad6265SDimitry Andric static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, 332081ad6265SDimitry Andric int64_t C) { 332181ad6265SDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), C); 332281ad6265SDimitry Andric MI.eraseFromParent(); 332381ad6265SDimitry Andric return true; 332481ad6265SDimitry Andric } 332581ad6265SDimitry Andric 332681ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( 332781ad6265SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 332881ad6265SDimitry Andric unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 332981ad6265SDimitry Andric unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); 333081ad6265SDimitry Andric if (MaxID == 0) 333181ad6265SDimitry Andric return replaceWithConstant(B, MI, 0); 333281ad6265SDimitry Andric 333381ad6265SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 333481ad6265SDimitry Andric const ArgDescriptor *Arg; 333581ad6265SDimitry Andric const TargetRegisterClass *ArgRC; 333681ad6265SDimitry Andric LLT ArgTy; 333781ad6265SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 333881ad6265SDimitry Andric 333981ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 334081ad6265SDimitry Andric if (!Arg) { 334181ad6265SDimitry Andric // It's undefined behavior if a function marked with the amdgpu-no-* 334281ad6265SDimitry Andric // attributes uses the corresponding intrinsic. 334381ad6265SDimitry Andric B.buildUndef(DstReg); 334481ad6265SDimitry Andric MI.eraseFromParent(); 334581ad6265SDimitry Andric return true; 334681ad6265SDimitry Andric } 334781ad6265SDimitry Andric 334881ad6265SDimitry Andric if (Arg->isMasked()) { 334981ad6265SDimitry Andric // Don't bother inserting AssertZext for packed IDs since we're emitting the 335081ad6265SDimitry Andric // masking operations anyway. 335181ad6265SDimitry Andric // 335281ad6265SDimitry Andric // TODO: We could assert the top bit is 0 for the source copy. 335381ad6265SDimitry Andric if (!loadInputValue(DstReg, B, ArgType)) 335481ad6265SDimitry Andric return false; 335581ad6265SDimitry Andric } else { 335681ad6265SDimitry Andric Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); 335781ad6265SDimitry Andric if (!loadInputValue(TmpReg, B, ArgType)) 335881ad6265SDimitry Andric return false; 3359*bdd1243dSDimitry Andric B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID)); 336081ad6265SDimitry Andric } 336181ad6265SDimitry Andric 336281ad6265SDimitry Andric MI.eraseFromParent(); 336381ad6265SDimitry Andric return true; 336481ad6265SDimitry Andric } 336581ad6265SDimitry Andric 336681ad6265SDimitry Andric Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, 336781ad6265SDimitry Andric int64_t Offset) const { 336881ad6265SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 336981ad6265SDimitry Andric Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); 337081ad6265SDimitry Andric 337181ad6265SDimitry Andric // TODO: If we passed in the base kernel offset we could have a better 337281ad6265SDimitry Andric // alignment than 4, but we don't really need it. 337381ad6265SDimitry Andric if (!loadInputValue(KernArgReg, B, 337481ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 337581ad6265SDimitry Andric llvm_unreachable("failed to find kernarg segment ptr"); 337681ad6265SDimitry Andric 337781ad6265SDimitry Andric auto COffset = B.buildConstant(LLT::scalar(64), Offset); 337881ad6265SDimitry Andric // TODO: Should get nuw 337981ad6265SDimitry Andric return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); 338081ad6265SDimitry Andric } 338181ad6265SDimitry Andric 338281ad6265SDimitry Andric /// Legalize a value that's loaded from kernel arguments. This is only used by 338381ad6265SDimitry Andric /// legacy intrinsics. 338481ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, 338581ad6265SDimitry Andric MachineIRBuilder &B, 338681ad6265SDimitry Andric uint64_t Offset, 338781ad6265SDimitry Andric Align Alignment) const { 338881ad6265SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 338981ad6265SDimitry Andric 339081ad6265SDimitry Andric assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && 339181ad6265SDimitry Andric "unexpected kernarg parameter type"); 339281ad6265SDimitry Andric 339381ad6265SDimitry Andric Register Ptr = getKernargParameterPtr(B, Offset); 339481ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 339581ad6265SDimitry Andric B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), 339681ad6265SDimitry Andric MachineMemOperand::MODereferenceable | 339781ad6265SDimitry Andric MachineMemOperand::MOInvariant); 339881ad6265SDimitry Andric MI.eraseFromParent(); 339981ad6265SDimitry Andric return true; 340081ad6265SDimitry Andric } 340181ad6265SDimitry Andric 34028bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 34038bcb0991SDimitry Andric MachineRegisterInfo &MRI, 34048bcb0991SDimitry Andric MachineIRBuilder &B) const { 3405480093f4SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 3406480093f4SDimitry Andric LLT DstTy = MRI.getType(Dst); 3407480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 3408480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3409480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 34108bcb0991SDimitry Andric 3411480093f4SDimitry Andric if (DstTy == S16) 3412480093f4SDimitry Andric return legalizeFDIV16(MI, MRI, B); 3413480093f4SDimitry Andric if (DstTy == S32) 3414480093f4SDimitry Andric return legalizeFDIV32(MI, MRI, B); 3415480093f4SDimitry Andric if (DstTy == S64) 3416480093f4SDimitry Andric return legalizeFDIV64(MI, MRI, B); 3417480093f4SDimitry Andric 34188bcb0991SDimitry Andric return false; 34198bcb0991SDimitry Andric } 34208bcb0991SDimitry Andric 3421fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, 3422fe6060f1SDimitry Andric Register DstDivReg, 3423fe6060f1SDimitry Andric Register DstRemReg, 34245ffd83dbSDimitry Andric Register X, 3425fe6060f1SDimitry Andric Register Y) const { 34265ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 34275ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 34285ffd83dbSDimitry Andric 34295ffd83dbSDimitry Andric // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 34305ffd83dbSDimitry Andric // algorithm used here. 34315ffd83dbSDimitry Andric 34325ffd83dbSDimitry Andric // Initial estimate of inv(y). 34335ffd83dbSDimitry Andric auto FloatY = B.buildUITOFP(S32, Y); 34345ffd83dbSDimitry Andric auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 34355ffd83dbSDimitry Andric auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 34365ffd83dbSDimitry Andric auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 34375ffd83dbSDimitry Andric auto Z = B.buildFPTOUI(S32, ScaledY); 34385ffd83dbSDimitry Andric 34395ffd83dbSDimitry Andric // One round of UNR. 34405ffd83dbSDimitry Andric auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 34415ffd83dbSDimitry Andric auto NegYZ = B.buildMul(S32, NegY, Z); 34425ffd83dbSDimitry Andric Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 34435ffd83dbSDimitry Andric 34445ffd83dbSDimitry Andric // Quotient/remainder estimate. 34455ffd83dbSDimitry Andric auto Q = B.buildUMulH(S32, X, Z); 34465ffd83dbSDimitry Andric auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 34475ffd83dbSDimitry Andric 34485ffd83dbSDimitry Andric // First quotient/remainder refinement. 34495ffd83dbSDimitry Andric auto One = B.buildConstant(S32, 1); 34505ffd83dbSDimitry Andric auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3451fe6060f1SDimitry Andric if (DstDivReg) 34525ffd83dbSDimitry Andric Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 34535ffd83dbSDimitry Andric R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 34545ffd83dbSDimitry Andric 34555ffd83dbSDimitry Andric // Second quotient/remainder refinement. 34565ffd83dbSDimitry Andric Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 3457fe6060f1SDimitry Andric if (DstDivReg) 3458fe6060f1SDimitry Andric B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q); 34595ffd83dbSDimitry Andric 3460fe6060f1SDimitry Andric if (DstRemReg) 3461fe6060f1SDimitry Andric B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R); 34625ffd83dbSDimitry Andric } 34635ffd83dbSDimitry Andric 3464349cc55cSDimitry Andric // Build integer reciprocal sequence around V_RCP_IFLAG_F32 34655ffd83dbSDimitry Andric // 34665ffd83dbSDimitry Andric // Return lo, hi of result 34675ffd83dbSDimitry Andric // 34685ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo 34695ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi 34705ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 34715ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad 34725ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc 34735ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32) 34745ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2 34755ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1 34765ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 34775ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 34785ffd83dbSDimitry Andric Register Val) { 34795ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 34805ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, Val); 34815ffd83dbSDimitry Andric 34825ffd83dbSDimitry Andric auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 34835ffd83dbSDimitry Andric auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 34845ffd83dbSDimitry Andric 34855ffd83dbSDimitry Andric auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 34865ffd83dbSDimitry Andric B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 34875ffd83dbSDimitry Andric 34885ffd83dbSDimitry Andric auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 34895ffd83dbSDimitry Andric auto Mul1 = 34905ffd83dbSDimitry Andric B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 34915ffd83dbSDimitry Andric 34925ffd83dbSDimitry Andric // 2**(-32) 34935ffd83dbSDimitry Andric auto Mul2 = 34945ffd83dbSDimitry Andric B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 34955ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 34965ffd83dbSDimitry Andric 34975ffd83dbSDimitry Andric // -(2**32) 34985ffd83dbSDimitry Andric auto Mad2 = B.buildFMAD(S32, Trunc, 34995ffd83dbSDimitry Andric B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 35005ffd83dbSDimitry Andric 35015ffd83dbSDimitry Andric auto ResultLo = B.buildFPTOUI(S32, Mad2); 35025ffd83dbSDimitry Andric auto ResultHi = B.buildFPTOUI(S32, Trunc); 35035ffd83dbSDimitry Andric 35045ffd83dbSDimitry Andric return {ResultLo.getReg(0), ResultHi.getReg(0)}; 35055ffd83dbSDimitry Andric } 35065ffd83dbSDimitry Andric 3507fe6060f1SDimitry Andric void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, 3508fe6060f1SDimitry Andric Register DstDivReg, 3509fe6060f1SDimitry Andric Register DstRemReg, 35105ffd83dbSDimitry Andric Register Numer, 3511fe6060f1SDimitry Andric Register Denom) const { 35125ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 35135ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 35145ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 35155ffd83dbSDimitry Andric Register RcpLo, RcpHi; 35165ffd83dbSDimitry Andric 35175ffd83dbSDimitry Andric std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 35185ffd83dbSDimitry Andric 3519*bdd1243dSDimitry Andric auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi}); 35205ffd83dbSDimitry Andric 35215ffd83dbSDimitry Andric auto Zero64 = B.buildConstant(S64, 0); 35225ffd83dbSDimitry Andric auto NegDenom = B.buildSub(S64, Zero64, Denom); 35235ffd83dbSDimitry Andric 35245ffd83dbSDimitry Andric auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 35255ffd83dbSDimitry Andric auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 35265ffd83dbSDimitry Andric 35275ffd83dbSDimitry Andric auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 35285ffd83dbSDimitry Andric Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 35295ffd83dbSDimitry Andric Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 35305ffd83dbSDimitry Andric 35315ffd83dbSDimitry Andric auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 35325ffd83dbSDimitry Andric auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 3533*bdd1243dSDimitry Andric auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi}); 35345ffd83dbSDimitry Andric 35355ffd83dbSDimitry Andric auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 35365ffd83dbSDimitry Andric auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 35375ffd83dbSDimitry Andric auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 35385ffd83dbSDimitry Andric Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 35395ffd83dbSDimitry Andric Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 35405ffd83dbSDimitry Andric 35415ffd83dbSDimitry Andric auto Zero32 = B.buildConstant(S32, 0); 35425ffd83dbSDimitry Andric auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 3543349cc55cSDimitry Andric auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1)); 3544*bdd1243dSDimitry Andric auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi}); 35455ffd83dbSDimitry Andric 35465ffd83dbSDimitry Andric auto UnmergeNumer = B.buildUnmerge(S32, Numer); 35475ffd83dbSDimitry Andric Register NumerLo = UnmergeNumer.getReg(0); 35485ffd83dbSDimitry Andric Register NumerHi = UnmergeNumer.getReg(1); 35495ffd83dbSDimitry Andric 35505ffd83dbSDimitry Andric auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 35515ffd83dbSDimitry Andric auto Mul3 = B.buildMul(S64, Denom, MulHi3); 35525ffd83dbSDimitry Andric auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 35535ffd83dbSDimitry Andric Register Mul3_Lo = UnmergeMul3.getReg(0); 35545ffd83dbSDimitry Andric Register Mul3_Hi = UnmergeMul3.getReg(1); 35555ffd83dbSDimitry Andric auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 35565ffd83dbSDimitry Andric auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 35575ffd83dbSDimitry Andric auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 3558*bdd1243dSDimitry Andric auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi}); 35595ffd83dbSDimitry Andric 35605ffd83dbSDimitry Andric auto UnmergeDenom = B.buildUnmerge(S32, Denom); 35615ffd83dbSDimitry Andric Register DenomLo = UnmergeDenom.getReg(0); 35625ffd83dbSDimitry Andric Register DenomHi = UnmergeDenom.getReg(1); 35635ffd83dbSDimitry Andric 35645ffd83dbSDimitry Andric auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 35655ffd83dbSDimitry Andric auto C1 = B.buildSExt(S32, CmpHi); 35665ffd83dbSDimitry Andric 35675ffd83dbSDimitry Andric auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 35685ffd83dbSDimitry Andric auto C2 = B.buildSExt(S32, CmpLo); 35695ffd83dbSDimitry Andric 35705ffd83dbSDimitry Andric auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 35715ffd83dbSDimitry Andric auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 35725ffd83dbSDimitry Andric 35735ffd83dbSDimitry Andric // TODO: Here and below portions of the code can be enclosed into if/endif. 35745ffd83dbSDimitry Andric // Currently control flow is unconditional and we have 4 selects after 35755ffd83dbSDimitry Andric // potential endif to substitute PHIs. 35765ffd83dbSDimitry Andric 35775ffd83dbSDimitry Andric // if C3 != 0 ... 35785ffd83dbSDimitry Andric auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 35795ffd83dbSDimitry Andric auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 35805ffd83dbSDimitry Andric auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 3581*bdd1243dSDimitry Andric auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi}); 35825ffd83dbSDimitry Andric 35835ffd83dbSDimitry Andric auto One64 = B.buildConstant(S64, 1); 35845ffd83dbSDimitry Andric auto Add3 = B.buildAdd(S64, MulHi3, One64); 35855ffd83dbSDimitry Andric 35865ffd83dbSDimitry Andric auto C4 = 35875ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 35885ffd83dbSDimitry Andric auto C5 = 35895ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 35905ffd83dbSDimitry Andric auto C6 = B.buildSelect( 35915ffd83dbSDimitry Andric S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 35925ffd83dbSDimitry Andric 35935ffd83dbSDimitry Andric // if (C6 != 0) 35945ffd83dbSDimitry Andric auto Add4 = B.buildAdd(S64, Add3, One64); 35955ffd83dbSDimitry Andric auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 35965ffd83dbSDimitry Andric 35975ffd83dbSDimitry Andric auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 35985ffd83dbSDimitry Andric auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 3599*bdd1243dSDimitry Andric auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi}); 36005ffd83dbSDimitry Andric 36015ffd83dbSDimitry Andric // endif C6 36025ffd83dbSDimitry Andric // endif C3 36035ffd83dbSDimitry Andric 3604fe6060f1SDimitry Andric if (DstDivReg) { 36055ffd83dbSDimitry Andric auto Sel1 = B.buildSelect( 36065ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 3607fe6060f1SDimitry Andric B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3608fe6060f1SDimitry Andric Sel1, MulHi3); 3609fe6060f1SDimitry Andric } 3610fe6060f1SDimitry Andric 3611fe6060f1SDimitry Andric if (DstRemReg) { 36125ffd83dbSDimitry Andric auto Sel2 = B.buildSelect( 36135ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 3614fe6060f1SDimitry Andric B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), 3615fe6060f1SDimitry Andric Sel2, Sub1); 36165ffd83dbSDimitry Andric } 36175ffd83dbSDimitry Andric } 36185ffd83dbSDimitry Andric 3619fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeUnsignedDIV_REM(MachineInstr &MI, 36205ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 36215ffd83dbSDimitry Andric MachineIRBuilder &B) const { 3622fe6060f1SDimitry Andric Register DstDivReg, DstRemReg; 3623fe6060f1SDimitry Andric switch (MI.getOpcode()) { 3624fe6060f1SDimitry Andric default: 3625fe6060f1SDimitry Andric llvm_unreachable("Unexpected opcode!"); 3626fe6060f1SDimitry Andric case AMDGPU::G_UDIV: { 3627fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 3628fe6060f1SDimitry Andric break; 3629fe6060f1SDimitry Andric } 3630fe6060f1SDimitry Andric case AMDGPU::G_UREM: { 3631fe6060f1SDimitry Andric DstRemReg = MI.getOperand(0).getReg(); 3632fe6060f1SDimitry Andric break; 3633fe6060f1SDimitry Andric } 3634fe6060f1SDimitry Andric case AMDGPU::G_UDIVREM: { 3635fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 3636fe6060f1SDimitry Andric DstRemReg = MI.getOperand(1).getReg(); 3637fe6060f1SDimitry Andric break; 3638fe6060f1SDimitry Andric } 3639fe6060f1SDimitry Andric } 3640fe6060f1SDimitry Andric 36415ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 36425ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3643fe6060f1SDimitry Andric const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3644fe6060f1SDimitry Andric Register Num = MI.getOperand(FirstSrcOpIdx).getReg(); 3645fe6060f1SDimitry Andric Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 3646fe6060f1SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 36475ffd83dbSDimitry Andric 36485ffd83dbSDimitry Andric if (Ty == S32) 3649fe6060f1SDimitry Andric legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den); 36505ffd83dbSDimitry Andric else if (Ty == S64) 3651fe6060f1SDimitry Andric legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den); 36525ffd83dbSDimitry Andric else 36535ffd83dbSDimitry Andric return false; 36545ffd83dbSDimitry Andric 36555ffd83dbSDimitry Andric MI.eraseFromParent(); 36565ffd83dbSDimitry Andric return true; 36575ffd83dbSDimitry Andric } 36585ffd83dbSDimitry Andric 3659fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeSignedDIV_REM(MachineInstr &MI, 36605ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 36615ffd83dbSDimitry Andric MachineIRBuilder &B) const { 36625ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 36635ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 36645ffd83dbSDimitry Andric 3665fe6060f1SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 36665ffd83dbSDimitry Andric if (Ty != S32 && Ty != S64) 36675ffd83dbSDimitry Andric return false; 36685ffd83dbSDimitry Andric 3669fe6060f1SDimitry Andric const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs(); 3670fe6060f1SDimitry Andric Register LHS = MI.getOperand(FirstSrcOpIdx).getReg(); 3671fe6060f1SDimitry Andric Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg(); 36725ffd83dbSDimitry Andric 36735ffd83dbSDimitry Andric auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 36745ffd83dbSDimitry Andric auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 36755ffd83dbSDimitry Andric auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 36765ffd83dbSDimitry Andric 36775ffd83dbSDimitry Andric LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 36785ffd83dbSDimitry Andric RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 36795ffd83dbSDimitry Andric 36805ffd83dbSDimitry Andric LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 36815ffd83dbSDimitry Andric RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 36825ffd83dbSDimitry Andric 3683fe6060f1SDimitry Andric Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg; 3684fe6060f1SDimitry Andric switch (MI.getOpcode()) { 3685fe6060f1SDimitry Andric default: 3686fe6060f1SDimitry Andric llvm_unreachable("Unexpected opcode!"); 3687fe6060f1SDimitry Andric case AMDGPU::G_SDIV: { 3688fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 3689fe6060f1SDimitry Andric TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3690fe6060f1SDimitry Andric break; 3691fe6060f1SDimitry Andric } 3692fe6060f1SDimitry Andric case AMDGPU::G_SREM: { 3693fe6060f1SDimitry Andric DstRemReg = MI.getOperand(0).getReg(); 3694fe6060f1SDimitry Andric TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3695fe6060f1SDimitry Andric break; 3696fe6060f1SDimitry Andric } 3697fe6060f1SDimitry Andric case AMDGPU::G_SDIVREM: { 3698fe6060f1SDimitry Andric DstDivReg = MI.getOperand(0).getReg(); 3699fe6060f1SDimitry Andric DstRemReg = MI.getOperand(1).getReg(); 3700fe6060f1SDimitry Andric TmpDivReg = MRI.createGenericVirtualRegister(Ty); 3701fe6060f1SDimitry Andric TmpRemReg = MRI.createGenericVirtualRegister(Ty); 3702fe6060f1SDimitry Andric break; 3703fe6060f1SDimitry Andric } 3704fe6060f1SDimitry Andric } 3705fe6060f1SDimitry Andric 37065ffd83dbSDimitry Andric if (Ty == S32) 3707fe6060f1SDimitry Andric legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 37085ffd83dbSDimitry Andric else 3709fe6060f1SDimitry Andric legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS); 37105ffd83dbSDimitry Andric 3711fe6060f1SDimitry Andric if (DstDivReg) { 3712fe6060f1SDimitry Andric auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 3713fe6060f1SDimitry Andric auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0); 3714fe6060f1SDimitry Andric B.buildSub(DstDivReg, SignXor, Sign); 3715fe6060f1SDimitry Andric } 37165ffd83dbSDimitry Andric 3717fe6060f1SDimitry Andric if (DstRemReg) { 3718fe6060f1SDimitry Andric auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 3719fe6060f1SDimitry Andric auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0); 3720fe6060f1SDimitry Andric B.buildSub(DstRemReg, SignXor, Sign); 3721fe6060f1SDimitry Andric } 37225ffd83dbSDimitry Andric 37235ffd83dbSDimitry Andric MI.eraseFromParent(); 37245ffd83dbSDimitry Andric return true; 37255ffd83dbSDimitry Andric } 37265ffd83dbSDimitry Andric 37278bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 37288bcb0991SDimitry Andric MachineRegisterInfo &MRI, 37298bcb0991SDimitry Andric MachineIRBuilder &B) const { 37308bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 37318bcb0991SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 37328bcb0991SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 37338bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 37348bcb0991SDimitry Andric LLT ResTy = MRI.getType(Res); 37358bcb0991SDimitry Andric 37368bcb0991SDimitry Andric const MachineFunction &MF = B.getMF(); 3737e8d8bef9SDimitry Andric bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3738e8d8bef9SDimitry Andric MI.getFlag(MachineInstr::FmAfn); 37398bcb0991SDimitry Andric 3740e8d8bef9SDimitry Andric if (!AllowInaccurateRcp) 37418bcb0991SDimitry Andric return false; 37428bcb0991SDimitry Andric 37438bcb0991SDimitry Andric if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 37448bcb0991SDimitry Andric // 1 / x -> RCP(x) 37458bcb0991SDimitry Andric if (CLHS->isExactlyValue(1.0)) { 37468bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 37478bcb0991SDimitry Andric .addUse(RHS) 37488bcb0991SDimitry Andric .setMIFlags(Flags); 37498bcb0991SDimitry Andric 37508bcb0991SDimitry Andric MI.eraseFromParent(); 37518bcb0991SDimitry Andric return true; 37528bcb0991SDimitry Andric } 37538bcb0991SDimitry Andric 37548bcb0991SDimitry Andric // -1 / x -> RCP( FNEG(x) ) 37558bcb0991SDimitry Andric if (CLHS->isExactlyValue(-1.0)) { 37568bcb0991SDimitry Andric auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 37578bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 37588bcb0991SDimitry Andric .addUse(FNeg.getReg(0)) 37598bcb0991SDimitry Andric .setMIFlags(Flags); 37608bcb0991SDimitry Andric 37618bcb0991SDimitry Andric MI.eraseFromParent(); 37628bcb0991SDimitry Andric return true; 37638bcb0991SDimitry Andric } 37648bcb0991SDimitry Andric } 37658bcb0991SDimitry Andric 37668bcb0991SDimitry Andric // x / y -> x * (1.0 / y) 37678bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 37688bcb0991SDimitry Andric .addUse(RHS) 37698bcb0991SDimitry Andric .setMIFlags(Flags); 37708bcb0991SDimitry Andric B.buildFMul(Res, LHS, RCP, Flags); 37718bcb0991SDimitry Andric 37728bcb0991SDimitry Andric MI.eraseFromParent(); 37738bcb0991SDimitry Andric return true; 37748bcb0991SDimitry Andric } 37758bcb0991SDimitry Andric 3776e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3777e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 3778e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 3779e8d8bef9SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3780e8d8bef9SDimitry Andric Register X = MI.getOperand(1).getReg(); 3781e8d8bef9SDimitry Andric Register Y = MI.getOperand(2).getReg(); 3782e8d8bef9SDimitry Andric uint16_t Flags = MI.getFlags(); 3783e8d8bef9SDimitry Andric LLT ResTy = MRI.getType(Res); 3784e8d8bef9SDimitry Andric 3785e8d8bef9SDimitry Andric const MachineFunction &MF = B.getMF(); 3786e8d8bef9SDimitry Andric bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3787e8d8bef9SDimitry Andric MI.getFlag(MachineInstr::FmAfn); 3788e8d8bef9SDimitry Andric 3789e8d8bef9SDimitry Andric if (!AllowInaccurateRcp) 37908bcb0991SDimitry Andric return false; 3791e8d8bef9SDimitry Andric 3792e8d8bef9SDimitry Andric auto NegY = B.buildFNeg(ResTy, Y); 3793e8d8bef9SDimitry Andric auto One = B.buildFConstant(ResTy, 1.0); 3794e8d8bef9SDimitry Andric 3795e8d8bef9SDimitry Andric auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3796e8d8bef9SDimitry Andric .addUse(Y) 3797e8d8bef9SDimitry Andric .setMIFlags(Flags); 3798e8d8bef9SDimitry Andric 3799e8d8bef9SDimitry Andric auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3800e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp0, R, R); 3801e8d8bef9SDimitry Andric 3802e8d8bef9SDimitry Andric auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3803e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp1, R, R); 3804e8d8bef9SDimitry Andric 3805e8d8bef9SDimitry Andric auto Ret = B.buildFMul(ResTy, X, R); 3806e8d8bef9SDimitry Andric auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3807e8d8bef9SDimitry Andric 3808e8d8bef9SDimitry Andric B.buildFMA(Res, Tmp2, R, Ret); 3809e8d8bef9SDimitry Andric MI.eraseFromParent(); 3810e8d8bef9SDimitry Andric return true; 38118bcb0991SDimitry Andric } 38128bcb0991SDimitry Andric 3813480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3814480093f4SDimitry Andric MachineRegisterInfo &MRI, 3815480093f4SDimitry Andric MachineIRBuilder &B) const { 3816e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3817e8d8bef9SDimitry Andric return true; 3818e8d8bef9SDimitry Andric 3819480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3820480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3821480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3822480093f4SDimitry Andric 3823480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3824480093f4SDimitry Andric 3825480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 3826480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3827480093f4SDimitry Andric 3828480093f4SDimitry Andric auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3829480093f4SDimitry Andric auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3830480093f4SDimitry Andric 3831480093f4SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3832480093f4SDimitry Andric .addUse(RHSExt.getReg(0)) 3833480093f4SDimitry Andric .setMIFlags(Flags); 3834480093f4SDimitry Andric 3835480093f4SDimitry Andric auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3836480093f4SDimitry Andric auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3837480093f4SDimitry Andric 3838480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3839480093f4SDimitry Andric .addUse(RDst.getReg(0)) 3840480093f4SDimitry Andric .addUse(RHS) 3841480093f4SDimitry Andric .addUse(LHS) 3842480093f4SDimitry Andric .setMIFlags(Flags); 3843480093f4SDimitry Andric 3844480093f4SDimitry Andric MI.eraseFromParent(); 3845480093f4SDimitry Andric return true; 3846480093f4SDimitry Andric } 3847480093f4SDimitry Andric 3848480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3849480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3850480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable, 3851480093f4SDimitry Andric MachineIRBuilder &B, 3852480093f4SDimitry Andric const GCNSubtarget &ST, 3853480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode) { 3854480093f4SDimitry Andric // Set SP denorm mode to this value. 3855480093f4SDimitry Andric unsigned SPDenormMode = 38565ffd83dbSDimitry Andric Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3857480093f4SDimitry Andric 3858480093f4SDimitry Andric if (ST.hasDenormModeInst()) { 3859480093f4SDimitry Andric // Preserve default FP64FP16 denorm mode while updating FP32 mode. 38605ffd83dbSDimitry Andric uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3861480093f4SDimitry Andric 38625ffd83dbSDimitry Andric uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3863480093f4SDimitry Andric B.buildInstr(AMDGPU::S_DENORM_MODE) 3864480093f4SDimitry Andric .addImm(NewDenormModeValue); 3865480093f4SDimitry Andric 3866480093f4SDimitry Andric } else { 3867480093f4SDimitry Andric // Select FP32 bit field in mode register. 3868480093f4SDimitry Andric unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3869480093f4SDimitry Andric (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3870480093f4SDimitry Andric (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3871480093f4SDimitry Andric 3872480093f4SDimitry Andric B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3873480093f4SDimitry Andric .addImm(SPDenormMode) 3874480093f4SDimitry Andric .addImm(SPDenormModeBitField); 3875480093f4SDimitry Andric } 3876480093f4SDimitry Andric } 3877480093f4SDimitry Andric 3878480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3879480093f4SDimitry Andric MachineRegisterInfo &MRI, 3880480093f4SDimitry Andric MachineIRBuilder &B) const { 3881e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3882e8d8bef9SDimitry Andric return true; 3883e8d8bef9SDimitry Andric 3884480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3885480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3886480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3887480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3888480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3889480093f4SDimitry Andric 3890480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3891480093f4SDimitry Andric 3892480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3893480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 3894480093f4SDimitry Andric 3895480093f4SDimitry Andric auto One = B.buildFConstant(S32, 1.0f); 3896480093f4SDimitry Andric 3897480093f4SDimitry Andric auto DenominatorScaled = 3898480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3899480093f4SDimitry Andric .addUse(LHS) 39005ffd83dbSDimitry Andric .addUse(RHS) 39015ffd83dbSDimitry Andric .addImm(0) 3902480093f4SDimitry Andric .setMIFlags(Flags); 3903480093f4SDimitry Andric auto NumeratorScaled = 3904480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3905480093f4SDimitry Andric .addUse(LHS) 3906480093f4SDimitry Andric .addUse(RHS) 39075ffd83dbSDimitry Andric .addImm(1) 3908480093f4SDimitry Andric .setMIFlags(Flags); 3909480093f4SDimitry Andric 3910480093f4SDimitry Andric auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3911480093f4SDimitry Andric .addUse(DenominatorScaled.getReg(0)) 3912480093f4SDimitry Andric .setMIFlags(Flags); 3913480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3914480093f4SDimitry Andric 3915480093f4SDimitry Andric // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3916480093f4SDimitry Andric // aren't modeled as reading it. 39175ffd83dbSDimitry Andric if (!Mode.allFP32Denormals()) 3918480093f4SDimitry Andric toggleSPDenormMode(true, B, ST, Mode); 3919480093f4SDimitry Andric 3920480093f4SDimitry Andric auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3921480093f4SDimitry Andric auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3922480093f4SDimitry Andric auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3923480093f4SDimitry Andric auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3924480093f4SDimitry Andric auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3925480093f4SDimitry Andric auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3926480093f4SDimitry Andric 39275ffd83dbSDimitry Andric if (!Mode.allFP32Denormals()) 3928480093f4SDimitry Andric toggleSPDenormMode(false, B, ST, Mode); 3929480093f4SDimitry Andric 3930480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3931480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 3932480093f4SDimitry Andric .addUse(Fma1.getReg(0)) 3933480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 3934480093f4SDimitry Andric .addUse(NumeratorScaled.getReg(1)) 3935480093f4SDimitry Andric .setMIFlags(Flags); 3936480093f4SDimitry Andric 3937480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3938480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 3939480093f4SDimitry Andric .addUse(RHS) 3940480093f4SDimitry Andric .addUse(LHS) 3941480093f4SDimitry Andric .setMIFlags(Flags); 3942480093f4SDimitry Andric 3943480093f4SDimitry Andric MI.eraseFromParent(); 3944480093f4SDimitry Andric return true; 3945480093f4SDimitry Andric } 3946480093f4SDimitry Andric 3947480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3948480093f4SDimitry Andric MachineRegisterInfo &MRI, 3949480093f4SDimitry Andric MachineIRBuilder &B) const { 3950e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3951e8d8bef9SDimitry Andric return true; 3952e8d8bef9SDimitry Andric 3953480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3954480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3955480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3956480093f4SDimitry Andric 3957480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3958480093f4SDimitry Andric 3959480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 3960480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 3961480093f4SDimitry Andric 3962480093f4SDimitry Andric auto One = B.buildFConstant(S64, 1.0); 3963480093f4SDimitry Andric 3964480093f4SDimitry Andric auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3965480093f4SDimitry Andric .addUse(LHS) 3966480093f4SDimitry Andric .addUse(RHS) 39675ffd83dbSDimitry Andric .addImm(0) 3968480093f4SDimitry Andric .setMIFlags(Flags); 3969480093f4SDimitry Andric 3970480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3971480093f4SDimitry Andric 3972480093f4SDimitry Andric auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3973480093f4SDimitry Andric .addUse(DivScale0.getReg(0)) 3974480093f4SDimitry Andric .setMIFlags(Flags); 3975480093f4SDimitry Andric 3976480093f4SDimitry Andric auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3977480093f4SDimitry Andric auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3978480093f4SDimitry Andric auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3979480093f4SDimitry Andric 3980480093f4SDimitry Andric auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3981480093f4SDimitry Andric .addUse(LHS) 3982480093f4SDimitry Andric .addUse(RHS) 39835ffd83dbSDimitry Andric .addImm(1) 3984480093f4SDimitry Andric .setMIFlags(Flags); 3985480093f4SDimitry Andric 3986480093f4SDimitry Andric auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 39875ffd83dbSDimitry Andric auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3988480093f4SDimitry Andric auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3989480093f4SDimitry Andric 3990480093f4SDimitry Andric Register Scale; 3991480093f4SDimitry Andric if (!ST.hasUsableDivScaleConditionOutput()) { 3992480093f4SDimitry Andric // Workaround a hardware bug on SI where the condition output from div_scale 3993480093f4SDimitry Andric // is not usable. 3994480093f4SDimitry Andric 3995480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3996480093f4SDimitry Andric 3997480093f4SDimitry Andric auto NumUnmerge = B.buildUnmerge(S32, LHS); 3998480093f4SDimitry Andric auto DenUnmerge = B.buildUnmerge(S32, RHS); 3999480093f4SDimitry Andric auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 4000480093f4SDimitry Andric auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 4001480093f4SDimitry Andric 4002480093f4SDimitry Andric auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 4003480093f4SDimitry Andric Scale1Unmerge.getReg(1)); 4004480093f4SDimitry Andric auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 4005480093f4SDimitry Andric Scale0Unmerge.getReg(1)); 40065ffd83dbSDimitry Andric Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 4007480093f4SDimitry Andric } else { 4008480093f4SDimitry Andric Scale = DivScale1.getReg(1); 4009480093f4SDimitry Andric } 4010480093f4SDimitry Andric 4011480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 4012480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 4013480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 4014480093f4SDimitry Andric .addUse(Mul.getReg(0)) 4015480093f4SDimitry Andric .addUse(Scale) 4016480093f4SDimitry Andric .setMIFlags(Flags); 4017480093f4SDimitry Andric 4018*bdd1243dSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false) 4019480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 4020480093f4SDimitry Andric .addUse(RHS) 4021480093f4SDimitry Andric .addUse(LHS) 4022480093f4SDimitry Andric .setMIFlags(Flags); 4023480093f4SDimitry Andric 4024480093f4SDimitry Andric MI.eraseFromParent(); 4025480093f4SDimitry Andric return true; 4026480093f4SDimitry Andric } 4027480093f4SDimitry Andric 40288bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 40298bcb0991SDimitry Andric MachineRegisterInfo &MRI, 40308bcb0991SDimitry Andric MachineIRBuilder &B) const { 40318bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 40328bcb0991SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 40338bcb0991SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 40348bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 40358bcb0991SDimitry Andric 40368bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 40378bcb0991SDimitry Andric LLT S1 = LLT::scalar(1); 40388bcb0991SDimitry Andric 40398bcb0991SDimitry Andric auto Abs = B.buildFAbs(S32, RHS, Flags); 40408bcb0991SDimitry Andric const APFloat C0Val(1.0f); 40418bcb0991SDimitry Andric 40428bcb0991SDimitry Andric auto C0 = B.buildConstant(S32, 0x6f800000); 40438bcb0991SDimitry Andric auto C1 = B.buildConstant(S32, 0x2f800000); 40448bcb0991SDimitry Andric auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 40458bcb0991SDimitry Andric 40468bcb0991SDimitry Andric auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 40478bcb0991SDimitry Andric auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 40488bcb0991SDimitry Andric 40498bcb0991SDimitry Andric auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 40508bcb0991SDimitry Andric 40518bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 40528bcb0991SDimitry Andric .addUse(Mul0.getReg(0)) 40538bcb0991SDimitry Andric .setMIFlags(Flags); 40548bcb0991SDimitry Andric 40558bcb0991SDimitry Andric auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 40568bcb0991SDimitry Andric 40578bcb0991SDimitry Andric B.buildFMul(Res, Sel, Mul1, Flags); 40588bcb0991SDimitry Andric 40598bcb0991SDimitry Andric MI.eraseFromParent(); 40608bcb0991SDimitry Andric return true; 40618bcb0991SDimitry Andric } 40628bcb0991SDimitry Andric 4063e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 4064e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions? 4065e8d8bef9SDimitry Andric // 4066e8d8bef9SDimitry Andric // Reciprocal square root. The clamp prevents infinite results, clamping 4067e8d8bef9SDimitry Andric // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 4068e8d8bef9SDimitry Andric // +-max_float. 4069e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 4070e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 4071e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 4072e8d8bef9SDimitry Andric if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 4073e8d8bef9SDimitry Andric return true; 4074e8d8bef9SDimitry Andric 4075e8d8bef9SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 4076e8d8bef9SDimitry Andric Register Src = MI.getOperand(2).getReg(); 4077e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 4078e8d8bef9SDimitry Andric 4079e8d8bef9SDimitry Andric LLT Ty = MRI.getType(Dst); 4080e8d8bef9SDimitry Andric 4081e8d8bef9SDimitry Andric const fltSemantics *FltSemantics; 4082e8d8bef9SDimitry Andric if (Ty == LLT::scalar(32)) 4083e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEsingle(); 4084e8d8bef9SDimitry Andric else if (Ty == LLT::scalar(64)) 4085e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEdouble(); 4086e8d8bef9SDimitry Andric else 4087e8d8bef9SDimitry Andric return false; 4088e8d8bef9SDimitry Andric 4089e8d8bef9SDimitry Andric auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 4090e8d8bef9SDimitry Andric .addUse(Src) 4091e8d8bef9SDimitry Andric .setMIFlags(Flags); 4092e8d8bef9SDimitry Andric 4093e8d8bef9SDimitry Andric // We don't need to concern ourselves with the snan handling difference, since 4094e8d8bef9SDimitry Andric // the rsq quieted (or not) so use the one which will directly select. 4095e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4096e8d8bef9SDimitry Andric const bool UseIEEE = MFI->getMode().IEEE; 4097e8d8bef9SDimitry Andric 4098e8d8bef9SDimitry Andric auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 4099e8d8bef9SDimitry Andric auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 4100e8d8bef9SDimitry Andric B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 4101e8d8bef9SDimitry Andric 4102e8d8bef9SDimitry Andric auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 4103e8d8bef9SDimitry Andric 4104e8d8bef9SDimitry Andric if (UseIEEE) 4105e8d8bef9SDimitry Andric B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 4106e8d8bef9SDimitry Andric else 4107e8d8bef9SDimitry Andric B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 4108e8d8bef9SDimitry Andric MI.eraseFromParent(); 4109e8d8bef9SDimitry Andric return true; 4110e8d8bef9SDimitry Andric } 4111e8d8bef9SDimitry Andric 4112e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 4113e8d8bef9SDimitry Andric switch (IID) { 4114e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 4115e8d8bef9SDimitry Andric return AMDGPU::G_ATOMICRMW_FADD; 4116e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 4117e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 4118e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 4119e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 4120e8d8bef9SDimitry Andric default: 4121e8d8bef9SDimitry Andric llvm_unreachable("not a DS FP intrinsic"); 4122e8d8bef9SDimitry Andric } 4123e8d8bef9SDimitry Andric } 4124e8d8bef9SDimitry Andric 4125e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 4126e8d8bef9SDimitry Andric MachineInstr &MI, 4127e8d8bef9SDimitry Andric Intrinsic::ID IID) const { 4128e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 4129e8d8bef9SDimitry Andric Observer.changingInstr(MI); 4130e8d8bef9SDimitry Andric 4131e8d8bef9SDimitry Andric MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 4132e8d8bef9SDimitry Andric 4133e8d8bef9SDimitry Andric // The remaining operands were used to set fields in the MemOperand on 4134e8d8bef9SDimitry Andric // construction. 4135e8d8bef9SDimitry Andric for (int I = 6; I > 3; --I) 413681ad6265SDimitry Andric MI.removeOperand(I); 4137e8d8bef9SDimitry Andric 413881ad6265SDimitry Andric MI.removeOperand(1); // Remove the intrinsic ID. 4139e8d8bef9SDimitry Andric Observer.changedInstr(MI); 4140e8d8bef9SDimitry Andric return true; 4141e8d8bef9SDimitry Andric } 4142e8d8bef9SDimitry Andric 4143e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 4144e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 4145e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 4146e8d8bef9SDimitry Andric uint64_t Offset = 4147e8d8bef9SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset( 4148e8d8bef9SDimitry Andric B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 4149e8d8bef9SDimitry Andric LLT DstTy = MRI.getType(DstReg); 4150e8d8bef9SDimitry Andric LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 4151e8d8bef9SDimitry Andric 4152e8d8bef9SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 4153e8d8bef9SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 4154e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 4155e8d8bef9SDimitry Andric return false; 4156e8d8bef9SDimitry Andric 4157e8d8bef9SDimitry Andric // FIXME: This should be nuw 4158e8d8bef9SDimitry Andric B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 4159e8d8bef9SDimitry Andric return true; 4160e8d8bef9SDimitry Andric } 4161e8d8bef9SDimitry Andric 41620b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 41630b57cec5SDimitry Andric MachineRegisterInfo &MRI, 41640b57cec5SDimitry Andric MachineIRBuilder &B) const { 41650b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 41660b57cec5SDimitry Andric if (!MFI->isEntryFunction()) { 41670b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 41680b57cec5SDimitry Andric AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 41690b57cec5SDimitry Andric } 41700b57cec5SDimitry Andric 41710b57cec5SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 4172e8d8bef9SDimitry Andric if (!getImplicitArgPtr(DstReg, MRI, B)) 41730b57cec5SDimitry Andric return false; 41740b57cec5SDimitry Andric 41750b57cec5SDimitry Andric MI.eraseFromParent(); 41760b57cec5SDimitry Andric return true; 41770b57cec5SDimitry Andric } 41780b57cec5SDimitry Andric 4179fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg, 4180fcaf7f86SDimitry Andric MachineRegisterInfo &MRI, 4181fcaf7f86SDimitry Andric MachineIRBuilder &B) const { 4182fcaf7f86SDimitry Andric Function &F = B.getMF().getFunction(); 4183*bdd1243dSDimitry Andric std::optional<uint32_t> KnownSize = 4184fcaf7f86SDimitry Andric AMDGPUMachineFunction::getLDSKernelIdMetadata(F); 4185fcaf7f86SDimitry Andric if (KnownSize.has_value()) 4186*bdd1243dSDimitry Andric B.buildConstant(DstReg, *KnownSize); 4187fcaf7f86SDimitry Andric return false; 4188fcaf7f86SDimitry Andric } 4189fcaf7f86SDimitry Andric 4190fcaf7f86SDimitry Andric bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI, 4191fcaf7f86SDimitry Andric MachineRegisterInfo &MRI, 4192fcaf7f86SDimitry Andric MachineIRBuilder &B) const { 4193fcaf7f86SDimitry Andric 4194fcaf7f86SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 4195fcaf7f86SDimitry Andric if (!MFI->isEntryFunction()) { 4196fcaf7f86SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 4197fcaf7f86SDimitry Andric AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 4198fcaf7f86SDimitry Andric } 4199fcaf7f86SDimitry Andric 4200fcaf7f86SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 4201fcaf7f86SDimitry Andric if (!getLDSKernelId(DstReg, MRI, B)) 4202fcaf7f86SDimitry Andric return false; 4203fcaf7f86SDimitry Andric 4204fcaf7f86SDimitry Andric MI.eraseFromParent(); 4205fcaf7f86SDimitry Andric return true; 4206fcaf7f86SDimitry Andric } 4207fcaf7f86SDimitry Andric 42088bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 42098bcb0991SDimitry Andric MachineRegisterInfo &MRI, 42108bcb0991SDimitry Andric MachineIRBuilder &B, 42118bcb0991SDimitry Andric unsigned AddrSpace) const { 42128bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 4213e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 4214e8d8bef9SDimitry Andric Register Hi32 = Unmerge.getReg(1); 4215e8d8bef9SDimitry Andric 42168bcb0991SDimitry Andric B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 42178bcb0991SDimitry Andric MI.eraseFromParent(); 42188bcb0991SDimitry Andric return true; 42198bcb0991SDimitry Andric } 42208bcb0991SDimitry Andric 42215ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 42225ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be 42235ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset 42245ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in 42255ffd83dbSDimitry Andric // the instruction's soffset field). This function takes the first kind of 42265ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset. 4227fe6060f1SDimitry Andric std::pair<Register, unsigned> 42285ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 42295ffd83dbSDimitry Andric Register OrigOffset) const { 42305ffd83dbSDimitry Andric const unsigned MaxImm = 4095; 42315ffd83dbSDimitry Andric Register BaseReg; 4232fe6060f1SDimitry Andric unsigned ImmOffset; 42335ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 4234fe6060f1SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 42355ffd83dbSDimitry Andric 4236fe6060f1SDimitry Andric std::tie(BaseReg, ImmOffset) = 4237fe6060f1SDimitry Andric AMDGPU::getBaseWithConstantOffset(MRI, OrigOffset); 42385ffd83dbSDimitry Andric 4239fe6060f1SDimitry Andric // If BaseReg is a pointer, convert it to int. 4240fe6060f1SDimitry Andric if (MRI.getType(BaseReg).isPointer()) 4241fe6060f1SDimitry Andric BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0); 42425ffd83dbSDimitry Andric 42435ffd83dbSDimitry Andric // If the immediate value is too big for the immoffset field, put the value 42445ffd83dbSDimitry Andric // and -4096 into the immoffset field so that the value that is copied/added 42455ffd83dbSDimitry Andric // for the voffset field is a multiple of 4096, and it stands more chance 42465ffd83dbSDimitry Andric // of being CSEd with the copy/add for another similar load/store. 42475ffd83dbSDimitry Andric // However, do not do that rounding down to a multiple of 4096 if that is a 42485ffd83dbSDimitry Andric // negative number, as it appears to be illegal to have a negative offset 42495ffd83dbSDimitry Andric // in the vgpr, even if adding the immediate offset makes it positive. 42505ffd83dbSDimitry Andric unsigned Overflow = ImmOffset & ~MaxImm; 42515ffd83dbSDimitry Andric ImmOffset -= Overflow; 42525ffd83dbSDimitry Andric if ((int32_t)Overflow < 0) { 42535ffd83dbSDimitry Andric Overflow += ImmOffset; 42545ffd83dbSDimitry Andric ImmOffset = 0; 42555ffd83dbSDimitry Andric } 42565ffd83dbSDimitry Andric 42575ffd83dbSDimitry Andric if (Overflow != 0) { 42585ffd83dbSDimitry Andric if (!BaseReg) { 42595ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, Overflow).getReg(0); 42605ffd83dbSDimitry Andric } else { 42615ffd83dbSDimitry Andric auto OverflowVal = B.buildConstant(S32, Overflow); 42625ffd83dbSDimitry Andric BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 42635ffd83dbSDimitry Andric } 42645ffd83dbSDimitry Andric } 42655ffd83dbSDimitry Andric 42665ffd83dbSDimitry Andric if (!BaseReg) 42675ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, 0).getReg(0); 42685ffd83dbSDimitry Andric 4269*bdd1243dSDimitry Andric return std::pair(BaseReg, ImmOffset); 4270fe6060f1SDimitry Andric } 4271fe6060f1SDimitry Andric 4272fe6060f1SDimitry Andric /// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic. 4273fe6060f1SDimitry Andric void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO, 4274fe6060f1SDimitry Andric Register VOffset, Register SOffset, 4275fe6060f1SDimitry Andric unsigned ImmOffset, Register VIndex, 4276fe6060f1SDimitry Andric MachineRegisterInfo &MRI) const { 4277*bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeVOffsetVal = 4278349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(VOffset, MRI); 4279*bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeSOffsetVal = 4280349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(SOffset, MRI); 4281*bdd1243dSDimitry Andric std::optional<ValueAndVReg> MaybeVIndexVal = 4282349cc55cSDimitry Andric getIConstantVRegValWithLookThrough(VIndex, MRI); 4283fe6060f1SDimitry Andric // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant, 4284fe6060f1SDimitry Andric // update the MMO with that offset. The stride is unknown so we can only do 4285fe6060f1SDimitry Andric // this if VIndex is constant 0. 4286fe6060f1SDimitry Andric if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal && 4287fe6060f1SDimitry Andric MaybeVIndexVal->Value == 0) { 4288fe6060f1SDimitry Andric uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() + 4289fe6060f1SDimitry Andric MaybeSOffsetVal->Value.getZExtValue() + ImmOffset; 4290fe6060f1SDimitry Andric MMO->setOffset(TotalOffset); 4291fe6060f1SDimitry Andric } else { 4292fe6060f1SDimitry Andric // We don't have a constant combined offset to use in the MMO. Give up. 4293fe6060f1SDimitry Andric MMO->setValue((Value *)nullptr); 4294fe6060f1SDimitry Andric } 42955ffd83dbSDimitry Andric } 42965ffd83dbSDimitry Andric 42978bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets. 42988bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 42998bcb0991SDimitry Andric MachineRegisterInfo &MRI, 4300e8d8bef9SDimitry Andric Register Reg, 4301e8d8bef9SDimitry Andric bool ImageStore) const { 43028bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 43038bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 43048bcb0991SDimitry Andric LLT StoreVT = MRI.getType(Reg); 43058bcb0991SDimitry Andric assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 43068bcb0991SDimitry Andric 4307e8d8bef9SDimitry Andric if (ST.hasUnpackedD16VMem()) { 43088bcb0991SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 43098bcb0991SDimitry Andric 43108bcb0991SDimitry Andric SmallVector<Register, 4> WideRegs; 43118bcb0991SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 43128bcb0991SDimitry Andric WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 43138bcb0991SDimitry Andric 43148bcb0991SDimitry Andric int NumElts = StoreVT.getNumElements(); 43158bcb0991SDimitry Andric 4316fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs) 4317fe6060f1SDimitry Andric .getReg(0); 43188bcb0991SDimitry Andric } 43198bcb0991SDimitry Andric 4320e8d8bef9SDimitry Andric if (ImageStore && ST.hasImageStoreD16Bug()) { 4321e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 2) { 4322e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 4323e8d8bef9SDimitry Andric Reg = B.buildBitcast(S32, Reg).getReg(0); 4324e8d8bef9SDimitry Andric PackedRegs.push_back(Reg); 4325e8d8bef9SDimitry Andric PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 4326fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs) 4327fe6060f1SDimitry Andric .getReg(0); 4328e8d8bef9SDimitry Andric } 4329e8d8bef9SDimitry Andric 4330e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 3) { 4331e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 4332e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 4333e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4334e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 4335e8d8bef9SDimitry Andric PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 4336fe6060f1SDimitry Andric Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0); 4337fe6060f1SDimitry Andric return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0); 4338e8d8bef9SDimitry Andric } 4339e8d8bef9SDimitry Andric 4340e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 4) { 4341e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 4342fe6060f1SDimitry Andric Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0); 4343e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Reg); 4344e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 4345e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 4346e8d8bef9SDimitry Andric PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 4347fe6060f1SDimitry Andric return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs) 4348fe6060f1SDimitry Andric .getReg(0); 4349e8d8bef9SDimitry Andric } 4350e8d8bef9SDimitry Andric 4351e8d8bef9SDimitry Andric llvm_unreachable("invalid data type"); 4352e8d8bef9SDimitry Andric } 4353e8d8bef9SDimitry Andric 43540eae32dcSDimitry Andric if (StoreVT == LLT::fixed_vector(3, S16)) { 43550eae32dcSDimitry Andric Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg) 43560eae32dcSDimitry Andric .getReg(0); 43570eae32dcSDimitry Andric } 4358e8d8bef9SDimitry Andric return Reg; 4359e8d8bef9SDimitry Andric } 4360e8d8bef9SDimitry Andric 43615ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType( 43625ffd83dbSDimitry Andric MachineIRBuilder &B, Register VData, bool IsFormat) const { 43635ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 43645ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData); 43658bcb0991SDimitry Andric 43668bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 43678bcb0991SDimitry Andric 43688bcb0991SDimitry Andric // Fixup illegal register types for i8 stores. 43698bcb0991SDimitry Andric if (Ty == LLT::scalar(8) || Ty == S16) { 43708bcb0991SDimitry Andric Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 43715ffd83dbSDimitry Andric return AnyExt; 43728bcb0991SDimitry Andric } 43738bcb0991SDimitry Andric 43748bcb0991SDimitry Andric if (Ty.isVector()) { 43758bcb0991SDimitry Andric if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 43768bcb0991SDimitry Andric if (IsFormat) 43775ffd83dbSDimitry Andric return handleD16VData(B, *MRI, VData); 43785ffd83dbSDimitry Andric } 43795ffd83dbSDimitry Andric } 43805ffd83dbSDimitry Andric 43815ffd83dbSDimitry Andric return VData; 43825ffd83dbSDimitry Andric } 43835ffd83dbSDimitry Andric 43845ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 43855ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 43865ffd83dbSDimitry Andric MachineIRBuilder &B, 43875ffd83dbSDimitry Andric bool IsTyped, 43885ffd83dbSDimitry Andric bool IsFormat) const { 43895ffd83dbSDimitry Andric Register VData = MI.getOperand(1).getReg(); 43905ffd83dbSDimitry Andric LLT Ty = MRI.getType(VData); 43915ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 43925ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 43935ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 43945ffd83dbSDimitry Andric 43955ffd83dbSDimitry Andric VData = fixStoreSourceType(B, VData, IsFormat); 43965ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 43975ffd83dbSDimitry Andric 43985ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 43995ffd83dbSDimitry Andric const int MemSize = MMO->getSize(); 44005ffd83dbSDimitry Andric 44015ffd83dbSDimitry Andric unsigned ImmOffset; 44025ffd83dbSDimitry Andric 44035ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 44045ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 44055ffd83dbSDimitry Andric 44065ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 44075ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 44085ffd83dbSDimitry Andric Register VIndex; 44095ffd83dbSDimitry Andric int OpOffset = 0; 44105ffd83dbSDimitry Andric if (HasVIndex) { 44115ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 44125ffd83dbSDimitry Andric OpOffset = 1; 4413fe6060f1SDimitry Andric } else { 4414fe6060f1SDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 44155ffd83dbSDimitry Andric } 44165ffd83dbSDimitry Andric 44175ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 44185ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 44195ffd83dbSDimitry Andric 44205ffd83dbSDimitry Andric unsigned Format = 0; 44215ffd83dbSDimitry Andric if (IsTyped) { 44225ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 44235ffd83dbSDimitry Andric ++OpOffset; 44245ffd83dbSDimitry Andric } 44255ffd83dbSDimitry Andric 44265ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 44275ffd83dbSDimitry Andric 4428fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4429fe6060f1SDimitry Andric updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 44305ffd83dbSDimitry Andric 44315ffd83dbSDimitry Andric unsigned Opc; 44325ffd83dbSDimitry Andric if (IsTyped) { 44335ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 44345ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 44355ffd83dbSDimitry Andric } else if (IsFormat) { 44365ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 44375ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 44385ffd83dbSDimitry Andric } else { 44395ffd83dbSDimitry Andric switch (MemSize) { 44405ffd83dbSDimitry Andric case 1: 44415ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 44425ffd83dbSDimitry Andric break; 44435ffd83dbSDimitry Andric case 2: 44445ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 44455ffd83dbSDimitry Andric break; 44465ffd83dbSDimitry Andric default: 44475ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 44485ffd83dbSDimitry Andric break; 44495ffd83dbSDimitry Andric } 44505ffd83dbSDimitry Andric } 44515ffd83dbSDimitry Andric 44525ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 44535ffd83dbSDimitry Andric .addUse(VData) // vdata 44545ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 44555ffd83dbSDimitry Andric .addUse(VIndex) // vindex 44565ffd83dbSDimitry Andric .addUse(VOffset) // voffset 44575ffd83dbSDimitry Andric .addUse(SOffset) // soffset 44585ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 44595ffd83dbSDimitry Andric 44605ffd83dbSDimitry Andric if (IsTyped) 44615ffd83dbSDimitry Andric MIB.addImm(Format); 44625ffd83dbSDimitry Andric 44635ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 44645ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 44655ffd83dbSDimitry Andric .addMemOperand(MMO); 44665ffd83dbSDimitry Andric 44675ffd83dbSDimitry Andric MI.eraseFromParent(); 44688bcb0991SDimitry Andric return true; 44698bcb0991SDimitry Andric } 44708bcb0991SDimitry Andric 4471*bdd1243dSDimitry Andric static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, 4472*bdd1243dSDimitry Andric Register VIndex, Register VOffset, Register SOffset, 4473*bdd1243dSDimitry Andric unsigned ImmOffset, unsigned Format, 4474*bdd1243dSDimitry Andric unsigned AuxiliaryData, MachineMemOperand *MMO, 4475*bdd1243dSDimitry Andric bool IsTyped, bool HasVIndex, MachineIRBuilder &B) { 4476*bdd1243dSDimitry Andric auto MIB = B.buildInstr(Opc) 4477*bdd1243dSDimitry Andric .addDef(LoadDstReg) // vdata 4478*bdd1243dSDimitry Andric .addUse(RSrc) // rsrc 4479*bdd1243dSDimitry Andric .addUse(VIndex) // vindex 4480*bdd1243dSDimitry Andric .addUse(VOffset) // voffset 4481*bdd1243dSDimitry Andric .addUse(SOffset) // soffset 4482*bdd1243dSDimitry Andric .addImm(ImmOffset); // offset(imm) 4483*bdd1243dSDimitry Andric 4484*bdd1243dSDimitry Andric if (IsTyped) 4485*bdd1243dSDimitry Andric MIB.addImm(Format); 4486*bdd1243dSDimitry Andric 4487*bdd1243dSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 4488*bdd1243dSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 4489*bdd1243dSDimitry Andric .addMemOperand(MMO); 4490*bdd1243dSDimitry Andric } 4491*bdd1243dSDimitry Andric 44925ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 44935ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 44945ffd83dbSDimitry Andric MachineIRBuilder &B, 44955ffd83dbSDimitry Andric bool IsFormat, 44965ffd83dbSDimitry Andric bool IsTyped) const { 44975ffd83dbSDimitry Andric // FIXME: Verifier should enforce 1 MMO for these intrinsics. 44985ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 4499fe6060f1SDimitry Andric const LLT MemTy = MMO->getMemoryType(); 45005ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 45015ffd83dbSDimitry Andric 45025ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 4503*bdd1243dSDimitry Andric 4504*bdd1243dSDimitry Andric Register StatusDst; 4505*bdd1243dSDimitry Andric int OpOffset = 0; 4506*bdd1243dSDimitry Andric assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2); 4507*bdd1243dSDimitry Andric bool IsTFE = MI.getNumExplicitDefs() == 2; 4508*bdd1243dSDimitry Andric if (IsTFE) { 4509*bdd1243dSDimitry Andric StatusDst = MI.getOperand(1).getReg(); 4510*bdd1243dSDimitry Andric ++OpOffset; 4511*bdd1243dSDimitry Andric } 4512*bdd1243dSDimitry Andric 4513*bdd1243dSDimitry Andric Register RSrc = MI.getOperand(2 + OpOffset).getReg(); 45145ffd83dbSDimitry Andric 45155ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 45165ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 45175ffd83dbSDimitry Andric 45185ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 4519*bdd1243dSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset; 45205ffd83dbSDimitry Andric Register VIndex; 45215ffd83dbSDimitry Andric if (HasVIndex) { 4522*bdd1243dSDimitry Andric VIndex = MI.getOperand(3 + OpOffset).getReg(); 4523*bdd1243dSDimitry Andric ++OpOffset; 4524fe6060f1SDimitry Andric } else { 4525fe6060f1SDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 45268bcb0991SDimitry Andric } 45278bcb0991SDimitry Andric 45285ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 45295ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 45305ffd83dbSDimitry Andric 45315ffd83dbSDimitry Andric unsigned Format = 0; 45325ffd83dbSDimitry Andric if (IsTyped) { 45335ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 45345ffd83dbSDimitry Andric ++OpOffset; 45358bcb0991SDimitry Andric } 45368bcb0991SDimitry Andric 45375ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 45385ffd83dbSDimitry Andric unsigned ImmOffset; 45395ffd83dbSDimitry Andric 45405ffd83dbSDimitry Andric LLT Ty = MRI.getType(Dst); 45415ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 45425ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 45435ffd83dbSDimitry Andric const bool Unpacked = ST.hasUnpackedD16VMem(); 45445ffd83dbSDimitry Andric 4545fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4546fe6060f1SDimitry Andric updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI); 45475ffd83dbSDimitry Andric 45485ffd83dbSDimitry Andric unsigned Opc; 45495ffd83dbSDimitry Andric 4550*bdd1243dSDimitry Andric // TODO: Support TFE for typed and narrow loads. 45515ffd83dbSDimitry Andric if (IsTyped) { 4552*bdd1243dSDimitry Andric if (IsTFE) 4553*bdd1243dSDimitry Andric return false; 45545ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 45555ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 45565ffd83dbSDimitry Andric } else if (IsFormat) { 4557*bdd1243dSDimitry Andric if (IsD16) { 4558*bdd1243dSDimitry Andric if (IsTFE) 4559*bdd1243dSDimitry Andric return false; 4560*bdd1243dSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16; 45615ffd83dbSDimitry Andric } else { 4562*bdd1243dSDimitry Andric Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE 4563*bdd1243dSDimitry Andric : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 4564*bdd1243dSDimitry Andric } 4565*bdd1243dSDimitry Andric } else { 4566*bdd1243dSDimitry Andric if (IsTFE) 4567*bdd1243dSDimitry Andric return false; 4568fe6060f1SDimitry Andric switch (MemTy.getSizeInBits()) { 4569fe6060f1SDimitry Andric case 8: 45705ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 45715ffd83dbSDimitry Andric break; 4572fe6060f1SDimitry Andric case 16: 45735ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 45745ffd83dbSDimitry Andric break; 45755ffd83dbSDimitry Andric default: 45765ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 45775ffd83dbSDimitry Andric break; 45785ffd83dbSDimitry Andric } 45795ffd83dbSDimitry Andric } 45805ffd83dbSDimitry Andric 4581*bdd1243dSDimitry Andric if (IsTFE) { 4582*bdd1243dSDimitry Andric unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32); 4583*bdd1243dSDimitry Andric unsigned NumLoadDWords = NumValueDWords + 1; 4584*bdd1243dSDimitry Andric LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32); 4585*bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy); 4586*bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 4587*bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 4588*bdd1243dSDimitry Andric if (NumValueDWords == 1) { 4589*bdd1243dSDimitry Andric B.buildUnmerge({Dst, StatusDst}, LoadDstReg); 4590*bdd1243dSDimitry Andric } else { 4591*bdd1243dSDimitry Andric SmallVector<Register, 5> LoadElts; 4592*bdd1243dSDimitry Andric for (unsigned I = 0; I != NumValueDWords; ++I) 4593*bdd1243dSDimitry Andric LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32)); 4594*bdd1243dSDimitry Andric LoadElts.push_back(StatusDst); 4595*bdd1243dSDimitry Andric B.buildUnmerge(LoadElts, LoadDstReg); 4596*bdd1243dSDimitry Andric LoadElts.truncate(NumValueDWords); 4597*bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, LoadElts); 4598*bdd1243dSDimitry Andric } 4599*bdd1243dSDimitry Andric } else if ((!IsD16 && MemTy.getSizeInBits() < 32) || 4600*bdd1243dSDimitry Andric (IsD16 && !Ty.isVector())) { 4601*bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 4602*bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 4603*bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 46045ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 46055ffd83dbSDimitry Andric B.buildTrunc(Dst, LoadDstReg); 4606*bdd1243dSDimitry Andric } else if (Unpacked && IsD16 && Ty.isVector()) { 4607*bdd1243dSDimitry Andric LLT UnpackedTy = Ty.changeElementSize(32); 4608*bdd1243dSDimitry Andric Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 4609*bdd1243dSDimitry Andric buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset, 4610*bdd1243dSDimitry Andric Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B); 4611*bdd1243dSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 46125ffd83dbSDimitry Andric // FIXME: G_TRUNC should work, but legalization currently fails 46135ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 46145ffd83dbSDimitry Andric SmallVector<Register, 4> Repack; 46155ffd83dbSDimitry Andric for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 46165ffd83dbSDimitry Andric Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 4617*bdd1243dSDimitry Andric B.buildMergeLikeInstr(Dst, Repack); 4618*bdd1243dSDimitry Andric } else { 4619*bdd1243dSDimitry Andric buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format, 4620*bdd1243dSDimitry Andric AuxiliaryData, MMO, IsTyped, HasVIndex, B); 46215ffd83dbSDimitry Andric } 46225ffd83dbSDimitry Andric 46235ffd83dbSDimitry Andric MI.eraseFromParent(); 46245ffd83dbSDimitry Andric return true; 46255ffd83dbSDimitry Andric } 46265ffd83dbSDimitry Andric 46275ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 46285ffd83dbSDimitry Andric MachineIRBuilder &B, 46295ffd83dbSDimitry Andric bool IsInc) const { 46305ffd83dbSDimitry Andric unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 46315ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_ATOMIC_DEC; 46325ffd83dbSDimitry Andric B.buildInstr(Opc) 46335ffd83dbSDimitry Andric .addDef(MI.getOperand(0).getReg()) 46345ffd83dbSDimitry Andric .addUse(MI.getOperand(2).getReg()) 46355ffd83dbSDimitry Andric .addUse(MI.getOperand(3).getReg()) 46365ffd83dbSDimitry Andric .cloneMemRefs(MI); 46375ffd83dbSDimitry Andric MI.eraseFromParent(); 46385ffd83dbSDimitry Andric return true; 46395ffd83dbSDimitry Andric } 46405ffd83dbSDimitry Andric 46415ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 46425ffd83dbSDimitry Andric switch (IntrID) { 46435ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 46445ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 46455ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 46465ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 46475ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 46485ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 46495ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 46505ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 46515ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 46525ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 46535ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 46545ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 46555ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 46565ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 46575ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 46585ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 46595ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 46605ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 46615ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 46625ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 46635ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 46645ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 46655ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 46665ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 46675ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 46685ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 46695ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 46705ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 46715ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 46725ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 46735ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 46745ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 46755ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 46765ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 46775ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 46785ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 46795ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 46805ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 46815ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 4682e8d8bef9SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4683e8d8bef9SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 4684e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 4685fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 4686fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 4687fe6060f1SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN; 4688fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 4689fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 4690fe6060f1SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX; 46915ffd83dbSDimitry Andric default: 46925ffd83dbSDimitry Andric llvm_unreachable("unhandled atomic opcode"); 46935ffd83dbSDimitry Andric } 46945ffd83dbSDimitry Andric } 46955ffd83dbSDimitry Andric 46965ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 46975ffd83dbSDimitry Andric MachineIRBuilder &B, 46985ffd83dbSDimitry Andric Intrinsic::ID IID) const { 46995ffd83dbSDimitry Andric const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 47005ffd83dbSDimitry Andric IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 4701e8d8bef9SDimitry Andric const bool HasReturn = MI.getNumExplicitDefs() != 0; 47025ffd83dbSDimitry Andric 4703e8d8bef9SDimitry Andric Register Dst; 47045ffd83dbSDimitry Andric 47055ffd83dbSDimitry Andric int OpOffset = 0; 4706e8d8bef9SDimitry Andric if (HasReturn) { 4707e8d8bef9SDimitry Andric // A few FP atomics do not support return values. 4708e8d8bef9SDimitry Andric Dst = MI.getOperand(0).getReg(); 4709e8d8bef9SDimitry Andric } else { 4710e8d8bef9SDimitry Andric OpOffset = -1; 4711e8d8bef9SDimitry Andric } 4712e8d8bef9SDimitry Andric 4713e8d8bef9SDimitry Andric Register VData = MI.getOperand(2 + OpOffset).getReg(); 4714e8d8bef9SDimitry Andric Register CmpVal; 47155ffd83dbSDimitry Andric 47165ffd83dbSDimitry Andric if (IsCmpSwap) { 47175ffd83dbSDimitry Andric CmpVal = MI.getOperand(3 + OpOffset).getReg(); 47185ffd83dbSDimitry Andric ++OpOffset; 47195ffd83dbSDimitry Andric } 47205ffd83dbSDimitry Andric 47215ffd83dbSDimitry Andric Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 4722e8d8bef9SDimitry Andric const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 47235ffd83dbSDimitry Andric 47245ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 47255ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 47265ffd83dbSDimitry Andric Register VIndex; 47275ffd83dbSDimitry Andric if (HasVIndex) { 47285ffd83dbSDimitry Andric VIndex = MI.getOperand(4 + OpOffset).getReg(); 47295ffd83dbSDimitry Andric ++OpOffset; 4730fe6060f1SDimitry Andric } else { 4731fe6060f1SDimitry Andric VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 47325ffd83dbSDimitry Andric } 47335ffd83dbSDimitry Andric 47345ffd83dbSDimitry Andric Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 47355ffd83dbSDimitry Andric Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 47365ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 47375ffd83dbSDimitry Andric 47385ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 47395ffd83dbSDimitry Andric 47405ffd83dbSDimitry Andric unsigned ImmOffset; 4741fe6060f1SDimitry Andric std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset); 4742fe6060f1SDimitry Andric updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI()); 47435ffd83dbSDimitry Andric 4744e8d8bef9SDimitry Andric auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 4745e8d8bef9SDimitry Andric 4746e8d8bef9SDimitry Andric if (HasReturn) 4747e8d8bef9SDimitry Andric MIB.addDef(Dst); 4748e8d8bef9SDimitry Andric 4749e8d8bef9SDimitry Andric MIB.addUse(VData); // vdata 47505ffd83dbSDimitry Andric 47515ffd83dbSDimitry Andric if (IsCmpSwap) 47525ffd83dbSDimitry Andric MIB.addReg(CmpVal); 47535ffd83dbSDimitry Andric 47545ffd83dbSDimitry Andric MIB.addUse(RSrc) // rsrc 47555ffd83dbSDimitry Andric .addUse(VIndex) // vindex 47565ffd83dbSDimitry Andric .addUse(VOffset) // voffset 47575ffd83dbSDimitry Andric .addUse(SOffset) // soffset 47585ffd83dbSDimitry Andric .addImm(ImmOffset) // offset(imm) 47595ffd83dbSDimitry Andric .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 47605ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 47615ffd83dbSDimitry Andric .addMemOperand(MMO); 47625ffd83dbSDimitry Andric 47635ffd83dbSDimitry Andric MI.eraseFromParent(); 47645ffd83dbSDimitry Andric return true; 47655ffd83dbSDimitry Andric } 47665ffd83dbSDimitry Andric 4767fe6060f1SDimitry Andric /// Turn a set of s16 typed registers in \p AddrRegs into a dword sized 47685ffd83dbSDimitry Andric /// vector with s16 typed elements. 4769fe6060f1SDimitry Andric static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, 4770fe6060f1SDimitry Andric SmallVectorImpl<Register> &PackedAddrs, 4771fe6060f1SDimitry Andric unsigned ArgOffset, 4772fe6060f1SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr, 4773fe6060f1SDimitry Andric bool IsA16, bool IsG16) { 47745ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 4775fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 4776fe6060f1SDimitry Andric auto EndIdx = Intr->VAddrEnd; 47775ffd83dbSDimitry Andric 4778e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 4779e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 47805ffd83dbSDimitry Andric if (!SrcOp.isReg()) 47815ffd83dbSDimitry Andric continue; // _L to _LZ may have eliminated this. 47825ffd83dbSDimitry Andric 47835ffd83dbSDimitry Andric Register AddrReg = SrcOp.getReg(); 47845ffd83dbSDimitry Andric 4785fe6060f1SDimitry Andric if ((I < Intr->GradientStart) || 4786fe6060f1SDimitry Andric (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) || 4787fe6060f1SDimitry Andric (I >= Intr->CoordStart && !IsA16)) { 47880eae32dcSDimitry Andric if ((I < Intr->GradientStart) && IsA16 && 47890eae32dcSDimitry Andric (B.getMRI()->getType(AddrReg) == S16)) { 479004eeddc0SDimitry Andric assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); 47910eae32dcSDimitry Andric // Special handling of bias when A16 is on. Bias is of type half but 47920eae32dcSDimitry Andric // occupies full 32-bit. 47930eae32dcSDimitry Andric PackedAddrs.push_back( 47940eae32dcSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 47950eae32dcSDimitry Andric .getReg(0)); 47960eae32dcSDimitry Andric } else { 479704eeddc0SDimitry Andric assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && 479804eeddc0SDimitry Andric "Bias needs to be converted to 16 bit in A16 mode"); 479904eeddc0SDimitry Andric // Handle any gradient or coordinate operands that should not be packed 48005ffd83dbSDimitry Andric AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 48015ffd83dbSDimitry Andric PackedAddrs.push_back(AddrReg); 48020eae32dcSDimitry Andric } 48035ffd83dbSDimitry Andric } else { 48045ffd83dbSDimitry Andric // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 48055ffd83dbSDimitry Andric // derivatives dx/dh and dx/dv are packed with undef. 48065ffd83dbSDimitry Andric if (((I + 1) >= EndIdx) || 4807e8d8bef9SDimitry Andric ((Intr->NumGradients / 2) % 2 == 1 && 4808e8d8bef9SDimitry Andric (I == static_cast<unsigned>(Intr->GradientStart + 4809e8d8bef9SDimitry Andric (Intr->NumGradients / 2) - 1) || 4810e8d8bef9SDimitry Andric I == static_cast<unsigned>(Intr->GradientStart + 4811e8d8bef9SDimitry Andric Intr->NumGradients - 1))) || 48125ffd83dbSDimitry Andric // Check for _L to _LZ optimization 4813e8d8bef9SDimitry Andric !MI.getOperand(ArgOffset + I + 1).isReg()) { 48145ffd83dbSDimitry Andric PackedAddrs.push_back( 48155ffd83dbSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 48165ffd83dbSDimitry Andric .getReg(0)); 48175ffd83dbSDimitry Andric } else { 48185ffd83dbSDimitry Andric PackedAddrs.push_back( 4819e8d8bef9SDimitry Andric B.buildBuildVector( 4820e8d8bef9SDimitry Andric V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 48215ffd83dbSDimitry Andric .getReg(0)); 48225ffd83dbSDimitry Andric ++I; 48235ffd83dbSDimitry Andric } 48245ffd83dbSDimitry Andric } 48255ffd83dbSDimitry Andric } 48265ffd83dbSDimitry Andric } 48275ffd83dbSDimitry Andric 48285ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register, 48295ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg. 48305ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 48315ffd83dbSDimitry Andric int DimIdx, int NumVAddrs) { 48325ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 4833*bdd1243dSDimitry Andric (void)S32; 48345ffd83dbSDimitry Andric SmallVector<Register, 8> AddrRegs; 48355ffd83dbSDimitry Andric for (int I = 0; I != NumVAddrs; ++I) { 48365ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 48375ffd83dbSDimitry Andric if (SrcOp.isReg()) { 48385ffd83dbSDimitry Andric AddrRegs.push_back(SrcOp.getReg()); 48395ffd83dbSDimitry Andric assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 48405ffd83dbSDimitry Andric } 48415ffd83dbSDimitry Andric } 48425ffd83dbSDimitry Andric 48435ffd83dbSDimitry Andric int NumAddrRegs = AddrRegs.size(); 48445ffd83dbSDimitry Andric if (NumAddrRegs != 1) { 4845fe6060f1SDimitry Andric auto VAddr = 4846fe6060f1SDimitry Andric B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs); 48475ffd83dbSDimitry Andric MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 48485ffd83dbSDimitry Andric } 48495ffd83dbSDimitry Andric 48505ffd83dbSDimitry Andric for (int I = 1; I != NumVAddrs; ++I) { 48515ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 48525ffd83dbSDimitry Andric if (SrcOp.isReg()) 48535ffd83dbSDimitry Andric MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 48545ffd83dbSDimitry Andric } 48555ffd83dbSDimitry Andric } 48565ffd83dbSDimitry Andric 48575ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget. 48585ffd83dbSDimitry Andric /// 48595ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be 48605ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed 48615ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit 48625ffd83dbSDimitry Andric /// registers. 48635ffd83dbSDimitry Andric /// 48645ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want 48655ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't 486681ad6265SDimitry Andric /// want a selected instruction entering RegBankSelect. In order to avoid 48675ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on 4868349cc55cSDimitry Andric /// the intrinsic's arguments. In cases like a16 addresses, this requires 4869349cc55cSDimitry Andric /// padding now unnecessary arguments with $noreg. 48705ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4871e8d8bef9SDimitry Andric MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4872e8d8bef9SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 48735ffd83dbSDimitry Andric 4874*bdd1243dSDimitry Andric const MachineFunction &MF = *MI.getMF(); 4875e8d8bef9SDimitry Andric const unsigned NumDefs = MI.getNumExplicitDefs(); 4876e8d8bef9SDimitry Andric const unsigned ArgOffset = NumDefs + 1; 48775ffd83dbSDimitry Andric bool IsTFE = NumDefs == 2; 48785ffd83dbSDimitry Andric // We are only processing the operands of d16 image operations on subtargets 48795ffd83dbSDimitry Andric // that use the unpacked register layout, or need to repack the TFE result. 48805ffd83dbSDimitry Andric 48815ffd83dbSDimitry Andric // TODO: Do we need to guard against already legalized intrinsics? 48825ffd83dbSDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4883e8d8bef9SDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 48845ffd83dbSDimitry Andric 48855ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 48865ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 48875ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 4888fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 48895ffd83dbSDimitry Andric 48905ffd83dbSDimitry Andric unsigned DMask = 0; 489104eeddc0SDimitry Andric Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg(); 489204eeddc0SDimitry Andric LLT Ty = MRI->getType(VData); 48935ffd83dbSDimitry Andric 48945ffd83dbSDimitry Andric // Check for 16 bit addresses and pack if true. 4895e8d8bef9SDimitry Andric LLT GradTy = 4896e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4897e8d8bef9SDimitry Andric LLT AddrTy = 4898e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 48995ffd83dbSDimitry Andric const bool IsG16 = GradTy == S16; 49005ffd83dbSDimitry Andric const bool IsA16 = AddrTy == S16; 490104eeddc0SDimitry Andric const bool IsD16 = Ty.getScalarType() == S16; 49025ffd83dbSDimitry Andric 49035ffd83dbSDimitry Andric int DMaskLanes = 0; 49045ffd83dbSDimitry Andric if (!BaseOpcode->Atomic) { 4905e8d8bef9SDimitry Andric DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 49065ffd83dbSDimitry Andric if (BaseOpcode->Gather4) { 49075ffd83dbSDimitry Andric DMaskLanes = 4; 49085ffd83dbSDimitry Andric } else if (DMask != 0) { 4909*bdd1243dSDimitry Andric DMaskLanes = llvm::popcount(DMask); 49105ffd83dbSDimitry Andric } else if (!IsTFE && !BaseOpcode->Store) { 49115ffd83dbSDimitry Andric // If dmask is 0, this is a no-op load. This can be eliminated. 49125ffd83dbSDimitry Andric B.buildUndef(MI.getOperand(0)); 49135ffd83dbSDimitry Andric MI.eraseFromParent(); 49145ffd83dbSDimitry Andric return true; 49155ffd83dbSDimitry Andric } 49165ffd83dbSDimitry Andric } 49175ffd83dbSDimitry Andric 49185ffd83dbSDimitry Andric Observer.changingInstr(MI); 49195ffd83dbSDimitry Andric auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 49205ffd83dbSDimitry Andric 492104eeddc0SDimitry Andric const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16 492204eeddc0SDimitry Andric : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE; 492304eeddc0SDimitry Andric const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 492404eeddc0SDimitry Andric : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 492504eeddc0SDimitry Andric unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode; 49265ffd83dbSDimitry Andric 49275ffd83dbSDimitry Andric // Track that we legalized this 49285ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(NewOpcode)); 49295ffd83dbSDimitry Andric 49305ffd83dbSDimitry Andric // Expecting to get an error flag since TFC is on - and dmask is 0 Force 49315ffd83dbSDimitry Andric // dmask to be at least 1 otherwise the instruction will fail 49325ffd83dbSDimitry Andric if (IsTFE && DMask == 0) { 49335ffd83dbSDimitry Andric DMask = 0x1; 49345ffd83dbSDimitry Andric DMaskLanes = 1; 4935e8d8bef9SDimitry Andric MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 49365ffd83dbSDimitry Andric } 49375ffd83dbSDimitry Andric 49385ffd83dbSDimitry Andric if (BaseOpcode->Atomic) { 49395ffd83dbSDimitry Andric Register VData0 = MI.getOperand(2).getReg(); 49405ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData0); 49415ffd83dbSDimitry Andric 49425ffd83dbSDimitry Andric // TODO: Allow atomic swap and bit ops for v2s16/v4s16 49435ffd83dbSDimitry Andric if (Ty.isVector()) 49445ffd83dbSDimitry Andric return false; 49455ffd83dbSDimitry Andric 49465ffd83dbSDimitry Andric if (BaseOpcode->AtomicX2) { 49475ffd83dbSDimitry Andric Register VData1 = MI.getOperand(3).getReg(); 49485ffd83dbSDimitry Andric // The two values are packed in one register. 4949fe6060f1SDimitry Andric LLT PackedTy = LLT::fixed_vector(2, Ty); 49505ffd83dbSDimitry Andric auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 49515ffd83dbSDimitry Andric MI.getOperand(2).setReg(Concat.getReg(0)); 49525ffd83dbSDimitry Andric MI.getOperand(3).setReg(AMDGPU::NoRegister); 49535ffd83dbSDimitry Andric } 49545ffd83dbSDimitry Andric } 49555ffd83dbSDimitry Andric 4956e8d8bef9SDimitry Andric unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 49575ffd83dbSDimitry Andric 49585ffd83dbSDimitry Andric // Rewrite the addressing register layout before doing anything else. 4959fe6060f1SDimitry Andric if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) { 4960fe6060f1SDimitry Andric // 16 bit gradients are supported, but are tied to the A16 control 4961fe6060f1SDimitry Andric // so both gradients and addresses must be 16 bit 49625ffd83dbSDimitry Andric return false; 4963fe6060f1SDimitry Andric } 49645ffd83dbSDimitry Andric 4965fe6060f1SDimitry Andric if (IsA16 && !ST.hasA16()) { 4966fe6060f1SDimitry Andric // A16 not supported 4967fe6060f1SDimitry Andric return false; 4968fe6060f1SDimitry Andric } 4969fe6060f1SDimitry Andric 4970fe6060f1SDimitry Andric if (IsA16 || IsG16) { 4971e8d8bef9SDimitry Andric if (Intr->NumVAddrs > 1) { 49725ffd83dbSDimitry Andric SmallVector<Register, 4> PackedRegs; 49735ffd83dbSDimitry Andric 4974fe6060f1SDimitry Andric packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, 4975fe6060f1SDimitry Andric IsG16); 49765ffd83dbSDimitry Andric 49775ffd83dbSDimitry Andric // See also below in the non-a16 branch 4978*bdd1243dSDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && 4979*bdd1243dSDimitry Andric PackedRegs.size() >= ST.getNSAThreshold(MF) && 4980fe6060f1SDimitry Andric PackedRegs.size() <= ST.getNSAMaxSize(); 49815ffd83dbSDimitry Andric 49825ffd83dbSDimitry Andric if (!UseNSA && PackedRegs.size() > 1) { 4983fe6060f1SDimitry Andric LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16); 49845ffd83dbSDimitry Andric auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 49855ffd83dbSDimitry Andric PackedRegs[0] = Concat.getReg(0); 49865ffd83dbSDimitry Andric PackedRegs.resize(1); 49875ffd83dbSDimitry Andric } 49885ffd83dbSDimitry Andric 4989e8d8bef9SDimitry Andric const unsigned NumPacked = PackedRegs.size(); 4990e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4991e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 49925ffd83dbSDimitry Andric if (!SrcOp.isReg()) { 49935ffd83dbSDimitry Andric assert(SrcOp.isImm() && SrcOp.getImm() == 0); 49945ffd83dbSDimitry Andric continue; 49955ffd83dbSDimitry Andric } 49965ffd83dbSDimitry Andric 49975ffd83dbSDimitry Andric assert(SrcOp.getReg() != AMDGPU::NoRegister); 49985ffd83dbSDimitry Andric 4999e8d8bef9SDimitry Andric if (I - Intr->VAddrStart < NumPacked) 5000e8d8bef9SDimitry Andric SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 50015ffd83dbSDimitry Andric else 50025ffd83dbSDimitry Andric SrcOp.setReg(AMDGPU::NoRegister); 50035ffd83dbSDimitry Andric } 50045ffd83dbSDimitry Andric } 50055ffd83dbSDimitry Andric } else { 50065ffd83dbSDimitry Andric // If the register allocator cannot place the address registers contiguously 50075ffd83dbSDimitry Andric // without introducing moves, then using the non-sequential address encoding 50085ffd83dbSDimitry Andric // is always preferable, since it saves VALU instructions and is usually a 50095ffd83dbSDimitry Andric // wash in terms of code size or even better. 50105ffd83dbSDimitry Andric // 50115ffd83dbSDimitry Andric // However, we currently have no way of hinting to the register allocator 50125ffd83dbSDimitry Andric // that MIMG addresses should be placed contiguously when it is possible to 50135ffd83dbSDimitry Andric // do so, so force non-NSA for the common 2-address case as a heuristic. 50145ffd83dbSDimitry Andric // 50155ffd83dbSDimitry Andric // SIShrinkInstructions will convert NSA encodings to non-NSA after register 50165ffd83dbSDimitry Andric // allocation when possible. 501781ad6265SDimitry Andric // 501881ad6265SDimitry Andric // TODO: we can actually allow partial NSA where the final register is a 501981ad6265SDimitry Andric // contiguous set of the remaining addresses. 502081ad6265SDimitry Andric // This could help where there are more addresses than supported. 5021*bdd1243dSDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && 5022*bdd1243dSDimitry Andric CorrectedNumVAddrs >= ST.getNSAThreshold(MF) && 5023fe6060f1SDimitry Andric CorrectedNumVAddrs <= ST.getNSAMaxSize(); 50245ffd83dbSDimitry Andric 5025e8d8bef9SDimitry Andric if (!UseNSA && Intr->NumVAddrs > 1) 5026e8d8bef9SDimitry Andric convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 5027e8d8bef9SDimitry Andric Intr->NumVAddrs); 50285ffd83dbSDimitry Andric } 50295ffd83dbSDimitry Andric 50305ffd83dbSDimitry Andric int Flags = 0; 50315ffd83dbSDimitry Andric if (IsA16) 50325ffd83dbSDimitry Andric Flags |= 1; 50335ffd83dbSDimitry Andric if (IsG16) 50345ffd83dbSDimitry Andric Flags |= 2; 50355ffd83dbSDimitry Andric MI.addOperand(MachineOperand::CreateImm(Flags)); 50365ffd83dbSDimitry Andric 50375ffd83dbSDimitry Andric if (BaseOpcode->Store) { // No TFE for stores? 50385ffd83dbSDimitry Andric // TODO: Handle dmask trim 503904eeddc0SDimitry Andric if (!Ty.isVector() || !IsD16) 50405ffd83dbSDimitry Andric return true; 50415ffd83dbSDimitry Andric 5042e8d8bef9SDimitry Andric Register RepackedReg = handleD16VData(B, *MRI, VData, true); 50435ffd83dbSDimitry Andric if (RepackedReg != VData) { 50445ffd83dbSDimitry Andric MI.getOperand(1).setReg(RepackedReg); 50455ffd83dbSDimitry Andric } 50465ffd83dbSDimitry Andric 50475ffd83dbSDimitry Andric return true; 50485ffd83dbSDimitry Andric } 50495ffd83dbSDimitry Andric 50505ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 50515ffd83dbSDimitry Andric const LLT EltTy = Ty.getScalarType(); 50525ffd83dbSDimitry Andric const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 50535ffd83dbSDimitry Andric 50545ffd83dbSDimitry Andric // Confirm that the return type is large enough for the dmask specified 50555ffd83dbSDimitry Andric if (NumElts < DMaskLanes) 50565ffd83dbSDimitry Andric return false; 50575ffd83dbSDimitry Andric 50585ffd83dbSDimitry Andric if (NumElts > 4 || DMaskLanes > 4) 50595ffd83dbSDimitry Andric return false; 50605ffd83dbSDimitry Andric 50615ffd83dbSDimitry Andric const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 5062fe6060f1SDimitry Andric const LLT AdjustedTy = 5063fe6060f1SDimitry Andric Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts)); 50645ffd83dbSDimitry Andric 50655ffd83dbSDimitry Andric // The raw dword aligned data component of the load. The only legal cases 50665ffd83dbSDimitry Andric // where this matters should be when using the packed D16 format, for 50675ffd83dbSDimitry Andric // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 50685ffd83dbSDimitry Andric LLT RoundedTy; 50695ffd83dbSDimitry Andric 5070*bdd1243dSDimitry Andric // S32 vector to cover all data, plus TFE result element. 50715ffd83dbSDimitry Andric LLT TFETy; 50725ffd83dbSDimitry Andric 50735ffd83dbSDimitry Andric // Register type to use for each loaded component. Will be S32 or V2S16. 50745ffd83dbSDimitry Andric LLT RegTy; 50755ffd83dbSDimitry Andric 50765ffd83dbSDimitry Andric if (IsD16 && ST.hasUnpackedD16VMem()) { 5077fe6060f1SDimitry Andric RoundedTy = 5078fe6060f1SDimitry Andric LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32); 5079fe6060f1SDimitry Andric TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32); 50805ffd83dbSDimitry Andric RegTy = S32; 50815ffd83dbSDimitry Andric } else { 50825ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 50835ffd83dbSDimitry Andric unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 50845ffd83dbSDimitry Andric unsigned RoundedSize = 32 * RoundedElts; 5085fe6060f1SDimitry Andric RoundedTy = LLT::scalarOrVector( 5086fe6060f1SDimitry Andric ElementCount::getFixed(RoundedSize / EltSize), EltSize); 5087fe6060f1SDimitry Andric TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32); 50885ffd83dbSDimitry Andric RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 50895ffd83dbSDimitry Andric } 50905ffd83dbSDimitry Andric 50915ffd83dbSDimitry Andric // The return type does not need adjustment. 50925ffd83dbSDimitry Andric // TODO: Should we change s16 case to s32 or <2 x s16>? 50935ffd83dbSDimitry Andric if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 50945ffd83dbSDimitry Andric return true; 50955ffd83dbSDimitry Andric 50965ffd83dbSDimitry Andric Register Dst1Reg; 50975ffd83dbSDimitry Andric 50985ffd83dbSDimitry Andric // Insert after the instruction. 50995ffd83dbSDimitry Andric B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 51005ffd83dbSDimitry Andric 51015ffd83dbSDimitry Andric // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 51025ffd83dbSDimitry Andric // s16> instead of s32, we would only need 1 bitcast instead of multiple. 51035ffd83dbSDimitry Andric const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 51045ffd83dbSDimitry Andric const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 51055ffd83dbSDimitry Andric 51065ffd83dbSDimitry Andric Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 51075ffd83dbSDimitry Andric 51085ffd83dbSDimitry Andric MI.getOperand(0).setReg(NewResultReg); 51095ffd83dbSDimitry Andric 51105ffd83dbSDimitry Andric // In the IR, TFE is supposed to be used with a 2 element struct return 5111349cc55cSDimitry Andric // type. The instruction really returns these two values in one contiguous 51125ffd83dbSDimitry Andric // register, with one additional dword beyond the loaded data. Rewrite the 51135ffd83dbSDimitry Andric // return type to use a single register result. 51145ffd83dbSDimitry Andric 51155ffd83dbSDimitry Andric if (IsTFE) { 51165ffd83dbSDimitry Andric Dst1Reg = MI.getOperand(1).getReg(); 51175ffd83dbSDimitry Andric if (MRI->getType(Dst1Reg) != S32) 51185ffd83dbSDimitry Andric return false; 51195ffd83dbSDimitry Andric 51205ffd83dbSDimitry Andric // TODO: Make sure the TFE operand bit is set. 512181ad6265SDimitry Andric MI.removeOperand(1); 51225ffd83dbSDimitry Andric 51235ffd83dbSDimitry Andric // Handle the easy case that requires no repack instructions. 51245ffd83dbSDimitry Andric if (Ty == S32) { 51255ffd83dbSDimitry Andric B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 51265ffd83dbSDimitry Andric return true; 51275ffd83dbSDimitry Andric } 51285ffd83dbSDimitry Andric } 51295ffd83dbSDimitry Andric 51305ffd83dbSDimitry Andric // Now figure out how to copy the new result register back into the old 51315ffd83dbSDimitry Andric // result. 51325ffd83dbSDimitry Andric SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 51335ffd83dbSDimitry Andric 51345ffd83dbSDimitry Andric const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 51355ffd83dbSDimitry Andric 51365ffd83dbSDimitry Andric if (ResultNumRegs == 1) { 51375ffd83dbSDimitry Andric assert(!IsTFE); 51385ffd83dbSDimitry Andric ResultRegs[0] = NewResultReg; 51395ffd83dbSDimitry Andric } else { 51405ffd83dbSDimitry Andric // We have to repack into a new vector of some kind. 51415ffd83dbSDimitry Andric for (int I = 0; I != NumDataRegs; ++I) 51425ffd83dbSDimitry Andric ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 51435ffd83dbSDimitry Andric B.buildUnmerge(ResultRegs, NewResultReg); 51445ffd83dbSDimitry Andric 51455ffd83dbSDimitry Andric // Drop the final TFE element to get the data part. The TFE result is 51465ffd83dbSDimitry Andric // directly written to the right place already. 51475ffd83dbSDimitry Andric if (IsTFE) 51485ffd83dbSDimitry Andric ResultRegs.resize(NumDataRegs); 51495ffd83dbSDimitry Andric } 51505ffd83dbSDimitry Andric 51515ffd83dbSDimitry Andric // For an s16 scalar result, we form an s32 result with a truncate regardless 51525ffd83dbSDimitry Andric // of packed vs. unpacked. 51535ffd83dbSDimitry Andric if (IsD16 && !Ty.isVector()) { 51545ffd83dbSDimitry Andric B.buildTrunc(DstReg, ResultRegs[0]); 51555ffd83dbSDimitry Andric return true; 51565ffd83dbSDimitry Andric } 51575ffd83dbSDimitry Andric 51585ffd83dbSDimitry Andric // Avoid a build/concat_vector of 1 entry. 51595ffd83dbSDimitry Andric if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 51605ffd83dbSDimitry Andric B.buildBitcast(DstReg, ResultRegs[0]); 51615ffd83dbSDimitry Andric return true; 51625ffd83dbSDimitry Andric } 51635ffd83dbSDimitry Andric 51645ffd83dbSDimitry Andric assert(Ty.isVector()); 51655ffd83dbSDimitry Andric 51665ffd83dbSDimitry Andric if (IsD16) { 51675ffd83dbSDimitry Andric // For packed D16 results with TFE enabled, all the data components are 51685ffd83dbSDimitry Andric // S32. Cast back to the expected type. 51695ffd83dbSDimitry Andric // 51705ffd83dbSDimitry Andric // TODO: We don't really need to use load s32 elements. We would only need one 51715ffd83dbSDimitry Andric // cast for the TFE result if a multiple of v2s16 was used. 51725ffd83dbSDimitry Andric if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 51735ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 51745ffd83dbSDimitry Andric Reg = B.buildBitcast(V2S16, Reg).getReg(0); 51755ffd83dbSDimitry Andric } else if (ST.hasUnpackedD16VMem()) { 51765ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 51775ffd83dbSDimitry Andric Reg = B.buildTrunc(S16, Reg).getReg(0); 51785ffd83dbSDimitry Andric } 51795ffd83dbSDimitry Andric } 51805ffd83dbSDimitry Andric 51815ffd83dbSDimitry Andric auto padWithUndef = [&](LLT Ty, int NumElts) { 51825ffd83dbSDimitry Andric if (NumElts == 0) 51835ffd83dbSDimitry Andric return; 51845ffd83dbSDimitry Andric Register Undef = B.buildUndef(Ty).getReg(0); 51855ffd83dbSDimitry Andric for (int I = 0; I != NumElts; ++I) 51865ffd83dbSDimitry Andric ResultRegs.push_back(Undef); 51875ffd83dbSDimitry Andric }; 51885ffd83dbSDimitry Andric 51895ffd83dbSDimitry Andric // Pad out any elements eliminated due to the dmask. 51905ffd83dbSDimitry Andric LLT ResTy = MRI->getType(ResultRegs[0]); 51915ffd83dbSDimitry Andric if (!ResTy.isVector()) { 51925ffd83dbSDimitry Andric padWithUndef(ResTy, NumElts - ResultRegs.size()); 51935ffd83dbSDimitry Andric B.buildBuildVector(DstReg, ResultRegs); 51945ffd83dbSDimitry Andric return true; 51955ffd83dbSDimitry Andric } 51965ffd83dbSDimitry Andric 51975ffd83dbSDimitry Andric assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 51985ffd83dbSDimitry Andric const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 51995ffd83dbSDimitry Andric 52005ffd83dbSDimitry Andric // Deal with the one annoying legal case. 5201fe6060f1SDimitry Andric const LLT V3S16 = LLT::fixed_vector(3, 16); 52025ffd83dbSDimitry Andric if (Ty == V3S16) { 52030eae32dcSDimitry Andric if (IsTFE) { 52040eae32dcSDimitry Andric if (ResultRegs.size() == 1) { 52050eae32dcSDimitry Andric NewResultReg = ResultRegs[0]; 52060eae32dcSDimitry Andric } else if (ResultRegs.size() == 2) { 52070eae32dcSDimitry Andric LLT V4S16 = LLT::fixed_vector(4, 16); 52080eae32dcSDimitry Andric NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0); 52090eae32dcSDimitry Andric } else { 52100eae32dcSDimitry Andric return false; 52110eae32dcSDimitry Andric } 52120eae32dcSDimitry Andric } 52130eae32dcSDimitry Andric 52140eae32dcSDimitry Andric if (MRI->getType(DstReg).getNumElements() < 52150eae32dcSDimitry Andric MRI->getType(NewResultReg).getNumElements()) { 52160eae32dcSDimitry Andric B.buildDeleteTrailingVectorElements(DstReg, NewResultReg); 52170eae32dcSDimitry Andric } else { 52180eae32dcSDimitry Andric B.buildPadVectorWithUndefElements(DstReg, NewResultReg); 52190eae32dcSDimitry Andric } 52205ffd83dbSDimitry Andric return true; 52215ffd83dbSDimitry Andric } 52225ffd83dbSDimitry Andric 52235ffd83dbSDimitry Andric padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 52245ffd83dbSDimitry Andric B.buildConcatVectors(DstReg, ResultRegs); 52255ffd83dbSDimitry Andric return true; 52265ffd83dbSDimitry Andric } 52275ffd83dbSDimitry Andric 52285ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad( 5229e8d8bef9SDimitry Andric LegalizerHelper &Helper, MachineInstr &MI) const { 5230e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 5231e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 5232e8d8bef9SDimitry Andric 52335ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 52345ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 52355ffd83dbSDimitry Andric unsigned Size = Ty.getSizeInBits(); 52365ffd83dbSDimitry Andric MachineFunction &MF = B.getMF(); 52375ffd83dbSDimitry Andric 52385ffd83dbSDimitry Andric Observer.changingInstr(MI); 52395ffd83dbSDimitry Andric 5240fe6060f1SDimitry Andric if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) { 5241e8d8bef9SDimitry Andric Ty = getBitcastRegisterType(Ty); 5242e8d8bef9SDimitry Andric Helper.bitcastDst(MI, Ty, 0); 5243e8d8bef9SDimitry Andric Dst = MI.getOperand(0).getReg(); 5244e8d8bef9SDimitry Andric B.setInsertPt(B.getMBB(), MI); 5245e8d8bef9SDimitry Andric } 5246e8d8bef9SDimitry Andric 52475ffd83dbSDimitry Andric // FIXME: We don't really need this intermediate instruction. The intrinsic 52485ffd83dbSDimitry Andric // should be fixed to have a memory operand. Since it's readnone, we're not 52495ffd83dbSDimitry Andric // allowed to add one. 52505ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 525181ad6265SDimitry Andric MI.removeOperand(1); // Remove intrinsic ID 52525ffd83dbSDimitry Andric 52535ffd83dbSDimitry Andric // FIXME: When intrinsic definition is fixed, this should have an MMO already. 52545ffd83dbSDimitry Andric // TODO: Should this use datalayout alignment? 52555ffd83dbSDimitry Andric const unsigned MemSize = (Size + 7) / 8; 52565ffd83dbSDimitry Andric const Align MemAlign(4); 52575ffd83dbSDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 52585ffd83dbSDimitry Andric MachinePointerInfo(), 52595ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 52605ffd83dbSDimitry Andric MachineMemOperand::MOInvariant, 52615ffd83dbSDimitry Andric MemSize, MemAlign); 52625ffd83dbSDimitry Andric MI.addMemOperand(MF, MMO); 52635ffd83dbSDimitry Andric 52645ffd83dbSDimitry Andric // There are no 96-bit result scalar loads, but widening to 128-bit should 52655ffd83dbSDimitry Andric // always be legal. We may need to restore this to a 96-bit result if it turns 52665ffd83dbSDimitry Andric // out this needs to be converted to a vector load during RegBankSelect. 52675ffd83dbSDimitry Andric if (!isPowerOf2_32(Size)) { 52685ffd83dbSDimitry Andric if (Ty.isVector()) 52695ffd83dbSDimitry Andric Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 52705ffd83dbSDimitry Andric else 52715ffd83dbSDimitry Andric Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 52725ffd83dbSDimitry Andric } 52735ffd83dbSDimitry Andric 52745ffd83dbSDimitry Andric Observer.changedInstr(MI); 52755ffd83dbSDimitry Andric return true; 52765ffd83dbSDimitry Andric } 52775ffd83dbSDimitry Andric 5278e8d8bef9SDimitry Andric // TODO: Move to selection 52795ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 52800b57cec5SDimitry Andric MachineRegisterInfo &MRI, 52810b57cec5SDimitry Andric MachineIRBuilder &B) const { 5282fe6060f1SDimitry Andric if (!ST.isTrapHandlerEnabled() || 5283fe6060f1SDimitry Andric ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) 5284fe6060f1SDimitry Andric return legalizeTrapEndpgm(MI, MRI, B); 5285fe6060f1SDimitry Andric 5286*bdd1243dSDimitry Andric if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) { 5287fe6060f1SDimitry Andric switch (*HsaAbiVer) { 5288fe6060f1SDimitry Andric case ELF::ELFABIVERSION_AMDGPU_HSA_V2: 5289fe6060f1SDimitry Andric case ELF::ELFABIVERSION_AMDGPU_HSA_V3: 5290fe6060f1SDimitry Andric return legalizeTrapHsaQueuePtr(MI, MRI, B); 5291fe6060f1SDimitry Andric case ELF::ELFABIVERSION_AMDGPU_HSA_V4: 52921fd87a68SDimitry Andric case ELF::ELFABIVERSION_AMDGPU_HSA_V5: 5293fe6060f1SDimitry Andric return ST.supportsGetDoorbellID() ? 5294fe6060f1SDimitry Andric legalizeTrapHsa(MI, MRI, B) : 5295fe6060f1SDimitry Andric legalizeTrapHsaQueuePtr(MI, MRI, B); 5296fe6060f1SDimitry Andric } 5297fe6060f1SDimitry Andric } 5298fe6060f1SDimitry Andric 5299fe6060f1SDimitry Andric llvm_unreachable("Unknown trap handler"); 5300fe6060f1SDimitry Andric } 5301fe6060f1SDimitry Andric 5302fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapEndpgm( 5303fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 53045ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 5305fe6060f1SDimitry Andric MI.eraseFromParent(); 5306fe6060f1SDimitry Andric return true; 5307fe6060f1SDimitry Andric } 5308fe6060f1SDimitry Andric 5309fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( 5310fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 531181ad6265SDimitry Andric MachineFunction &MF = B.getMF(); 531281ad6265SDimitry Andric const LLT S64 = LLT::scalar(64); 531381ad6265SDimitry Andric 531481ad6265SDimitry Andric Register SGPR01(AMDGPU::SGPR0_SGPR1); 531581ad6265SDimitry Andric // For code object version 5, queue_ptr is passed through implicit kernarg. 531681ad6265SDimitry Andric if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { 531781ad6265SDimitry Andric AMDGPUTargetLowering::ImplicitParameter Param = 531881ad6265SDimitry Andric AMDGPUTargetLowering::QUEUE_PTR; 531981ad6265SDimitry Andric uint64_t Offset = 532081ad6265SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); 532181ad6265SDimitry Andric 532281ad6265SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister( 532381ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 532481ad6265SDimitry Andric 532581ad6265SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 532681ad6265SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 532781ad6265SDimitry Andric return false; 532881ad6265SDimitry Andric 532981ad6265SDimitry Andric // TODO: can we be smarter about machine pointer info? 533081ad6265SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 533181ad6265SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 533281ad6265SDimitry Andric PtrInfo, 533381ad6265SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 533481ad6265SDimitry Andric MachineMemOperand::MOInvariant, 533581ad6265SDimitry Andric LLT::scalar(64), commonAlignment(Align(64), Offset)); 533681ad6265SDimitry Andric 533781ad6265SDimitry Andric // Pointer address 533881ad6265SDimitry Andric Register LoadAddr = MRI.createGenericVirtualRegister( 533981ad6265SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 534081ad6265SDimitry Andric B.buildPtrAdd(LoadAddr, KernargPtrReg, 534181ad6265SDimitry Andric B.buildConstant(LLT::scalar(64), Offset).getReg(0)); 534281ad6265SDimitry Andric // Load address 534381ad6265SDimitry Andric Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); 534481ad6265SDimitry Andric B.buildCopy(SGPR01, Temp); 534581ad6265SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 534681ad6265SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 534781ad6265SDimitry Andric .addReg(SGPR01, RegState::Implicit); 534881ad6265SDimitry Andric MI.eraseFromParent(); 534981ad6265SDimitry Andric return true; 535081ad6265SDimitry Andric } 535181ad6265SDimitry Andric 53525ffd83dbSDimitry Andric // Pass queue pointer to trap handler as input, and insert trap instruction 53535ffd83dbSDimitry Andric // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 5354e8d8bef9SDimitry Andric Register LiveIn = 5355e8d8bef9SDimitry Andric MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 5356e8d8bef9SDimitry Andric if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 53575ffd83dbSDimitry Andric return false; 5358e8d8bef9SDimitry Andric 53595ffd83dbSDimitry Andric B.buildCopy(SGPR01, LiveIn); 53605ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 5361fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) 53625ffd83dbSDimitry Andric .addReg(SGPR01, RegState::Implicit); 5363fe6060f1SDimitry Andric 5364fe6060f1SDimitry Andric MI.eraseFromParent(); 5365fe6060f1SDimitry Andric return true; 53665ffd83dbSDimitry Andric } 53675ffd83dbSDimitry Andric 5368fe6060f1SDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapHsa( 5369fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5370fe6060f1SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 5371fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)); 53725ffd83dbSDimitry Andric MI.eraseFromParent(); 53735ffd83dbSDimitry Andric return true; 53745ffd83dbSDimitry Andric } 53755ffd83dbSDimitry Andric 53765ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 53775ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 5378349cc55cSDimitry Andric // Is non-HSA path or trap-handler disabled? Then, report a warning 53795ffd83dbSDimitry Andric // accordingly 5380fe6060f1SDimitry Andric if (!ST.isTrapHandlerEnabled() || 5381fe6060f1SDimitry Andric ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { 53825ffd83dbSDimitry Andric DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 53835ffd83dbSDimitry Andric "debugtrap handler not supported", 53845ffd83dbSDimitry Andric MI.getDebugLoc(), DS_Warning); 53855ffd83dbSDimitry Andric LLVMContext &Ctx = B.getMF().getFunction().getContext(); 53865ffd83dbSDimitry Andric Ctx.diagnose(NoTrap); 53875ffd83dbSDimitry Andric } else { 53885ffd83dbSDimitry Andric // Insert debug-trap instruction 5389fe6060f1SDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 5390fe6060f1SDimitry Andric .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap)); 53915ffd83dbSDimitry Andric } 53925ffd83dbSDimitry Andric 53935ffd83dbSDimitry Andric MI.eraseFromParent(); 53945ffd83dbSDimitry Andric return true; 53955ffd83dbSDimitry Andric } 53965ffd83dbSDimitry Andric 5397e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 5398e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 5399e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 5400e8d8bef9SDimitry Andric const LLT S16 = LLT::scalar(16); 5401e8d8bef9SDimitry Andric const LLT S32 = LLT::scalar(32); 540281ad6265SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 540381ad6265SDimitry Andric const LLT V3S32 = LLT::fixed_vector(3, 32); 5404e8d8bef9SDimitry Andric 5405e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 5406e8d8bef9SDimitry Andric Register NodePtr = MI.getOperand(2).getReg(); 5407e8d8bef9SDimitry Andric Register RayExtent = MI.getOperand(3).getReg(); 5408e8d8bef9SDimitry Andric Register RayOrigin = MI.getOperand(4).getReg(); 5409e8d8bef9SDimitry Andric Register RayDir = MI.getOperand(5).getReg(); 5410e8d8bef9SDimitry Andric Register RayInvDir = MI.getOperand(6).getReg(); 5411e8d8bef9SDimitry Andric Register TDescr = MI.getOperand(7).getReg(); 5412e8d8bef9SDimitry Andric 5413fe6060f1SDimitry Andric if (!ST.hasGFX10_AEncoding()) { 5414fe6060f1SDimitry Andric DiagnosticInfoUnsupported BadIntrin(B.getMF().getFunction(), 5415fe6060f1SDimitry Andric "intrinsic not supported on subtarget", 5416fe6060f1SDimitry Andric MI.getDebugLoc()); 5417fe6060f1SDimitry Andric B.getMF().getFunction().getContext().diagnose(BadIntrin); 5418fe6060f1SDimitry Andric return false; 5419fe6060f1SDimitry Andric } 5420fe6060f1SDimitry Andric 542181ad6265SDimitry Andric const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); 5422349cc55cSDimitry Andric const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 5423349cc55cSDimitry Andric const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 5424349cc55cSDimitry Andric const unsigned NumVDataDwords = 4; 5425349cc55cSDimitry Andric const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); 542681ad6265SDimitry Andric const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; 542781ad6265SDimitry Andric const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); 5428349cc55cSDimitry Andric const unsigned BaseOpcodes[2][2] = { 5429349cc55cSDimitry Andric {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, 5430349cc55cSDimitry Andric {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, 5431349cc55cSDimitry Andric AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; 5432349cc55cSDimitry Andric int Opcode; 5433349cc55cSDimitry Andric if (UseNSA) { 543481ad6265SDimitry Andric Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], 543581ad6265SDimitry Andric IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA 543681ad6265SDimitry Andric : AMDGPU::MIMGEncGfx10NSA, 5437349cc55cSDimitry Andric NumVDataDwords, NumVAddrDwords); 5438349cc55cSDimitry Andric } else { 543981ad6265SDimitry Andric Opcode = AMDGPU::getMIMGOpcode( 544081ad6265SDimitry Andric BaseOpcodes[Is64][IsA16], 544181ad6265SDimitry Andric IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, 5442*bdd1243dSDimitry Andric NumVDataDwords, NumVAddrDwords); 5443349cc55cSDimitry Andric } 5444349cc55cSDimitry Andric assert(Opcode != -1); 5445e8d8bef9SDimitry Andric 5446e8d8bef9SDimitry Andric SmallVector<Register, 12> Ops; 544781ad6265SDimitry Andric if (UseNSA && IsGFX11Plus) { 544881ad6265SDimitry Andric auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { 544981ad6265SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5450*bdd1243dSDimitry Andric auto Merged = B.buildMergeLikeInstr( 545181ad6265SDimitry Andric V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); 545281ad6265SDimitry Andric Ops.push_back(Merged.getReg(0)); 545381ad6265SDimitry Andric }; 545481ad6265SDimitry Andric 545581ad6265SDimitry Andric Ops.push_back(NodePtr); 545681ad6265SDimitry Andric Ops.push_back(RayExtent); 545781ad6265SDimitry Andric packLanes(RayOrigin); 545881ad6265SDimitry Andric 545981ad6265SDimitry Andric if (IsA16) { 546081ad6265SDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 546181ad6265SDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5462*bdd1243dSDimitry Andric auto MergedDir = B.buildMergeLikeInstr( 546381ad6265SDimitry Andric V3S32, 5464*bdd1243dSDimitry Andric {B.buildBitcast( 5465*bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0), 546681ad6265SDimitry Andric UnmergeRayDir.getReg(0)})) 546781ad6265SDimitry Andric .getReg(0), 5468*bdd1243dSDimitry Andric B.buildBitcast( 5469*bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1), 547081ad6265SDimitry Andric UnmergeRayDir.getReg(1)})) 547181ad6265SDimitry Andric .getReg(0), 5472*bdd1243dSDimitry Andric B.buildBitcast( 5473*bdd1243dSDimitry Andric S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2), 547481ad6265SDimitry Andric UnmergeRayDir.getReg(2)})) 547581ad6265SDimitry Andric .getReg(0)}); 547681ad6265SDimitry Andric Ops.push_back(MergedDir.getReg(0)); 547781ad6265SDimitry Andric } else { 547881ad6265SDimitry Andric packLanes(RayDir); 547981ad6265SDimitry Andric packLanes(RayInvDir); 548081ad6265SDimitry Andric } 548181ad6265SDimitry Andric } else { 5482e8d8bef9SDimitry Andric if (Is64) { 5483e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 5484e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 5485e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 5486e8d8bef9SDimitry Andric } else { 5487e8d8bef9SDimitry Andric Ops.push_back(NodePtr); 5488e8d8bef9SDimitry Andric } 5489e8d8bef9SDimitry Andric Ops.push_back(RayExtent); 5490e8d8bef9SDimitry Andric 5491e8d8bef9SDimitry Andric auto packLanes = [&Ops, &S32, &B](Register Src) { 54920eae32dcSDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); 5493e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 5494e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 5495e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(2)); 5496e8d8bef9SDimitry Andric }; 5497e8d8bef9SDimitry Andric 5498e8d8bef9SDimitry Andric packLanes(RayOrigin); 5499e8d8bef9SDimitry Andric if (IsA16) { 55000eae32dcSDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); 55010eae32dcSDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); 5502e8d8bef9SDimitry Andric Register R1 = MRI.createGenericVirtualRegister(S32); 5503e8d8bef9SDimitry Andric Register R2 = MRI.createGenericVirtualRegister(S32); 5504e8d8bef9SDimitry Andric Register R3 = MRI.createGenericVirtualRegister(S32); 5505*bdd1243dSDimitry Andric B.buildMergeLikeInstr(R1, 5506*bdd1243dSDimitry Andric {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 5507*bdd1243dSDimitry Andric B.buildMergeLikeInstr( 5508*bdd1243dSDimitry Andric R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 5509*bdd1243dSDimitry Andric B.buildMergeLikeInstr( 5510*bdd1243dSDimitry Andric R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 5511e8d8bef9SDimitry Andric Ops.push_back(R1); 5512e8d8bef9SDimitry Andric Ops.push_back(R2); 5513e8d8bef9SDimitry Andric Ops.push_back(R3); 5514e8d8bef9SDimitry Andric } else { 5515e8d8bef9SDimitry Andric packLanes(RayDir); 5516e8d8bef9SDimitry Andric packLanes(RayInvDir); 5517e8d8bef9SDimitry Andric } 551881ad6265SDimitry Andric } 5519e8d8bef9SDimitry Andric 5520349cc55cSDimitry Andric if (!UseNSA) { 5521349cc55cSDimitry Andric // Build a single vector containing all the operands so far prepared. 5522349cc55cSDimitry Andric LLT OpTy = LLT::fixed_vector(Ops.size(), 32); 5523*bdd1243dSDimitry Andric Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0); 5524349cc55cSDimitry Andric Ops.clear(); 5525349cc55cSDimitry Andric Ops.push_back(MergedOps); 5526349cc55cSDimitry Andric } 5527349cc55cSDimitry Andric 5528e8d8bef9SDimitry Andric auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 5529e8d8bef9SDimitry Andric .addDef(DstReg) 5530e8d8bef9SDimitry Andric .addImm(Opcode); 5531e8d8bef9SDimitry Andric 5532e8d8bef9SDimitry Andric for (Register R : Ops) { 5533e8d8bef9SDimitry Andric MIB.addUse(R); 5534e8d8bef9SDimitry Andric } 5535e8d8bef9SDimitry Andric 5536e8d8bef9SDimitry Andric MIB.addUse(TDescr) 5537e8d8bef9SDimitry Andric .addImm(IsA16 ? 1 : 0) 5538e8d8bef9SDimitry Andric .cloneMemRefs(MI); 5539e8d8bef9SDimitry Andric 5540e8d8bef9SDimitry Andric MI.eraseFromParent(); 5541e8d8bef9SDimitry Andric return true; 5542e8d8bef9SDimitry Andric } 5543e8d8bef9SDimitry Andric 554481ad6265SDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, 554581ad6265SDimitry Andric MachineIRBuilder &B) const { 554681ad6265SDimitry Andric unsigned Opc; 554781ad6265SDimitry Andric int RoundMode = MI.getOperand(2).getImm(); 554881ad6265SDimitry Andric 554981ad6265SDimitry Andric if (RoundMode == (int)RoundingMode::TowardPositive) 555081ad6265SDimitry Andric Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; 555181ad6265SDimitry Andric else if (RoundMode == (int)RoundingMode::TowardNegative) 555281ad6265SDimitry Andric Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; 555381ad6265SDimitry Andric else 555481ad6265SDimitry Andric return false; 555581ad6265SDimitry Andric 555681ad6265SDimitry Andric B.buildInstr(Opc) 555781ad6265SDimitry Andric .addDef(MI.getOperand(0).getReg()) 555881ad6265SDimitry Andric .addUse(MI.getOperand(1).getReg()); 555981ad6265SDimitry Andric 556004eeddc0SDimitry Andric MI.eraseFromParent(); 556181ad6265SDimitry Andric 556204eeddc0SDimitry Andric return true; 556304eeddc0SDimitry Andric } 556404eeddc0SDimitry Andric 55655ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 55665ffd83dbSDimitry Andric MachineInstr &MI) const { 55675ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 55685ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 55695ffd83dbSDimitry Andric 55700b57cec5SDimitry Andric // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 5571480093f4SDimitry Andric auto IntrID = MI.getIntrinsicID(); 5572480093f4SDimitry Andric switch (IntrID) { 5573480093f4SDimitry Andric case Intrinsic::amdgcn_if: 5574480093f4SDimitry Andric case Intrinsic::amdgcn_else: { 5575480093f4SDimitry Andric MachineInstr *Br = nullptr; 55765ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 5577e8d8bef9SDimitry Andric bool Negated = false; 5578e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 5579e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 55800b57cec5SDimitry Andric const SIRegisterInfo *TRI 55810b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 55820b57cec5SDimitry Andric 55830b57cec5SDimitry Andric Register Def = MI.getOperand(1).getReg(); 55840b57cec5SDimitry Andric Register Use = MI.getOperand(3).getReg(); 5585480093f4SDimitry Andric 55865ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 5587e8d8bef9SDimitry Andric 5588e8d8bef9SDimitry Andric if (Negated) 5589e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 5590e8d8bef9SDimitry Andric 55915ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 5592480093f4SDimitry Andric if (IntrID == Intrinsic::amdgcn_if) { 55930b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_IF) 55940b57cec5SDimitry Andric .addDef(Def) 55950b57cec5SDimitry Andric .addUse(Use) 55965ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 5597480093f4SDimitry Andric } else { 5598480093f4SDimitry Andric B.buildInstr(AMDGPU::SI_ELSE) 5599480093f4SDimitry Andric .addDef(Def) 5600480093f4SDimitry Andric .addUse(Use) 5601e8d8bef9SDimitry Andric .addMBB(UncondBrTarget); 5602480093f4SDimitry Andric } 5603480093f4SDimitry Andric 56045ffd83dbSDimitry Andric if (Br) { 56055ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 56065ffd83dbSDimitry Andric } else { 56075ffd83dbSDimitry Andric // The IRTranslator skips inserting the G_BR for fallthrough cases, but 56085ffd83dbSDimitry Andric // since we're swapping branch targets it needs to be reinserted. 56095ffd83dbSDimitry Andric // FIXME: IRTranslator should probably not do this 56105ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 56115ffd83dbSDimitry Andric } 56120b57cec5SDimitry Andric 56130b57cec5SDimitry Andric MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 56140b57cec5SDimitry Andric MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 56150b57cec5SDimitry Andric MI.eraseFromParent(); 56160b57cec5SDimitry Andric BrCond->eraseFromParent(); 56170b57cec5SDimitry Andric return true; 56180b57cec5SDimitry Andric } 56190b57cec5SDimitry Andric 56200b57cec5SDimitry Andric return false; 56210b57cec5SDimitry Andric } 56220b57cec5SDimitry Andric case Intrinsic::amdgcn_loop: { 5623480093f4SDimitry Andric MachineInstr *Br = nullptr; 56245ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 5625e8d8bef9SDimitry Andric bool Negated = false; 5626e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 5627e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 56280b57cec5SDimitry Andric const SIRegisterInfo *TRI 56290b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 56300b57cec5SDimitry Andric 56315ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 56320b57cec5SDimitry Andric Register Reg = MI.getOperand(2).getReg(); 56335ffd83dbSDimitry Andric 5634e8d8bef9SDimitry Andric if (Negated) 5635e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 5636e8d8bef9SDimitry Andric 56375ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 56380b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_LOOP) 56390b57cec5SDimitry Andric .addUse(Reg) 56405ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 56415ffd83dbSDimitry Andric 56425ffd83dbSDimitry Andric if (Br) 56435ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 56445ffd83dbSDimitry Andric else 56455ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 56465ffd83dbSDimitry Andric 56470b57cec5SDimitry Andric MI.eraseFromParent(); 56480b57cec5SDimitry Andric BrCond->eraseFromParent(); 56490b57cec5SDimitry Andric MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 56500b57cec5SDimitry Andric return true; 56510b57cec5SDimitry Andric } 56520b57cec5SDimitry Andric 56530b57cec5SDimitry Andric return false; 56540b57cec5SDimitry Andric } 56550b57cec5SDimitry Andric case Intrinsic::amdgcn_kernarg_segment_ptr: 56565ffd83dbSDimitry Andric if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 56575ffd83dbSDimitry Andric // This only makes sense to call in a kernel, so just lower to null. 56585ffd83dbSDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), 0); 56595ffd83dbSDimitry Andric MI.eraseFromParent(); 56605ffd83dbSDimitry Andric return true; 56615ffd83dbSDimitry Andric } 56625ffd83dbSDimitry Andric 56630b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 56640b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 56650b57cec5SDimitry Andric case Intrinsic::amdgcn_implicitarg_ptr: 56660b57cec5SDimitry Andric return legalizeImplicitArgPtr(MI, MRI, B); 56670b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 566881ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, 56690b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_X); 56700b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 567181ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, 56720b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 56730b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 567481ad6265SDimitry Andric return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, 56750b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 56760b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_x: 56770b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56780b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 56790b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_y: 56800b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56810b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 56820b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_z: 56830b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56840b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 5685fcaf7f86SDimitry Andric case Intrinsic::amdgcn_lds_kernel_id: 5686fcaf7f86SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 5687fcaf7f86SDimitry Andric AMDGPUFunctionArgInfo::LDS_KERNEL_ID); 56880b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_ptr: 56890b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56900b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_PTR); 56910b57cec5SDimitry Andric case Intrinsic::amdgcn_queue_ptr: 56920b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56930b57cec5SDimitry Andric AMDGPUFunctionArgInfo::QUEUE_PTR); 56940b57cec5SDimitry Andric case Intrinsic::amdgcn_implicit_buffer_ptr: 56950b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 56960b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 56970b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_id: 56980b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 56990b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_ID); 570081ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_x: 570181ad6265SDimitry Andric // TODO: Emit error for hsa 570281ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 570381ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_X); 570481ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_y: 570581ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 570681ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_Y); 570781ad6265SDimitry Andric case Intrinsic::r600_read_ngroups_z: 570881ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, 570981ad6265SDimitry Andric SI::KernelInputOffsets::NGROUPS_Z); 571081ad6265SDimitry Andric case Intrinsic::r600_read_local_size_x: 571181ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 571281ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); 571381ad6265SDimitry Andric case Intrinsic::r600_read_local_size_y: 571481ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 571581ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); 571681ad6265SDimitry Andric // TODO: Could insert G_ASSERT_ZEXT from s16 571781ad6265SDimitry Andric case Intrinsic::r600_read_local_size_z: 571881ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); 571981ad6265SDimitry Andric case Intrinsic::r600_read_global_size_x: 572081ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); 572181ad6265SDimitry Andric case Intrinsic::r600_read_global_size_y: 572281ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); 572381ad6265SDimitry Andric case Intrinsic::r600_read_global_size_z: 572481ad6265SDimitry Andric return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); 57258bcb0991SDimitry Andric case Intrinsic::amdgcn_fdiv_fast: 57268bcb0991SDimitry Andric return legalizeFDIVFastIntrin(MI, MRI, B); 57278bcb0991SDimitry Andric case Intrinsic::amdgcn_is_shared: 57288bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 57298bcb0991SDimitry Andric case Intrinsic::amdgcn_is_private: 57308bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 57318bcb0991SDimitry Andric case Intrinsic::amdgcn_wavefrontsize: { 57328bcb0991SDimitry Andric B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 57338bcb0991SDimitry Andric MI.eraseFromParent(); 57348bcb0991SDimitry Andric return true; 57358bcb0991SDimitry Andric } 57365ffd83dbSDimitry Andric case Intrinsic::amdgcn_s_buffer_load: 5737e8d8bef9SDimitry Andric return legalizeSBufferLoad(Helper, MI); 57388bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store: 57395ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store: 57405ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, false); 57418bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store_format: 57425ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store_format: 57435ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, true); 57445ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_store: 57455ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_store: 57465ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, true, true); 57475ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load: 57485ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load: 57495ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, false, false); 57505ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load_format: 57515ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load_format: 57525ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, false); 57535ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_load: 57545ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_load: 57555ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, true); 57565ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 57575ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 57585ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 57595ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 57605ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 57615ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 57625ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 57635ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 57645ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 57655ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 57665ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 57675ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 57685ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 57695ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 57705ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 57715ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 57725ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 57735ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 57745ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 57755ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 57765ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 57775ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 57785ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 57795ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 57805ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 57815ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 5782fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmin: 5783fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmin: 5784fe6060f1SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fmax: 5785fe6060f1SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fmax: 578604eeddc0SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 5787*bdd1243dSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 578804eeddc0SDimitry Andric return legalizeBufferAtomic(MI, B, IntrID); 57895ffd83dbSDimitry Andric case Intrinsic::amdgcn_atomic_inc: 57905ffd83dbSDimitry Andric return legalizeAtomicIncDec(MI, B, true); 57915ffd83dbSDimitry Andric case Intrinsic::amdgcn_atomic_dec: 57925ffd83dbSDimitry Andric return legalizeAtomicIncDec(MI, B, false); 57935ffd83dbSDimitry Andric case Intrinsic::trap: 57945ffd83dbSDimitry Andric return legalizeTrapIntrinsic(MI, MRI, B); 57955ffd83dbSDimitry Andric case Intrinsic::debugtrap: 57965ffd83dbSDimitry Andric return legalizeDebugTrapIntrinsic(MI, MRI, B); 5797e8d8bef9SDimitry Andric case Intrinsic::amdgcn_rsq_clamp: 5798e8d8bef9SDimitry Andric return legalizeRsqClampIntrinsic(MI, MRI, B); 5799e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 5800e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 5801e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 5802e8d8bef9SDimitry Andric return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 5803e8d8bef9SDimitry Andric case Intrinsic::amdgcn_image_bvh_intersect_ray: 5804e8d8bef9SDimitry Andric return legalizeBVHIntrinsic(MI, B); 58055ffd83dbSDimitry Andric default: { 58065ffd83dbSDimitry Andric if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 58075ffd83dbSDimitry Andric AMDGPU::getImageDimIntrinsicInfo(IntrID)) 58085ffd83dbSDimitry Andric return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 58090b57cec5SDimitry Andric return true; 58100b57cec5SDimitry Andric } 58115ffd83dbSDimitry Andric } 58120b57cec5SDimitry Andric 58130b57cec5SDimitry Andric return true; 58140b57cec5SDimitry Andric } 5815