10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for 100b57cec5SDimitry Andric /// AMDGPU. 110b57cec5SDimitry Andric /// \todo This should be generated by TableGen. 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 145ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 158bcb0991SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 175ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h" 18*e8d8bef9SDimitry Andric #include "AMDGPUInstrInfo.h" 190b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 200b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 215ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h" 220b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 235ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24*e8d8bef9SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 258bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 26*e8d8bef9SDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 270b57cec5SDimitry Andric 280b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo" 290b57cec5SDimitry Andric 300b57cec5SDimitry Andric using namespace llvm; 310b57cec5SDimitry Andric using namespace LegalizeActions; 320b57cec5SDimitry Andric using namespace LegalizeMutations; 330b57cec5SDimitry Andric using namespace LegalityPredicates; 345ffd83dbSDimitry Andric using namespace MIPatternMatch; 350b57cec5SDimitry Andric 365ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types. 375ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality( 385ffd83dbSDimitry Andric "amdgpu-global-isel-new-legality", 395ffd83dbSDimitry Andric cl::desc("Use GlobalISel desired legality, rather than try to use" 405ffd83dbSDimitry Andric "rules compatible with selection patterns"), 415ffd83dbSDimitry Andric cl::init(false), 425ffd83dbSDimitry Andric cl::ReallyHidden); 430b57cec5SDimitry Andric 445ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024; 455ffd83dbSDimitry Andric 465ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements 475ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) { 485ffd83dbSDimitry Andric unsigned NElts = Ty.getNumElements(); 495ffd83dbSDimitry Andric unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 505ffd83dbSDimitry Andric return Ty.changeNumElements(Pow2NElts); 510b57cec5SDimitry Andric } 520b57cec5SDimitry Andric 535ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits 545ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) { 555ffd83dbSDimitry Andric unsigned Bits = Ty.getSizeInBits(); 565ffd83dbSDimitry Andric unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 575ffd83dbSDimitry Andric return LLT::scalar(Pow2Bits); 588bcb0991SDimitry Andric } 598bcb0991SDimitry Andric 60*e8d8bef9SDimitry Andric /// \returs true if this is an odd sized vector which should widen by adding an 61*e8d8bef9SDimitry Andric /// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This 62*e8d8bef9SDimitry Andric /// excludes s1 vectors, which should always be scalarized. 630b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 640b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 650b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 66*e8d8bef9SDimitry Andric if (!Ty.isVector()) 67*e8d8bef9SDimitry Andric return false; 68*e8d8bef9SDimitry Andric 69*e8d8bef9SDimitry Andric const LLT EltTy = Ty.getElementType(); 70*e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 71*e8d8bef9SDimitry Andric return Ty.getNumElements() % 2 != 0 && 72*e8d8bef9SDimitry Andric EltSize > 1 && EltSize < 32 && 738bcb0991SDimitry Andric Ty.getSizeInBits() % 32 != 0; 748bcb0991SDimitry Andric }; 758bcb0991SDimitry Andric } 768bcb0991SDimitry Andric 77*e8d8bef9SDimitry Andric static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { 78*e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 79*e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 80*e8d8bef9SDimitry Andric return Ty.getSizeInBits() % 32 == 0; 81*e8d8bef9SDimitry Andric }; 82*e8d8bef9SDimitry Andric } 83*e8d8bef9SDimitry Andric 848bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) { 858bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 868bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 878bcb0991SDimitry Andric const LLT EltTy = Ty.getScalarType(); 888bcb0991SDimitry Andric return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 890b57cec5SDimitry Andric }; 900b57cec5SDimitry Andric } 910b57cec5SDimitry Andric 920b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 930b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 940b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 950b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 960b57cec5SDimitry Andric return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 970b57cec5SDimitry Andric }; 980b57cec5SDimitry Andric } 990b57cec5SDimitry Andric 1000b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 1010b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1020b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1030b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 1040b57cec5SDimitry Andric unsigned Size = Ty.getSizeInBits(); 1050b57cec5SDimitry Andric unsigned Pieces = (Size + 63) / 64; 1060b57cec5SDimitry Andric unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 1070b57cec5SDimitry Andric return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 1080b57cec5SDimitry Andric }; 1090b57cec5SDimitry Andric } 1100b57cec5SDimitry Andric 1118bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit 1128bcb0991SDimitry Andric // type. 1138bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 1148bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1158bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1168bcb0991SDimitry Andric 1178bcb0991SDimitry Andric const LLT EltTy = Ty.getElementType(); 1188bcb0991SDimitry Andric const int Size = Ty.getSizeInBits(); 1198bcb0991SDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1208bcb0991SDimitry Andric const int NextMul32 = (Size + 31) / 32; 1218bcb0991SDimitry Andric 1228bcb0991SDimitry Andric assert(EltSize < 32); 1238bcb0991SDimitry Andric 1248bcb0991SDimitry Andric const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 1258bcb0991SDimitry Andric return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 1268bcb0991SDimitry Andric }; 1278bcb0991SDimitry Andric } 1288bcb0991SDimitry Andric 129*e8d8bef9SDimitry Andric static LLT getBitcastRegisterType(const LLT Ty) { 130*e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 1315ffd83dbSDimitry Andric 1325ffd83dbSDimitry Andric LLT CoercedTy; 1335ffd83dbSDimitry Andric if (Size <= 32) { 1345ffd83dbSDimitry Andric // <2 x s8> -> s16 1355ffd83dbSDimitry Andric // <4 x s8> -> s32 136*e8d8bef9SDimitry Andric return LLT::scalar(Size); 137*e8d8bef9SDimitry Andric } 1385ffd83dbSDimitry Andric 139*e8d8bef9SDimitry Andric return LLT::scalarOrVector(Size / 32, 32); 140*e8d8bef9SDimitry Andric } 141*e8d8bef9SDimitry Andric 142*e8d8bef9SDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 143*e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 144*e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 145*e8d8bef9SDimitry Andric return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); 146*e8d8bef9SDimitry Andric }; 147*e8d8bef9SDimitry Andric } 148*e8d8bef9SDimitry Andric 149*e8d8bef9SDimitry Andric static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { 150*e8d8bef9SDimitry Andric return [=](const LegalityQuery &Query) { 151*e8d8bef9SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 152*e8d8bef9SDimitry Andric unsigned Size = Ty.getSizeInBits(); 153*e8d8bef9SDimitry Andric assert(Size % 32 == 0); 154*e8d8bef9SDimitry Andric return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); 1555ffd83dbSDimitry Andric }; 1565ffd83dbSDimitry Andric } 1575ffd83dbSDimitry Andric 1588bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 1598bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1608bcb0991SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1618bcb0991SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 1628bcb0991SDimitry Andric }; 1638bcb0991SDimitry Andric } 1648bcb0991SDimitry Andric 1650b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 1660b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1670b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1680b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 1690b57cec5SDimitry Andric }; 1700b57cec5SDimitry Andric } 1710b57cec5SDimitry Andric 1720b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 1730b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1740b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1750b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 1760b57cec5SDimitry Andric }; 1770b57cec5SDimitry Andric } 1780b57cec5SDimitry Andric 1795ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) { 1805ffd83dbSDimitry Andric return Size % 32 == 0 && Size <= MaxRegisterSize; 1815ffd83dbSDimitry Andric } 1825ffd83dbSDimitry Andric 1835ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) { 1845ffd83dbSDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1855ffd83dbSDimitry Andric return EltSize == 16 || EltSize % 32 == 0; 1865ffd83dbSDimitry Andric } 1875ffd83dbSDimitry Andric 1885ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) { 1890b57cec5SDimitry Andric const int EltSize = Ty.getElementType().getSizeInBits(); 1900b57cec5SDimitry Andric return EltSize == 32 || EltSize == 64 || 1910b57cec5SDimitry Andric (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 1920b57cec5SDimitry Andric EltSize == 128 || EltSize == 256; 1930b57cec5SDimitry Andric } 1940b57cec5SDimitry Andric 1955ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) { 1965ffd83dbSDimitry Andric if (!isRegisterSize(Ty.getSizeInBits())) 1975ffd83dbSDimitry Andric return false; 1985ffd83dbSDimitry Andric 1995ffd83dbSDimitry Andric if (Ty.isVector()) 2005ffd83dbSDimitry Andric return isRegisterVectorType(Ty); 2015ffd83dbSDimitry Andric 2025ffd83dbSDimitry Andric return true; 2035ffd83dbSDimitry Andric } 2045ffd83dbSDimitry Andric 2055ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and 2065ffd83dbSDimitry Andric // multiples of v2s16. 2075ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) { 2085ffd83dbSDimitry Andric return [=](const LegalityQuery &Query) { 2095ffd83dbSDimitry Andric return isRegisterType(Query.Types[TypeIdx]); 2108bcb0991SDimitry Andric }; 2118bcb0991SDimitry Andric } 2128bcb0991SDimitry Andric 2135ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 2148bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2155ffd83dbSDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 2165ffd83dbSDimitry Andric if (!QueryTy.isVector()) 2175ffd83dbSDimitry Andric return false; 2185ffd83dbSDimitry Andric const LLT EltTy = QueryTy.getElementType(); 2195ffd83dbSDimitry Andric return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 2208bcb0991SDimitry Andric }; 2218bcb0991SDimitry Andric } 2228bcb0991SDimitry Andric 2238bcb0991SDimitry Andric static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 2248bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2258bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 2268bcb0991SDimitry Andric return !Ty.isVector() && Ty.getSizeInBits() > 32 && 2278bcb0991SDimitry Andric Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 2280b57cec5SDimitry Andric }; 2290b57cec5SDimitry Andric } 2300b57cec5SDimitry Andric 2315ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 2325ffd83dbSDimitry Andric // handle some operations by just promoting the register during 2335ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits. 2345ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 2355ffd83dbSDimitry Andric bool IsLoad) { 2365ffd83dbSDimitry Andric switch (AS) { 2375ffd83dbSDimitry Andric case AMDGPUAS::PRIVATE_ADDRESS: 2385ffd83dbSDimitry Andric // FIXME: Private element size. 239*e8d8bef9SDimitry Andric return ST.enableFlatScratch() ? 128 : 32; 2405ffd83dbSDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 2415ffd83dbSDimitry Andric return ST.useDS128() ? 128 : 64; 2425ffd83dbSDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 2435ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS: 2445ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 2455ffd83dbSDimitry Andric // Treat constant and global as identical. SMRD loads are sometimes usable for 2465ffd83dbSDimitry Andric // global loads (ideally constant address space should be eliminated) 2475ffd83dbSDimitry Andric // depending on the context. Legality cannot be context dependent, but 2485ffd83dbSDimitry Andric // RegBankSelect can split the load as necessary depending on the pointer 2495ffd83dbSDimitry Andric // register bank/uniformity and if the memory is invariant or not written in a 2505ffd83dbSDimitry Andric // kernel. 2515ffd83dbSDimitry Andric return IsLoad ? 512 : 128; 2525ffd83dbSDimitry Andric default: 2535ffd83dbSDimitry Andric // Flat addresses may contextually need to be split to 32-bit parts if they 2545ffd83dbSDimitry Andric // may alias scratch depending on the subtarget. 2555ffd83dbSDimitry Andric return 128; 2565ffd83dbSDimitry Andric } 2575ffd83dbSDimitry Andric } 2585ffd83dbSDimitry Andric 2595ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 2605ffd83dbSDimitry Andric const LegalityQuery &Query, 2615ffd83dbSDimitry Andric unsigned Opcode) { 2625ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 2635ffd83dbSDimitry Andric 2645ffd83dbSDimitry Andric // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 2655ffd83dbSDimitry Andric const bool IsLoad = Opcode != AMDGPU::G_STORE; 2665ffd83dbSDimitry Andric 2675ffd83dbSDimitry Andric unsigned RegSize = Ty.getSizeInBits(); 2685ffd83dbSDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 269*e8d8bef9SDimitry Andric unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 2705ffd83dbSDimitry Andric unsigned AS = Query.Types[1].getAddressSpace(); 2715ffd83dbSDimitry Andric 2725ffd83dbSDimitry Andric // All of these need to be custom lowered to cast the pointer operand. 2735ffd83dbSDimitry Andric if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 2745ffd83dbSDimitry Andric return false; 2755ffd83dbSDimitry Andric 2765ffd83dbSDimitry Andric // TODO: We should be able to widen loads if the alignment is high enough, but 2775ffd83dbSDimitry Andric // we also need to modify the memory access size. 2785ffd83dbSDimitry Andric #if 0 2795ffd83dbSDimitry Andric // Accept widening loads based on alignment. 2805ffd83dbSDimitry Andric if (IsLoad && MemSize < Size) 2815ffd83dbSDimitry Andric MemSize = std::max(MemSize, Align); 2825ffd83dbSDimitry Andric #endif 2835ffd83dbSDimitry Andric 2845ffd83dbSDimitry Andric // Only 1-byte and 2-byte to 32-bit extloads are valid. 2855ffd83dbSDimitry Andric if (MemSize != RegSize && RegSize != 32) 2865ffd83dbSDimitry Andric return false; 2875ffd83dbSDimitry Andric 2885ffd83dbSDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 2895ffd83dbSDimitry Andric return false; 2905ffd83dbSDimitry Andric 2915ffd83dbSDimitry Andric switch (MemSize) { 2925ffd83dbSDimitry Andric case 8: 2935ffd83dbSDimitry Andric case 16: 2945ffd83dbSDimitry Andric case 32: 2955ffd83dbSDimitry Andric case 64: 2965ffd83dbSDimitry Andric case 128: 2975ffd83dbSDimitry Andric break; 2985ffd83dbSDimitry Andric case 96: 2995ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 3005ffd83dbSDimitry Andric return false; 3015ffd83dbSDimitry Andric break; 3025ffd83dbSDimitry Andric case 256: 3035ffd83dbSDimitry Andric case 512: 3045ffd83dbSDimitry Andric // These may contextually need to be broken down. 3055ffd83dbSDimitry Andric break; 3065ffd83dbSDimitry Andric default: 3075ffd83dbSDimitry Andric return false; 3085ffd83dbSDimitry Andric } 3095ffd83dbSDimitry Andric 3105ffd83dbSDimitry Andric assert(RegSize >= MemSize); 3115ffd83dbSDimitry Andric 312*e8d8bef9SDimitry Andric if (AlignBits < MemSize) { 3135ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 314*e8d8bef9SDimitry Andric if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 315*e8d8bef9SDimitry Andric Align(AlignBits / 8))) 3165ffd83dbSDimitry Andric return false; 3175ffd83dbSDimitry Andric } 3185ffd83dbSDimitry Andric 3195ffd83dbSDimitry Andric return true; 3205ffd83dbSDimitry Andric } 3215ffd83dbSDimitry Andric 3225ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 3235ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care 3245ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by 3255ffd83dbSDimitry Andric // bitcasting. 3265ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) { 3275ffd83dbSDimitry Andric if (EnableNewLegality) 3285ffd83dbSDimitry Andric return false; 3295ffd83dbSDimitry Andric 3305ffd83dbSDimitry Andric const unsigned Size = Ty.getSizeInBits(); 3315ffd83dbSDimitry Andric if (Size <= 64) 3325ffd83dbSDimitry Andric return false; 3335ffd83dbSDimitry Andric if (!Ty.isVector()) 3345ffd83dbSDimitry Andric return true; 335*e8d8bef9SDimitry Andric 336*e8d8bef9SDimitry Andric LLT EltTy = Ty.getElementType(); 337*e8d8bef9SDimitry Andric if (EltTy.isPointer()) 338*e8d8bef9SDimitry Andric return true; 339*e8d8bef9SDimitry Andric 340*e8d8bef9SDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 3415ffd83dbSDimitry Andric return EltSize != 32 && EltSize != 64; 3425ffd83dbSDimitry Andric } 3435ffd83dbSDimitry Andric 3445ffd83dbSDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 3455ffd83dbSDimitry Andric unsigned Opcode) { 3465ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 3475ffd83dbSDimitry Andric return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 3485ffd83dbSDimitry Andric !loadStoreBitcastWorkaround(Ty); 3495ffd83dbSDimitry Andric } 3505ffd83dbSDimitry Andric 351*e8d8bef9SDimitry Andric /// Return true if a load or store of the type should be lowered with a bitcast 352*e8d8bef9SDimitry Andric /// to a different type. 353*e8d8bef9SDimitry Andric static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, 354*e8d8bef9SDimitry Andric const unsigned MemSizeInBits) { 355*e8d8bef9SDimitry Andric const unsigned Size = Ty.getSizeInBits(); 356*e8d8bef9SDimitry Andric if (Size != MemSizeInBits) 357*e8d8bef9SDimitry Andric return Size <= 32 && Ty.isVector(); 358*e8d8bef9SDimitry Andric 359*e8d8bef9SDimitry Andric if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 360*e8d8bef9SDimitry Andric return true; 361*e8d8bef9SDimitry Andric return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 362*e8d8bef9SDimitry Andric !isRegisterVectorElementType(Ty.getElementType()); 363*e8d8bef9SDimitry Andric } 364*e8d8bef9SDimitry Andric 365*e8d8bef9SDimitry Andric /// Return true if we should legalize a load by widening an odd sized memory 366*e8d8bef9SDimitry Andric /// access up to the alignment. Note this case when the memory access itself 367*e8d8bef9SDimitry Andric /// changes, not the size of the result register. 368*e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, 369*e8d8bef9SDimitry Andric unsigned AlignInBits, unsigned AddrSpace, 370*e8d8bef9SDimitry Andric unsigned Opcode) { 371*e8d8bef9SDimitry Andric // We don't want to widen cases that are naturally legal. 372*e8d8bef9SDimitry Andric if (isPowerOf2_32(SizeInBits)) 373*e8d8bef9SDimitry Andric return false; 374*e8d8bef9SDimitry Andric 375*e8d8bef9SDimitry Andric // If we have 96-bit memory operations, we shouldn't touch them. Note we may 376*e8d8bef9SDimitry Andric // end up widening these for a scalar load during RegBankSelect, since there 377*e8d8bef9SDimitry Andric // aren't 96-bit scalar loads. 378*e8d8bef9SDimitry Andric if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) 379*e8d8bef9SDimitry Andric return false; 380*e8d8bef9SDimitry Andric 381*e8d8bef9SDimitry Andric if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) 382*e8d8bef9SDimitry Andric return false; 383*e8d8bef9SDimitry Andric 384*e8d8bef9SDimitry Andric // A load is known dereferenceable up to the alignment, so it's legal to widen 385*e8d8bef9SDimitry Andric // to it. 386*e8d8bef9SDimitry Andric // 387*e8d8bef9SDimitry Andric // TODO: Could check dereferenceable for less aligned cases. 388*e8d8bef9SDimitry Andric unsigned RoundedSize = NextPowerOf2(SizeInBits); 389*e8d8bef9SDimitry Andric if (AlignInBits < RoundedSize) 390*e8d8bef9SDimitry Andric return false; 391*e8d8bef9SDimitry Andric 392*e8d8bef9SDimitry Andric // Do not widen if it would introduce a slow unaligned load. 393*e8d8bef9SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 394*e8d8bef9SDimitry Andric bool Fast = false; 395*e8d8bef9SDimitry Andric return TLI->allowsMisalignedMemoryAccessesImpl( 396*e8d8bef9SDimitry Andric RoundedSize, AddrSpace, Align(AlignInBits / 8), 397*e8d8bef9SDimitry Andric MachineMemOperand::MOLoad, &Fast) && 398*e8d8bef9SDimitry Andric Fast; 399*e8d8bef9SDimitry Andric } 400*e8d8bef9SDimitry Andric 401*e8d8bef9SDimitry Andric static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, 402*e8d8bef9SDimitry Andric unsigned Opcode) { 403*e8d8bef9SDimitry Andric if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) 404*e8d8bef9SDimitry Andric return false; 405*e8d8bef9SDimitry Andric 406*e8d8bef9SDimitry Andric return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, 407*e8d8bef9SDimitry Andric Query.MMODescrs[0].AlignInBits, 408*e8d8bef9SDimitry Andric Query.Types[1].getAddressSpace(), Opcode); 409*e8d8bef9SDimitry Andric } 410*e8d8bef9SDimitry Andric 4110b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 4120b57cec5SDimitry Andric const GCNTargetMachine &TM) 4130b57cec5SDimitry Andric : ST(ST_) { 4140b57cec5SDimitry Andric using namespace TargetOpcode; 4150b57cec5SDimitry Andric 4160b57cec5SDimitry Andric auto GetAddrSpacePtr = [&TM](unsigned AS) { 4170b57cec5SDimitry Andric return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 4180b57cec5SDimitry Andric }; 4190b57cec5SDimitry Andric 4200b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 421*e8d8bef9SDimitry Andric const LLT S8 = LLT::scalar(8); 4220b57cec5SDimitry Andric const LLT S16 = LLT::scalar(16); 4230b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 4240b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 4250b57cec5SDimitry Andric const LLT S128 = LLT::scalar(128); 4260b57cec5SDimitry Andric const LLT S256 = LLT::scalar(256); 4275ffd83dbSDimitry Andric const LLT S512 = LLT::scalar(512); 4285ffd83dbSDimitry Andric const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 4290b57cec5SDimitry Andric 430*e8d8bef9SDimitry Andric const LLT V2S8 = LLT::vector(2, 8); 4310b57cec5SDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 4320b57cec5SDimitry Andric const LLT V4S16 = LLT::vector(4, 16); 4330b57cec5SDimitry Andric 4340b57cec5SDimitry Andric const LLT V2S32 = LLT::vector(2, 32); 4350b57cec5SDimitry Andric const LLT V3S32 = LLT::vector(3, 32); 4360b57cec5SDimitry Andric const LLT V4S32 = LLT::vector(4, 32); 4370b57cec5SDimitry Andric const LLT V5S32 = LLT::vector(5, 32); 4380b57cec5SDimitry Andric const LLT V6S32 = LLT::vector(6, 32); 4390b57cec5SDimitry Andric const LLT V7S32 = LLT::vector(7, 32); 4400b57cec5SDimitry Andric const LLT V8S32 = LLT::vector(8, 32); 4410b57cec5SDimitry Andric const LLT V9S32 = LLT::vector(9, 32); 4420b57cec5SDimitry Andric const LLT V10S32 = LLT::vector(10, 32); 4430b57cec5SDimitry Andric const LLT V11S32 = LLT::vector(11, 32); 4440b57cec5SDimitry Andric const LLT V12S32 = LLT::vector(12, 32); 4450b57cec5SDimitry Andric const LLT V13S32 = LLT::vector(13, 32); 4460b57cec5SDimitry Andric const LLT V14S32 = LLT::vector(14, 32); 4470b57cec5SDimitry Andric const LLT V15S32 = LLT::vector(15, 32); 4480b57cec5SDimitry Andric const LLT V16S32 = LLT::vector(16, 32); 4498bcb0991SDimitry Andric const LLT V32S32 = LLT::vector(32, 32); 4500b57cec5SDimitry Andric 4510b57cec5SDimitry Andric const LLT V2S64 = LLT::vector(2, 64); 4520b57cec5SDimitry Andric const LLT V3S64 = LLT::vector(3, 64); 4530b57cec5SDimitry Andric const LLT V4S64 = LLT::vector(4, 64); 4540b57cec5SDimitry Andric const LLT V5S64 = LLT::vector(5, 64); 4550b57cec5SDimitry Andric const LLT V6S64 = LLT::vector(6, 64); 4560b57cec5SDimitry Andric const LLT V7S64 = LLT::vector(7, 64); 4570b57cec5SDimitry Andric const LLT V8S64 = LLT::vector(8, 64); 4588bcb0991SDimitry Andric const LLT V16S64 = LLT::vector(16, 64); 4590b57cec5SDimitry Andric 4600b57cec5SDimitry Andric std::initializer_list<LLT> AllS32Vectors = 4610b57cec5SDimitry Andric {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 4628bcb0991SDimitry Andric V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 4630b57cec5SDimitry Andric std::initializer_list<LLT> AllS64Vectors = 4648bcb0991SDimitry Andric {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 4650b57cec5SDimitry Andric 4660b57cec5SDimitry Andric const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 4670b57cec5SDimitry Andric const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 4688bcb0991SDimitry Andric const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 4690b57cec5SDimitry Andric const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 4708bcb0991SDimitry Andric const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 4710b57cec5SDimitry Andric const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 4720b57cec5SDimitry Andric const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 4730b57cec5SDimitry Andric 4740b57cec5SDimitry Andric const LLT CodePtr = FlatPtr; 4750b57cec5SDimitry Andric 4760b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces64 = { 4770b57cec5SDimitry Andric GlobalPtr, ConstantPtr, FlatPtr 4780b57cec5SDimitry Andric }; 4790b57cec5SDimitry Andric 4800b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces32 = { 4818bcb0991SDimitry Andric LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 4820b57cec5SDimitry Andric }; 4830b57cec5SDimitry Andric 4840b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesBase = { 4850b57cec5SDimitry Andric S32, S64 4860b57cec5SDimitry Andric }; 4870b57cec5SDimitry Andric 4880b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypes16 = { 4890b57cec5SDimitry Andric S32, S64, S16 4900b57cec5SDimitry Andric }; 4910b57cec5SDimitry Andric 4920b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesPK16 = { 4930b57cec5SDimitry Andric S32, S64, S16, V2S16 4940b57cec5SDimitry Andric }; 4950b57cec5SDimitry Andric 4965ffd83dbSDimitry Andric const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 4975ffd83dbSDimitry Andric 498480093f4SDimitry Andric setAction({G_BRCOND, S1}, Legal); // VCC branches 499480093f4SDimitry Andric setAction({G_BRCOND, S32}, Legal); // SCC branches 5000b57cec5SDimitry Andric 5010b57cec5SDimitry Andric // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 5020b57cec5SDimitry Andric // elements for v3s16 5030b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PHI) 504*e8d8bef9SDimitry Andric .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) 5050b57cec5SDimitry Andric .legalFor(AllS32Vectors) 5060b57cec5SDimitry Andric .legalFor(AllS64Vectors) 5070b57cec5SDimitry Andric .legalFor(AddrSpaces64) 5080b57cec5SDimitry Andric .legalFor(AddrSpaces32) 509*e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 510*e8d8bef9SDimitry Andric .clampScalar(0, S16, S256) 5110b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 5120b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16) 5130b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 514*e8d8bef9SDimitry Andric .scalarize(0); 5150b57cec5SDimitry Andric 516*e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { 517*e8d8bef9SDimitry Andric // Full set of gfx9 features. 5185ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 5195ffd83dbSDimitry Andric .legalFor({S32, S16, V2S16}) 5205ffd83dbSDimitry Andric .clampScalar(0, S16, S32) 5215ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 5225ffd83dbSDimitry Andric .scalarize(0) 5235ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32); 524*e8d8bef9SDimitry Andric 525*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) 526*e8d8bef9SDimitry Andric .legalFor({S32, S16, V2S16}) // Clamp modifier 527*e8d8bef9SDimitry Andric .minScalarOrElt(0, S16) 528*e8d8bef9SDimitry Andric .clampMaxNumElements(0, S16, 2) 529*e8d8bef9SDimitry Andric .scalarize(0) 530*e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 32) 531*e8d8bef9SDimitry Andric .lower(); 5325ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 5330b57cec5SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 5340b57cec5SDimitry Andric .legalFor({S32, S16}) 5350b57cec5SDimitry Andric .clampScalar(0, S16, S32) 5365ffd83dbSDimitry Andric .scalarize(0) 537*e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 538*e8d8bef9SDimitry Andric 539*e8d8bef9SDimitry Andric // Technically the saturating operations require clamp bit support, but this 540*e8d8bef9SDimitry Andric // was introduced at the same time as 16-bit operations. 541*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 542*e8d8bef9SDimitry Andric .legalFor({S32, S16}) // Clamp modifier 543*e8d8bef9SDimitry Andric .minScalar(0, S16) 544*e8d8bef9SDimitry Andric .scalarize(0) 545*e8d8bef9SDimitry Andric .widenScalarToNextPow2(0, 16) 546*e8d8bef9SDimitry Andric .lower(); 547*e8d8bef9SDimitry Andric 548*e8d8bef9SDimitry Andric // We're just lowering this, but it helps get a better result to try to 549*e8d8bef9SDimitry Andric // coerce to the desired type first. 550*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 551*e8d8bef9SDimitry Andric .minScalar(0, S16) 552*e8d8bef9SDimitry Andric .scalarize(0) 553*e8d8bef9SDimitry Andric .lower(); 5540b57cec5SDimitry Andric } else { 5550b57cec5SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 5560b57cec5SDimitry Andric .legalFor({S32}) 5570b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5580b57cec5SDimitry Andric .scalarize(0); 559*e8d8bef9SDimitry Andric 560*e8d8bef9SDimitry Andric if (ST.hasIntClamp()) { 561*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 562*e8d8bef9SDimitry Andric .legalFor({S32}) // Clamp modifier. 563*e8d8bef9SDimitry Andric .scalarize(0) 564*e8d8bef9SDimitry Andric .minScalarOrElt(0, S32) 565*e8d8bef9SDimitry Andric .lower(); 566*e8d8bef9SDimitry Andric } else { 567*e8d8bef9SDimitry Andric // Clamp bit support was added in VI, along with 16-bit operations. 568*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) 569*e8d8bef9SDimitry Andric .minScalar(0, S32) 570*e8d8bef9SDimitry Andric .scalarize(0) 571*e8d8bef9SDimitry Andric .lower(); 5720b57cec5SDimitry Andric } 5730b57cec5SDimitry Andric 574*e8d8bef9SDimitry Andric // FIXME: DAG expansion gets better results. The widening uses the smaller 575*e8d8bef9SDimitry Andric // range values and goes for the min/max lowering directly. 576*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) 577*e8d8bef9SDimitry Andric .minScalar(0, S32) 578*e8d8bef9SDimitry Andric .scalarize(0) 579*e8d8bef9SDimitry Andric .lower(); 580*e8d8bef9SDimitry Andric } 581*e8d8bef9SDimitry Andric 582480093f4SDimitry Andric getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 5835ffd83dbSDimitry Andric .customFor({S32, S64}) 584480093f4SDimitry Andric .clampScalar(0, S32, S64) 585480093f4SDimitry Andric .widenScalarToNextPow2(0, 32) 586480093f4SDimitry Andric .scalarize(0); 587480093f4SDimitry Andric 588*e8d8bef9SDimitry Andric auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 5890b57cec5SDimitry Andric .legalFor({S32}) 590*e8d8bef9SDimitry Andric .maxScalarOrElt(0, S32); 591*e8d8bef9SDimitry Andric 592*e8d8bef9SDimitry Andric if (ST.hasVOP3PInsts()) { 593*e8d8bef9SDimitry Andric Mulh 594*e8d8bef9SDimitry Andric .clampMaxNumElements(0, S8, 2) 595*e8d8bef9SDimitry Andric .lowerFor({V2S8}); 596*e8d8bef9SDimitry Andric } 597*e8d8bef9SDimitry Andric 598*e8d8bef9SDimitry Andric Mulh 599*e8d8bef9SDimitry Andric .scalarize(0) 600*e8d8bef9SDimitry Andric .lower(); 6010b57cec5SDimitry Andric 6020b57cec5SDimitry Andric // Report legal for any types we can handle anywhere. For the cases only legal 6030b57cec5SDimitry Andric // on the SALU, RegBankSelect will be able to re-legalize. 6040b57cec5SDimitry Andric getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 6050b57cec5SDimitry Andric .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 6060b57cec5SDimitry Andric .clampScalar(0, S32, S64) 6070b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 6088bcb0991SDimitry Andric .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 6090b57cec5SDimitry Andric .widenScalarToNextPow2(0) 6100b57cec5SDimitry Andric .scalarize(0); 6110b57cec5SDimitry Andric 6128bcb0991SDimitry Andric getActionDefinitionsBuilder({G_UADDO, G_USUBO, 6130b57cec5SDimitry Andric G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 614480093f4SDimitry Andric .legalFor({{S32, S1}, {S32, S32}}) 6155ffd83dbSDimitry Andric .minScalar(0, S32) 6165ffd83dbSDimitry Andric // TODO: .scalarize(0) 6178bcb0991SDimitry Andric .lower(); 6180b57cec5SDimitry Andric 6190b57cec5SDimitry Andric getActionDefinitionsBuilder(G_BITCAST) 6200b57cec5SDimitry Andric // Don't worry about the size constraint. 6218bcb0991SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 6225ffd83dbSDimitry Andric .lower(); 6230b57cec5SDimitry Andric 6240b57cec5SDimitry Andric 6250b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONSTANT) 6268bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, GlobalPtr, 6270b57cec5SDimitry Andric LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 628*e8d8bef9SDimitry Andric .legalIf(isPointer(0)) 6290b57cec5SDimitry Andric .clampScalar(0, S32, S64) 630*e8d8bef9SDimitry Andric .widenScalarToNextPow2(0); 6310b57cec5SDimitry Andric 6325ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FCONSTANT) 6335ffd83dbSDimitry Andric .legalFor({S32, S64, S16}) 6345ffd83dbSDimitry Andric .clampScalar(0, S16, S64); 6358bcb0991SDimitry Andric 6365ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 6375ffd83dbSDimitry Andric .legalIf(isRegisterType(0)) 6385ffd83dbSDimitry Andric // s1 and s16 are special cases because they have legal operations on 6395ffd83dbSDimitry Andric // them, but don't really occupy registers in the normal way. 6405ffd83dbSDimitry Andric .legalFor({S1, S16}) 6415ffd83dbSDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 6425ffd83dbSDimitry Andric .clampScalarOrElt(0, S32, MaxScalar) 6435ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 6445ffd83dbSDimitry Andric .clampMaxNumElements(0, S32, 16); 6455ffd83dbSDimitry Andric 6465ffd83dbSDimitry Andric setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 6475ffd83dbSDimitry Andric 6485ffd83dbSDimitry Andric // If the amount is divergent, we have to do a wave reduction to get the 6495ffd83dbSDimitry Andric // maximum value, so this is expanded during RegBankSelect. 6505ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_DYN_STACKALLOC) 6515ffd83dbSDimitry Andric .legalFor({{PrivatePtr, S32}}); 6525ffd83dbSDimitry Andric 6535ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_GLOBAL_VALUE) 654*e8d8bef9SDimitry Andric .customIf(typeIsNot(0, PrivatePtr)); 655*e8d8bef9SDimitry Andric 6565ffd83dbSDimitry Andric setAction({G_BLOCK_ADDR, CodePtr}, Legal); 6570b57cec5SDimitry Andric 6580b57cec5SDimitry Andric auto &FPOpActions = getActionDefinitionsBuilder( 6598bcb0991SDimitry Andric { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 6600b57cec5SDimitry Andric .legalFor({S32, S64}); 6618bcb0991SDimitry Andric auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 6628bcb0991SDimitry Andric .customFor({S32, S64}); 6638bcb0991SDimitry Andric auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 6648bcb0991SDimitry Andric .customFor({S32, S64}); 6650b57cec5SDimitry Andric 6660b57cec5SDimitry Andric if (ST.has16BitInsts()) { 6670b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 6680b57cec5SDimitry Andric FPOpActions.legalFor({S16, V2S16}); 6690b57cec5SDimitry Andric else 6700b57cec5SDimitry Andric FPOpActions.legalFor({S16}); 6718bcb0991SDimitry Andric 6728bcb0991SDimitry Andric TrigActions.customFor({S16}); 6738bcb0991SDimitry Andric FDIVActions.customFor({S16}); 6740b57cec5SDimitry Andric } 6750b57cec5SDimitry Andric 6760b57cec5SDimitry Andric auto &MinNumMaxNum = getActionDefinitionsBuilder({ 6770b57cec5SDimitry Andric G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 6780b57cec5SDimitry Andric 6790b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 6800b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesPK16) 681480093f4SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 6820b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 6830b57cec5SDimitry Andric .clampScalar(0, S16, S64) 6840b57cec5SDimitry Andric .scalarize(0); 6850b57cec5SDimitry Andric } else if (ST.has16BitInsts()) { 6860b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypes16) 6870b57cec5SDimitry Andric .clampScalar(0, S16, S64) 6880b57cec5SDimitry Andric .scalarize(0); 6890b57cec5SDimitry Andric } else { 6900b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesBase) 6910b57cec5SDimitry Andric .clampScalar(0, S32, S64) 6920b57cec5SDimitry Andric .scalarize(0); 6930b57cec5SDimitry Andric } 6940b57cec5SDimitry Andric 6950b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 6960b57cec5SDimitry Andric FPOpActions.clampMaxNumElements(0, S16, 2); 6978bcb0991SDimitry Andric 6980b57cec5SDimitry Andric FPOpActions 6990b57cec5SDimitry Andric .scalarize(0) 7000b57cec5SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7010b57cec5SDimitry Andric 7028bcb0991SDimitry Andric TrigActions 7038bcb0991SDimitry Andric .scalarize(0) 7048bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7058bcb0991SDimitry Andric 7068bcb0991SDimitry Andric FDIVActions 7078bcb0991SDimitry Andric .scalarize(0) 7088bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 7098bcb0991SDimitry Andric 7108bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FNEG, G_FABS}) 7118bcb0991SDimitry Andric .legalFor(FPTypesPK16) 7128bcb0991SDimitry Andric .clampMaxNumElements(0, S16, 2) 7138bcb0991SDimitry Andric .scalarize(0) 7148bcb0991SDimitry Andric .clampScalar(0, S16, S64); 7158bcb0991SDimitry Andric 7160b57cec5SDimitry Andric if (ST.has16BitInsts()) { 7178bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 7180b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 7190b57cec5SDimitry Andric .scalarize(0) 7200b57cec5SDimitry Andric .clampScalar(0, S16, S64); 7210b57cec5SDimitry Andric } else { 7225ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSQRT) 7235ffd83dbSDimitry Andric .legalFor({S32, S64}) 7245ffd83dbSDimitry Andric .scalarize(0) 7255ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 7265ffd83dbSDimitry Andric 7275ffd83dbSDimitry Andric if (ST.hasFractBug()) { 7285ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 7295ffd83dbSDimitry Andric .customFor({S64}) 7305ffd83dbSDimitry Andric .legalFor({S32, S64}) 7315ffd83dbSDimitry Andric .scalarize(0) 7325ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 7335ffd83dbSDimitry Andric } else { 7345ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 7350b57cec5SDimitry Andric .legalFor({S32, S64}) 7360b57cec5SDimitry Andric .scalarize(0) 7370b57cec5SDimitry Andric .clampScalar(0, S32, S64); 7380b57cec5SDimitry Andric } 7395ffd83dbSDimitry Andric } 7400b57cec5SDimitry Andric 7410b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPTRUNC) 7420b57cec5SDimitry Andric .legalFor({{S32, S64}, {S16, S32}}) 7435ffd83dbSDimitry Andric .scalarize(0) 7445ffd83dbSDimitry Andric .lower(); 7450b57cec5SDimitry Andric 7460b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPEXT) 7470b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}}) 748*e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) 7490b57cec5SDimitry Andric .scalarize(0); 7500b57cec5SDimitry Andric 7510b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FSUB) 7520b57cec5SDimitry Andric // Use actual fsub instruction 7530b57cec5SDimitry Andric .legalFor({S32}) 7540b57cec5SDimitry Andric // Must use fadd + fneg 7550b57cec5SDimitry Andric .lowerFor({S64, S16, V2S16}) 7560b57cec5SDimitry Andric .scalarize(0) 7570b57cec5SDimitry Andric .clampScalar(0, S32, S64); 7580b57cec5SDimitry Andric 7598bcb0991SDimitry Andric // Whether this is legal depends on the floating point mode for the function. 7608bcb0991SDimitry Andric auto &FMad = getActionDefinitionsBuilder(G_FMAD); 7615ffd83dbSDimitry Andric if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 7628bcb0991SDimitry Andric FMad.customFor({S32, S16}); 7635ffd83dbSDimitry Andric else if (ST.hasMadMacF32Insts()) 7648bcb0991SDimitry Andric FMad.customFor({S32}); 7655ffd83dbSDimitry Andric else if (ST.hasMadF16()) 7665ffd83dbSDimitry Andric FMad.customFor({S16}); 7678bcb0991SDimitry Andric FMad.scalarize(0) 7688bcb0991SDimitry Andric .lower(); 7698bcb0991SDimitry Andric 770*e8d8bef9SDimitry Andric auto &FRem = getActionDefinitionsBuilder(G_FREM); 771*e8d8bef9SDimitry Andric if (ST.has16BitInsts()) { 772*e8d8bef9SDimitry Andric FRem.customFor({S16, S32, S64}); 773*e8d8bef9SDimitry Andric } else { 774*e8d8bef9SDimitry Andric FRem.minScalar(0, S32) 775*e8d8bef9SDimitry Andric .customFor({S32, S64}); 776*e8d8bef9SDimitry Andric } 777*e8d8bef9SDimitry Andric FRem.scalarize(0); 778*e8d8bef9SDimitry Andric 7795ffd83dbSDimitry Andric // TODO: Do we need to clamp maximum bitwidth? 7805ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_TRUNC) 7815ffd83dbSDimitry Andric .legalIf(isScalar(0)) 7825ffd83dbSDimitry Andric .legalFor({{V2S16, V2S32}}) 7835ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 7845ffd83dbSDimitry Andric // Avoid scalarizing in cases that should be truly illegal. In unresolvable 7855ffd83dbSDimitry Andric // situations (like an invalid implicit use), we don't want to infinite loop 7865ffd83dbSDimitry Andric // in the legalizer. 7875ffd83dbSDimitry Andric .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 7885ffd83dbSDimitry Andric .alwaysLegal(); 7895ffd83dbSDimitry Andric 7900b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 7910b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 7925ffd83dbSDimitry Andric {S32, S1}, {S64, S1}, {S16, S1}}) 793480093f4SDimitry Andric .scalarize(0) 7945ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 7955ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 7960b57cec5SDimitry Andric 7978bcb0991SDimitry Andric // TODO: Split s1->s64 during regbankselect for VALU. 7988bcb0991SDimitry Andric auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 799480093f4SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 8000b57cec5SDimitry Andric .lowerFor({{S32, S64}}) 801480093f4SDimitry Andric .lowerIf(typeIs(1, S1)) 8028bcb0991SDimitry Andric .customFor({{S64, S64}}); 8038bcb0991SDimitry Andric if (ST.has16BitInsts()) 8048bcb0991SDimitry Andric IToFP.legalFor({{S16, S16}}); 8058bcb0991SDimitry Andric IToFP.clampScalar(1, S32, S64) 806*e8d8bef9SDimitry Andric .minScalar(0, S32) 8075ffd83dbSDimitry Andric .scalarize(0) 8085ffd83dbSDimitry Andric .widenScalarToNextPow2(1); 8090b57cec5SDimitry Andric 8108bcb0991SDimitry Andric auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 8115ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 812*e8d8bef9SDimitry Andric .customFor({{S64, S64}}) 813*e8d8bef9SDimitry Andric .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); 8148bcb0991SDimitry Andric if (ST.has16BitInsts()) 8158bcb0991SDimitry Andric FPToI.legalFor({{S16, S16}}); 8168bcb0991SDimitry Andric else 8178bcb0991SDimitry Andric FPToI.minScalar(1, S32); 8188bcb0991SDimitry Andric 8198bcb0991SDimitry Andric FPToI.minScalar(0, S32) 8205ffd83dbSDimitry Andric .scalarize(0) 8215ffd83dbSDimitry Andric .lower(); 8220b57cec5SDimitry Andric 823*e8d8bef9SDimitry Andric // Lower roundeven into G_FRINT 824*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) 825480093f4SDimitry Andric .scalarize(0) 826480093f4SDimitry Andric .lower(); 8270b57cec5SDimitry Andric 828480093f4SDimitry Andric if (ST.has16BitInsts()) { 829480093f4SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 830480093f4SDimitry Andric .legalFor({S16, S32, S64}) 831480093f4SDimitry Andric .clampScalar(0, S16, S64) 832480093f4SDimitry Andric .scalarize(0); 833480093f4SDimitry Andric } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 8340b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 8350b57cec5SDimitry Andric .legalFor({S32, S64}) 8360b57cec5SDimitry Andric .clampScalar(0, S32, S64) 8370b57cec5SDimitry Andric .scalarize(0); 8380b57cec5SDimitry Andric } else { 8390b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 8400b57cec5SDimitry Andric .legalFor({S32}) 8410b57cec5SDimitry Andric .customFor({S64}) 8420b57cec5SDimitry Andric .clampScalar(0, S32, S64) 8430b57cec5SDimitry Andric .scalarize(0); 8440b57cec5SDimitry Andric } 8450b57cec5SDimitry Andric 846480093f4SDimitry Andric getActionDefinitionsBuilder(G_PTR_ADD) 847*e8d8bef9SDimitry Andric .legalIf(all(isPointer(0), sameSize(0, 1))) 848*e8d8bef9SDimitry Andric .scalarize(0) 849*e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0); 8500b57cec5SDimitry Andric 8515ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_PTRMASK) 852*e8d8bef9SDimitry Andric .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) 853*e8d8bef9SDimitry Andric .scalarSameSizeAs(1, 0) 8545ffd83dbSDimitry Andric .scalarize(0); 8550b57cec5SDimitry Andric 8560b57cec5SDimitry Andric auto &CmpBuilder = 8570b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ICMP) 858480093f4SDimitry Andric // The compare output type differs based on the register bank of the output, 859480093f4SDimitry Andric // so make both s1 and s32 legal. 860480093f4SDimitry Andric // 861480093f4SDimitry Andric // Scalar compares producing output in scc will be promoted to s32, as that 862480093f4SDimitry Andric // is the allocatable register type that will be needed for the copy from 863480093f4SDimitry Andric // scc. This will be promoted during RegBankSelect, and we assume something 864480093f4SDimitry Andric // before that won't try to use s32 result types. 865480093f4SDimitry Andric // 866480093f4SDimitry Andric // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 867480093f4SDimitry Andric // bank. 8680b57cec5SDimitry Andric .legalForCartesianProduct( 8690b57cec5SDimitry Andric {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 870480093f4SDimitry Andric .legalForCartesianProduct( 871480093f4SDimitry Andric {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 8720b57cec5SDimitry Andric if (ST.has16BitInsts()) { 8730b57cec5SDimitry Andric CmpBuilder.legalFor({{S1, S16}}); 8740b57cec5SDimitry Andric } 8750b57cec5SDimitry Andric 8760b57cec5SDimitry Andric CmpBuilder 8770b57cec5SDimitry Andric .widenScalarToNextPow2(1) 8780b57cec5SDimitry Andric .clampScalar(1, S32, S64) 8790b57cec5SDimitry Andric .scalarize(0) 880480093f4SDimitry Andric .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 8810b57cec5SDimitry Andric 8820b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCMP) 8830b57cec5SDimitry Andric .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 8840b57cec5SDimitry Andric .widenScalarToNextPow2(1) 8850b57cec5SDimitry Andric .clampScalar(1, S32, S64) 8860b57cec5SDimitry Andric .scalarize(0); 8870b57cec5SDimitry Andric 8885ffd83dbSDimitry Andric // FIXME: fpow has a selection pattern that should move to custom lowering. 8895ffd83dbSDimitry Andric auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 8905ffd83dbSDimitry Andric if (ST.has16BitInsts()) 8915ffd83dbSDimitry Andric Exp2Ops.legalFor({S32, S16}); 8925ffd83dbSDimitry Andric else 8935ffd83dbSDimitry Andric Exp2Ops.legalFor({S32}); 8945ffd83dbSDimitry Andric Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 8955ffd83dbSDimitry Andric Exp2Ops.scalarize(0); 8965ffd83dbSDimitry Andric 8975ffd83dbSDimitry Andric auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 8985ffd83dbSDimitry Andric if (ST.has16BitInsts()) 8995ffd83dbSDimitry Andric ExpOps.customFor({{S32}, {S16}}); 9005ffd83dbSDimitry Andric else 9015ffd83dbSDimitry Andric ExpOps.customFor({S32}); 9025ffd83dbSDimitry Andric ExpOps.clampScalar(0, MinScalarFPTy, S32) 9030b57cec5SDimitry Andric .scalarize(0); 9040b57cec5SDimitry Andric 905*e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FPOWI) 906*e8d8bef9SDimitry Andric .clampScalar(0, MinScalarFPTy, S32) 907*e8d8bef9SDimitry Andric .lower(); 908*e8d8bef9SDimitry Andric 9090b57cec5SDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 9105ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_CTPOP) 9110b57cec5SDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 9120b57cec5SDimitry Andric .clampScalar(0, S32, S32) 9130b57cec5SDimitry Andric .clampScalar(1, S32, S64) 9140b57cec5SDimitry Andric .scalarize(0) 9150b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 9160b57cec5SDimitry Andric .widenScalarToNextPow2(1, 32); 9170b57cec5SDimitry Andric 9185ffd83dbSDimitry Andric // The hardware instructions return a different result on 0 than the generic 9195ffd83dbSDimitry Andric // instructions expect. The hardware produces -1, but these produce the 9205ffd83dbSDimitry Andric // bitwidth. 9215ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 9225ffd83dbSDimitry Andric .scalarize(0) 9235ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 9245ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 9255ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 9265ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32) 9275ffd83dbSDimitry Andric .lower(); 9285ffd83dbSDimitry Andric 9295ffd83dbSDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 9305ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 9315ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 9325ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 9335ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 9345ffd83dbSDimitry Andric .scalarize(0) 9355ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 9365ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 9375ffd83dbSDimitry Andric 9385ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BITREVERSE) 9390b57cec5SDimitry Andric .legalFor({S32}) 9400b57cec5SDimitry Andric .clampScalar(0, S32, S32) 9410b57cec5SDimitry Andric .scalarize(0); 9420b57cec5SDimitry Andric 9430b57cec5SDimitry Andric if (ST.has16BitInsts()) { 9445ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 9455ffd83dbSDimitry Andric .legalFor({S16, S32, V2S16}) 9465ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 9475ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 9485ffd83dbSDimitry Andric // narrowScalar limitation. 9495ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 9505ffd83dbSDimitry Andric .clampScalar(0, S16, S32) 9515ffd83dbSDimitry Andric .scalarize(0); 9525ffd83dbSDimitry Andric 9530b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 9540b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 9550b57cec5SDimitry Andric .legalFor({S32, S16, V2S16}) 9560b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 9570b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 9585ffd83dbSDimitry Andric .minScalar(0, S16) 9590b57cec5SDimitry Andric .widenScalarToNextPow2(0) 9605ffd83dbSDimitry Andric .scalarize(0) 9615ffd83dbSDimitry Andric .lower(); 9620b57cec5SDimitry Andric } else { 9630b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 9640b57cec5SDimitry Andric .legalFor({S32, S16}) 9650b57cec5SDimitry Andric .widenScalarToNextPow2(0) 9665ffd83dbSDimitry Andric .minScalar(0, S16) 9675ffd83dbSDimitry Andric .scalarize(0) 9685ffd83dbSDimitry Andric .lower(); 9690b57cec5SDimitry Andric } 9700b57cec5SDimitry Andric } else { 9715ffd83dbSDimitry Andric // TODO: Should have same legality without v_perm_b32 9725ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 9735ffd83dbSDimitry Andric .legalFor({S32}) 9745ffd83dbSDimitry Andric .lowerIf(scalarNarrowerThan(0, 32)) 9755ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 9765ffd83dbSDimitry Andric // narrowScalar limitation. 9775ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 9785ffd83dbSDimitry Andric .maxScalar(0, S32) 9795ffd83dbSDimitry Andric .scalarize(0) 9805ffd83dbSDimitry Andric .lower(); 9815ffd83dbSDimitry Andric 9820b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 9830b57cec5SDimitry Andric .legalFor({S32}) 9845ffd83dbSDimitry Andric .minScalar(0, S32) 9850b57cec5SDimitry Andric .widenScalarToNextPow2(0) 9865ffd83dbSDimitry Andric .scalarize(0) 9875ffd83dbSDimitry Andric .lower(); 9880b57cec5SDimitry Andric } 9890b57cec5SDimitry Andric 9900b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTTOPTR) 9910b57cec5SDimitry Andric // List the common cases 9920b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 9930b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 9940b57cec5SDimitry Andric .scalarize(0) 9950b57cec5SDimitry Andric // Accept any address space as long as the size matches 9960b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 9970b57cec5SDimitry Andric .widenScalarIf(smallerThan(1, 0), 9980b57cec5SDimitry Andric [](const LegalityQuery &Query) { 9990b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 10000b57cec5SDimitry Andric }) 10015ffd83dbSDimitry Andric .narrowScalarIf(largerThan(1, 0), 10020b57cec5SDimitry Andric [](const LegalityQuery &Query) { 10030b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 10040b57cec5SDimitry Andric }); 10050b57cec5SDimitry Andric 10060b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PTRTOINT) 10070b57cec5SDimitry Andric // List the common cases 10080b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 10090b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 10100b57cec5SDimitry Andric .scalarize(0) 10110b57cec5SDimitry Andric // Accept any address space as long as the size matches 10120b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 10130b57cec5SDimitry Andric .widenScalarIf(smallerThan(0, 1), 10140b57cec5SDimitry Andric [](const LegalityQuery &Query) { 10150b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 10160b57cec5SDimitry Andric }) 10170b57cec5SDimitry Andric .narrowScalarIf( 10185ffd83dbSDimitry Andric largerThan(0, 1), 10190b57cec5SDimitry Andric [](const LegalityQuery &Query) { 10200b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 10210b57cec5SDimitry Andric }); 10220b57cec5SDimitry Andric 10230b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 10240b57cec5SDimitry Andric .scalarize(0) 10250b57cec5SDimitry Andric .custom(); 10260b57cec5SDimitry Andric 10275ffd83dbSDimitry Andric const auto needToSplitMemOp = [=](const LegalityQuery &Query, 10285ffd83dbSDimitry Andric bool IsLoad) -> bool { 10298bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 10308bcb0991SDimitry Andric 10318bcb0991SDimitry Andric // Split vector extloads. 10328bcb0991SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1033*e8d8bef9SDimitry Andric unsigned AlignBits = Query.MMODescrs[0].AlignInBits; 1034480093f4SDimitry Andric 1035480093f4SDimitry Andric if (MemSize < DstTy.getSizeInBits()) 1036*e8d8bef9SDimitry Andric MemSize = std::max(MemSize, AlignBits); 1037480093f4SDimitry Andric 10388bcb0991SDimitry Andric if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 10398bcb0991SDimitry Andric return true; 10408bcb0991SDimitry Andric 10418bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 10428bcb0991SDimitry Andric unsigned AS = PtrTy.getAddressSpace(); 10435ffd83dbSDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 10448bcb0991SDimitry Andric return true; 10458bcb0991SDimitry Andric 10468bcb0991SDimitry Andric // Catch weird sized loads that don't evenly divide into the access sizes 10478bcb0991SDimitry Andric // TODO: May be able to widen depending on alignment etc. 10485ffd83dbSDimitry Andric unsigned NumRegs = (MemSize + 31) / 32; 10495ffd83dbSDimitry Andric if (NumRegs == 3) { 10505ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 10518bcb0991SDimitry Andric return true; 10525ffd83dbSDimitry Andric } else { 10535ffd83dbSDimitry Andric // If the alignment allows, these should have been widened. 10545ffd83dbSDimitry Andric if (!isPowerOf2_32(NumRegs)) 10555ffd83dbSDimitry Andric return true; 10565ffd83dbSDimitry Andric } 10578bcb0991SDimitry Andric 1058*e8d8bef9SDimitry Andric if (AlignBits < MemSize) { 10598bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 1060*e8d8bef9SDimitry Andric return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, 1061*e8d8bef9SDimitry Andric Align(AlignBits / 8)); 10628bcb0991SDimitry Andric } 10638bcb0991SDimitry Andric 10648bcb0991SDimitry Andric return false; 10658bcb0991SDimitry Andric }; 10668bcb0991SDimitry Andric 1067*e8d8bef9SDimitry Andric unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; 1068*e8d8bef9SDimitry Andric unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; 1069*e8d8bef9SDimitry Andric unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; 10708bcb0991SDimitry Andric 10718bcb0991SDimitry Andric // TODO: Refine based on subtargets which support unaligned access or 128-bit 10728bcb0991SDimitry Andric // LDS 10738bcb0991SDimitry Andric // TODO: Unsupported flat for SI. 10748bcb0991SDimitry Andric 10758bcb0991SDimitry Andric for (unsigned Op : {G_LOAD, G_STORE}) { 10768bcb0991SDimitry Andric const bool IsStore = Op == G_STORE; 10778bcb0991SDimitry Andric 10788bcb0991SDimitry Andric auto &Actions = getActionDefinitionsBuilder(Op); 10795ffd83dbSDimitry Andric // Explicitly list some common cases. 10805ffd83dbSDimitry Andric // TODO: Does this help compile time at all? 10818bcb0991SDimitry Andric Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 10828bcb0991SDimitry Andric {V2S32, GlobalPtr, 64, GlobalAlign32}, 10838bcb0991SDimitry Andric {V4S32, GlobalPtr, 128, GlobalAlign32}, 10848bcb0991SDimitry Andric {S64, GlobalPtr, 64, GlobalAlign32}, 10858bcb0991SDimitry Andric {V2S64, GlobalPtr, 128, GlobalAlign32}, 10868bcb0991SDimitry Andric {V2S16, GlobalPtr, 32, GlobalAlign32}, 10878bcb0991SDimitry Andric {S32, GlobalPtr, 8, GlobalAlign8}, 10888bcb0991SDimitry Andric {S32, GlobalPtr, 16, GlobalAlign16}, 10898bcb0991SDimitry Andric 10908bcb0991SDimitry Andric {S32, LocalPtr, 32, 32}, 10918bcb0991SDimitry Andric {S64, LocalPtr, 64, 32}, 10928bcb0991SDimitry Andric {V2S32, LocalPtr, 64, 32}, 10938bcb0991SDimitry Andric {S32, LocalPtr, 8, 8}, 10948bcb0991SDimitry Andric {S32, LocalPtr, 16, 16}, 10958bcb0991SDimitry Andric {V2S16, LocalPtr, 32, 32}, 10968bcb0991SDimitry Andric 10978bcb0991SDimitry Andric {S32, PrivatePtr, 32, 32}, 10988bcb0991SDimitry Andric {S32, PrivatePtr, 8, 8}, 10998bcb0991SDimitry Andric {S32, PrivatePtr, 16, 16}, 11008bcb0991SDimitry Andric {V2S16, PrivatePtr, 32, 32}, 11018bcb0991SDimitry Andric 11028bcb0991SDimitry Andric {S32, ConstantPtr, 32, GlobalAlign32}, 11038bcb0991SDimitry Andric {V2S32, ConstantPtr, 64, GlobalAlign32}, 11048bcb0991SDimitry Andric {V4S32, ConstantPtr, 128, GlobalAlign32}, 11058bcb0991SDimitry Andric {S64, ConstantPtr, 64, GlobalAlign32}, 11068bcb0991SDimitry Andric {V2S32, ConstantPtr, 32, GlobalAlign32}}); 11075ffd83dbSDimitry Andric Actions.legalIf( 11085ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 11095ffd83dbSDimitry Andric return isLoadStoreLegal(ST, Query, Op); 11105ffd83dbSDimitry Andric }); 11115ffd83dbSDimitry Andric 11125ffd83dbSDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 11135ffd83dbSDimitry Andric // 64-bits. 11145ffd83dbSDimitry Andric // 11155ffd83dbSDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 11165ffd83dbSDimitry Andric // inserting addrspacecasts. 11175ffd83dbSDimitry Andric Actions.customIf(typeIs(1, Constant32Ptr)); 11185ffd83dbSDimitry Andric 11195ffd83dbSDimitry Andric // Turn any illegal element vectors into something easier to deal 11205ffd83dbSDimitry Andric // with. These will ultimately produce 32-bit scalar shifts to extract the 11215ffd83dbSDimitry Andric // parts anyway. 11225ffd83dbSDimitry Andric // 11235ffd83dbSDimitry Andric // For odd 16-bit element vectors, prefer to split those into pieces with 11245ffd83dbSDimitry Andric // 16-bit vector parts. 11255ffd83dbSDimitry Andric Actions.bitcastIf( 11265ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 1127*e8d8bef9SDimitry Andric return shouldBitcastLoadStoreType(ST, Query.Types[0], 1128*e8d8bef9SDimitry Andric Query.MMODescrs[0].SizeInBits); 11295ffd83dbSDimitry Andric }, bitcastToRegisterType(0)); 11305ffd83dbSDimitry Andric 1131*e8d8bef9SDimitry Andric if (!IsStore) { 1132*e8d8bef9SDimitry Andric // Widen suitably aligned loads by loading extra bytes. The standard 1133*e8d8bef9SDimitry Andric // legalization actions can't properly express widening memory operands. 1134*e8d8bef9SDimitry Andric Actions.customIf([=](const LegalityQuery &Query) -> bool { 1135*e8d8bef9SDimitry Andric return shouldWidenLoad(ST, Query, G_LOAD); 1136*e8d8bef9SDimitry Andric }); 1137*e8d8bef9SDimitry Andric } 1138*e8d8bef9SDimitry Andric 1139*e8d8bef9SDimitry Andric // FIXME: load/store narrowing should be moved to lower action 11408bcb0991SDimitry Andric Actions 11418bcb0991SDimitry Andric .narrowScalarIf( 11428bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 11435ffd83dbSDimitry Andric return !Query.Types[0].isVector() && 11445ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 11458bcb0991SDimitry Andric }, 11468bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 11478bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 11488bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 11498bcb0991SDimitry Andric 11508bcb0991SDimitry Andric const unsigned DstSize = DstTy.getSizeInBits(); 11518bcb0991SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 11528bcb0991SDimitry Andric 11538bcb0991SDimitry Andric // Split extloads. 11548bcb0991SDimitry Andric if (DstSize > MemSize) 11558bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MemSize)); 11568bcb0991SDimitry Andric 11575ffd83dbSDimitry Andric if (!isPowerOf2_32(DstSize)) { 11585ffd83dbSDimitry Andric // We're probably decomposing an odd sized store. Try to split 11595ffd83dbSDimitry Andric // to the widest type. TODO: Account for alignment. As-is it 11605ffd83dbSDimitry Andric // should be OK, since the new parts will be further legalized. 11615ffd83dbSDimitry Andric unsigned FloorSize = PowerOf2Floor(DstSize); 11625ffd83dbSDimitry Andric return std::make_pair(0, LLT::scalar(FloorSize)); 11635ffd83dbSDimitry Andric } 11645ffd83dbSDimitry Andric 11658bcb0991SDimitry Andric if (DstSize > 32 && (DstSize % 32 != 0)) { 11668bcb0991SDimitry Andric // FIXME: Need a way to specify non-extload of larger size if 11678bcb0991SDimitry Andric // suitably aligned. 11688bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 11698bcb0991SDimitry Andric } 11708bcb0991SDimitry Andric 11715ffd83dbSDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(ST, 11725ffd83dbSDimitry Andric PtrTy.getAddressSpace(), 11735ffd83dbSDimitry Andric Op == G_LOAD); 11748bcb0991SDimitry Andric if (MemSize > MaxSize) 11758bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MaxSize)); 11768bcb0991SDimitry Andric 11778bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 11788bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(Align)); 11798bcb0991SDimitry Andric }) 11808bcb0991SDimitry Andric .fewerElementsIf( 11818bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 11825ffd83dbSDimitry Andric return Query.Types[0].isVector() && 11835ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 11848bcb0991SDimitry Andric }, 11858bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 11868bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 11878bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 11888bcb0991SDimitry Andric 11898bcb0991SDimitry Andric LLT EltTy = DstTy.getElementType(); 11905ffd83dbSDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(ST, 11915ffd83dbSDimitry Andric PtrTy.getAddressSpace(), 11925ffd83dbSDimitry Andric Op == G_LOAD); 11935ffd83dbSDimitry Andric 11945ffd83dbSDimitry Andric // FIXME: Handle widened to power of 2 results better. This ends 11955ffd83dbSDimitry Andric // up scalarizing. 11965ffd83dbSDimitry Andric // FIXME: 3 element stores scalarized on SI 11978bcb0991SDimitry Andric 11988bcb0991SDimitry Andric // Split if it's too large for the address space. 11998bcb0991SDimitry Andric if (Query.MMODescrs[0].SizeInBits > MaxSize) { 12008bcb0991SDimitry Andric unsigned NumElts = DstTy.getNumElements(); 12015ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 12025ffd83dbSDimitry Andric 12035ffd83dbSDimitry Andric if (MaxSize % EltSize == 0) { 12045ffd83dbSDimitry Andric return std::make_pair( 12055ffd83dbSDimitry Andric 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 12065ffd83dbSDimitry Andric } 12075ffd83dbSDimitry Andric 12088bcb0991SDimitry Andric unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 12098bcb0991SDimitry Andric 12108bcb0991SDimitry Andric // FIXME: Refine when odd breakdowns handled 12118bcb0991SDimitry Andric // The scalars will need to be re-legalized. 12128bcb0991SDimitry Andric if (NumPieces == 1 || NumPieces >= NumElts || 12138bcb0991SDimitry Andric NumElts % NumPieces != 0) 12148bcb0991SDimitry Andric return std::make_pair(0, EltTy); 12158bcb0991SDimitry Andric 12168bcb0991SDimitry Andric return std::make_pair(0, 12178bcb0991SDimitry Andric LLT::vector(NumElts / NumPieces, EltTy)); 12188bcb0991SDimitry Andric } 12198bcb0991SDimitry Andric 12205ffd83dbSDimitry Andric // FIXME: We could probably handle weird extending loads better. 12215ffd83dbSDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 12225ffd83dbSDimitry Andric if (DstTy.getSizeInBits() > MemSize) 12235ffd83dbSDimitry Andric return std::make_pair(0, EltTy); 12245ffd83dbSDimitry Andric 12255ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 12265ffd83dbSDimitry Andric unsigned DstSize = DstTy.getSizeInBits(); 12275ffd83dbSDimitry Andric if (!isPowerOf2_32(DstSize)) { 12285ffd83dbSDimitry Andric // We're probably decomposing an odd sized store. Try to split 12295ffd83dbSDimitry Andric // to the widest type. TODO: Account for alignment. As-is it 12305ffd83dbSDimitry Andric // should be OK, since the new parts will be further legalized. 12315ffd83dbSDimitry Andric unsigned FloorSize = PowerOf2Floor(DstSize); 12325ffd83dbSDimitry Andric return std::make_pair( 12335ffd83dbSDimitry Andric 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 12345ffd83dbSDimitry Andric } 12355ffd83dbSDimitry Andric 12368bcb0991SDimitry Andric // Need to split because of alignment. 12378bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 12388bcb0991SDimitry Andric if (EltSize > Align && 12398bcb0991SDimitry Andric (EltSize / Align < DstTy.getNumElements())) { 12408bcb0991SDimitry Andric return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 12418bcb0991SDimitry Andric } 12428bcb0991SDimitry Andric 12438bcb0991SDimitry Andric // May need relegalization for the scalars. 12448bcb0991SDimitry Andric return std::make_pair(0, EltTy); 12458bcb0991SDimitry Andric }) 1246*e8d8bef9SDimitry Andric .lowerIfMemSizeNotPow2() 12478bcb0991SDimitry Andric .minScalar(0, S32); 12488bcb0991SDimitry Andric 12498bcb0991SDimitry Andric if (IsStore) 12508bcb0991SDimitry Andric Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 12518bcb0991SDimitry Andric 12528bcb0991SDimitry Andric Actions 12538bcb0991SDimitry Andric .widenScalarToNextPow2(0) 1254*e8d8bef9SDimitry Andric .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) 1255*e8d8bef9SDimitry Andric .lower(); 12568bcb0991SDimitry Andric } 12570b57cec5SDimitry Andric 12580b57cec5SDimitry Andric auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 12598bcb0991SDimitry Andric .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 12608bcb0991SDimitry Andric {S32, GlobalPtr, 16, 2 * 8}, 12610b57cec5SDimitry Andric {S32, LocalPtr, 8, 8}, 12628bcb0991SDimitry Andric {S32, LocalPtr, 16, 16}, 12630b57cec5SDimitry Andric {S32, PrivatePtr, 8, 8}, 12648bcb0991SDimitry Andric {S32, PrivatePtr, 16, 16}, 12658bcb0991SDimitry Andric {S32, ConstantPtr, 8, 8}, 12668bcb0991SDimitry Andric {S32, ConstantPtr, 16, 2 * 8}}); 12670b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 12688bcb0991SDimitry Andric ExtLoads.legalForTypesWithMemDesc( 12698bcb0991SDimitry Andric {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 12700b57cec5SDimitry Andric } 12710b57cec5SDimitry Andric 12720b57cec5SDimitry Andric ExtLoads.clampScalar(0, S32, S32) 12730b57cec5SDimitry Andric .widenScalarToNextPow2(0) 12740b57cec5SDimitry Andric .unsupportedIfMemSizeNotPow2() 12750b57cec5SDimitry Andric .lower(); 12760b57cec5SDimitry Andric 12770b57cec5SDimitry Andric auto &Atomics = getActionDefinitionsBuilder( 12780b57cec5SDimitry Andric {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 12790b57cec5SDimitry Andric G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 12800b57cec5SDimitry Andric G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1281480093f4SDimitry Andric G_ATOMICRMW_UMIN}) 12820b57cec5SDimitry Andric .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 1283*e8d8bef9SDimitry Andric {S64, GlobalPtr}, {S64, LocalPtr}, 1284*e8d8bef9SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 12850b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 12860b57cec5SDimitry Andric Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 12870b57cec5SDimitry Andric } 12880b57cec5SDimitry Andric 12895ffd83dbSDimitry Andric if (ST.hasLDSFPAtomics()) { 12908bcb0991SDimitry Andric getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 1291*e8d8bef9SDimitry Andric .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); 12925ffd83dbSDimitry Andric } 12938bcb0991SDimitry Andric 1294480093f4SDimitry Andric // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1295480093f4SDimitry Andric // demarshalling 1296480093f4SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1297480093f4SDimitry Andric .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1298480093f4SDimitry Andric {S32, FlatPtr}, {S64, FlatPtr}}) 1299480093f4SDimitry Andric .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1300480093f4SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 13010b57cec5SDimitry Andric // TODO: Pointer types, any 32-bit or 64-bit vector 1302480093f4SDimitry Andric 1303480093f4SDimitry Andric // Condition should be s32 for scalar, s1 for vector. 13040b57cec5SDimitry Andric getActionDefinitionsBuilder(G_SELECT) 13050b57cec5SDimitry Andric .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 13060b57cec5SDimitry Andric GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1307480093f4SDimitry Andric LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 13080b57cec5SDimitry Andric .clampScalar(0, S16, S64) 13095ffd83dbSDimitry Andric .scalarize(1) 13100b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 13110b57cec5SDimitry Andric .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 13120b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 2) 13130b57cec5SDimitry Andric .clampMaxNumElements(0, LocalPtr, 2) 13140b57cec5SDimitry Andric .clampMaxNumElements(0, PrivatePtr, 2) 13150b57cec5SDimitry Andric .scalarize(0) 13160b57cec5SDimitry Andric .widenScalarToNextPow2(0) 1317480093f4SDimitry Andric .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 13180b57cec5SDimitry Andric 13190b57cec5SDimitry Andric // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 13200b57cec5SDimitry Andric // be more flexible with the shift amount type. 13210b57cec5SDimitry Andric auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 13220b57cec5SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}); 13230b57cec5SDimitry Andric if (ST.has16BitInsts()) { 13240b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 13255ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 13260b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2); 13270b57cec5SDimitry Andric } else 13285ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}}); 13290b57cec5SDimitry Andric 13305ffd83dbSDimitry Andric // TODO: Support 16-bit shift amounts for all types 13315ffd83dbSDimitry Andric Shifts.widenScalarIf( 13325ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { 13335ffd83dbSDimitry Andric // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 13345ffd83dbSDimitry Andric // 32-bit amount. 13355ffd83dbSDimitry Andric const LLT ValTy = Query.Types[0]; 13365ffd83dbSDimitry Andric const LLT AmountTy = Query.Types[1]; 13375ffd83dbSDimitry Andric return ValTy.getSizeInBits() <= 16 && 13385ffd83dbSDimitry Andric AmountTy.getSizeInBits() < 16; 13395ffd83dbSDimitry Andric }, changeTo(1, S16)); 13405ffd83dbSDimitry Andric Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1341480093f4SDimitry Andric Shifts.clampScalar(1, S32, S32); 13420b57cec5SDimitry Andric Shifts.clampScalar(0, S16, S64); 13430b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 16); 1344*e8d8bef9SDimitry Andric 1345*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1346*e8d8bef9SDimitry Andric .minScalar(0, S16) 1347*e8d8bef9SDimitry Andric .scalarize(0) 1348*e8d8bef9SDimitry Andric .lower(); 13490b57cec5SDimitry Andric } else { 13500b57cec5SDimitry Andric // Make sure we legalize the shift amount type first, as the general 13510b57cec5SDimitry Andric // expansion for the shifted type will produce much worse code if it hasn't 13520b57cec5SDimitry Andric // been truncated already. 13530b57cec5SDimitry Andric Shifts.clampScalar(1, S32, S32); 13540b57cec5SDimitry Andric Shifts.clampScalar(0, S32, S64); 13550b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 32); 1356*e8d8bef9SDimitry Andric 1357*e8d8bef9SDimitry Andric getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) 1358*e8d8bef9SDimitry Andric .minScalar(0, S32) 1359*e8d8bef9SDimitry Andric .scalarize(0) 1360*e8d8bef9SDimitry Andric .lower(); 13610b57cec5SDimitry Andric } 13620b57cec5SDimitry Andric Shifts.scalarize(0); 13630b57cec5SDimitry Andric 13640b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 13650b57cec5SDimitry Andric unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 13660b57cec5SDimitry Andric unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 13670b57cec5SDimitry Andric unsigned IdxTypeIdx = 2; 13680b57cec5SDimitry Andric 13690b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 13700b57cec5SDimitry Andric .customIf([=](const LegalityQuery &Query) { 13710b57cec5SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 13720b57cec5SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 13730b57cec5SDimitry Andric const LLT IdxTy = Query.Types[IdxTypeIdx]; 1374*e8d8bef9SDimitry Andric const unsigned EltSize = EltTy.getSizeInBits(); 1375*e8d8bef9SDimitry Andric return (EltSize == 32 || EltSize == 64) && 13760b57cec5SDimitry Andric VecTy.getSizeInBits() % 32 == 0 && 13775ffd83dbSDimitry Andric VecTy.getSizeInBits() <= MaxRegisterSize && 13780b57cec5SDimitry Andric IdxTy.getSizeInBits() == 32; 13790b57cec5SDimitry Andric }) 1380*e8d8bef9SDimitry Andric .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), 1381*e8d8bef9SDimitry Andric bitcastToVectorElement32(VecTypeIdx)) 1382*e8d8bef9SDimitry Andric //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) 1383*e8d8bef9SDimitry Andric .bitcastIf( 1384*e8d8bef9SDimitry Andric all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), 1385*e8d8bef9SDimitry Andric [=](const LegalityQuery &Query) { 1386*e8d8bef9SDimitry Andric // For > 64-bit element types, try to turn this into a 64-bit 1387*e8d8bef9SDimitry Andric // element vector since we may be able to do better indexing 1388*e8d8bef9SDimitry Andric // if this is scalar. If not, fall back to 32. 1389*e8d8bef9SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 1390*e8d8bef9SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 1391*e8d8bef9SDimitry Andric const unsigned DstEltSize = EltTy.getSizeInBits(); 1392*e8d8bef9SDimitry Andric const unsigned VecSize = VecTy.getSizeInBits(); 1393*e8d8bef9SDimitry Andric 1394*e8d8bef9SDimitry Andric const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; 1395*e8d8bef9SDimitry Andric return std::make_pair( 1396*e8d8bef9SDimitry Andric VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); 1397*e8d8bef9SDimitry Andric }) 13980b57cec5SDimitry Andric .clampScalar(EltTypeIdx, S32, S64) 13990b57cec5SDimitry Andric .clampScalar(VecTypeIdx, S32, S64) 1400*e8d8bef9SDimitry Andric .clampScalar(IdxTypeIdx, S32, S32) 1401*e8d8bef9SDimitry Andric .clampMaxNumElements(VecTypeIdx, S32, 32) 1402*e8d8bef9SDimitry Andric // TODO: Clamp elements for 64-bit vectors? 1403*e8d8bef9SDimitry Andric // It should only be necessary with variable indexes. 1404*e8d8bef9SDimitry Andric // As a last resort, lower to the stack 1405*e8d8bef9SDimitry Andric .lower(); 14060b57cec5SDimitry Andric } 14070b57cec5SDimitry Andric 14080b57cec5SDimitry Andric getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 14090b57cec5SDimitry Andric .unsupportedIf([=](const LegalityQuery &Query) { 14100b57cec5SDimitry Andric const LLT &EltTy = Query.Types[1].getElementType(); 14110b57cec5SDimitry Andric return Query.Types[0] != EltTy; 14120b57cec5SDimitry Andric }); 14130b57cec5SDimitry Andric 14140b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT, G_INSERT}) { 14150b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 14160b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 14170b57cec5SDimitry Andric 14180b57cec5SDimitry Andric // FIXME: Doesn't handle extract of illegal sizes. 14190b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 14208bcb0991SDimitry Andric .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 14218bcb0991SDimitry Andric // FIXME: Multiples of 16 should not be legal. 14220b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 14230b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 14240b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 14250b57cec5SDimitry Andric return (BigTy.getSizeInBits() % 32 == 0) && 14260b57cec5SDimitry Andric (LitTy.getSizeInBits() % 16 == 0); 14270b57cec5SDimitry Andric }) 14280b57cec5SDimitry Andric .widenScalarIf( 14290b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 14300b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 14310b57cec5SDimitry Andric return (BigTy.getScalarSizeInBits() < 16); 14320b57cec5SDimitry Andric }, 14330b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 14340b57cec5SDimitry Andric .widenScalarIf( 14350b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 14360b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 14370b57cec5SDimitry Andric return (LitTy.getScalarSizeInBits() < 16); 14380b57cec5SDimitry Andric }, 14390b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 14400b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 14410b57cec5SDimitry Andric .widenScalarToNextPow2(BigTyIdx, 32); 14420b57cec5SDimitry Andric 14430b57cec5SDimitry Andric } 14440b57cec5SDimitry Andric 14458bcb0991SDimitry Andric auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 14460b57cec5SDimitry Andric .legalForCartesianProduct(AllS32Vectors, {S32}) 14470b57cec5SDimitry Andric .legalForCartesianProduct(AllS64Vectors, {S64}) 14488bcb0991SDimitry Andric .clampNumElements(0, V16S32, V32S32) 14498bcb0991SDimitry Andric .clampNumElements(0, V2S64, V16S64) 14508bcb0991SDimitry Andric .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 14518bcb0991SDimitry Andric 14528bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) { 14535ffd83dbSDimitry Andric BuildVector 14545ffd83dbSDimitry Andric // FIXME: Should probably widen s1 vectors straight to s32 14555ffd83dbSDimitry Andric .minScalarOrElt(0, S16) 14565ffd83dbSDimitry Andric // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 14575ffd83dbSDimitry Andric .minScalar(1, S32); 14585ffd83dbSDimitry Andric 14598bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 14608bcb0991SDimitry Andric .legalFor({V2S16, S32}) 14618bcb0991SDimitry Andric .lower(); 14625ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 14638bcb0991SDimitry Andric } else { 14645ffd83dbSDimitry Andric BuildVector.customFor({V2S16, S16}); 14655ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 14665ffd83dbSDimitry Andric 14678bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 14685ffd83dbSDimitry Andric .customFor({V2S16, S32}) 14698bcb0991SDimitry Andric .lower(); 14708bcb0991SDimitry Andric } 14718bcb0991SDimitry Andric 14725ffd83dbSDimitry Andric BuildVector.legalIf(isRegisterType(0)); 14735ffd83dbSDimitry Andric 14745ffd83dbSDimitry Andric // FIXME: Clamp maximum size 14750b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONCAT_VECTORS) 1476*e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 1477*e8d8bef9SDimitry Andric .clampMaxNumElements(0, S32, 32) 1478*e8d8bef9SDimitry Andric .clampMaxNumElements(1, S16, 2) // TODO: Make 4? 1479*e8d8bef9SDimitry Andric .clampMaxNumElements(0, S16, 64); 14800b57cec5SDimitry Andric 14815ffd83dbSDimitry Andric // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 14825ffd83dbSDimitry Andric // pre-legalize. 14835ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 14845ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 14855ffd83dbSDimitry Andric .customFor({V2S16, V2S16}) 14865ffd83dbSDimitry Andric .lower(); 14875ffd83dbSDimitry Andric } else 14888bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 14898bcb0991SDimitry Andric 14900b57cec5SDimitry Andric // Merge/Unmerge 14910b57cec5SDimitry Andric for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 14920b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 14930b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 14940b57cec5SDimitry Andric 14950b57cec5SDimitry Andric auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 14965ffd83dbSDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 14970b57cec5SDimitry Andric if (Ty.isVector()) { 14980b57cec5SDimitry Andric const LLT &EltTy = Ty.getElementType(); 14995ffd83dbSDimitry Andric if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 15000b57cec5SDimitry Andric return true; 15010b57cec5SDimitry Andric if (!isPowerOf2_32(EltTy.getSizeInBits())) 15020b57cec5SDimitry Andric return true; 15030b57cec5SDimitry Andric } 15040b57cec5SDimitry Andric return false; 15050b57cec5SDimitry Andric }; 15060b57cec5SDimitry Andric 15078bcb0991SDimitry Andric auto &Builder = getActionDefinitionsBuilder(Op) 1508*e8d8bef9SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 15095ffd83dbSDimitry Andric .lowerFor({{S16, V2S16}}) 15105ffd83dbSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 15115ffd83dbSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 15125ffd83dbSDimitry Andric return BigTy.getSizeInBits() == 32; 15135ffd83dbSDimitry Andric }) 15145ffd83dbSDimitry Andric // Try to widen to s16 first for small types. 15155ffd83dbSDimitry Andric // TODO: Only do this on targets with legal s16 shifts 15165ffd83dbSDimitry Andric .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 15170b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 15188bcb0991SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 15198bcb0991SDimitry Andric .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 15208bcb0991SDimitry Andric elementTypeIs(1, S16)), 15218bcb0991SDimitry Andric changeTo(1, V2S16)) 15225ffd83dbSDimitry Andric // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 15235ffd83dbSDimitry Andric // worth considering the multiples of 64 since 2*192 and 2*384 are not 15245ffd83dbSDimitry Andric // valid. 15255ffd83dbSDimitry Andric .clampScalar(LitTyIdx, S32, S512) 15265ffd83dbSDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 15270b57cec5SDimitry Andric // Break up vectors with weird elements into scalars 15280b57cec5SDimitry Andric .fewerElementsIf( 15295ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 15300b57cec5SDimitry Andric scalarize(0)) 15310b57cec5SDimitry Andric .fewerElementsIf( 15325ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 15330b57cec5SDimitry Andric scalarize(1)) 15345ffd83dbSDimitry Andric .clampScalar(BigTyIdx, S32, MaxScalar); 15358bcb0991SDimitry Andric 15368bcb0991SDimitry Andric if (Op == G_MERGE_VALUES) { 15378bcb0991SDimitry Andric Builder.widenScalarIf( 15388bcb0991SDimitry Andric // TODO: Use 16-bit shifts if legal for 8-bit values? 15390b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 15408bcb0991SDimitry Andric const LLT Ty = Query.Types[LitTyIdx]; 15418bcb0991SDimitry Andric return Ty.getSizeInBits() < 32; 15428bcb0991SDimitry Andric }, 15438bcb0991SDimitry Andric changeTo(LitTyIdx, S32)); 15448bcb0991SDimitry Andric } 15458bcb0991SDimitry Andric 15468bcb0991SDimitry Andric Builder.widenScalarIf( 15478bcb0991SDimitry Andric [=](const LegalityQuery &Query) { 15488bcb0991SDimitry Andric const LLT Ty = Query.Types[BigTyIdx]; 15490b57cec5SDimitry Andric return !isPowerOf2_32(Ty.getSizeInBits()) && 15500b57cec5SDimitry Andric Ty.getSizeInBits() % 16 != 0; 15510b57cec5SDimitry Andric }, 15520b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 15530b57cec5SDimitry Andric // Pick the next power of 2, or a multiple of 64 over 128. 15540b57cec5SDimitry Andric // Whichever is smaller. 15550b57cec5SDimitry Andric const LLT &Ty = Query.Types[BigTyIdx]; 15560b57cec5SDimitry Andric unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 15570b57cec5SDimitry Andric if (NewSizeInBits >= 256) { 15580b57cec5SDimitry Andric unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 15590b57cec5SDimitry Andric if (RoundedTo < NewSizeInBits) 15600b57cec5SDimitry Andric NewSizeInBits = RoundedTo; 15610b57cec5SDimitry Andric } 15620b57cec5SDimitry Andric return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 15630b57cec5SDimitry Andric }) 15640b57cec5SDimitry Andric // Any vectors left are the wrong size. Scalarize them. 15650b57cec5SDimitry Andric .scalarize(0) 15660b57cec5SDimitry Andric .scalarize(1); 15670b57cec5SDimitry Andric } 15680b57cec5SDimitry Andric 15695ffd83dbSDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 15705ffd83dbSDimitry Andric // RegBankSelect. 15715ffd83dbSDimitry Andric auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 15725ffd83dbSDimitry Andric .legalFor({{S32}, {S64}}); 15738bcb0991SDimitry Andric 15745ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 15755ffd83dbSDimitry Andric SextInReg.lowerFor({{V2S16}}) 15765ffd83dbSDimitry Andric // Prefer to reduce vector widths for 16-bit vectors before lowering, to 15775ffd83dbSDimitry Andric // get more vector shift opportunities, since we'll get those when 15785ffd83dbSDimitry Andric // expanded. 15795ffd83dbSDimitry Andric .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 15805ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 15815ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}, {S16}}); 15825ffd83dbSDimitry Andric } else { 15835ffd83dbSDimitry Andric // Prefer to promote to s32 before lowering if we don't have 16-bit 15845ffd83dbSDimitry Andric // shifts. This avoid a lot of intermediate truncate and extend operations. 15855ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}}); 15865ffd83dbSDimitry Andric } 15875ffd83dbSDimitry Andric 15885ffd83dbSDimitry Andric SextInReg 15895ffd83dbSDimitry Andric .scalarize(0) 15905ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 15915ffd83dbSDimitry Andric .lower(); 15925ffd83dbSDimitry Andric 15935ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSHR) 15945ffd83dbSDimitry Andric .legalFor({{S32, S32}}) 15955ffd83dbSDimitry Andric .scalarize(0) 15965ffd83dbSDimitry Andric .lower(); 1597480093f4SDimitry Andric 1598480093f4SDimitry Andric getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1599480093f4SDimitry Andric .legalFor({S64}); 1600480093f4SDimitry Andric 1601*e8d8bef9SDimitry Andric getActionDefinitionsBuilder(G_FENCE) 1602*e8d8bef9SDimitry Andric .alwaysLegal(); 1603*e8d8bef9SDimitry Andric 16045ffd83dbSDimitry Andric getActionDefinitionsBuilder({ 16055ffd83dbSDimitry Andric // TODO: Verify V_BFI_B32 is generated from expanded bit ops 16065ffd83dbSDimitry Andric G_FCOPYSIGN, 16075ffd83dbSDimitry Andric 16085ffd83dbSDimitry Andric G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1609*e8d8bef9SDimitry Andric G_ATOMICRMW_NAND, 1610*e8d8bef9SDimitry Andric G_ATOMICRMW_FSUB, 16115ffd83dbSDimitry Andric G_READ_REGISTER, 16125ffd83dbSDimitry Andric G_WRITE_REGISTER, 16135ffd83dbSDimitry Andric 16145ffd83dbSDimitry Andric G_SADDO, G_SSUBO, 16155ffd83dbSDimitry Andric 16165ffd83dbSDimitry Andric // TODO: Implement 16175ffd83dbSDimitry Andric G_FMINIMUM, G_FMAXIMUM, 16185ffd83dbSDimitry Andric G_FSHL 16195ffd83dbSDimitry Andric }).lower(); 16205ffd83dbSDimitry Andric 1621480093f4SDimitry Andric getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 16225ffd83dbSDimitry Andric G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1623480093f4SDimitry Andric G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1624480093f4SDimitry Andric .unsupported(); 1625480093f4SDimitry Andric 16260b57cec5SDimitry Andric computeTables(); 16270b57cec5SDimitry Andric verify(*ST.getInstrInfo()); 16280b57cec5SDimitry Andric } 16290b57cec5SDimitry Andric 16305ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 16315ffd83dbSDimitry Andric MachineInstr &MI) const { 16325ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 16335ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 16345ffd83dbSDimitry Andric 16350b57cec5SDimitry Andric switch (MI.getOpcode()) { 16360b57cec5SDimitry Andric case TargetOpcode::G_ADDRSPACE_CAST: 16378bcb0991SDimitry Andric return legalizeAddrSpaceCast(MI, MRI, B); 16380b57cec5SDimitry Andric case TargetOpcode::G_FRINT: 16398bcb0991SDimitry Andric return legalizeFrint(MI, MRI, B); 16400b57cec5SDimitry Andric case TargetOpcode::G_FCEIL: 16418bcb0991SDimitry Andric return legalizeFceil(MI, MRI, B); 1642*e8d8bef9SDimitry Andric case TargetOpcode::G_FREM: 1643*e8d8bef9SDimitry Andric return legalizeFrem(MI, MRI, B); 16440b57cec5SDimitry Andric case TargetOpcode::G_INTRINSIC_TRUNC: 16458bcb0991SDimitry Andric return legalizeIntrinsicTrunc(MI, MRI, B); 16460b57cec5SDimitry Andric case TargetOpcode::G_SITOFP: 16478bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, true); 16480b57cec5SDimitry Andric case TargetOpcode::G_UITOFP: 16498bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, false); 16505ffd83dbSDimitry Andric case TargetOpcode::G_FPTOSI: 16515ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, true); 16525ffd83dbSDimitry Andric case TargetOpcode::G_FPTOUI: 16535ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, false); 16540b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM: 16550b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM: 16560b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM_IEEE: 16570b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM_IEEE: 16585ffd83dbSDimitry Andric return legalizeMinNumMaxNum(Helper, MI); 16590b57cec5SDimitry Andric case TargetOpcode::G_EXTRACT_VECTOR_ELT: 16608bcb0991SDimitry Andric return legalizeExtractVectorElt(MI, MRI, B); 16610b57cec5SDimitry Andric case TargetOpcode::G_INSERT_VECTOR_ELT: 16628bcb0991SDimitry Andric return legalizeInsertVectorElt(MI, MRI, B); 16635ffd83dbSDimitry Andric case TargetOpcode::G_SHUFFLE_VECTOR: 16645ffd83dbSDimitry Andric return legalizeShuffleVector(MI, MRI, B); 16658bcb0991SDimitry Andric case TargetOpcode::G_FSIN: 16668bcb0991SDimitry Andric case TargetOpcode::G_FCOS: 16678bcb0991SDimitry Andric return legalizeSinCos(MI, MRI, B); 16688bcb0991SDimitry Andric case TargetOpcode::G_GLOBAL_VALUE: 16698bcb0991SDimitry Andric return legalizeGlobalValue(MI, MRI, B); 16708bcb0991SDimitry Andric case TargetOpcode::G_LOAD: 1671*e8d8bef9SDimitry Andric return legalizeLoad(Helper, MI); 16728bcb0991SDimitry Andric case TargetOpcode::G_FMAD: 16738bcb0991SDimitry Andric return legalizeFMad(MI, MRI, B); 16748bcb0991SDimitry Andric case TargetOpcode::G_FDIV: 16758bcb0991SDimitry Andric return legalizeFDIV(MI, MRI, B); 16765ffd83dbSDimitry Andric case TargetOpcode::G_UDIV: 16775ffd83dbSDimitry Andric case TargetOpcode::G_UREM: 16785ffd83dbSDimitry Andric return legalizeUDIV_UREM(MI, MRI, B); 16795ffd83dbSDimitry Andric case TargetOpcode::G_SDIV: 16805ffd83dbSDimitry Andric case TargetOpcode::G_SREM: 16815ffd83dbSDimitry Andric return legalizeSDIV_SREM(MI, MRI, B); 1682480093f4SDimitry Andric case TargetOpcode::G_ATOMIC_CMPXCHG: 1683480093f4SDimitry Andric return legalizeAtomicCmpXChg(MI, MRI, B); 16845ffd83dbSDimitry Andric case TargetOpcode::G_FLOG: 16855ffd83dbSDimitry Andric return legalizeFlog(MI, B, numbers::ln2f); 16865ffd83dbSDimitry Andric case TargetOpcode::G_FLOG10: 16875ffd83dbSDimitry Andric return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 16885ffd83dbSDimitry Andric case TargetOpcode::G_FEXP: 16895ffd83dbSDimitry Andric return legalizeFExp(MI, B); 16905ffd83dbSDimitry Andric case TargetOpcode::G_FPOW: 16915ffd83dbSDimitry Andric return legalizeFPow(MI, B); 16925ffd83dbSDimitry Andric case TargetOpcode::G_FFLOOR: 16935ffd83dbSDimitry Andric return legalizeFFloor(MI, MRI, B); 16945ffd83dbSDimitry Andric case TargetOpcode::G_BUILD_VECTOR: 16955ffd83dbSDimitry Andric return legalizeBuildVector(MI, MRI, B); 16960b57cec5SDimitry Andric default: 16970b57cec5SDimitry Andric return false; 16980b57cec5SDimitry Andric } 16990b57cec5SDimitry Andric 17000b57cec5SDimitry Andric llvm_unreachable("expected switch to return"); 17010b57cec5SDimitry Andric } 17020b57cec5SDimitry Andric 17030b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture( 17040b57cec5SDimitry Andric unsigned AS, 17050b57cec5SDimitry Andric MachineRegisterInfo &MRI, 17068bcb0991SDimitry Andric MachineIRBuilder &B) const { 17078bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 17080b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 17090b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 17100b57cec5SDimitry Andric 17118bcb0991SDimitry Andric assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 17128bcb0991SDimitry Andric 17130b57cec5SDimitry Andric if (ST.hasApertureRegs()) { 17140b57cec5SDimitry Andric // FIXME: Use inline constants (src_{shared, private}_base) instead of 17150b57cec5SDimitry Andric // getreg. 17160b57cec5SDimitry Andric unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 17170b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 17180b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 17190b57cec5SDimitry Andric unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 17200b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 17210b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 17220b57cec5SDimitry Andric unsigned Encoding = 17230b57cec5SDimitry Andric AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 17240b57cec5SDimitry Andric Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 17250b57cec5SDimitry Andric WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 17260b57cec5SDimitry Andric 17270b57cec5SDimitry Andric Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 17280b57cec5SDimitry Andric 17298bcb0991SDimitry Andric B.buildInstr(AMDGPU::S_GETREG_B32) 17300b57cec5SDimitry Andric .addDef(GetReg) 17310b57cec5SDimitry Andric .addImm(Encoding); 17320b57cec5SDimitry Andric MRI.setType(GetReg, S32); 17330b57cec5SDimitry Andric 17348bcb0991SDimitry Andric auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 17355ffd83dbSDimitry Andric return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 17360b57cec5SDimitry Andric } 17370b57cec5SDimitry Andric 17380b57cec5SDimitry Andric Register QueuePtr = MRI.createGenericVirtualRegister( 17390b57cec5SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 17400b57cec5SDimitry Andric 1741*e8d8bef9SDimitry Andric if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 17428bcb0991SDimitry Andric return Register(); 17430b57cec5SDimitry Andric 17440b57cec5SDimitry Andric // Offset into amd_queue_t for group_segment_aperture_base_hi / 17450b57cec5SDimitry Andric // private_segment_aperture_base_hi. 17460b57cec5SDimitry Andric uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 17470b57cec5SDimitry Andric 1748480093f4SDimitry Andric // TODO: can we be smarter about machine pointer info? 1749480093f4SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 17500b57cec5SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 17510b57cec5SDimitry Andric PtrInfo, 17525ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 17530b57cec5SDimitry Andric MachineMemOperand::MOInvariant, 17545ffd83dbSDimitry Andric 4, commonAlignment(Align(64), StructOffset)); 17550b57cec5SDimitry Andric 17560b57cec5SDimitry Andric Register LoadAddr; 17570b57cec5SDimitry Andric 1758480093f4SDimitry Andric B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 17595ffd83dbSDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 17600b57cec5SDimitry Andric } 17610b57cec5SDimitry Andric 17620b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 17630b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 17648bcb0991SDimitry Andric MachineIRBuilder &B) const { 17658bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 17660b57cec5SDimitry Andric 17678bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 17680b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 17690b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 17700b57cec5SDimitry Andric 17710b57cec5SDimitry Andric LLT DstTy = MRI.getType(Dst); 17720b57cec5SDimitry Andric LLT SrcTy = MRI.getType(Src); 17730b57cec5SDimitry Andric unsigned DestAS = DstTy.getAddressSpace(); 17740b57cec5SDimitry Andric unsigned SrcAS = SrcTy.getAddressSpace(); 17750b57cec5SDimitry Andric 17760b57cec5SDimitry Andric // TODO: Avoid reloading from the queue ptr for each cast, or at least each 17770b57cec5SDimitry Andric // vector element. 17780b57cec5SDimitry Andric assert(!DstTy.isVector()); 17790b57cec5SDimitry Andric 17800b57cec5SDimitry Andric const AMDGPUTargetMachine &TM 17810b57cec5SDimitry Andric = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 17820b57cec5SDimitry Andric 1783*e8d8bef9SDimitry Andric if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { 17848bcb0991SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 17858bcb0991SDimitry Andric return true; 17868bcb0991SDimitry Andric } 17878bcb0991SDimitry Andric 17888bcb0991SDimitry Andric if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 17898bcb0991SDimitry Andric // Truncate. 17908bcb0991SDimitry Andric B.buildExtract(Dst, Src, 0); 17918bcb0991SDimitry Andric MI.eraseFromParent(); 17928bcb0991SDimitry Andric return true; 17938bcb0991SDimitry Andric } 17948bcb0991SDimitry Andric 17958bcb0991SDimitry Andric if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 17968bcb0991SDimitry Andric const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 17978bcb0991SDimitry Andric uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 17988bcb0991SDimitry Andric 17998bcb0991SDimitry Andric // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 18008bcb0991SDimitry Andric // another. Merge operands are required to be the same type, but creating an 18018bcb0991SDimitry Andric // extra ptrtoint would be kind of pointless. 18028bcb0991SDimitry Andric auto HighAddr = B.buildConstant( 18038bcb0991SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 18045ffd83dbSDimitry Andric B.buildMerge(Dst, {Src, HighAddr}); 18058bcb0991SDimitry Andric MI.eraseFromParent(); 18060b57cec5SDimitry Andric return true; 18070b57cec5SDimitry Andric } 18080b57cec5SDimitry Andric 18090b57cec5SDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 18100b57cec5SDimitry Andric assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 18110b57cec5SDimitry Andric DestAS == AMDGPUAS::PRIVATE_ADDRESS); 18120b57cec5SDimitry Andric unsigned NullVal = TM.getNullPointerValue(DestAS); 18130b57cec5SDimitry Andric 18148bcb0991SDimitry Andric auto SegmentNull = B.buildConstant(DstTy, NullVal); 18158bcb0991SDimitry Andric auto FlatNull = B.buildConstant(SrcTy, 0); 18160b57cec5SDimitry Andric 18170b57cec5SDimitry Andric // Extract low 32-bits of the pointer. 18185ffd83dbSDimitry Andric auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 18190b57cec5SDimitry Andric 18205ffd83dbSDimitry Andric auto CmpRes = 18215ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 18228bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 18230b57cec5SDimitry Andric 18240b57cec5SDimitry Andric MI.eraseFromParent(); 18250b57cec5SDimitry Andric return true; 18260b57cec5SDimitry Andric } 18270b57cec5SDimitry Andric 18288bcb0991SDimitry Andric if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 18298bcb0991SDimitry Andric return false; 18308bcb0991SDimitry Andric 18318bcb0991SDimitry Andric if (!ST.hasFlatAddressSpace()) 18328bcb0991SDimitry Andric return false; 18330b57cec5SDimitry Andric 18340b57cec5SDimitry Andric auto SegmentNull = 18358bcb0991SDimitry Andric B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 18360b57cec5SDimitry Andric auto FlatNull = 18378bcb0991SDimitry Andric B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 18380b57cec5SDimitry Andric 18398bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 18408bcb0991SDimitry Andric if (!ApertureReg.isValid()) 18418bcb0991SDimitry Andric return false; 18420b57cec5SDimitry Andric 18435ffd83dbSDimitry Andric auto CmpRes = 18445ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 18450b57cec5SDimitry Andric 18460b57cec5SDimitry Andric // Coerce the type of the low half of the result so we can use merge_values. 18475ffd83dbSDimitry Andric Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 18480b57cec5SDimitry Andric 18490b57cec5SDimitry Andric // TODO: Should we allow mismatched types but matching sizes in merges to 18500b57cec5SDimitry Andric // avoid the ptrtoint? 18515ffd83dbSDimitry Andric auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 18525ffd83dbSDimitry Andric B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 18530b57cec5SDimitry Andric 18540b57cec5SDimitry Andric MI.eraseFromParent(); 18550b57cec5SDimitry Andric return true; 18560b57cec5SDimitry Andric } 18570b57cec5SDimitry Andric 18580b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint( 18590b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 18608bcb0991SDimitry Andric MachineIRBuilder &B) const { 18610b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 18620b57cec5SDimitry Andric LLT Ty = MRI.getType(Src); 18630b57cec5SDimitry Andric assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 18640b57cec5SDimitry Andric 18650b57cec5SDimitry Andric APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 18660b57cec5SDimitry Andric APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 18670b57cec5SDimitry Andric 18688bcb0991SDimitry Andric auto C1 = B.buildFConstant(Ty, C1Val); 18698bcb0991SDimitry Andric auto CopySign = B.buildFCopysign(Ty, C1, Src); 18700b57cec5SDimitry Andric 18710b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 18728bcb0991SDimitry Andric auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 18738bcb0991SDimitry Andric auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 18740b57cec5SDimitry Andric 18758bcb0991SDimitry Andric auto C2 = B.buildFConstant(Ty, C2Val); 18768bcb0991SDimitry Andric auto Fabs = B.buildFAbs(Ty, Src); 18770b57cec5SDimitry Andric 18788bcb0991SDimitry Andric auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 18798bcb0991SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 1880*e8d8bef9SDimitry Andric MI.eraseFromParent(); 18810b57cec5SDimitry Andric return true; 18820b57cec5SDimitry Andric } 18830b57cec5SDimitry Andric 18840b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil( 18850b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 18860b57cec5SDimitry Andric MachineIRBuilder &B) const { 18870b57cec5SDimitry Andric 18880b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 18890b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 18900b57cec5SDimitry Andric 18910b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 18920b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 18930b57cec5SDimitry Andric 18940b57cec5SDimitry Andric // result = trunc(src) 18950b57cec5SDimitry Andric // if (src > 0.0 && src != result) 18960b57cec5SDimitry Andric // result += 1.0 18970b57cec5SDimitry Andric 18985ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S64, Src); 18990b57cec5SDimitry Andric 19000b57cec5SDimitry Andric const auto Zero = B.buildFConstant(S64, 0.0); 19010b57cec5SDimitry Andric const auto One = B.buildFConstant(S64, 1.0); 19020b57cec5SDimitry Andric auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 19030b57cec5SDimitry Andric auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 19040b57cec5SDimitry Andric auto And = B.buildAnd(S1, Lt0, NeTrunc); 19050b57cec5SDimitry Andric auto Add = B.buildSelect(S64, And, One, Zero); 19060b57cec5SDimitry Andric 19070b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 19080b57cec5SDimitry Andric B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 19090b57cec5SDimitry Andric return true; 19100b57cec5SDimitry Andric } 19110b57cec5SDimitry Andric 1912*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrem( 1913*e8d8bef9SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1914*e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 1915*e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 1916*e8d8bef9SDimitry Andric Register Src0Reg = MI.getOperand(1).getReg(); 1917*e8d8bef9SDimitry Andric Register Src1Reg = MI.getOperand(2).getReg(); 1918*e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 1919*e8d8bef9SDimitry Andric LLT Ty = MRI.getType(DstReg); 1920*e8d8bef9SDimitry Andric 1921*e8d8bef9SDimitry Andric auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); 1922*e8d8bef9SDimitry Andric auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); 1923*e8d8bef9SDimitry Andric auto Neg = B.buildFNeg(Ty, Trunc, Flags); 1924*e8d8bef9SDimitry Andric B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); 1925*e8d8bef9SDimitry Andric MI.eraseFromParent(); 1926*e8d8bef9SDimitry Andric return true; 1927*e8d8bef9SDimitry Andric } 1928*e8d8bef9SDimitry Andric 1929*e8d8bef9SDimitry Andric static MachineInstrBuilder extractF64Exponent(Register Hi, 19300b57cec5SDimitry Andric MachineIRBuilder &B) { 19310b57cec5SDimitry Andric const unsigned FractBits = 52; 19320b57cec5SDimitry Andric const unsigned ExpBits = 11; 19330b57cec5SDimitry Andric LLT S32 = LLT::scalar(32); 19340b57cec5SDimitry Andric 19350b57cec5SDimitry Andric auto Const0 = B.buildConstant(S32, FractBits - 32); 19360b57cec5SDimitry Andric auto Const1 = B.buildConstant(S32, ExpBits); 19370b57cec5SDimitry Andric 19380b57cec5SDimitry Andric auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 1939*e8d8bef9SDimitry Andric .addUse(Hi) 19400b57cec5SDimitry Andric .addUse(Const0.getReg(0)) 19410b57cec5SDimitry Andric .addUse(Const1.getReg(0)); 19420b57cec5SDimitry Andric 19430b57cec5SDimitry Andric return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 19440b57cec5SDimitry Andric } 19450b57cec5SDimitry Andric 19460b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 19470b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 19480b57cec5SDimitry Andric MachineIRBuilder &B) const { 19490b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 19500b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 19510b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 19520b57cec5SDimitry Andric 19530b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 19540b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 19550b57cec5SDimitry Andric 19560b57cec5SDimitry Andric // TODO: Should this use extract since the low half is unused? 19570b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 19580b57cec5SDimitry Andric Register Hi = Unmerge.getReg(1); 19590b57cec5SDimitry Andric 19600b57cec5SDimitry Andric // Extract the upper half, since this is where we will find the sign and 19610b57cec5SDimitry Andric // exponent. 19620b57cec5SDimitry Andric auto Exp = extractF64Exponent(Hi, B); 19630b57cec5SDimitry Andric 19640b57cec5SDimitry Andric const unsigned FractBits = 52; 19650b57cec5SDimitry Andric 19660b57cec5SDimitry Andric // Extract the sign bit. 19670b57cec5SDimitry Andric const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 19680b57cec5SDimitry Andric auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 19690b57cec5SDimitry Andric 19700b57cec5SDimitry Andric const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 19710b57cec5SDimitry Andric 19720b57cec5SDimitry Andric const auto Zero32 = B.buildConstant(S32, 0); 19730b57cec5SDimitry Andric 19740b57cec5SDimitry Andric // Extend back to 64-bits. 19755ffd83dbSDimitry Andric auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 19760b57cec5SDimitry Andric 19770b57cec5SDimitry Andric auto Shr = B.buildAShr(S64, FractMask, Exp); 19780b57cec5SDimitry Andric auto Not = B.buildNot(S64, Shr); 19790b57cec5SDimitry Andric auto Tmp0 = B.buildAnd(S64, Src, Not); 19800b57cec5SDimitry Andric auto FiftyOne = B.buildConstant(S32, FractBits - 1); 19810b57cec5SDimitry Andric 19820b57cec5SDimitry Andric auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 19830b57cec5SDimitry Andric auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 19840b57cec5SDimitry Andric 19850b57cec5SDimitry Andric auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 19860b57cec5SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 1987*e8d8bef9SDimitry Andric MI.eraseFromParent(); 19880b57cec5SDimitry Andric return true; 19890b57cec5SDimitry Andric } 19900b57cec5SDimitry Andric 19910b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP( 19920b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 19930b57cec5SDimitry Andric MachineIRBuilder &B, bool Signed) const { 19940b57cec5SDimitry Andric 19950b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 19960b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 19970b57cec5SDimitry Andric 19980b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 19990b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 20000b57cec5SDimitry Andric 20010b57cec5SDimitry Andric assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 20020b57cec5SDimitry Andric 20030b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 20040b57cec5SDimitry Andric 20050b57cec5SDimitry Andric auto CvtHi = Signed ? 20060b57cec5SDimitry Andric B.buildSITOFP(S64, Unmerge.getReg(1)) : 20070b57cec5SDimitry Andric B.buildUITOFP(S64, Unmerge.getReg(1)); 20080b57cec5SDimitry Andric 20090b57cec5SDimitry Andric auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 20100b57cec5SDimitry Andric 20110b57cec5SDimitry Andric auto ThirtyTwo = B.buildConstant(S32, 32); 20120b57cec5SDimitry Andric auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 20130b57cec5SDimitry Andric .addUse(CvtHi.getReg(0)) 20140b57cec5SDimitry Andric .addUse(ThirtyTwo.getReg(0)); 20150b57cec5SDimitry Andric 20160b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 20170b57cec5SDimitry Andric B.buildFAdd(Dst, LdExp, CvtLo); 20180b57cec5SDimitry Andric MI.eraseFromParent(); 20190b57cec5SDimitry Andric return true; 20200b57cec5SDimitry Andric } 20210b57cec5SDimitry Andric 20225ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this 20235ffd83dbSDimitry Andric // actually works. 20245ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI( 20250b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 20265ffd83dbSDimitry Andric MachineIRBuilder &B, bool Signed) const { 20275ffd83dbSDimitry Andric 20285ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 20295ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 20305ffd83dbSDimitry Andric 20315ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 20325ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 20335ffd83dbSDimitry Andric 20345ffd83dbSDimitry Andric assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 20355ffd83dbSDimitry Andric 20365ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 20375ffd83dbSDimitry Andric 20385ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 20395ffd83dbSDimitry Andric auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 20405ffd83dbSDimitry Andric auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 20415ffd83dbSDimitry Andric 20425ffd83dbSDimitry Andric auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 20435ffd83dbSDimitry Andric auto FloorMul = B.buildFFloor(S64, Mul, Flags); 20445ffd83dbSDimitry Andric auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 20455ffd83dbSDimitry Andric 20465ffd83dbSDimitry Andric auto Hi = Signed ? 20475ffd83dbSDimitry Andric B.buildFPTOSI(S32, FloorMul) : 20485ffd83dbSDimitry Andric B.buildFPTOUI(S32, FloorMul); 20495ffd83dbSDimitry Andric auto Lo = B.buildFPTOUI(S32, Fma); 20505ffd83dbSDimitry Andric 20515ffd83dbSDimitry Andric B.buildMerge(Dst, { Lo, Hi }); 20525ffd83dbSDimitry Andric MI.eraseFromParent(); 20535ffd83dbSDimitry Andric 20545ffd83dbSDimitry Andric return true; 20555ffd83dbSDimitry Andric } 20565ffd83dbSDimitry Andric 20575ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 20585ffd83dbSDimitry Andric MachineInstr &MI) const { 20595ffd83dbSDimitry Andric MachineFunction &MF = Helper.MIRBuilder.getMF(); 20600b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 20610b57cec5SDimitry Andric 20620b57cec5SDimitry Andric const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 20630b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 20640b57cec5SDimitry Andric 20650b57cec5SDimitry Andric // With ieee_mode disabled, the instructions have the correct behavior 20660b57cec5SDimitry Andric // already for G_FMINNUM/G_FMAXNUM 20670b57cec5SDimitry Andric if (!MFI->getMode().IEEE) 20680b57cec5SDimitry Andric return !IsIEEEOp; 20690b57cec5SDimitry Andric 20700b57cec5SDimitry Andric if (IsIEEEOp) 20710b57cec5SDimitry Andric return true; 20720b57cec5SDimitry Andric 20730b57cec5SDimitry Andric return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 20740b57cec5SDimitry Andric } 20750b57cec5SDimitry Andric 20760b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 20770b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 20780b57cec5SDimitry Andric MachineIRBuilder &B) const { 20790b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 20800b57cec5SDimitry Andric 20810b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 20825ffd83dbSDimitry Andric 20835ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 20845ffd83dbSDimitry Andric // constant before this, so we shouldn't need 20855ffd83dbSDimitry Andric // getConstantVRegValWithLookThrough. 2086*e8d8bef9SDimitry Andric Optional<ValueAndVReg> MaybeIdxVal = 2087*e8d8bef9SDimitry Andric getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); 2088*e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 20890b57cec5SDimitry Andric return true; 2090*e8d8bef9SDimitry Andric const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 20910b57cec5SDimitry Andric 20920b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 20930b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 20940b57cec5SDimitry Andric 20950b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 20960b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 20970b57cec5SDimitry Andric assert(EltTy == MRI.getType(Dst)); 20980b57cec5SDimitry Andric 2099*e8d8bef9SDimitry Andric if (IdxVal < VecTy.getNumElements()) 2100*e8d8bef9SDimitry Andric B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); 21010b57cec5SDimitry Andric else 21020b57cec5SDimitry Andric B.buildUndef(Dst); 21030b57cec5SDimitry Andric 21040b57cec5SDimitry Andric MI.eraseFromParent(); 21050b57cec5SDimitry Andric return true; 21060b57cec5SDimitry Andric } 21070b57cec5SDimitry Andric 21080b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 21090b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21100b57cec5SDimitry Andric MachineIRBuilder &B) const { 21110b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 21120b57cec5SDimitry Andric 21130b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 21145ffd83dbSDimitry Andric 21155ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 21165ffd83dbSDimitry Andric // constant before this, so we shouldn't need 21175ffd83dbSDimitry Andric // getConstantVRegValWithLookThrough. 2118*e8d8bef9SDimitry Andric Optional<ValueAndVReg> MaybeIdxVal = 2119*e8d8bef9SDimitry Andric getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); 2120*e8d8bef9SDimitry Andric if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. 21210b57cec5SDimitry Andric return true; 21220b57cec5SDimitry Andric 2123*e8d8bef9SDimitry Andric int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); 21240b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 21250b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 21260b57cec5SDimitry Andric Register Ins = MI.getOperand(2).getReg(); 21270b57cec5SDimitry Andric 21280b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 21290b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 21300b57cec5SDimitry Andric assert(EltTy == MRI.getType(Ins)); 21310b57cec5SDimitry Andric 2132*e8d8bef9SDimitry Andric if (IdxVal < VecTy.getNumElements()) 2133*e8d8bef9SDimitry Andric B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); 21340b57cec5SDimitry Andric else 21350b57cec5SDimitry Andric B.buildUndef(Dst); 21360b57cec5SDimitry Andric 21370b57cec5SDimitry Andric MI.eraseFromParent(); 21380b57cec5SDimitry Andric return true; 21390b57cec5SDimitry Andric } 21400b57cec5SDimitry Andric 21415ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector( 21425ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21435ffd83dbSDimitry Andric MachineIRBuilder &B) const { 21445ffd83dbSDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 21455ffd83dbSDimitry Andric 21465ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 21475ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 21485ffd83dbSDimitry Andric LLT DstTy = MRI.getType(Dst); 21495ffd83dbSDimitry Andric LLT SrcTy = MRI.getType(Src0); 21505ffd83dbSDimitry Andric 21515ffd83dbSDimitry Andric if (SrcTy == V2S16 && DstTy == V2S16 && 21525ffd83dbSDimitry Andric AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 21535ffd83dbSDimitry Andric return true; 21545ffd83dbSDimitry Andric 21555ffd83dbSDimitry Andric MachineIRBuilder HelperBuilder(MI); 21565ffd83dbSDimitry Andric GISelObserverWrapper DummyObserver; 21575ffd83dbSDimitry Andric LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 21585ffd83dbSDimitry Andric return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 21595ffd83dbSDimitry Andric } 21605ffd83dbSDimitry Andric 21618bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos( 21628bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21638bcb0991SDimitry Andric MachineIRBuilder &B) const { 21648bcb0991SDimitry Andric 21658bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 21668bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 21678bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 21688bcb0991SDimitry Andric unsigned Flags = MI.getFlags(); 21698bcb0991SDimitry Andric 21708bcb0991SDimitry Andric Register TrigVal; 21715ffd83dbSDimitry Andric auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 21728bcb0991SDimitry Andric if (ST.hasTrigReducedRange()) { 21738bcb0991SDimitry Andric auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 21748bcb0991SDimitry Andric TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 21758bcb0991SDimitry Andric .addUse(MulVal.getReg(0)) 21768bcb0991SDimitry Andric .setMIFlags(Flags).getReg(0); 21778bcb0991SDimitry Andric } else 21788bcb0991SDimitry Andric TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 21798bcb0991SDimitry Andric 21808bcb0991SDimitry Andric Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 21818bcb0991SDimitry Andric Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 21828bcb0991SDimitry Andric B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 21838bcb0991SDimitry Andric .addUse(TrigVal) 21848bcb0991SDimitry Andric .setMIFlags(Flags); 21858bcb0991SDimitry Andric MI.eraseFromParent(); 21868bcb0991SDimitry Andric return true; 21878bcb0991SDimitry Andric } 21888bcb0991SDimitry Andric 21895ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 21905ffd83dbSDimitry Andric MachineIRBuilder &B, 21915ffd83dbSDimitry Andric const GlobalValue *GV, 21925ffd83dbSDimitry Andric int64_t Offset, 21935ffd83dbSDimitry Andric unsigned GAFlags) const { 21945ffd83dbSDimitry Andric assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 21958bcb0991SDimitry Andric // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 21968bcb0991SDimitry Andric // to the following code sequence: 21978bcb0991SDimitry Andric // 21988bcb0991SDimitry Andric // For constant address space: 21998bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 22008bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol 22018bcb0991SDimitry Andric // s_addc_u32 s1, s1, 0 22028bcb0991SDimitry Andric // 22038bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 22048bcb0991SDimitry Andric // a fixup or relocation is emitted to replace $symbol with a literal 22058bcb0991SDimitry Andric // constant, which is a pc-relative offset from the encoding of the $symbol 22068bcb0991SDimitry Andric // operand to the global variable. 22078bcb0991SDimitry Andric // 22088bcb0991SDimitry Andric // For global address space: 22098bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 22108bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 22118bcb0991SDimitry Andric // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 22128bcb0991SDimitry Andric // 22138bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 22148bcb0991SDimitry Andric // fixups or relocations are emitted to replace $symbol@*@lo and 22158bcb0991SDimitry Andric // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 22168bcb0991SDimitry Andric // which is a 64-bit pc-relative offset from the encoding of the $symbol 22178bcb0991SDimitry Andric // operand to the global variable. 22188bcb0991SDimitry Andric // 22198bcb0991SDimitry Andric // What we want here is an offset from the value returned by s_getpc 22208bcb0991SDimitry Andric // (which is the address of the s_add_u32 instruction) to the global 22218bcb0991SDimitry Andric // variable, but since the encoding of $symbol starts 4 bytes after the start 22228bcb0991SDimitry Andric // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 22238bcb0991SDimitry Andric // small. This requires us to add 4 to the global variable offset in order to 2224*e8d8bef9SDimitry Andric // compute the correct address. Similarly for the s_addc_u32 instruction, the 2225*e8d8bef9SDimitry Andric // encoding of $symbol starts 12 bytes after the start of the s_add_u32 2226*e8d8bef9SDimitry Andric // instruction. 22278bcb0991SDimitry Andric 22288bcb0991SDimitry Andric LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 22298bcb0991SDimitry Andric 22308bcb0991SDimitry Andric Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 22318bcb0991SDimitry Andric B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 22328bcb0991SDimitry Andric 22338bcb0991SDimitry Andric MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 22348bcb0991SDimitry Andric .addDef(PCReg); 22358bcb0991SDimitry Andric 22368bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 22378bcb0991SDimitry Andric if (GAFlags == SIInstrInfo::MO_NONE) 22388bcb0991SDimitry Andric MIB.addImm(0); 22398bcb0991SDimitry Andric else 2240*e8d8bef9SDimitry Andric MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); 22418bcb0991SDimitry Andric 22428bcb0991SDimitry Andric B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 22438bcb0991SDimitry Andric 22448bcb0991SDimitry Andric if (PtrTy.getSizeInBits() == 32) 22458bcb0991SDimitry Andric B.buildExtract(DstReg, PCReg, 0); 22468bcb0991SDimitry Andric return true; 22478bcb0991SDimitry Andric } 22488bcb0991SDimitry Andric 22498bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue( 22508bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 22518bcb0991SDimitry Andric MachineIRBuilder &B) const { 22528bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 22538bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 22548bcb0991SDimitry Andric unsigned AS = Ty.getAddressSpace(); 22558bcb0991SDimitry Andric 22568bcb0991SDimitry Andric const GlobalValue *GV = MI.getOperand(1).getGlobal(); 22578bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 22588bcb0991SDimitry Andric SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 22598bcb0991SDimitry Andric 22608bcb0991SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 2261*e8d8bef9SDimitry Andric if (!MFI->isModuleEntryFunction()) { 22628bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 22638bcb0991SDimitry Andric DiagnosticInfoUnsupported BadLDSDecl( 22645ffd83dbSDimitry Andric Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 22655ffd83dbSDimitry Andric DS_Warning); 22668bcb0991SDimitry Andric Fn.getContext().diagnose(BadLDSDecl); 22675ffd83dbSDimitry Andric 22685ffd83dbSDimitry Andric // We currently don't have a way to correctly allocate LDS objects that 22695ffd83dbSDimitry Andric // aren't directly associated with a kernel. We do force inlining of 22705ffd83dbSDimitry Andric // functions that use local objects. However, if these dead functions are 22715ffd83dbSDimitry Andric // not eliminated, we don't want a compile time error. Just emit a warning 22725ffd83dbSDimitry Andric // and a trap, since there should be no callable path here. 22735ffd83dbSDimitry Andric B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 22745ffd83dbSDimitry Andric B.buildUndef(DstReg); 22755ffd83dbSDimitry Andric MI.eraseFromParent(); 22765ffd83dbSDimitry Andric return true; 22778bcb0991SDimitry Andric } 22788bcb0991SDimitry Andric 22798bcb0991SDimitry Andric // TODO: We could emit code to handle the initialization somewhere. 22808bcb0991SDimitry Andric if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 22815ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 22825ffd83dbSDimitry Andric if (!TLI->shouldUseLDSConstAddress(GV)) { 22835ffd83dbSDimitry Andric MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 22845ffd83dbSDimitry Andric return true; // Leave in place; 22855ffd83dbSDimitry Andric } 22865ffd83dbSDimitry Andric 2287*e8d8bef9SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { 2288*e8d8bef9SDimitry Andric Type *Ty = GV->getValueType(); 2289*e8d8bef9SDimitry Andric // HIP uses an unsized array `extern __shared__ T s[]` or similar 2290*e8d8bef9SDimitry Andric // zero-sized type in other languages to declare the dynamic shared 2291*e8d8bef9SDimitry Andric // memory which size is not known at the compile time. They will be 2292*e8d8bef9SDimitry Andric // allocated by the runtime and placed directly after the static 2293*e8d8bef9SDimitry Andric // allocated ones. They all share the same offset. 2294*e8d8bef9SDimitry Andric if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { 2295*e8d8bef9SDimitry Andric // Adjust alignment for that dynamic shared memory array. 2296*e8d8bef9SDimitry Andric MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); 2297*e8d8bef9SDimitry Andric LLT S32 = LLT::scalar(32); 2298*e8d8bef9SDimitry Andric auto Sz = 2299*e8d8bef9SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); 2300*e8d8bef9SDimitry Andric B.buildIntToPtr(DstReg, Sz); 2301*e8d8bef9SDimitry Andric MI.eraseFromParent(); 2302*e8d8bef9SDimitry Andric return true; 2303*e8d8bef9SDimitry Andric } 2304*e8d8bef9SDimitry Andric } 2305*e8d8bef9SDimitry Andric 23065ffd83dbSDimitry Andric B.buildConstant( 23075ffd83dbSDimitry Andric DstReg, 23085ffd83dbSDimitry Andric MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 23098bcb0991SDimitry Andric MI.eraseFromParent(); 23108bcb0991SDimitry Andric return true; 23118bcb0991SDimitry Andric } 23128bcb0991SDimitry Andric 23138bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 23148bcb0991SDimitry Andric DiagnosticInfoUnsupported BadInit( 23158bcb0991SDimitry Andric Fn, "unsupported initializer for address space", MI.getDebugLoc()); 23168bcb0991SDimitry Andric Fn.getContext().diagnose(BadInit); 23178bcb0991SDimitry Andric return true; 23188bcb0991SDimitry Andric } 23198bcb0991SDimitry Andric 23208bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 23218bcb0991SDimitry Andric 23228bcb0991SDimitry Andric if (TLI->shouldEmitFixup(GV)) { 23238bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 23248bcb0991SDimitry Andric MI.eraseFromParent(); 23258bcb0991SDimitry Andric return true; 23268bcb0991SDimitry Andric } 23278bcb0991SDimitry Andric 23288bcb0991SDimitry Andric if (TLI->shouldEmitPCReloc(GV)) { 23298bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 23308bcb0991SDimitry Andric MI.eraseFromParent(); 23318bcb0991SDimitry Andric return true; 23328bcb0991SDimitry Andric } 23338bcb0991SDimitry Andric 23348bcb0991SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 23358bcb0991SDimitry Andric Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 23368bcb0991SDimitry Andric 23378bcb0991SDimitry Andric MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 23388bcb0991SDimitry Andric MachinePointerInfo::getGOT(MF), 23398bcb0991SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 23408bcb0991SDimitry Andric MachineMemOperand::MOInvariant, 23415ffd83dbSDimitry Andric 8 /*Size*/, Align(8)); 23428bcb0991SDimitry Andric 23438bcb0991SDimitry Andric buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 23448bcb0991SDimitry Andric 23458bcb0991SDimitry Andric if (Ty.getSizeInBits() == 32) { 23468bcb0991SDimitry Andric // Truncate if this is a 32-bit constant adrdess. 23478bcb0991SDimitry Andric auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 23488bcb0991SDimitry Andric B.buildExtract(DstReg, Load, 0); 23498bcb0991SDimitry Andric } else 23508bcb0991SDimitry Andric B.buildLoad(DstReg, GOTAddr, *GOTMMO); 23518bcb0991SDimitry Andric 23528bcb0991SDimitry Andric MI.eraseFromParent(); 23538bcb0991SDimitry Andric return true; 23548bcb0991SDimitry Andric } 23558bcb0991SDimitry Andric 2356*e8d8bef9SDimitry Andric static LLT widenToNextPowerOf2(LLT Ty) { 2357*e8d8bef9SDimitry Andric if (Ty.isVector()) 2358*e8d8bef9SDimitry Andric return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); 2359*e8d8bef9SDimitry Andric return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); 2360*e8d8bef9SDimitry Andric } 2361*e8d8bef9SDimitry Andric 2362*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, 2363*e8d8bef9SDimitry Andric MachineInstr &MI) const { 2364*e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 2365*e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 2366*e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 2367*e8d8bef9SDimitry Andric 2368*e8d8bef9SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2369*e8d8bef9SDimitry Andric LLT PtrTy = MRI.getType(PtrReg); 2370*e8d8bef9SDimitry Andric unsigned AddrSpace = PtrTy.getAddressSpace(); 2371*e8d8bef9SDimitry Andric 2372*e8d8bef9SDimitry Andric if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 23738bcb0991SDimitry Andric LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 2374*e8d8bef9SDimitry Andric auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); 23758bcb0991SDimitry Andric Observer.changingInstr(MI); 23768bcb0991SDimitry Andric MI.getOperand(1).setReg(Cast.getReg(0)); 23778bcb0991SDimitry Andric Observer.changedInstr(MI); 23788bcb0991SDimitry Andric return true; 23798bcb0991SDimitry Andric } 23808bcb0991SDimitry Andric 2381*e8d8bef9SDimitry Andric Register ValReg = MI.getOperand(0).getReg(); 2382*e8d8bef9SDimitry Andric LLT ValTy = MRI.getType(ValReg); 2383*e8d8bef9SDimitry Andric 2384*e8d8bef9SDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 2385*e8d8bef9SDimitry Andric const unsigned ValSize = ValTy.getSizeInBits(); 2386*e8d8bef9SDimitry Andric const unsigned MemSize = 8 * MMO->getSize(); 2387*e8d8bef9SDimitry Andric const Align MemAlign = MMO->getAlign(); 2388*e8d8bef9SDimitry Andric const unsigned AlignInBits = 8 * MemAlign.value(); 2389*e8d8bef9SDimitry Andric 2390*e8d8bef9SDimitry Andric // Widen non-power-of-2 loads to the alignment if needed 2391*e8d8bef9SDimitry Andric if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { 2392*e8d8bef9SDimitry Andric const unsigned WideMemSize = PowerOf2Ceil(MemSize); 2393*e8d8bef9SDimitry Andric 2394*e8d8bef9SDimitry Andric // This was already the correct extending load result type, so just adjust 2395*e8d8bef9SDimitry Andric // the memory type. 2396*e8d8bef9SDimitry Andric if (WideMemSize == ValSize) { 2397*e8d8bef9SDimitry Andric MachineFunction &MF = B.getMF(); 2398*e8d8bef9SDimitry Andric 2399*e8d8bef9SDimitry Andric MachineMemOperand *WideMMO = 2400*e8d8bef9SDimitry Andric MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); 2401*e8d8bef9SDimitry Andric Observer.changingInstr(MI); 2402*e8d8bef9SDimitry Andric MI.setMemRefs(MF, {WideMMO}); 2403*e8d8bef9SDimitry Andric Observer.changedInstr(MI); 2404*e8d8bef9SDimitry Andric return true; 2405*e8d8bef9SDimitry Andric } 2406*e8d8bef9SDimitry Andric 2407*e8d8bef9SDimitry Andric // Don't bother handling edge case that should probably never be produced. 2408*e8d8bef9SDimitry Andric if (ValSize > WideMemSize) 2409*e8d8bef9SDimitry Andric return false; 2410*e8d8bef9SDimitry Andric 2411*e8d8bef9SDimitry Andric LLT WideTy = widenToNextPowerOf2(ValTy); 2412*e8d8bef9SDimitry Andric 2413*e8d8bef9SDimitry Andric Register WideLoad; 2414*e8d8bef9SDimitry Andric if (!WideTy.isVector()) { 2415*e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2416*e8d8bef9SDimitry Andric B.buildTrunc(ValReg, WideLoad).getReg(0); 2417*e8d8bef9SDimitry Andric } else { 2418*e8d8bef9SDimitry Andric // Extract the subvector. 2419*e8d8bef9SDimitry Andric 2420*e8d8bef9SDimitry Andric if (isRegisterType(ValTy)) { 2421*e8d8bef9SDimitry Andric // If this a case where G_EXTRACT is legal, use it. 2422*e8d8bef9SDimitry Andric // (e.g. <3 x s32> -> <4 x s32>) 2423*e8d8bef9SDimitry Andric WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); 2424*e8d8bef9SDimitry Andric B.buildExtract(ValReg, WideLoad, 0); 2425*e8d8bef9SDimitry Andric } else { 2426*e8d8bef9SDimitry Andric // For cases where the widened type isn't a nice register value, unmerge 2427*e8d8bef9SDimitry Andric // from a widened register (e.g. <3 x s16> -> <4 x s16>) 2428*e8d8bef9SDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 2429*e8d8bef9SDimitry Andric WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); 2430*e8d8bef9SDimitry Andric B.setInsertPt(B.getMBB(), MI.getIterator()); 2431*e8d8bef9SDimitry Andric B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); 2432*e8d8bef9SDimitry Andric } 2433*e8d8bef9SDimitry Andric } 2434*e8d8bef9SDimitry Andric 2435*e8d8bef9SDimitry Andric MI.eraseFromParent(); 2436*e8d8bef9SDimitry Andric return true; 2437*e8d8bef9SDimitry Andric } 2438*e8d8bef9SDimitry Andric 2439*e8d8bef9SDimitry Andric return false; 2440*e8d8bef9SDimitry Andric } 2441*e8d8bef9SDimitry Andric 24428bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad( 24438bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 24448bcb0991SDimitry Andric MachineIRBuilder &B) const { 24458bcb0991SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 24468bcb0991SDimitry Andric assert(Ty.isScalar()); 24478bcb0991SDimitry Andric 2448480093f4SDimitry Andric MachineFunction &MF = B.getMF(); 2449480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2450480093f4SDimitry Andric 24518bcb0991SDimitry Andric // TODO: Always legal with future ftz flag. 24525ffd83dbSDimitry Andric // FIXME: Do we need just output? 24535ffd83dbSDimitry Andric if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 24548bcb0991SDimitry Andric return true; 24555ffd83dbSDimitry Andric if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 24568bcb0991SDimitry Andric return true; 24578bcb0991SDimitry Andric 24588bcb0991SDimitry Andric MachineIRBuilder HelperBuilder(MI); 24598bcb0991SDimitry Andric GISelObserverWrapper DummyObserver; 24608bcb0991SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 24618bcb0991SDimitry Andric return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 24628bcb0991SDimitry Andric } 24638bcb0991SDimitry Andric 2464480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2465480093f4SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2466480093f4SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2467480093f4SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2468480093f4SDimitry Andric Register CmpVal = MI.getOperand(2).getReg(); 2469480093f4SDimitry Andric Register NewVal = MI.getOperand(3).getReg(); 2470480093f4SDimitry Andric 2471*e8d8bef9SDimitry Andric assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && 2472480093f4SDimitry Andric "this should not have been custom lowered"); 2473480093f4SDimitry Andric 2474480093f4SDimitry Andric LLT ValTy = MRI.getType(CmpVal); 2475480093f4SDimitry Andric LLT VecTy = LLT::vector(2, ValTy); 2476480093f4SDimitry Andric 2477480093f4SDimitry Andric Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2478480093f4SDimitry Andric 2479480093f4SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2480480093f4SDimitry Andric .addDef(DstReg) 2481480093f4SDimitry Andric .addUse(PtrReg) 2482480093f4SDimitry Andric .addUse(PackedVal) 2483480093f4SDimitry Andric .setMemRefs(MI.memoperands()); 2484480093f4SDimitry Andric 2485480093f4SDimitry Andric MI.eraseFromParent(); 2486480093f4SDimitry Andric return true; 2487480093f4SDimitry Andric } 2488480093f4SDimitry Andric 24895ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog( 24905ffd83dbSDimitry Andric MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 24915ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 24925ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 24935ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 24945ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 24955ffd83dbSDimitry Andric 24965ffd83dbSDimitry Andric auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 24975ffd83dbSDimitry Andric auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 24985ffd83dbSDimitry Andric 24995ffd83dbSDimitry Andric B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 25005ffd83dbSDimitry Andric MI.eraseFromParent(); 25015ffd83dbSDimitry Andric return true; 25025ffd83dbSDimitry Andric } 25035ffd83dbSDimitry Andric 25045ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 25055ffd83dbSDimitry Andric MachineIRBuilder &B) const { 25065ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 25075ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 25085ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 25095ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 25105ffd83dbSDimitry Andric 25115ffd83dbSDimitry Andric auto K = B.buildFConstant(Ty, numbers::log2e); 25125ffd83dbSDimitry Andric auto Mul = B.buildFMul(Ty, Src, K, Flags); 25135ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 25145ffd83dbSDimitry Andric MI.eraseFromParent(); 25155ffd83dbSDimitry Andric return true; 25165ffd83dbSDimitry Andric } 25175ffd83dbSDimitry Andric 25185ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 25195ffd83dbSDimitry Andric MachineIRBuilder &B) const { 25205ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 25215ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 25225ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 25235ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 25245ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 25255ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 25265ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 25275ffd83dbSDimitry Andric 25285ffd83dbSDimitry Andric if (Ty == S32) { 25295ffd83dbSDimitry Andric auto Log = B.buildFLog2(S32, Src0, Flags); 25305ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 25315ffd83dbSDimitry Andric .addUse(Log.getReg(0)) 25325ffd83dbSDimitry Andric .addUse(Src1) 25335ffd83dbSDimitry Andric .setMIFlags(Flags); 25345ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 25355ffd83dbSDimitry Andric } else if (Ty == S16) { 25365ffd83dbSDimitry Andric // There's no f16 fmul_legacy, so we need to convert for it. 25375ffd83dbSDimitry Andric auto Log = B.buildFLog2(S16, Src0, Flags); 25385ffd83dbSDimitry Andric auto Ext0 = B.buildFPExt(S32, Log, Flags); 25395ffd83dbSDimitry Andric auto Ext1 = B.buildFPExt(S32, Src1, Flags); 25405ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 25415ffd83dbSDimitry Andric .addUse(Ext0.getReg(0)) 25425ffd83dbSDimitry Andric .addUse(Ext1.getReg(0)) 25435ffd83dbSDimitry Andric .setMIFlags(Flags); 25445ffd83dbSDimitry Andric 25455ffd83dbSDimitry Andric B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 25465ffd83dbSDimitry Andric } else 25475ffd83dbSDimitry Andric return false; 25485ffd83dbSDimitry Andric 25495ffd83dbSDimitry Andric MI.eraseFromParent(); 25505ffd83dbSDimitry Andric return true; 25515ffd83dbSDimitry Andric } 25525ffd83dbSDimitry Andric 25535ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers. 25545ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 25555ffd83dbSDimitry Andric Register ModSrc = OrigSrc; 25565ffd83dbSDimitry Andric if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 25575ffd83dbSDimitry Andric ModSrc = SrcFNeg->getOperand(1).getReg(); 25585ffd83dbSDimitry Andric if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 25595ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 25605ffd83dbSDimitry Andric } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 25615ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 25625ffd83dbSDimitry Andric return ModSrc; 25635ffd83dbSDimitry Andric } 25645ffd83dbSDimitry Andric 25655ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 25665ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 25675ffd83dbSDimitry Andric MachineIRBuilder &B) const { 25685ffd83dbSDimitry Andric 25695ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 25705ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 25715ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 25725ffd83dbSDimitry Andric Register OrigSrc = MI.getOperand(1).getReg(); 25735ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 25745ffd83dbSDimitry Andric assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 25755ffd83dbSDimitry Andric "this should not have been custom lowered"); 25765ffd83dbSDimitry Andric 25775ffd83dbSDimitry Andric // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 25785ffd83dbSDimitry Andric // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 25795ffd83dbSDimitry Andric // efficient way to implement it is using V_FRACT_F64. The workaround for the 25805ffd83dbSDimitry Andric // V_FRACT bug is: 25815ffd83dbSDimitry Andric // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 25825ffd83dbSDimitry Andric // 25835ffd83dbSDimitry Andric // Convert floor(x) to (x - fract(x)) 25845ffd83dbSDimitry Andric 25855ffd83dbSDimitry Andric auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 25865ffd83dbSDimitry Andric .addUse(OrigSrc) 25875ffd83dbSDimitry Andric .setMIFlags(Flags); 25885ffd83dbSDimitry Andric 25895ffd83dbSDimitry Andric // Give source modifier matching some assistance before obscuring a foldable 25905ffd83dbSDimitry Andric // pattern. 25915ffd83dbSDimitry Andric 25925ffd83dbSDimitry Andric // TODO: We can avoid the neg on the fract? The input sign to fract 25935ffd83dbSDimitry Andric // shouldn't matter? 25945ffd83dbSDimitry Andric Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 25955ffd83dbSDimitry Andric 25965ffd83dbSDimitry Andric auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 25975ffd83dbSDimitry Andric 25985ffd83dbSDimitry Andric Register Min = MRI.createGenericVirtualRegister(S64); 25995ffd83dbSDimitry Andric 26005ffd83dbSDimitry Andric // We don't need to concern ourselves with the snan handling difference, so 26015ffd83dbSDimitry Andric // use the one which will directly select. 26025ffd83dbSDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 26035ffd83dbSDimitry Andric if (MFI->getMode().IEEE) 26045ffd83dbSDimitry Andric B.buildFMinNumIEEE(Min, Fract, Const, Flags); 26055ffd83dbSDimitry Andric else 26065ffd83dbSDimitry Andric B.buildFMinNum(Min, Fract, Const, Flags); 26075ffd83dbSDimitry Andric 26085ffd83dbSDimitry Andric Register CorrectedFract = Min; 26095ffd83dbSDimitry Andric if (!MI.getFlag(MachineInstr::FmNoNans)) { 26105ffd83dbSDimitry Andric auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 26115ffd83dbSDimitry Andric CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 26125ffd83dbSDimitry Andric } 26135ffd83dbSDimitry Andric 26145ffd83dbSDimitry Andric auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 26155ffd83dbSDimitry Andric B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 26165ffd83dbSDimitry Andric 26175ffd83dbSDimitry Andric MI.eraseFromParent(); 26185ffd83dbSDimitry Andric return true; 26195ffd83dbSDimitry Andric } 26205ffd83dbSDimitry Andric 26215ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations. 26225ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper. 26235ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector( 26245ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 26255ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 26265ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 26275ffd83dbSDimitry Andric assert(MRI.getType(Dst) == LLT::vector(2, 16)); 26285ffd83dbSDimitry Andric 26295ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 26305ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 26315ffd83dbSDimitry Andric assert(MRI.getType(Src0) == LLT::scalar(16)); 26325ffd83dbSDimitry Andric 26335ffd83dbSDimitry Andric auto Merge = B.buildMerge(S32, {Src0, Src1}); 26345ffd83dbSDimitry Andric B.buildBitcast(Dst, Merge); 26355ffd83dbSDimitry Andric 26365ffd83dbSDimitry Andric MI.eraseFromParent(); 26375ffd83dbSDimitry Andric return true; 26385ffd83dbSDimitry Andric } 26395ffd83dbSDimitry Andric 2640*e8d8bef9SDimitry Andric // Check that this is a G_XOR x, -1 2641*e8d8bef9SDimitry Andric static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { 2642*e8d8bef9SDimitry Andric if (MI.getOpcode() != TargetOpcode::G_XOR) 2643*e8d8bef9SDimitry Andric return false; 2644*e8d8bef9SDimitry Andric auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); 2645*e8d8bef9SDimitry Andric return ConstVal && *ConstVal == -1; 2646*e8d8bef9SDimitry Andric } 2647*e8d8bef9SDimitry Andric 26480b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid. 2649*e8d8bef9SDimitry Andric static MachineInstr * 2650*e8d8bef9SDimitry Andric verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, 2651*e8d8bef9SDimitry Andric MachineBasicBlock *&UncondBrTarget, bool &Negated) { 26520b57cec5SDimitry Andric Register CondDef = MI.getOperand(0).getReg(); 26530b57cec5SDimitry Andric if (!MRI.hasOneNonDBGUse(CondDef)) 26540b57cec5SDimitry Andric return nullptr; 26550b57cec5SDimitry Andric 26565ffd83dbSDimitry Andric MachineBasicBlock *Parent = MI.getParent(); 2657*e8d8bef9SDimitry Andric MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); 2658*e8d8bef9SDimitry Andric 2659*e8d8bef9SDimitry Andric if (isNot(MRI, *UseMI)) { 2660*e8d8bef9SDimitry Andric Register NegatedCond = UseMI->getOperand(0).getReg(); 2661*e8d8bef9SDimitry Andric if (!MRI.hasOneNonDBGUse(NegatedCond)) 2662*e8d8bef9SDimitry Andric return nullptr; 2663*e8d8bef9SDimitry Andric 2664*e8d8bef9SDimitry Andric // We're deleting the def of this value, so we need to remove it. 2665*e8d8bef9SDimitry Andric UseMI->eraseFromParent(); 2666*e8d8bef9SDimitry Andric 2667*e8d8bef9SDimitry Andric UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); 2668*e8d8bef9SDimitry Andric Negated = true; 2669*e8d8bef9SDimitry Andric } 2670*e8d8bef9SDimitry Andric 2671*e8d8bef9SDimitry Andric if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) 2672480093f4SDimitry Andric return nullptr; 2673480093f4SDimitry Andric 26745ffd83dbSDimitry Andric // Make sure the cond br is followed by a G_BR, or is the last instruction. 2675*e8d8bef9SDimitry Andric MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); 26765ffd83dbSDimitry Andric if (Next == Parent->end()) { 26775ffd83dbSDimitry Andric MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 26785ffd83dbSDimitry Andric if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 26795ffd83dbSDimitry Andric return nullptr; 26805ffd83dbSDimitry Andric UncondBrTarget = &*NextMBB; 26815ffd83dbSDimitry Andric } else { 2682480093f4SDimitry Andric if (Next->getOpcode() != AMDGPU::G_BR) 2683480093f4SDimitry Andric return nullptr; 2684480093f4SDimitry Andric Br = &*Next; 26855ffd83dbSDimitry Andric UncondBrTarget = Br->getOperand(0).getMBB(); 2686480093f4SDimitry Andric } 2687480093f4SDimitry Andric 2688*e8d8bef9SDimitry Andric return UseMI; 26890b57cec5SDimitry Andric } 26900b57cec5SDimitry Andric 26910b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 2692*e8d8bef9SDimitry Andric const ArgDescriptor *Arg, 2693*e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC, 2694*e8d8bef9SDimitry Andric LLT ArgTy) const { 2695*e8d8bef9SDimitry Andric MCRegister SrcReg = Arg->getRegister(); 2696*e8d8bef9SDimitry Andric assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); 26975ffd83dbSDimitry Andric assert(DstReg.isVirtual() && "Virtual register expected"); 26980b57cec5SDimitry Andric 2699*e8d8bef9SDimitry Andric Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, 2700*e8d8bef9SDimitry Andric ArgTy); 27010b57cec5SDimitry Andric if (Arg->isMasked()) { 27020b57cec5SDimitry Andric // TODO: Should we try to emit this once in the entry block? 27030b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 27040b57cec5SDimitry Andric const unsigned Mask = Arg->getMask(); 27050b57cec5SDimitry Andric const unsigned Shift = countTrailingZeros<unsigned>(Mask); 27060b57cec5SDimitry Andric 27078bcb0991SDimitry Andric Register AndMaskSrc = LiveIn; 27088bcb0991SDimitry Andric 27098bcb0991SDimitry Andric if (Shift != 0) { 27100b57cec5SDimitry Andric auto ShiftAmt = B.buildConstant(S32, Shift); 27118bcb0991SDimitry Andric AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 27128bcb0991SDimitry Andric } 27138bcb0991SDimitry Andric 27148bcb0991SDimitry Andric B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 27155ffd83dbSDimitry Andric } else { 27160b57cec5SDimitry Andric B.buildCopy(DstReg, LiveIn); 27170b57cec5SDimitry Andric } 27180b57cec5SDimitry Andric 27190b57cec5SDimitry Andric return true; 27200b57cec5SDimitry Andric } 27210b57cec5SDimitry Andric 2722*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue( 2723*e8d8bef9SDimitry Andric Register DstReg, MachineIRBuilder &B, 2724*e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2725*e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2726*e8d8bef9SDimitry Andric const ArgDescriptor *Arg; 2727*e8d8bef9SDimitry Andric const TargetRegisterClass *ArgRC; 2728*e8d8bef9SDimitry Andric LLT ArgTy; 2729*e8d8bef9SDimitry Andric std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); 2730*e8d8bef9SDimitry Andric 2731*e8d8bef9SDimitry Andric if (!Arg->isRegister() || !Arg->getRegister().isValid()) 2732*e8d8bef9SDimitry Andric return false; // TODO: Handle these 2733*e8d8bef9SDimitry Andric return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); 2734*e8d8bef9SDimitry Andric } 2735*e8d8bef9SDimitry Andric 27360b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 27375ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 27380b57cec5SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2739*e8d8bef9SDimitry Andric if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) 27405ffd83dbSDimitry Andric return false; 27415ffd83dbSDimitry Andric 27420b57cec5SDimitry Andric MI.eraseFromParent(); 27430b57cec5SDimitry Andric return true; 27440b57cec5SDimitry Andric } 27450b57cec5SDimitry Andric 27468bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 27478bcb0991SDimitry Andric MachineRegisterInfo &MRI, 27488bcb0991SDimitry Andric MachineIRBuilder &B) const { 2749480093f4SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2750480093f4SDimitry Andric LLT DstTy = MRI.getType(Dst); 2751480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 2752480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 2753480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 27548bcb0991SDimitry Andric 2755480093f4SDimitry Andric if (DstTy == S16) 2756480093f4SDimitry Andric return legalizeFDIV16(MI, MRI, B); 2757480093f4SDimitry Andric if (DstTy == S32) 2758480093f4SDimitry Andric return legalizeFDIV32(MI, MRI, B); 2759480093f4SDimitry Andric if (DstTy == S64) 2760480093f4SDimitry Andric return legalizeFDIV64(MI, MRI, B); 2761480093f4SDimitry Andric 27628bcb0991SDimitry Andric return false; 27638bcb0991SDimitry Andric } 27648bcb0991SDimitry Andric 27655ffd83dbSDimitry Andric void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 27665ffd83dbSDimitry Andric Register DstReg, 27675ffd83dbSDimitry Andric Register X, 27685ffd83dbSDimitry Andric Register Y, 27695ffd83dbSDimitry Andric bool IsDiv) const { 27705ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 27715ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 27725ffd83dbSDimitry Andric 27735ffd83dbSDimitry Andric // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 27745ffd83dbSDimitry Andric // algorithm used here. 27755ffd83dbSDimitry Andric 27765ffd83dbSDimitry Andric // Initial estimate of inv(y). 27775ffd83dbSDimitry Andric auto FloatY = B.buildUITOFP(S32, Y); 27785ffd83dbSDimitry Andric auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 27795ffd83dbSDimitry Andric auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 27805ffd83dbSDimitry Andric auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 27815ffd83dbSDimitry Andric auto Z = B.buildFPTOUI(S32, ScaledY); 27825ffd83dbSDimitry Andric 27835ffd83dbSDimitry Andric // One round of UNR. 27845ffd83dbSDimitry Andric auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 27855ffd83dbSDimitry Andric auto NegYZ = B.buildMul(S32, NegY, Z); 27865ffd83dbSDimitry Andric Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 27875ffd83dbSDimitry Andric 27885ffd83dbSDimitry Andric // Quotient/remainder estimate. 27895ffd83dbSDimitry Andric auto Q = B.buildUMulH(S32, X, Z); 27905ffd83dbSDimitry Andric auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 27915ffd83dbSDimitry Andric 27925ffd83dbSDimitry Andric // First quotient/remainder refinement. 27935ffd83dbSDimitry Andric auto One = B.buildConstant(S32, 1); 27945ffd83dbSDimitry Andric auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 27955ffd83dbSDimitry Andric if (IsDiv) 27965ffd83dbSDimitry Andric Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 27975ffd83dbSDimitry Andric R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 27985ffd83dbSDimitry Andric 27995ffd83dbSDimitry Andric // Second quotient/remainder refinement. 28005ffd83dbSDimitry Andric Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 28015ffd83dbSDimitry Andric if (IsDiv) 28025ffd83dbSDimitry Andric B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 28035ffd83dbSDimitry Andric else 28045ffd83dbSDimitry Andric B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 28055ffd83dbSDimitry Andric } 28065ffd83dbSDimitry Andric 28075ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 28085ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 28095ffd83dbSDimitry Andric MachineIRBuilder &B) const { 28105ffd83dbSDimitry Andric const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 28115ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 28125ffd83dbSDimitry Andric Register Num = MI.getOperand(1).getReg(); 28135ffd83dbSDimitry Andric Register Den = MI.getOperand(2).getReg(); 28145ffd83dbSDimitry Andric legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 28155ffd83dbSDimitry Andric MI.eraseFromParent(); 28165ffd83dbSDimitry Andric return true; 28175ffd83dbSDimitry Andric } 28185ffd83dbSDimitry Andric 28195ffd83dbSDimitry Andric // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 28205ffd83dbSDimitry Andric // 28215ffd83dbSDimitry Andric // Return lo, hi of result 28225ffd83dbSDimitry Andric // 28235ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo 28245ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi 28255ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 28265ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad 28275ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc 28285ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32) 28295ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2 28305ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1 28315ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 28325ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 28335ffd83dbSDimitry Andric Register Val) { 28345ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 28355ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, Val); 28365ffd83dbSDimitry Andric 28375ffd83dbSDimitry Andric auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 28385ffd83dbSDimitry Andric auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 28395ffd83dbSDimitry Andric 28405ffd83dbSDimitry Andric auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 28415ffd83dbSDimitry Andric B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 28425ffd83dbSDimitry Andric 28435ffd83dbSDimitry Andric auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 28445ffd83dbSDimitry Andric auto Mul1 = 28455ffd83dbSDimitry Andric B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 28465ffd83dbSDimitry Andric 28475ffd83dbSDimitry Andric // 2**(-32) 28485ffd83dbSDimitry Andric auto Mul2 = 28495ffd83dbSDimitry Andric B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 28505ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 28515ffd83dbSDimitry Andric 28525ffd83dbSDimitry Andric // -(2**32) 28535ffd83dbSDimitry Andric auto Mad2 = B.buildFMAD(S32, Trunc, 28545ffd83dbSDimitry Andric B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 28555ffd83dbSDimitry Andric 28565ffd83dbSDimitry Andric auto ResultLo = B.buildFPTOUI(S32, Mad2); 28575ffd83dbSDimitry Andric auto ResultHi = B.buildFPTOUI(S32, Trunc); 28585ffd83dbSDimitry Andric 28595ffd83dbSDimitry Andric return {ResultLo.getReg(0), ResultHi.getReg(0)}; 28605ffd83dbSDimitry Andric } 28615ffd83dbSDimitry Andric 28625ffd83dbSDimitry Andric void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 28635ffd83dbSDimitry Andric Register DstReg, 28645ffd83dbSDimitry Andric Register Numer, 28655ffd83dbSDimitry Andric Register Denom, 28665ffd83dbSDimitry Andric bool IsDiv) const { 28675ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 28685ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 28695ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 28705ffd83dbSDimitry Andric Register RcpLo, RcpHi; 28715ffd83dbSDimitry Andric 28725ffd83dbSDimitry Andric std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 28735ffd83dbSDimitry Andric 28745ffd83dbSDimitry Andric auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 28755ffd83dbSDimitry Andric 28765ffd83dbSDimitry Andric auto Zero64 = B.buildConstant(S64, 0); 28775ffd83dbSDimitry Andric auto NegDenom = B.buildSub(S64, Zero64, Denom); 28785ffd83dbSDimitry Andric 28795ffd83dbSDimitry Andric auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 28805ffd83dbSDimitry Andric auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 28815ffd83dbSDimitry Andric 28825ffd83dbSDimitry Andric auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 28835ffd83dbSDimitry Andric Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 28845ffd83dbSDimitry Andric Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 28855ffd83dbSDimitry Andric 28865ffd83dbSDimitry Andric auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 28875ffd83dbSDimitry Andric auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 28885ffd83dbSDimitry Andric auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 28895ffd83dbSDimitry Andric auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 28905ffd83dbSDimitry Andric 28915ffd83dbSDimitry Andric auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 28925ffd83dbSDimitry Andric auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 28935ffd83dbSDimitry Andric auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 28945ffd83dbSDimitry Andric Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 28955ffd83dbSDimitry Andric Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 28965ffd83dbSDimitry Andric 28975ffd83dbSDimitry Andric auto Zero32 = B.buildConstant(S32, 0); 28985ffd83dbSDimitry Andric auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 28995ffd83dbSDimitry Andric auto Add2_HiC = 29005ffd83dbSDimitry Andric B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 29015ffd83dbSDimitry Andric auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 29025ffd83dbSDimitry Andric auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 29035ffd83dbSDimitry Andric 29045ffd83dbSDimitry Andric auto UnmergeNumer = B.buildUnmerge(S32, Numer); 29055ffd83dbSDimitry Andric Register NumerLo = UnmergeNumer.getReg(0); 29065ffd83dbSDimitry Andric Register NumerHi = UnmergeNumer.getReg(1); 29075ffd83dbSDimitry Andric 29085ffd83dbSDimitry Andric auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 29095ffd83dbSDimitry Andric auto Mul3 = B.buildMul(S64, Denom, MulHi3); 29105ffd83dbSDimitry Andric auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 29115ffd83dbSDimitry Andric Register Mul3_Lo = UnmergeMul3.getReg(0); 29125ffd83dbSDimitry Andric Register Mul3_Hi = UnmergeMul3.getReg(1); 29135ffd83dbSDimitry Andric auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 29145ffd83dbSDimitry Andric auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 29155ffd83dbSDimitry Andric auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 29165ffd83dbSDimitry Andric auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 29175ffd83dbSDimitry Andric 29185ffd83dbSDimitry Andric auto UnmergeDenom = B.buildUnmerge(S32, Denom); 29195ffd83dbSDimitry Andric Register DenomLo = UnmergeDenom.getReg(0); 29205ffd83dbSDimitry Andric Register DenomHi = UnmergeDenom.getReg(1); 29215ffd83dbSDimitry Andric 29225ffd83dbSDimitry Andric auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 29235ffd83dbSDimitry Andric auto C1 = B.buildSExt(S32, CmpHi); 29245ffd83dbSDimitry Andric 29255ffd83dbSDimitry Andric auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 29265ffd83dbSDimitry Andric auto C2 = B.buildSExt(S32, CmpLo); 29275ffd83dbSDimitry Andric 29285ffd83dbSDimitry Andric auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 29295ffd83dbSDimitry Andric auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 29305ffd83dbSDimitry Andric 29315ffd83dbSDimitry Andric // TODO: Here and below portions of the code can be enclosed into if/endif. 29325ffd83dbSDimitry Andric // Currently control flow is unconditional and we have 4 selects after 29335ffd83dbSDimitry Andric // potential endif to substitute PHIs. 29345ffd83dbSDimitry Andric 29355ffd83dbSDimitry Andric // if C3 != 0 ... 29365ffd83dbSDimitry Andric auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 29375ffd83dbSDimitry Andric auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 29385ffd83dbSDimitry Andric auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 29395ffd83dbSDimitry Andric auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 29405ffd83dbSDimitry Andric 29415ffd83dbSDimitry Andric auto One64 = B.buildConstant(S64, 1); 29425ffd83dbSDimitry Andric auto Add3 = B.buildAdd(S64, MulHi3, One64); 29435ffd83dbSDimitry Andric 29445ffd83dbSDimitry Andric auto C4 = 29455ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 29465ffd83dbSDimitry Andric auto C5 = 29475ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 29485ffd83dbSDimitry Andric auto C6 = B.buildSelect( 29495ffd83dbSDimitry Andric S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 29505ffd83dbSDimitry Andric 29515ffd83dbSDimitry Andric // if (C6 != 0) 29525ffd83dbSDimitry Andric auto Add4 = B.buildAdd(S64, Add3, One64); 29535ffd83dbSDimitry Andric auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 29545ffd83dbSDimitry Andric 29555ffd83dbSDimitry Andric auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 29565ffd83dbSDimitry Andric auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 29575ffd83dbSDimitry Andric auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 29585ffd83dbSDimitry Andric 29595ffd83dbSDimitry Andric // endif C6 29605ffd83dbSDimitry Andric // endif C3 29615ffd83dbSDimitry Andric 29625ffd83dbSDimitry Andric if (IsDiv) { 29635ffd83dbSDimitry Andric auto Sel1 = B.buildSelect( 29645ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 29655ffd83dbSDimitry Andric B.buildSelect(DstReg, 29665ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 29675ffd83dbSDimitry Andric } else { 29685ffd83dbSDimitry Andric auto Sel2 = B.buildSelect( 29695ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 29705ffd83dbSDimitry Andric B.buildSelect(DstReg, 29715ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 29725ffd83dbSDimitry Andric } 29735ffd83dbSDimitry Andric } 29745ffd83dbSDimitry Andric 29755ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 29765ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 29775ffd83dbSDimitry Andric MachineIRBuilder &B) const { 29785ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 29795ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 29805ffd83dbSDimitry Andric const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 29815ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 29825ffd83dbSDimitry Andric Register Num = MI.getOperand(1).getReg(); 29835ffd83dbSDimitry Andric Register Den = MI.getOperand(2).getReg(); 29845ffd83dbSDimitry Andric LLT Ty = MRI.getType(DstReg); 29855ffd83dbSDimitry Andric 29865ffd83dbSDimitry Andric if (Ty == S32) 29875ffd83dbSDimitry Andric legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 29885ffd83dbSDimitry Andric else if (Ty == S64) 29895ffd83dbSDimitry Andric legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 29905ffd83dbSDimitry Andric else 29915ffd83dbSDimitry Andric return false; 29925ffd83dbSDimitry Andric 29935ffd83dbSDimitry Andric MI.eraseFromParent(); 29945ffd83dbSDimitry Andric return true; 29955ffd83dbSDimitry Andric 29965ffd83dbSDimitry Andric } 29975ffd83dbSDimitry Andric 29985ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 29995ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 30005ffd83dbSDimitry Andric MachineIRBuilder &B) const { 30015ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 30025ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 30035ffd83dbSDimitry Andric 30045ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 30055ffd83dbSDimitry Andric const LLT Ty = MRI.getType(DstReg); 30065ffd83dbSDimitry Andric if (Ty != S32 && Ty != S64) 30075ffd83dbSDimitry Andric return false; 30085ffd83dbSDimitry Andric 30095ffd83dbSDimitry Andric const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 30105ffd83dbSDimitry Andric 30115ffd83dbSDimitry Andric Register LHS = MI.getOperand(1).getReg(); 30125ffd83dbSDimitry Andric Register RHS = MI.getOperand(2).getReg(); 30135ffd83dbSDimitry Andric 30145ffd83dbSDimitry Andric auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 30155ffd83dbSDimitry Andric auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 30165ffd83dbSDimitry Andric auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 30175ffd83dbSDimitry Andric 30185ffd83dbSDimitry Andric LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 30195ffd83dbSDimitry Andric RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 30205ffd83dbSDimitry Andric 30215ffd83dbSDimitry Andric LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 30225ffd83dbSDimitry Andric RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 30235ffd83dbSDimitry Andric 30245ffd83dbSDimitry Andric Register UDivRem = MRI.createGenericVirtualRegister(Ty); 30255ffd83dbSDimitry Andric if (Ty == S32) 30265ffd83dbSDimitry Andric legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 30275ffd83dbSDimitry Andric else 30285ffd83dbSDimitry Andric legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 30295ffd83dbSDimitry Andric 30305ffd83dbSDimitry Andric Register Sign; 30315ffd83dbSDimitry Andric if (IsDiv) 30325ffd83dbSDimitry Andric Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 30335ffd83dbSDimitry Andric else 30345ffd83dbSDimitry Andric Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 30355ffd83dbSDimitry Andric 30365ffd83dbSDimitry Andric UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 30375ffd83dbSDimitry Andric B.buildSub(DstReg, UDivRem, Sign); 30385ffd83dbSDimitry Andric 30395ffd83dbSDimitry Andric MI.eraseFromParent(); 30405ffd83dbSDimitry Andric return true; 30415ffd83dbSDimitry Andric } 30425ffd83dbSDimitry Andric 30438bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 30448bcb0991SDimitry Andric MachineRegisterInfo &MRI, 30458bcb0991SDimitry Andric MachineIRBuilder &B) const { 30468bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 30478bcb0991SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 30488bcb0991SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 30498bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 30508bcb0991SDimitry Andric LLT ResTy = MRI.getType(Res); 30518bcb0991SDimitry Andric 30528bcb0991SDimitry Andric const MachineFunction &MF = B.getMF(); 3053*e8d8bef9SDimitry Andric bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3054*e8d8bef9SDimitry Andric MI.getFlag(MachineInstr::FmAfn); 30558bcb0991SDimitry Andric 3056*e8d8bef9SDimitry Andric if (!AllowInaccurateRcp) 30578bcb0991SDimitry Andric return false; 30588bcb0991SDimitry Andric 30598bcb0991SDimitry Andric if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 30608bcb0991SDimitry Andric // 1 / x -> RCP(x) 30618bcb0991SDimitry Andric if (CLHS->isExactlyValue(1.0)) { 30628bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 30638bcb0991SDimitry Andric .addUse(RHS) 30648bcb0991SDimitry Andric .setMIFlags(Flags); 30658bcb0991SDimitry Andric 30668bcb0991SDimitry Andric MI.eraseFromParent(); 30678bcb0991SDimitry Andric return true; 30688bcb0991SDimitry Andric } 30698bcb0991SDimitry Andric 30708bcb0991SDimitry Andric // -1 / x -> RCP( FNEG(x) ) 30718bcb0991SDimitry Andric if (CLHS->isExactlyValue(-1.0)) { 30728bcb0991SDimitry Andric auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 30738bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 30748bcb0991SDimitry Andric .addUse(FNeg.getReg(0)) 30758bcb0991SDimitry Andric .setMIFlags(Flags); 30768bcb0991SDimitry Andric 30778bcb0991SDimitry Andric MI.eraseFromParent(); 30788bcb0991SDimitry Andric return true; 30798bcb0991SDimitry Andric } 30808bcb0991SDimitry Andric } 30818bcb0991SDimitry Andric 30828bcb0991SDimitry Andric // x / y -> x * (1.0 / y) 30838bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 30848bcb0991SDimitry Andric .addUse(RHS) 30858bcb0991SDimitry Andric .setMIFlags(Flags); 30868bcb0991SDimitry Andric B.buildFMul(Res, LHS, RCP, Flags); 30878bcb0991SDimitry Andric 30888bcb0991SDimitry Andric MI.eraseFromParent(); 30898bcb0991SDimitry Andric return true; 30908bcb0991SDimitry Andric } 30918bcb0991SDimitry Andric 3092*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, 3093*e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 3094*e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 3095*e8d8bef9SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3096*e8d8bef9SDimitry Andric Register X = MI.getOperand(1).getReg(); 3097*e8d8bef9SDimitry Andric Register Y = MI.getOperand(2).getReg(); 3098*e8d8bef9SDimitry Andric uint16_t Flags = MI.getFlags(); 3099*e8d8bef9SDimitry Andric LLT ResTy = MRI.getType(Res); 3100*e8d8bef9SDimitry Andric 3101*e8d8bef9SDimitry Andric const MachineFunction &MF = B.getMF(); 3102*e8d8bef9SDimitry Andric bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || 3103*e8d8bef9SDimitry Andric MI.getFlag(MachineInstr::FmAfn); 3104*e8d8bef9SDimitry Andric 3105*e8d8bef9SDimitry Andric if (!AllowInaccurateRcp) 31068bcb0991SDimitry Andric return false; 3107*e8d8bef9SDimitry Andric 3108*e8d8bef9SDimitry Andric auto NegY = B.buildFNeg(ResTy, Y); 3109*e8d8bef9SDimitry Andric auto One = B.buildFConstant(ResTy, 1.0); 3110*e8d8bef9SDimitry Andric 3111*e8d8bef9SDimitry Andric auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 3112*e8d8bef9SDimitry Andric .addUse(Y) 3113*e8d8bef9SDimitry Andric .setMIFlags(Flags); 3114*e8d8bef9SDimitry Andric 3115*e8d8bef9SDimitry Andric auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); 3116*e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp0, R, R); 3117*e8d8bef9SDimitry Andric 3118*e8d8bef9SDimitry Andric auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); 3119*e8d8bef9SDimitry Andric R = B.buildFMA(ResTy, Tmp1, R, R); 3120*e8d8bef9SDimitry Andric 3121*e8d8bef9SDimitry Andric auto Ret = B.buildFMul(ResTy, X, R); 3122*e8d8bef9SDimitry Andric auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); 3123*e8d8bef9SDimitry Andric 3124*e8d8bef9SDimitry Andric B.buildFMA(Res, Tmp2, R, Ret); 3125*e8d8bef9SDimitry Andric MI.eraseFromParent(); 3126*e8d8bef9SDimitry Andric return true; 31278bcb0991SDimitry Andric } 31288bcb0991SDimitry Andric 3129480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 3130480093f4SDimitry Andric MachineRegisterInfo &MRI, 3131480093f4SDimitry Andric MachineIRBuilder &B) const { 3132*e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3133*e8d8bef9SDimitry Andric return true; 3134*e8d8bef9SDimitry Andric 3135480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3136480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3137480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3138480093f4SDimitry Andric 3139480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3140480093f4SDimitry Andric 3141480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 3142480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3143480093f4SDimitry Andric 3144480093f4SDimitry Andric auto LHSExt = B.buildFPExt(S32, LHS, Flags); 3145480093f4SDimitry Andric auto RHSExt = B.buildFPExt(S32, RHS, Flags); 3146480093f4SDimitry Andric 3147480093f4SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3148480093f4SDimitry Andric .addUse(RHSExt.getReg(0)) 3149480093f4SDimitry Andric .setMIFlags(Flags); 3150480093f4SDimitry Andric 3151480093f4SDimitry Andric auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 3152480093f4SDimitry Andric auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 3153480093f4SDimitry Andric 3154480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3155480093f4SDimitry Andric .addUse(RDst.getReg(0)) 3156480093f4SDimitry Andric .addUse(RHS) 3157480093f4SDimitry Andric .addUse(LHS) 3158480093f4SDimitry Andric .setMIFlags(Flags); 3159480093f4SDimitry Andric 3160480093f4SDimitry Andric MI.eraseFromParent(); 3161480093f4SDimitry Andric return true; 3162480093f4SDimitry Andric } 3163480093f4SDimitry Andric 3164480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 3165480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode. 3166480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable, 3167480093f4SDimitry Andric MachineIRBuilder &B, 3168480093f4SDimitry Andric const GCNSubtarget &ST, 3169480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode) { 3170480093f4SDimitry Andric // Set SP denorm mode to this value. 3171480093f4SDimitry Andric unsigned SPDenormMode = 31725ffd83dbSDimitry Andric Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 3173480093f4SDimitry Andric 3174480093f4SDimitry Andric if (ST.hasDenormModeInst()) { 3175480093f4SDimitry Andric // Preserve default FP64FP16 denorm mode while updating FP32 mode. 31765ffd83dbSDimitry Andric uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 3177480093f4SDimitry Andric 31785ffd83dbSDimitry Andric uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 3179480093f4SDimitry Andric B.buildInstr(AMDGPU::S_DENORM_MODE) 3180480093f4SDimitry Andric .addImm(NewDenormModeValue); 3181480093f4SDimitry Andric 3182480093f4SDimitry Andric } else { 3183480093f4SDimitry Andric // Select FP32 bit field in mode register. 3184480093f4SDimitry Andric unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 3185480093f4SDimitry Andric (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 3186480093f4SDimitry Andric (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 3187480093f4SDimitry Andric 3188480093f4SDimitry Andric B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 3189480093f4SDimitry Andric .addImm(SPDenormMode) 3190480093f4SDimitry Andric .addImm(SPDenormModeBitField); 3191480093f4SDimitry Andric } 3192480093f4SDimitry Andric } 3193480093f4SDimitry Andric 3194480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 3195480093f4SDimitry Andric MachineRegisterInfo &MRI, 3196480093f4SDimitry Andric MachineIRBuilder &B) const { 3197*e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 3198*e8d8bef9SDimitry Andric return true; 3199*e8d8bef9SDimitry Andric 3200480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3201480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3202480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3203480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3204480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 3205480093f4SDimitry Andric 3206480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3207480093f4SDimitry Andric 3208480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3209480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 3210480093f4SDimitry Andric 3211480093f4SDimitry Andric auto One = B.buildFConstant(S32, 1.0f); 3212480093f4SDimitry Andric 3213480093f4SDimitry Andric auto DenominatorScaled = 3214480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3215480093f4SDimitry Andric .addUse(LHS) 32165ffd83dbSDimitry Andric .addUse(RHS) 32175ffd83dbSDimitry Andric .addImm(0) 3218480093f4SDimitry Andric .setMIFlags(Flags); 3219480093f4SDimitry Andric auto NumeratorScaled = 3220480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 3221480093f4SDimitry Andric .addUse(LHS) 3222480093f4SDimitry Andric .addUse(RHS) 32235ffd83dbSDimitry Andric .addImm(1) 3224480093f4SDimitry Andric .setMIFlags(Flags); 3225480093f4SDimitry Andric 3226480093f4SDimitry Andric auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 3227480093f4SDimitry Andric .addUse(DenominatorScaled.getReg(0)) 3228480093f4SDimitry Andric .setMIFlags(Flags); 3229480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 3230480093f4SDimitry Andric 3231480093f4SDimitry Andric // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 3232480093f4SDimitry Andric // aren't modeled as reading it. 32335ffd83dbSDimitry Andric if (!Mode.allFP32Denormals()) 3234480093f4SDimitry Andric toggleSPDenormMode(true, B, ST, Mode); 3235480093f4SDimitry Andric 3236480093f4SDimitry Andric auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 3237480093f4SDimitry Andric auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 3238480093f4SDimitry Andric auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 3239480093f4SDimitry Andric auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 3240480093f4SDimitry Andric auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 3241480093f4SDimitry Andric auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 3242480093f4SDimitry Andric 32435ffd83dbSDimitry Andric if (!Mode.allFP32Denormals()) 3244480093f4SDimitry Andric toggleSPDenormMode(false, B, ST, Mode); 3245480093f4SDimitry Andric 3246480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 3247480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 3248480093f4SDimitry Andric .addUse(Fma1.getReg(0)) 3249480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 3250480093f4SDimitry Andric .addUse(NumeratorScaled.getReg(1)) 3251480093f4SDimitry Andric .setMIFlags(Flags); 3252480093f4SDimitry Andric 3253480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 3254480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 3255480093f4SDimitry Andric .addUse(RHS) 3256480093f4SDimitry Andric .addUse(LHS) 3257480093f4SDimitry Andric .setMIFlags(Flags); 3258480093f4SDimitry Andric 3259480093f4SDimitry Andric MI.eraseFromParent(); 3260480093f4SDimitry Andric return true; 3261480093f4SDimitry Andric } 3262480093f4SDimitry Andric 3263480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3264480093f4SDimitry Andric MachineRegisterInfo &MRI, 3265480093f4SDimitry Andric MachineIRBuilder &B) const { 3266*e8d8bef9SDimitry Andric if (legalizeFastUnsafeFDIV64(MI, MRI, B)) 3267*e8d8bef9SDimitry Andric return true; 3268*e8d8bef9SDimitry Andric 3269480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3270480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3271480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3272480093f4SDimitry Andric 3273480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3274480093f4SDimitry Andric 3275480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 3276480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 3277480093f4SDimitry Andric 3278480093f4SDimitry Andric auto One = B.buildFConstant(S64, 1.0); 3279480093f4SDimitry Andric 3280480093f4SDimitry Andric auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3281480093f4SDimitry Andric .addUse(LHS) 3282480093f4SDimitry Andric .addUse(RHS) 32835ffd83dbSDimitry Andric .addImm(0) 3284480093f4SDimitry Andric .setMIFlags(Flags); 3285480093f4SDimitry Andric 3286480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3287480093f4SDimitry Andric 3288480093f4SDimitry Andric auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3289480093f4SDimitry Andric .addUse(DivScale0.getReg(0)) 3290480093f4SDimitry Andric .setMIFlags(Flags); 3291480093f4SDimitry Andric 3292480093f4SDimitry Andric auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3293480093f4SDimitry Andric auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3294480093f4SDimitry Andric auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3295480093f4SDimitry Andric 3296480093f4SDimitry Andric auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3297480093f4SDimitry Andric .addUse(LHS) 3298480093f4SDimitry Andric .addUse(RHS) 32995ffd83dbSDimitry Andric .addImm(1) 3300480093f4SDimitry Andric .setMIFlags(Flags); 3301480093f4SDimitry Andric 3302480093f4SDimitry Andric auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 33035ffd83dbSDimitry Andric auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3304480093f4SDimitry Andric auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3305480093f4SDimitry Andric 3306480093f4SDimitry Andric Register Scale; 3307480093f4SDimitry Andric if (!ST.hasUsableDivScaleConditionOutput()) { 3308480093f4SDimitry Andric // Workaround a hardware bug on SI where the condition output from div_scale 3309480093f4SDimitry Andric // is not usable. 3310480093f4SDimitry Andric 3311480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3312480093f4SDimitry Andric 3313480093f4SDimitry Andric auto NumUnmerge = B.buildUnmerge(S32, LHS); 3314480093f4SDimitry Andric auto DenUnmerge = B.buildUnmerge(S32, RHS); 3315480093f4SDimitry Andric auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3316480093f4SDimitry Andric auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3317480093f4SDimitry Andric 3318480093f4SDimitry Andric auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3319480093f4SDimitry Andric Scale1Unmerge.getReg(1)); 3320480093f4SDimitry Andric auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3321480093f4SDimitry Andric Scale0Unmerge.getReg(1)); 33225ffd83dbSDimitry Andric Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3323480093f4SDimitry Andric } else { 3324480093f4SDimitry Andric Scale = DivScale1.getReg(1); 3325480093f4SDimitry Andric } 3326480093f4SDimitry Andric 3327480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3328480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 3329480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 3330480093f4SDimitry Andric .addUse(Mul.getReg(0)) 3331480093f4SDimitry Andric .addUse(Scale) 3332480093f4SDimitry Andric .setMIFlags(Flags); 3333480093f4SDimitry Andric 3334480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3335480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 3336480093f4SDimitry Andric .addUse(RHS) 3337480093f4SDimitry Andric .addUse(LHS) 3338480093f4SDimitry Andric .setMIFlags(Flags); 3339480093f4SDimitry Andric 3340480093f4SDimitry Andric MI.eraseFromParent(); 3341480093f4SDimitry Andric return true; 3342480093f4SDimitry Andric } 3343480093f4SDimitry Andric 33448bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 33458bcb0991SDimitry Andric MachineRegisterInfo &MRI, 33468bcb0991SDimitry Andric MachineIRBuilder &B) const { 33478bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 33488bcb0991SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 33498bcb0991SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 33508bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 33518bcb0991SDimitry Andric 33528bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 33538bcb0991SDimitry Andric LLT S1 = LLT::scalar(1); 33548bcb0991SDimitry Andric 33558bcb0991SDimitry Andric auto Abs = B.buildFAbs(S32, RHS, Flags); 33568bcb0991SDimitry Andric const APFloat C0Val(1.0f); 33578bcb0991SDimitry Andric 33588bcb0991SDimitry Andric auto C0 = B.buildConstant(S32, 0x6f800000); 33598bcb0991SDimitry Andric auto C1 = B.buildConstant(S32, 0x2f800000); 33608bcb0991SDimitry Andric auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 33618bcb0991SDimitry Andric 33628bcb0991SDimitry Andric auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 33638bcb0991SDimitry Andric auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 33648bcb0991SDimitry Andric 33658bcb0991SDimitry Andric auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 33668bcb0991SDimitry Andric 33678bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 33688bcb0991SDimitry Andric .addUse(Mul0.getReg(0)) 33698bcb0991SDimitry Andric .setMIFlags(Flags); 33708bcb0991SDimitry Andric 33718bcb0991SDimitry Andric auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 33728bcb0991SDimitry Andric 33738bcb0991SDimitry Andric B.buildFMul(Res, Sel, Mul1, Flags); 33748bcb0991SDimitry Andric 33758bcb0991SDimitry Andric MI.eraseFromParent(); 33768bcb0991SDimitry Andric return true; 33778bcb0991SDimitry Andric } 33788bcb0991SDimitry Andric 3379*e8d8bef9SDimitry Andric // Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. 3380*e8d8bef9SDimitry Andric // FIXME: Why do we handle this one but not other removed instructions? 3381*e8d8bef9SDimitry Andric // 3382*e8d8bef9SDimitry Andric // Reciprocal square root. The clamp prevents infinite results, clamping 3383*e8d8bef9SDimitry Andric // infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to 3384*e8d8bef9SDimitry Andric // +-max_float. 3385*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, 3386*e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 3387*e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 3388*e8d8bef9SDimitry Andric if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) 3389*e8d8bef9SDimitry Andric return true; 3390*e8d8bef9SDimitry Andric 3391*e8d8bef9SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 3392*e8d8bef9SDimitry Andric Register Src = MI.getOperand(2).getReg(); 3393*e8d8bef9SDimitry Andric auto Flags = MI.getFlags(); 3394*e8d8bef9SDimitry Andric 3395*e8d8bef9SDimitry Andric LLT Ty = MRI.getType(Dst); 3396*e8d8bef9SDimitry Andric 3397*e8d8bef9SDimitry Andric const fltSemantics *FltSemantics; 3398*e8d8bef9SDimitry Andric if (Ty == LLT::scalar(32)) 3399*e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEsingle(); 3400*e8d8bef9SDimitry Andric else if (Ty == LLT::scalar(64)) 3401*e8d8bef9SDimitry Andric FltSemantics = &APFloat::IEEEdouble(); 3402*e8d8bef9SDimitry Andric else 3403*e8d8bef9SDimitry Andric return false; 3404*e8d8bef9SDimitry Andric 3405*e8d8bef9SDimitry Andric auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) 3406*e8d8bef9SDimitry Andric .addUse(Src) 3407*e8d8bef9SDimitry Andric .setMIFlags(Flags); 3408*e8d8bef9SDimitry Andric 3409*e8d8bef9SDimitry Andric // We don't need to concern ourselves with the snan handling difference, since 3410*e8d8bef9SDimitry Andric // the rsq quieted (or not) so use the one which will directly select. 3411*e8d8bef9SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 3412*e8d8bef9SDimitry Andric const bool UseIEEE = MFI->getMode().IEEE; 3413*e8d8bef9SDimitry Andric 3414*e8d8bef9SDimitry Andric auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); 3415*e8d8bef9SDimitry Andric auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : 3416*e8d8bef9SDimitry Andric B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); 3417*e8d8bef9SDimitry Andric 3418*e8d8bef9SDimitry Andric auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); 3419*e8d8bef9SDimitry Andric 3420*e8d8bef9SDimitry Andric if (UseIEEE) 3421*e8d8bef9SDimitry Andric B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); 3422*e8d8bef9SDimitry Andric else 3423*e8d8bef9SDimitry Andric B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); 3424*e8d8bef9SDimitry Andric MI.eraseFromParent(); 3425*e8d8bef9SDimitry Andric return true; 3426*e8d8bef9SDimitry Andric } 3427*e8d8bef9SDimitry Andric 3428*e8d8bef9SDimitry Andric static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { 3429*e8d8bef9SDimitry Andric switch (IID) { 3430*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 3431*e8d8bef9SDimitry Andric return AMDGPU::G_ATOMICRMW_FADD; 3432*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 3433*e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMIN; 3434*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 3435*e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_ATOMIC_FMAX; 3436*e8d8bef9SDimitry Andric default: 3437*e8d8bef9SDimitry Andric llvm_unreachable("not a DS FP intrinsic"); 3438*e8d8bef9SDimitry Andric } 3439*e8d8bef9SDimitry Andric } 3440*e8d8bef9SDimitry Andric 3441*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, 3442*e8d8bef9SDimitry Andric MachineInstr &MI, 3443*e8d8bef9SDimitry Andric Intrinsic::ID IID) const { 3444*e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 3445*e8d8bef9SDimitry Andric Observer.changingInstr(MI); 3446*e8d8bef9SDimitry Andric 3447*e8d8bef9SDimitry Andric MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); 3448*e8d8bef9SDimitry Andric 3449*e8d8bef9SDimitry Andric // The remaining operands were used to set fields in the MemOperand on 3450*e8d8bef9SDimitry Andric // construction. 3451*e8d8bef9SDimitry Andric for (int I = 6; I > 3; --I) 3452*e8d8bef9SDimitry Andric MI.RemoveOperand(I); 3453*e8d8bef9SDimitry Andric 3454*e8d8bef9SDimitry Andric MI.RemoveOperand(1); // Remove the intrinsic ID. 3455*e8d8bef9SDimitry Andric Observer.changedInstr(MI); 3456*e8d8bef9SDimitry Andric return true; 3457*e8d8bef9SDimitry Andric } 3458*e8d8bef9SDimitry Andric 3459*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, 3460*e8d8bef9SDimitry Andric MachineRegisterInfo &MRI, 3461*e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 3462*e8d8bef9SDimitry Andric uint64_t Offset = 3463*e8d8bef9SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset( 3464*e8d8bef9SDimitry Andric B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 3465*e8d8bef9SDimitry Andric LLT DstTy = MRI.getType(DstReg); 3466*e8d8bef9SDimitry Andric LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 3467*e8d8bef9SDimitry Andric 3468*e8d8bef9SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 3469*e8d8bef9SDimitry Andric if (!loadInputValue(KernargPtrReg, B, 3470*e8d8bef9SDimitry Andric AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) 3471*e8d8bef9SDimitry Andric return false; 3472*e8d8bef9SDimitry Andric 3473*e8d8bef9SDimitry Andric // FIXME: This should be nuw 3474*e8d8bef9SDimitry Andric B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 3475*e8d8bef9SDimitry Andric return true; 3476*e8d8bef9SDimitry Andric } 3477*e8d8bef9SDimitry Andric 34780b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 34790b57cec5SDimitry Andric MachineRegisterInfo &MRI, 34800b57cec5SDimitry Andric MachineIRBuilder &B) const { 34810b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 34820b57cec5SDimitry Andric if (!MFI->isEntryFunction()) { 34830b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 34840b57cec5SDimitry Andric AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 34850b57cec5SDimitry Andric } 34860b57cec5SDimitry Andric 34870b57cec5SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 3488*e8d8bef9SDimitry Andric if (!getImplicitArgPtr(DstReg, MRI, B)) 34890b57cec5SDimitry Andric return false; 34900b57cec5SDimitry Andric 34910b57cec5SDimitry Andric MI.eraseFromParent(); 34920b57cec5SDimitry Andric return true; 34930b57cec5SDimitry Andric } 34940b57cec5SDimitry Andric 34958bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 34968bcb0991SDimitry Andric MachineRegisterInfo &MRI, 34978bcb0991SDimitry Andric MachineIRBuilder &B, 34988bcb0991SDimitry Andric unsigned AddrSpace) const { 34998bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 3500*e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); 3501*e8d8bef9SDimitry Andric Register Hi32 = Unmerge.getReg(1); 3502*e8d8bef9SDimitry Andric 35038bcb0991SDimitry Andric B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 35048bcb0991SDimitry Andric MI.eraseFromParent(); 35058bcb0991SDimitry Andric return true; 35068bcb0991SDimitry Andric } 35078bcb0991SDimitry Andric 35085ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 35095ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be 35105ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset 35115ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in 35125ffd83dbSDimitry Andric // the instruction's soffset field). This function takes the first kind of 35135ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset. 35145ffd83dbSDimitry Andric std::tuple<Register, unsigned, unsigned> 35155ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 35165ffd83dbSDimitry Andric Register OrigOffset) const { 35175ffd83dbSDimitry Andric const unsigned MaxImm = 4095; 35185ffd83dbSDimitry Andric Register BaseReg; 35195ffd83dbSDimitry Andric unsigned TotalConstOffset; 35205ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 35215ffd83dbSDimitry Andric 3522*e8d8bef9SDimitry Andric std::tie(BaseReg, TotalConstOffset) = 3523*e8d8bef9SDimitry Andric AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 35245ffd83dbSDimitry Andric 35255ffd83dbSDimitry Andric unsigned ImmOffset = TotalConstOffset; 35265ffd83dbSDimitry Andric 35275ffd83dbSDimitry Andric // If the immediate value is too big for the immoffset field, put the value 35285ffd83dbSDimitry Andric // and -4096 into the immoffset field so that the value that is copied/added 35295ffd83dbSDimitry Andric // for the voffset field is a multiple of 4096, and it stands more chance 35305ffd83dbSDimitry Andric // of being CSEd with the copy/add for another similar load/store. 35315ffd83dbSDimitry Andric // However, do not do that rounding down to a multiple of 4096 if that is a 35325ffd83dbSDimitry Andric // negative number, as it appears to be illegal to have a negative offset 35335ffd83dbSDimitry Andric // in the vgpr, even if adding the immediate offset makes it positive. 35345ffd83dbSDimitry Andric unsigned Overflow = ImmOffset & ~MaxImm; 35355ffd83dbSDimitry Andric ImmOffset -= Overflow; 35365ffd83dbSDimitry Andric if ((int32_t)Overflow < 0) { 35375ffd83dbSDimitry Andric Overflow += ImmOffset; 35385ffd83dbSDimitry Andric ImmOffset = 0; 35395ffd83dbSDimitry Andric } 35405ffd83dbSDimitry Andric 35415ffd83dbSDimitry Andric if (Overflow != 0) { 35425ffd83dbSDimitry Andric if (!BaseReg) { 35435ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, Overflow).getReg(0); 35445ffd83dbSDimitry Andric } else { 35455ffd83dbSDimitry Andric auto OverflowVal = B.buildConstant(S32, Overflow); 35465ffd83dbSDimitry Andric BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 35475ffd83dbSDimitry Andric } 35485ffd83dbSDimitry Andric } 35495ffd83dbSDimitry Andric 35505ffd83dbSDimitry Andric if (!BaseReg) 35515ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, 0).getReg(0); 35525ffd83dbSDimitry Andric 35535ffd83dbSDimitry Andric return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 35545ffd83dbSDimitry Andric } 35555ffd83dbSDimitry Andric 35568bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets. 35578bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 35588bcb0991SDimitry Andric MachineRegisterInfo &MRI, 3559*e8d8bef9SDimitry Andric Register Reg, 3560*e8d8bef9SDimitry Andric bool ImageStore) const { 35618bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 35628bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 35638bcb0991SDimitry Andric LLT StoreVT = MRI.getType(Reg); 35648bcb0991SDimitry Andric assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 35658bcb0991SDimitry Andric 3566*e8d8bef9SDimitry Andric if (ST.hasUnpackedD16VMem()) { 35678bcb0991SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 35688bcb0991SDimitry Andric 35698bcb0991SDimitry Andric SmallVector<Register, 4> WideRegs; 35708bcb0991SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 35718bcb0991SDimitry Andric WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 35728bcb0991SDimitry Andric 35738bcb0991SDimitry Andric int NumElts = StoreVT.getNumElements(); 35748bcb0991SDimitry Andric 35758bcb0991SDimitry Andric return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 35768bcb0991SDimitry Andric } 35778bcb0991SDimitry Andric 3578*e8d8bef9SDimitry Andric if (ImageStore && ST.hasImageStoreD16Bug()) { 3579*e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 2) { 3580*e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 3581*e8d8bef9SDimitry Andric Reg = B.buildBitcast(S32, Reg).getReg(0); 3582*e8d8bef9SDimitry Andric PackedRegs.push_back(Reg); 3583*e8d8bef9SDimitry Andric PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); 3584*e8d8bef9SDimitry Andric return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); 3585*e8d8bef9SDimitry Andric } 3586*e8d8bef9SDimitry Andric 3587*e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 3) { 3588*e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 3589*e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 3590*e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3591*e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 3592*e8d8bef9SDimitry Andric PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); 3593*e8d8bef9SDimitry Andric Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); 3594*e8d8bef9SDimitry Andric return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); 3595*e8d8bef9SDimitry Andric } 3596*e8d8bef9SDimitry Andric 3597*e8d8bef9SDimitry Andric if (StoreVT.getNumElements() == 4) { 3598*e8d8bef9SDimitry Andric SmallVector<Register, 4> PackedRegs; 3599*e8d8bef9SDimitry Andric Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); 3600*e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Reg); 3601*e8d8bef9SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 3602*e8d8bef9SDimitry Andric PackedRegs.push_back(Unmerge.getReg(I)); 3603*e8d8bef9SDimitry Andric PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); 3604*e8d8bef9SDimitry Andric return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); 3605*e8d8bef9SDimitry Andric } 3606*e8d8bef9SDimitry Andric 3607*e8d8bef9SDimitry Andric llvm_unreachable("invalid data type"); 3608*e8d8bef9SDimitry Andric } 3609*e8d8bef9SDimitry Andric 3610*e8d8bef9SDimitry Andric return Reg; 3611*e8d8bef9SDimitry Andric } 3612*e8d8bef9SDimitry Andric 36135ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType( 36145ffd83dbSDimitry Andric MachineIRBuilder &B, Register VData, bool IsFormat) const { 36155ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 36165ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData); 36178bcb0991SDimitry Andric 36188bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 36198bcb0991SDimitry Andric 36208bcb0991SDimitry Andric // Fixup illegal register types for i8 stores. 36218bcb0991SDimitry Andric if (Ty == LLT::scalar(8) || Ty == S16) { 36228bcb0991SDimitry Andric Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 36235ffd83dbSDimitry Andric return AnyExt; 36248bcb0991SDimitry Andric } 36258bcb0991SDimitry Andric 36268bcb0991SDimitry Andric if (Ty.isVector()) { 36278bcb0991SDimitry Andric if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 36288bcb0991SDimitry Andric if (IsFormat) 36295ffd83dbSDimitry Andric return handleD16VData(B, *MRI, VData); 36305ffd83dbSDimitry Andric } 36315ffd83dbSDimitry Andric } 36325ffd83dbSDimitry Andric 36335ffd83dbSDimitry Andric return VData; 36345ffd83dbSDimitry Andric } 36355ffd83dbSDimitry Andric 36365ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 36375ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 36385ffd83dbSDimitry Andric MachineIRBuilder &B, 36395ffd83dbSDimitry Andric bool IsTyped, 36405ffd83dbSDimitry Andric bool IsFormat) const { 36415ffd83dbSDimitry Andric Register VData = MI.getOperand(1).getReg(); 36425ffd83dbSDimitry Andric LLT Ty = MRI.getType(VData); 36435ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 36445ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 36455ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 36465ffd83dbSDimitry Andric 36475ffd83dbSDimitry Andric VData = fixStoreSourceType(B, VData, IsFormat); 36485ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 36495ffd83dbSDimitry Andric 36505ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 36515ffd83dbSDimitry Andric const int MemSize = MMO->getSize(); 36525ffd83dbSDimitry Andric 36535ffd83dbSDimitry Andric unsigned ImmOffset; 36545ffd83dbSDimitry Andric unsigned TotalOffset; 36555ffd83dbSDimitry Andric 36565ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 36575ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 36585ffd83dbSDimitry Andric 36595ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 36605ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 36615ffd83dbSDimitry Andric Register VIndex; 36625ffd83dbSDimitry Andric int OpOffset = 0; 36635ffd83dbSDimitry Andric if (HasVIndex) { 36645ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 36655ffd83dbSDimitry Andric OpOffset = 1; 36665ffd83dbSDimitry Andric } 36675ffd83dbSDimitry Andric 36685ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 36695ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 36705ffd83dbSDimitry Andric 36715ffd83dbSDimitry Andric unsigned Format = 0; 36725ffd83dbSDimitry Andric if (IsTyped) { 36735ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 36745ffd83dbSDimitry Andric ++OpOffset; 36755ffd83dbSDimitry Andric } 36765ffd83dbSDimitry Andric 36775ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 36785ffd83dbSDimitry Andric 36795ffd83dbSDimitry Andric std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 36805ffd83dbSDimitry Andric if (TotalOffset != 0) 36815ffd83dbSDimitry Andric MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 36825ffd83dbSDimitry Andric 36835ffd83dbSDimitry Andric unsigned Opc; 36845ffd83dbSDimitry Andric if (IsTyped) { 36855ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 36865ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 36875ffd83dbSDimitry Andric } else if (IsFormat) { 36885ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 36895ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 36905ffd83dbSDimitry Andric } else { 36915ffd83dbSDimitry Andric switch (MemSize) { 36925ffd83dbSDimitry Andric case 1: 36935ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 36945ffd83dbSDimitry Andric break; 36955ffd83dbSDimitry Andric case 2: 36965ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 36975ffd83dbSDimitry Andric break; 36985ffd83dbSDimitry Andric default: 36995ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 37005ffd83dbSDimitry Andric break; 37015ffd83dbSDimitry Andric } 37025ffd83dbSDimitry Andric } 37035ffd83dbSDimitry Andric 37045ffd83dbSDimitry Andric if (!VIndex) 37055ffd83dbSDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 37065ffd83dbSDimitry Andric 37075ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 37085ffd83dbSDimitry Andric .addUse(VData) // vdata 37095ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 37105ffd83dbSDimitry Andric .addUse(VIndex) // vindex 37115ffd83dbSDimitry Andric .addUse(VOffset) // voffset 37125ffd83dbSDimitry Andric .addUse(SOffset) // soffset 37135ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 37145ffd83dbSDimitry Andric 37155ffd83dbSDimitry Andric if (IsTyped) 37165ffd83dbSDimitry Andric MIB.addImm(Format); 37175ffd83dbSDimitry Andric 37185ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 37195ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 37205ffd83dbSDimitry Andric .addMemOperand(MMO); 37215ffd83dbSDimitry Andric 37225ffd83dbSDimitry Andric MI.eraseFromParent(); 37238bcb0991SDimitry Andric return true; 37248bcb0991SDimitry Andric } 37258bcb0991SDimitry Andric 37265ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 37275ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 37285ffd83dbSDimitry Andric MachineIRBuilder &B, 37295ffd83dbSDimitry Andric bool IsFormat, 37305ffd83dbSDimitry Andric bool IsTyped) const { 37315ffd83dbSDimitry Andric // FIXME: Verifier should enforce 1 MMO for these intrinsics. 37325ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 37335ffd83dbSDimitry Andric const int MemSize = MMO->getSize(); 37345ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 37355ffd83dbSDimitry Andric 37365ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 37375ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 37385ffd83dbSDimitry Andric 37395ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 37405ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 37415ffd83dbSDimitry Andric 37425ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 37435ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 37445ffd83dbSDimitry Andric Register VIndex; 37455ffd83dbSDimitry Andric int OpOffset = 0; 37465ffd83dbSDimitry Andric if (HasVIndex) { 37475ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 37485ffd83dbSDimitry Andric OpOffset = 1; 37498bcb0991SDimitry Andric } 37508bcb0991SDimitry Andric 37515ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 37525ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 37535ffd83dbSDimitry Andric 37545ffd83dbSDimitry Andric unsigned Format = 0; 37555ffd83dbSDimitry Andric if (IsTyped) { 37565ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 37575ffd83dbSDimitry Andric ++OpOffset; 37588bcb0991SDimitry Andric } 37598bcb0991SDimitry Andric 37605ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 37615ffd83dbSDimitry Andric unsigned ImmOffset; 37625ffd83dbSDimitry Andric unsigned TotalOffset; 37635ffd83dbSDimitry Andric 37645ffd83dbSDimitry Andric LLT Ty = MRI.getType(Dst); 37655ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 37665ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 37675ffd83dbSDimitry Andric const bool Unpacked = ST.hasUnpackedD16VMem(); 37685ffd83dbSDimitry Andric 37695ffd83dbSDimitry Andric std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 37705ffd83dbSDimitry Andric if (TotalOffset != 0) 37715ffd83dbSDimitry Andric MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 37725ffd83dbSDimitry Andric 37735ffd83dbSDimitry Andric unsigned Opc; 37745ffd83dbSDimitry Andric 37755ffd83dbSDimitry Andric if (IsTyped) { 37765ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 37775ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 37785ffd83dbSDimitry Andric } else if (IsFormat) { 37795ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 37805ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 37815ffd83dbSDimitry Andric } else { 37825ffd83dbSDimitry Andric switch (MemSize) { 37835ffd83dbSDimitry Andric case 1: 37845ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 37855ffd83dbSDimitry Andric break; 37865ffd83dbSDimitry Andric case 2: 37875ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 37885ffd83dbSDimitry Andric break; 37895ffd83dbSDimitry Andric default: 37905ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 37915ffd83dbSDimitry Andric break; 37925ffd83dbSDimitry Andric } 37935ffd83dbSDimitry Andric } 37945ffd83dbSDimitry Andric 37955ffd83dbSDimitry Andric Register LoadDstReg; 37965ffd83dbSDimitry Andric 37975ffd83dbSDimitry Andric bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 37985ffd83dbSDimitry Andric LLT UnpackedTy = Ty.changeElementSize(32); 37995ffd83dbSDimitry Andric 38005ffd83dbSDimitry Andric if (IsExtLoad) 38015ffd83dbSDimitry Andric LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 38025ffd83dbSDimitry Andric else if (Unpacked && IsD16 && Ty.isVector()) 38035ffd83dbSDimitry Andric LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 38045ffd83dbSDimitry Andric else 38055ffd83dbSDimitry Andric LoadDstReg = Dst; 38065ffd83dbSDimitry Andric 38075ffd83dbSDimitry Andric if (!VIndex) 38085ffd83dbSDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 38095ffd83dbSDimitry Andric 38105ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 38115ffd83dbSDimitry Andric .addDef(LoadDstReg) // vdata 38125ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 38135ffd83dbSDimitry Andric .addUse(VIndex) // vindex 38145ffd83dbSDimitry Andric .addUse(VOffset) // voffset 38155ffd83dbSDimitry Andric .addUse(SOffset) // soffset 38165ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 38175ffd83dbSDimitry Andric 38185ffd83dbSDimitry Andric if (IsTyped) 38195ffd83dbSDimitry Andric MIB.addImm(Format); 38205ffd83dbSDimitry Andric 38215ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 38225ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 38235ffd83dbSDimitry Andric .addMemOperand(MMO); 38245ffd83dbSDimitry Andric 38255ffd83dbSDimitry Andric if (LoadDstReg != Dst) { 38265ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 38275ffd83dbSDimitry Andric 38285ffd83dbSDimitry Andric // Widen result for extending loads was widened. 38295ffd83dbSDimitry Andric if (IsExtLoad) 38305ffd83dbSDimitry Andric B.buildTrunc(Dst, LoadDstReg); 38315ffd83dbSDimitry Andric else { 38325ffd83dbSDimitry Andric // Repack to original 16-bit vector result 38335ffd83dbSDimitry Andric // FIXME: G_TRUNC should work, but legalization currently fails 38345ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 38355ffd83dbSDimitry Andric SmallVector<Register, 4> Repack; 38365ffd83dbSDimitry Andric for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 38375ffd83dbSDimitry Andric Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 38385ffd83dbSDimitry Andric B.buildMerge(Dst, Repack); 38395ffd83dbSDimitry Andric } 38405ffd83dbSDimitry Andric } 38415ffd83dbSDimitry Andric 38425ffd83dbSDimitry Andric MI.eraseFromParent(); 38435ffd83dbSDimitry Andric return true; 38445ffd83dbSDimitry Andric } 38455ffd83dbSDimitry Andric 38465ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 38475ffd83dbSDimitry Andric MachineIRBuilder &B, 38485ffd83dbSDimitry Andric bool IsInc) const { 38495ffd83dbSDimitry Andric unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 38505ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_ATOMIC_DEC; 38515ffd83dbSDimitry Andric B.buildInstr(Opc) 38525ffd83dbSDimitry Andric .addDef(MI.getOperand(0).getReg()) 38535ffd83dbSDimitry Andric .addUse(MI.getOperand(2).getReg()) 38545ffd83dbSDimitry Andric .addUse(MI.getOperand(3).getReg()) 38555ffd83dbSDimitry Andric .cloneMemRefs(MI); 38565ffd83dbSDimitry Andric MI.eraseFromParent(); 38575ffd83dbSDimitry Andric return true; 38585ffd83dbSDimitry Andric } 38595ffd83dbSDimitry Andric 38605ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 38615ffd83dbSDimitry Andric switch (IntrID) { 38625ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 38635ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 38645ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 38655ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 38665ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 38675ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 38685ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 38695ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 38705ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 38715ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 38725ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 38735ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 38745ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 38755ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 38765ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 38775ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 38785ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 38795ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 38805ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 38815ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 38825ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 38835ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 38845ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 38855ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 38865ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 38875ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 38885ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 38895ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 38905ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 38915ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 38925ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 38935ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 38945ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 38955ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 38965ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 38975ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 38985ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 38995ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 39005ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3901*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 3902*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 3903*e8d8bef9SDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; 39045ffd83dbSDimitry Andric default: 39055ffd83dbSDimitry Andric llvm_unreachable("unhandled atomic opcode"); 39065ffd83dbSDimitry Andric } 39075ffd83dbSDimitry Andric } 39085ffd83dbSDimitry Andric 39095ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 39105ffd83dbSDimitry Andric MachineIRBuilder &B, 39115ffd83dbSDimitry Andric Intrinsic::ID IID) const { 39125ffd83dbSDimitry Andric const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 39135ffd83dbSDimitry Andric IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3914*e8d8bef9SDimitry Andric const bool HasReturn = MI.getNumExplicitDefs() != 0; 39155ffd83dbSDimitry Andric 3916*e8d8bef9SDimitry Andric Register Dst; 39175ffd83dbSDimitry Andric 39185ffd83dbSDimitry Andric int OpOffset = 0; 3919*e8d8bef9SDimitry Andric if (HasReturn) { 3920*e8d8bef9SDimitry Andric // A few FP atomics do not support return values. 3921*e8d8bef9SDimitry Andric Dst = MI.getOperand(0).getReg(); 3922*e8d8bef9SDimitry Andric } else { 3923*e8d8bef9SDimitry Andric OpOffset = -1; 3924*e8d8bef9SDimitry Andric } 3925*e8d8bef9SDimitry Andric 3926*e8d8bef9SDimitry Andric Register VData = MI.getOperand(2 + OpOffset).getReg(); 3927*e8d8bef9SDimitry Andric Register CmpVal; 39285ffd83dbSDimitry Andric 39295ffd83dbSDimitry Andric if (IsCmpSwap) { 39305ffd83dbSDimitry Andric CmpVal = MI.getOperand(3 + OpOffset).getReg(); 39315ffd83dbSDimitry Andric ++OpOffset; 39325ffd83dbSDimitry Andric } 39335ffd83dbSDimitry Andric 39345ffd83dbSDimitry Andric Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3935*e8d8bef9SDimitry Andric const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; 39365ffd83dbSDimitry Andric 39375ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 39385ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 39395ffd83dbSDimitry Andric Register VIndex; 39405ffd83dbSDimitry Andric if (HasVIndex) { 39415ffd83dbSDimitry Andric VIndex = MI.getOperand(4 + OpOffset).getReg(); 39425ffd83dbSDimitry Andric ++OpOffset; 39435ffd83dbSDimitry Andric } 39445ffd83dbSDimitry Andric 39455ffd83dbSDimitry Andric Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 39465ffd83dbSDimitry Andric Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 39475ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 39485ffd83dbSDimitry Andric 39495ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 39505ffd83dbSDimitry Andric 39515ffd83dbSDimitry Andric unsigned ImmOffset; 39525ffd83dbSDimitry Andric unsigned TotalOffset; 39535ffd83dbSDimitry Andric std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 39545ffd83dbSDimitry Andric if (TotalOffset != 0) 39555ffd83dbSDimitry Andric MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 39565ffd83dbSDimitry Andric 39575ffd83dbSDimitry Andric if (!VIndex) 39585ffd83dbSDimitry Andric VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 39595ffd83dbSDimitry Andric 3960*e8d8bef9SDimitry Andric auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); 3961*e8d8bef9SDimitry Andric 3962*e8d8bef9SDimitry Andric if (HasReturn) 3963*e8d8bef9SDimitry Andric MIB.addDef(Dst); 3964*e8d8bef9SDimitry Andric 3965*e8d8bef9SDimitry Andric MIB.addUse(VData); // vdata 39665ffd83dbSDimitry Andric 39675ffd83dbSDimitry Andric if (IsCmpSwap) 39685ffd83dbSDimitry Andric MIB.addReg(CmpVal); 39695ffd83dbSDimitry Andric 39705ffd83dbSDimitry Andric MIB.addUse(RSrc) // rsrc 39715ffd83dbSDimitry Andric .addUse(VIndex) // vindex 39725ffd83dbSDimitry Andric .addUse(VOffset) // voffset 39735ffd83dbSDimitry Andric .addUse(SOffset) // soffset 39745ffd83dbSDimitry Andric .addImm(ImmOffset) // offset(imm) 39755ffd83dbSDimitry Andric .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 39765ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 39775ffd83dbSDimitry Andric .addMemOperand(MMO); 39785ffd83dbSDimitry Andric 39795ffd83dbSDimitry Andric MI.eraseFromParent(); 39805ffd83dbSDimitry Andric return true; 39815ffd83dbSDimitry Andric } 39825ffd83dbSDimitry Andric 39835ffd83dbSDimitry Andric /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 39845ffd83dbSDimitry Andric /// vector with s16 typed elements. 3985*e8d8bef9SDimitry Andric static void packImageA16AddressToDwords( 3986*e8d8bef9SDimitry Andric MachineIRBuilder &B, MachineInstr &MI, 3987*e8d8bef9SDimitry Andric SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, 3988*e8d8bef9SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { 39895ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 39905ffd83dbSDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 39915ffd83dbSDimitry Andric 3992*e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { 3993*e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 39945ffd83dbSDimitry Andric if (!SrcOp.isReg()) 39955ffd83dbSDimitry Andric continue; // _L to _LZ may have eliminated this. 39965ffd83dbSDimitry Andric 39975ffd83dbSDimitry Andric Register AddrReg = SrcOp.getReg(); 39985ffd83dbSDimitry Andric 3999*e8d8bef9SDimitry Andric if (I < Intr->GradientStart) { 40005ffd83dbSDimitry Andric AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 40015ffd83dbSDimitry Andric PackedAddrs.push_back(AddrReg); 40025ffd83dbSDimitry Andric } else { 40035ffd83dbSDimitry Andric // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 40045ffd83dbSDimitry Andric // derivatives dx/dh and dx/dv are packed with undef. 40055ffd83dbSDimitry Andric if (((I + 1) >= EndIdx) || 4006*e8d8bef9SDimitry Andric ((Intr->NumGradients / 2) % 2 == 1 && 4007*e8d8bef9SDimitry Andric (I == static_cast<unsigned>(Intr->GradientStart + 4008*e8d8bef9SDimitry Andric (Intr->NumGradients / 2) - 1) || 4009*e8d8bef9SDimitry Andric I == static_cast<unsigned>(Intr->GradientStart + 4010*e8d8bef9SDimitry Andric Intr->NumGradients - 1))) || 40115ffd83dbSDimitry Andric // Check for _L to _LZ optimization 4012*e8d8bef9SDimitry Andric !MI.getOperand(ArgOffset + I + 1).isReg()) { 40135ffd83dbSDimitry Andric PackedAddrs.push_back( 40145ffd83dbSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 40155ffd83dbSDimitry Andric .getReg(0)); 40165ffd83dbSDimitry Andric } else { 40175ffd83dbSDimitry Andric PackedAddrs.push_back( 4018*e8d8bef9SDimitry Andric B.buildBuildVector( 4019*e8d8bef9SDimitry Andric V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) 40205ffd83dbSDimitry Andric .getReg(0)); 40215ffd83dbSDimitry Andric ++I; 40225ffd83dbSDimitry Andric } 40235ffd83dbSDimitry Andric } 40245ffd83dbSDimitry Andric } 40255ffd83dbSDimitry Andric } 40265ffd83dbSDimitry Andric 40275ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register, 40285ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg. 40295ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 40305ffd83dbSDimitry Andric int DimIdx, int NumVAddrs) { 40315ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 40325ffd83dbSDimitry Andric 40335ffd83dbSDimitry Andric SmallVector<Register, 8> AddrRegs; 40345ffd83dbSDimitry Andric for (int I = 0; I != NumVAddrs; ++I) { 40355ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 40365ffd83dbSDimitry Andric if (SrcOp.isReg()) { 40375ffd83dbSDimitry Andric AddrRegs.push_back(SrcOp.getReg()); 40385ffd83dbSDimitry Andric assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 40395ffd83dbSDimitry Andric } 40405ffd83dbSDimitry Andric } 40415ffd83dbSDimitry Andric 40425ffd83dbSDimitry Andric int NumAddrRegs = AddrRegs.size(); 40435ffd83dbSDimitry Andric if (NumAddrRegs != 1) { 40445ffd83dbSDimitry Andric // Round up to 8 elements for v5-v7 40455ffd83dbSDimitry Andric // FIXME: Missing intermediate sized register classes and instructions. 40465ffd83dbSDimitry Andric if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 40475ffd83dbSDimitry Andric const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 40485ffd83dbSDimitry Andric auto Undef = B.buildUndef(S32); 40495ffd83dbSDimitry Andric AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 40505ffd83dbSDimitry Andric NumAddrRegs = RoundedNumRegs; 40515ffd83dbSDimitry Andric } 40525ffd83dbSDimitry Andric 40535ffd83dbSDimitry Andric auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 40545ffd83dbSDimitry Andric MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 40555ffd83dbSDimitry Andric } 40565ffd83dbSDimitry Andric 40575ffd83dbSDimitry Andric for (int I = 1; I != NumVAddrs; ++I) { 40585ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 40595ffd83dbSDimitry Andric if (SrcOp.isReg()) 40605ffd83dbSDimitry Andric MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 40615ffd83dbSDimitry Andric } 40625ffd83dbSDimitry Andric } 40635ffd83dbSDimitry Andric 40645ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget. 40655ffd83dbSDimitry Andric /// 40665ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be 40675ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed 40685ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit 40695ffd83dbSDimitry Andric /// registers. 40705ffd83dbSDimitry Andric /// 40715ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want 40725ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't 40735ffd83dbSDimitry Andric /// want a selected instrution entering RegBankSelect. In order to avoid 40745ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on 40755ffd83dbSDimitry Andric /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 40765ffd83dbSDimitry Andric /// now unnecessary arguments with $noreg. 40775ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 4078*e8d8bef9SDimitry Andric MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, 4079*e8d8bef9SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *Intr) const { 40805ffd83dbSDimitry Andric 4081*e8d8bef9SDimitry Andric const unsigned NumDefs = MI.getNumExplicitDefs(); 4082*e8d8bef9SDimitry Andric const unsigned ArgOffset = NumDefs + 1; 40835ffd83dbSDimitry Andric bool IsTFE = NumDefs == 2; 40845ffd83dbSDimitry Andric // We are only processing the operands of d16 image operations on subtargets 40855ffd83dbSDimitry Andric // that use the unpacked register layout, or need to repack the TFE result. 40865ffd83dbSDimitry Andric 40875ffd83dbSDimitry Andric // TODO: Do we need to guard against already legalized intrinsics? 40885ffd83dbSDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 4089*e8d8bef9SDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); 40905ffd83dbSDimitry Andric 40915ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 40925ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 40935ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 40945ffd83dbSDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 40955ffd83dbSDimitry Andric 40965ffd83dbSDimitry Andric unsigned DMask = 0; 40975ffd83dbSDimitry Andric 40985ffd83dbSDimitry Andric // Check for 16 bit addresses and pack if true. 4099*e8d8bef9SDimitry Andric LLT GradTy = 4100*e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); 4101*e8d8bef9SDimitry Andric LLT AddrTy = 4102*e8d8bef9SDimitry Andric MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); 41035ffd83dbSDimitry Andric const bool IsG16 = GradTy == S16; 41045ffd83dbSDimitry Andric const bool IsA16 = AddrTy == S16; 41055ffd83dbSDimitry Andric 41065ffd83dbSDimitry Andric int DMaskLanes = 0; 41075ffd83dbSDimitry Andric if (!BaseOpcode->Atomic) { 4108*e8d8bef9SDimitry Andric DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); 41095ffd83dbSDimitry Andric if (BaseOpcode->Gather4) { 41105ffd83dbSDimitry Andric DMaskLanes = 4; 41115ffd83dbSDimitry Andric } else if (DMask != 0) { 41125ffd83dbSDimitry Andric DMaskLanes = countPopulation(DMask); 41135ffd83dbSDimitry Andric } else if (!IsTFE && !BaseOpcode->Store) { 41145ffd83dbSDimitry Andric // If dmask is 0, this is a no-op load. This can be eliminated. 41155ffd83dbSDimitry Andric B.buildUndef(MI.getOperand(0)); 41165ffd83dbSDimitry Andric MI.eraseFromParent(); 41175ffd83dbSDimitry Andric return true; 41185ffd83dbSDimitry Andric } 41195ffd83dbSDimitry Andric } 41205ffd83dbSDimitry Andric 41215ffd83dbSDimitry Andric Observer.changingInstr(MI); 41225ffd83dbSDimitry Andric auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 41235ffd83dbSDimitry Andric 41245ffd83dbSDimitry Andric unsigned NewOpcode = NumDefs == 0 ? 41255ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 41265ffd83dbSDimitry Andric 41275ffd83dbSDimitry Andric // Track that we legalized this 41285ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(NewOpcode)); 41295ffd83dbSDimitry Andric 41305ffd83dbSDimitry Andric // Expecting to get an error flag since TFC is on - and dmask is 0 Force 41315ffd83dbSDimitry Andric // dmask to be at least 1 otherwise the instruction will fail 41325ffd83dbSDimitry Andric if (IsTFE && DMask == 0) { 41335ffd83dbSDimitry Andric DMask = 0x1; 41345ffd83dbSDimitry Andric DMaskLanes = 1; 4135*e8d8bef9SDimitry Andric MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); 41365ffd83dbSDimitry Andric } 41375ffd83dbSDimitry Andric 41385ffd83dbSDimitry Andric if (BaseOpcode->Atomic) { 41395ffd83dbSDimitry Andric Register VData0 = MI.getOperand(2).getReg(); 41405ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData0); 41415ffd83dbSDimitry Andric 41425ffd83dbSDimitry Andric // TODO: Allow atomic swap and bit ops for v2s16/v4s16 41435ffd83dbSDimitry Andric if (Ty.isVector()) 41445ffd83dbSDimitry Andric return false; 41455ffd83dbSDimitry Andric 41465ffd83dbSDimitry Andric if (BaseOpcode->AtomicX2) { 41475ffd83dbSDimitry Andric Register VData1 = MI.getOperand(3).getReg(); 41485ffd83dbSDimitry Andric // The two values are packed in one register. 41495ffd83dbSDimitry Andric LLT PackedTy = LLT::vector(2, Ty); 41505ffd83dbSDimitry Andric auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 41515ffd83dbSDimitry Andric MI.getOperand(2).setReg(Concat.getReg(0)); 41525ffd83dbSDimitry Andric MI.getOperand(3).setReg(AMDGPU::NoRegister); 41535ffd83dbSDimitry Andric } 41545ffd83dbSDimitry Andric } 41555ffd83dbSDimitry Andric 4156*e8d8bef9SDimitry Andric unsigned CorrectedNumVAddrs = Intr->NumVAddrs; 41575ffd83dbSDimitry Andric 41585ffd83dbSDimitry Andric // Optimize _L to _LZ when _L is zero 41595ffd83dbSDimitry Andric if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 4160*e8d8bef9SDimitry Andric AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { 41615ffd83dbSDimitry Andric const ConstantFP *ConstantLod; 41625ffd83dbSDimitry Andric 4163*e8d8bef9SDimitry Andric if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, 4164*e8d8bef9SDimitry Andric m_GFCst(ConstantLod))) { 41655ffd83dbSDimitry Andric if (ConstantLod->isZero() || ConstantLod->isNegative()) { 41665ffd83dbSDimitry Andric // Set new opcode to _lz variant of _l, and change the intrinsic ID. 4167*e8d8bef9SDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = 4168*e8d8bef9SDimitry Andric AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, 4169*e8d8bef9SDimitry Andric Intr->Dim); 41705ffd83dbSDimitry Andric 41715ffd83dbSDimitry Andric // The starting indexes should remain in the same place. 41725ffd83dbSDimitry Andric --CorrectedNumVAddrs; 41735ffd83dbSDimitry Andric 4174*e8d8bef9SDimitry Andric MI.getOperand(MI.getNumExplicitDefs()) 4175*e8d8bef9SDimitry Andric .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); 4176*e8d8bef9SDimitry Andric MI.RemoveOperand(ArgOffset + Intr->LodIndex); 4177*e8d8bef9SDimitry Andric Intr = NewImageDimIntr; 41785ffd83dbSDimitry Andric } 41795ffd83dbSDimitry Andric } 41805ffd83dbSDimitry Andric } 41815ffd83dbSDimitry Andric 41825ffd83dbSDimitry Andric // Optimize _mip away, when 'lod' is zero 4183*e8d8bef9SDimitry Andric if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { 41845ffd83dbSDimitry Andric int64_t ConstantLod; 4185*e8d8bef9SDimitry Andric if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, 4186*e8d8bef9SDimitry Andric m_ICst(ConstantLod))) { 41875ffd83dbSDimitry Andric if (ConstantLod == 0) { 41885ffd83dbSDimitry Andric // TODO: Change intrinsic opcode and remove operand instead or replacing 41895ffd83dbSDimitry Andric // it with 0, as the _L to _LZ handling is done above. 4190*e8d8bef9SDimitry Andric MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); 41915ffd83dbSDimitry Andric --CorrectedNumVAddrs; 41925ffd83dbSDimitry Andric } 41935ffd83dbSDimitry Andric } 41945ffd83dbSDimitry Andric } 41955ffd83dbSDimitry Andric 41965ffd83dbSDimitry Andric // Rewrite the addressing register layout before doing anything else. 41975ffd83dbSDimitry Andric if (IsA16 || IsG16) { 41985ffd83dbSDimitry Andric if (IsA16) { 41995ffd83dbSDimitry Andric // Target must support the feature and gradients need to be 16 bit too 42005ffd83dbSDimitry Andric if (!ST.hasA16() || !IsG16) 42015ffd83dbSDimitry Andric return false; 42025ffd83dbSDimitry Andric } else if (!ST.hasG16()) 42035ffd83dbSDimitry Andric return false; 42045ffd83dbSDimitry Andric 4205*e8d8bef9SDimitry Andric if (Intr->NumVAddrs > 1) { 42065ffd83dbSDimitry Andric SmallVector<Register, 4> PackedRegs; 42075ffd83dbSDimitry Andric // Don't compress addresses for G16 4208*e8d8bef9SDimitry Andric const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; 4209*e8d8bef9SDimitry Andric packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, 4210*e8d8bef9SDimitry Andric PackEndIdx); 42115ffd83dbSDimitry Andric 42125ffd83dbSDimitry Andric if (!IsA16) { 42135ffd83dbSDimitry Andric // Add uncompressed address 4214*e8d8bef9SDimitry Andric for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { 4215*e8d8bef9SDimitry Andric int AddrReg = MI.getOperand(ArgOffset + I).getReg(); 42165ffd83dbSDimitry Andric assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 42175ffd83dbSDimitry Andric PackedRegs.push_back(AddrReg); 42185ffd83dbSDimitry Andric } 42195ffd83dbSDimitry Andric } 42205ffd83dbSDimitry Andric 42215ffd83dbSDimitry Andric // See also below in the non-a16 branch 42225ffd83dbSDimitry Andric const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 42235ffd83dbSDimitry Andric 42245ffd83dbSDimitry Andric if (!UseNSA && PackedRegs.size() > 1) { 42255ffd83dbSDimitry Andric LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 42265ffd83dbSDimitry Andric auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 42275ffd83dbSDimitry Andric PackedRegs[0] = Concat.getReg(0); 42285ffd83dbSDimitry Andric PackedRegs.resize(1); 42295ffd83dbSDimitry Andric } 42305ffd83dbSDimitry Andric 4231*e8d8bef9SDimitry Andric const unsigned NumPacked = PackedRegs.size(); 4232*e8d8bef9SDimitry Andric for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { 4233*e8d8bef9SDimitry Andric MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); 42345ffd83dbSDimitry Andric if (!SrcOp.isReg()) { 42355ffd83dbSDimitry Andric assert(SrcOp.isImm() && SrcOp.getImm() == 0); 42365ffd83dbSDimitry Andric continue; 42375ffd83dbSDimitry Andric } 42385ffd83dbSDimitry Andric 42395ffd83dbSDimitry Andric assert(SrcOp.getReg() != AMDGPU::NoRegister); 42405ffd83dbSDimitry Andric 4241*e8d8bef9SDimitry Andric if (I - Intr->VAddrStart < NumPacked) 4242*e8d8bef9SDimitry Andric SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); 42435ffd83dbSDimitry Andric else 42445ffd83dbSDimitry Andric SrcOp.setReg(AMDGPU::NoRegister); 42455ffd83dbSDimitry Andric } 42465ffd83dbSDimitry Andric } 42475ffd83dbSDimitry Andric } else { 42485ffd83dbSDimitry Andric // If the register allocator cannot place the address registers contiguously 42495ffd83dbSDimitry Andric // without introducing moves, then using the non-sequential address encoding 42505ffd83dbSDimitry Andric // is always preferable, since it saves VALU instructions and is usually a 42515ffd83dbSDimitry Andric // wash in terms of code size or even better. 42525ffd83dbSDimitry Andric // 42535ffd83dbSDimitry Andric // However, we currently have no way of hinting to the register allocator 42545ffd83dbSDimitry Andric // that MIMG addresses should be placed contiguously when it is possible to 42555ffd83dbSDimitry Andric // do so, so force non-NSA for the common 2-address case as a heuristic. 42565ffd83dbSDimitry Andric // 42575ffd83dbSDimitry Andric // SIShrinkInstructions will convert NSA encodings to non-NSA after register 42585ffd83dbSDimitry Andric // allocation when possible. 42595ffd83dbSDimitry Andric const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 42605ffd83dbSDimitry Andric 4261*e8d8bef9SDimitry Andric if (!UseNSA && Intr->NumVAddrs > 1) 4262*e8d8bef9SDimitry Andric convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, 4263*e8d8bef9SDimitry Andric Intr->NumVAddrs); 42645ffd83dbSDimitry Andric } 42655ffd83dbSDimitry Andric 42665ffd83dbSDimitry Andric int Flags = 0; 42675ffd83dbSDimitry Andric if (IsA16) 42685ffd83dbSDimitry Andric Flags |= 1; 42695ffd83dbSDimitry Andric if (IsG16) 42705ffd83dbSDimitry Andric Flags |= 2; 42715ffd83dbSDimitry Andric MI.addOperand(MachineOperand::CreateImm(Flags)); 42725ffd83dbSDimitry Andric 42735ffd83dbSDimitry Andric if (BaseOpcode->Store) { // No TFE for stores? 42745ffd83dbSDimitry Andric // TODO: Handle dmask trim 42755ffd83dbSDimitry Andric Register VData = MI.getOperand(1).getReg(); 42765ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData); 42775ffd83dbSDimitry Andric if (!Ty.isVector() || Ty.getElementType() != S16) 42785ffd83dbSDimitry Andric return true; 42795ffd83dbSDimitry Andric 4280*e8d8bef9SDimitry Andric Register RepackedReg = handleD16VData(B, *MRI, VData, true); 42815ffd83dbSDimitry Andric if (RepackedReg != VData) { 42825ffd83dbSDimitry Andric MI.getOperand(1).setReg(RepackedReg); 42835ffd83dbSDimitry Andric } 42845ffd83dbSDimitry Andric 42855ffd83dbSDimitry Andric return true; 42865ffd83dbSDimitry Andric } 42875ffd83dbSDimitry Andric 42885ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 42895ffd83dbSDimitry Andric LLT Ty = MRI->getType(DstReg); 42905ffd83dbSDimitry Andric const LLT EltTy = Ty.getScalarType(); 42915ffd83dbSDimitry Andric const bool IsD16 = Ty.getScalarType() == S16; 42925ffd83dbSDimitry Andric const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 42935ffd83dbSDimitry Andric 42945ffd83dbSDimitry Andric // Confirm that the return type is large enough for the dmask specified 42955ffd83dbSDimitry Andric if (NumElts < DMaskLanes) 42965ffd83dbSDimitry Andric return false; 42975ffd83dbSDimitry Andric 42985ffd83dbSDimitry Andric if (NumElts > 4 || DMaskLanes > 4) 42995ffd83dbSDimitry Andric return false; 43005ffd83dbSDimitry Andric 43015ffd83dbSDimitry Andric const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 43025ffd83dbSDimitry Andric const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 43035ffd83dbSDimitry Andric 43045ffd83dbSDimitry Andric // The raw dword aligned data component of the load. The only legal cases 43055ffd83dbSDimitry Andric // where this matters should be when using the packed D16 format, for 43065ffd83dbSDimitry Andric // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 43075ffd83dbSDimitry Andric LLT RoundedTy; 43085ffd83dbSDimitry Andric 43095ffd83dbSDimitry Andric // S32 vector to to cover all data, plus TFE result element. 43105ffd83dbSDimitry Andric LLT TFETy; 43115ffd83dbSDimitry Andric 43125ffd83dbSDimitry Andric // Register type to use for each loaded component. Will be S32 or V2S16. 43135ffd83dbSDimitry Andric LLT RegTy; 43145ffd83dbSDimitry Andric 43155ffd83dbSDimitry Andric if (IsD16 && ST.hasUnpackedD16VMem()) { 43165ffd83dbSDimitry Andric RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 43175ffd83dbSDimitry Andric TFETy = LLT::vector(AdjustedNumElts + 1, 32); 43185ffd83dbSDimitry Andric RegTy = S32; 43195ffd83dbSDimitry Andric } else { 43205ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 43215ffd83dbSDimitry Andric unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 43225ffd83dbSDimitry Andric unsigned RoundedSize = 32 * RoundedElts; 43235ffd83dbSDimitry Andric RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 43245ffd83dbSDimitry Andric TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 43255ffd83dbSDimitry Andric RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 43265ffd83dbSDimitry Andric } 43275ffd83dbSDimitry Andric 43285ffd83dbSDimitry Andric // The return type does not need adjustment. 43295ffd83dbSDimitry Andric // TODO: Should we change s16 case to s32 or <2 x s16>? 43305ffd83dbSDimitry Andric if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 43315ffd83dbSDimitry Andric return true; 43325ffd83dbSDimitry Andric 43335ffd83dbSDimitry Andric Register Dst1Reg; 43345ffd83dbSDimitry Andric 43355ffd83dbSDimitry Andric // Insert after the instruction. 43365ffd83dbSDimitry Andric B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 43375ffd83dbSDimitry Andric 43385ffd83dbSDimitry Andric // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 43395ffd83dbSDimitry Andric // s16> instead of s32, we would only need 1 bitcast instead of multiple. 43405ffd83dbSDimitry Andric const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 43415ffd83dbSDimitry Andric const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 43425ffd83dbSDimitry Andric 43435ffd83dbSDimitry Andric Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 43445ffd83dbSDimitry Andric 43455ffd83dbSDimitry Andric MI.getOperand(0).setReg(NewResultReg); 43465ffd83dbSDimitry Andric 43475ffd83dbSDimitry Andric // In the IR, TFE is supposed to be used with a 2 element struct return 43485ffd83dbSDimitry Andric // type. The intruction really returns these two values in one contiguous 43495ffd83dbSDimitry Andric // register, with one additional dword beyond the loaded data. Rewrite the 43505ffd83dbSDimitry Andric // return type to use a single register result. 43515ffd83dbSDimitry Andric 43525ffd83dbSDimitry Andric if (IsTFE) { 43535ffd83dbSDimitry Andric Dst1Reg = MI.getOperand(1).getReg(); 43545ffd83dbSDimitry Andric if (MRI->getType(Dst1Reg) != S32) 43555ffd83dbSDimitry Andric return false; 43565ffd83dbSDimitry Andric 43575ffd83dbSDimitry Andric // TODO: Make sure the TFE operand bit is set. 43585ffd83dbSDimitry Andric MI.RemoveOperand(1); 43595ffd83dbSDimitry Andric 43605ffd83dbSDimitry Andric // Handle the easy case that requires no repack instructions. 43615ffd83dbSDimitry Andric if (Ty == S32) { 43625ffd83dbSDimitry Andric B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 43635ffd83dbSDimitry Andric return true; 43645ffd83dbSDimitry Andric } 43655ffd83dbSDimitry Andric } 43665ffd83dbSDimitry Andric 43675ffd83dbSDimitry Andric // Now figure out how to copy the new result register back into the old 43685ffd83dbSDimitry Andric // result. 43695ffd83dbSDimitry Andric SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 43705ffd83dbSDimitry Andric 43715ffd83dbSDimitry Andric const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 43725ffd83dbSDimitry Andric 43735ffd83dbSDimitry Andric if (ResultNumRegs == 1) { 43745ffd83dbSDimitry Andric assert(!IsTFE); 43755ffd83dbSDimitry Andric ResultRegs[0] = NewResultReg; 43765ffd83dbSDimitry Andric } else { 43775ffd83dbSDimitry Andric // We have to repack into a new vector of some kind. 43785ffd83dbSDimitry Andric for (int I = 0; I != NumDataRegs; ++I) 43795ffd83dbSDimitry Andric ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 43805ffd83dbSDimitry Andric B.buildUnmerge(ResultRegs, NewResultReg); 43815ffd83dbSDimitry Andric 43825ffd83dbSDimitry Andric // Drop the final TFE element to get the data part. The TFE result is 43835ffd83dbSDimitry Andric // directly written to the right place already. 43845ffd83dbSDimitry Andric if (IsTFE) 43855ffd83dbSDimitry Andric ResultRegs.resize(NumDataRegs); 43865ffd83dbSDimitry Andric } 43875ffd83dbSDimitry Andric 43885ffd83dbSDimitry Andric // For an s16 scalar result, we form an s32 result with a truncate regardless 43895ffd83dbSDimitry Andric // of packed vs. unpacked. 43905ffd83dbSDimitry Andric if (IsD16 && !Ty.isVector()) { 43915ffd83dbSDimitry Andric B.buildTrunc(DstReg, ResultRegs[0]); 43925ffd83dbSDimitry Andric return true; 43935ffd83dbSDimitry Andric } 43945ffd83dbSDimitry Andric 43955ffd83dbSDimitry Andric // Avoid a build/concat_vector of 1 entry. 43965ffd83dbSDimitry Andric if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 43975ffd83dbSDimitry Andric B.buildBitcast(DstReg, ResultRegs[0]); 43985ffd83dbSDimitry Andric return true; 43995ffd83dbSDimitry Andric } 44005ffd83dbSDimitry Andric 44015ffd83dbSDimitry Andric assert(Ty.isVector()); 44025ffd83dbSDimitry Andric 44035ffd83dbSDimitry Andric if (IsD16) { 44045ffd83dbSDimitry Andric // For packed D16 results with TFE enabled, all the data components are 44055ffd83dbSDimitry Andric // S32. Cast back to the expected type. 44065ffd83dbSDimitry Andric // 44075ffd83dbSDimitry Andric // TODO: We don't really need to use load s32 elements. We would only need one 44085ffd83dbSDimitry Andric // cast for the TFE result if a multiple of v2s16 was used. 44095ffd83dbSDimitry Andric if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 44105ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 44115ffd83dbSDimitry Andric Reg = B.buildBitcast(V2S16, Reg).getReg(0); 44125ffd83dbSDimitry Andric } else if (ST.hasUnpackedD16VMem()) { 44135ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 44145ffd83dbSDimitry Andric Reg = B.buildTrunc(S16, Reg).getReg(0); 44155ffd83dbSDimitry Andric } 44165ffd83dbSDimitry Andric } 44175ffd83dbSDimitry Andric 44185ffd83dbSDimitry Andric auto padWithUndef = [&](LLT Ty, int NumElts) { 44195ffd83dbSDimitry Andric if (NumElts == 0) 44205ffd83dbSDimitry Andric return; 44215ffd83dbSDimitry Andric Register Undef = B.buildUndef(Ty).getReg(0); 44225ffd83dbSDimitry Andric for (int I = 0; I != NumElts; ++I) 44235ffd83dbSDimitry Andric ResultRegs.push_back(Undef); 44245ffd83dbSDimitry Andric }; 44255ffd83dbSDimitry Andric 44265ffd83dbSDimitry Andric // Pad out any elements eliminated due to the dmask. 44275ffd83dbSDimitry Andric LLT ResTy = MRI->getType(ResultRegs[0]); 44285ffd83dbSDimitry Andric if (!ResTy.isVector()) { 44295ffd83dbSDimitry Andric padWithUndef(ResTy, NumElts - ResultRegs.size()); 44305ffd83dbSDimitry Andric B.buildBuildVector(DstReg, ResultRegs); 44315ffd83dbSDimitry Andric return true; 44325ffd83dbSDimitry Andric } 44335ffd83dbSDimitry Andric 44345ffd83dbSDimitry Andric assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 44355ffd83dbSDimitry Andric const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 44365ffd83dbSDimitry Andric 44375ffd83dbSDimitry Andric // Deal with the one annoying legal case. 44385ffd83dbSDimitry Andric const LLT V3S16 = LLT::vector(3, 16); 44395ffd83dbSDimitry Andric if (Ty == V3S16) { 44405ffd83dbSDimitry Andric padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 44415ffd83dbSDimitry Andric auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 44425ffd83dbSDimitry Andric B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 44435ffd83dbSDimitry Andric return true; 44445ffd83dbSDimitry Andric } 44455ffd83dbSDimitry Andric 44465ffd83dbSDimitry Andric padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 44475ffd83dbSDimitry Andric B.buildConcatVectors(DstReg, ResultRegs); 44485ffd83dbSDimitry Andric return true; 44495ffd83dbSDimitry Andric } 44505ffd83dbSDimitry Andric 44515ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4452*e8d8bef9SDimitry Andric LegalizerHelper &Helper, MachineInstr &MI) const { 4453*e8d8bef9SDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 4454*e8d8bef9SDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 4455*e8d8bef9SDimitry Andric 44565ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 44575ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 44585ffd83dbSDimitry Andric unsigned Size = Ty.getSizeInBits(); 44595ffd83dbSDimitry Andric MachineFunction &MF = B.getMF(); 44605ffd83dbSDimitry Andric 44615ffd83dbSDimitry Andric Observer.changingInstr(MI); 44625ffd83dbSDimitry Andric 4463*e8d8bef9SDimitry Andric if (shouldBitcastLoadStoreType(ST, Ty, Size)) { 4464*e8d8bef9SDimitry Andric Ty = getBitcastRegisterType(Ty); 4465*e8d8bef9SDimitry Andric Helper.bitcastDst(MI, Ty, 0); 4466*e8d8bef9SDimitry Andric Dst = MI.getOperand(0).getReg(); 4467*e8d8bef9SDimitry Andric B.setInsertPt(B.getMBB(), MI); 4468*e8d8bef9SDimitry Andric } 4469*e8d8bef9SDimitry Andric 44705ffd83dbSDimitry Andric // FIXME: We don't really need this intermediate instruction. The intrinsic 44715ffd83dbSDimitry Andric // should be fixed to have a memory operand. Since it's readnone, we're not 44725ffd83dbSDimitry Andric // allowed to add one. 44735ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 44745ffd83dbSDimitry Andric MI.RemoveOperand(1); // Remove intrinsic ID 44755ffd83dbSDimitry Andric 44765ffd83dbSDimitry Andric // FIXME: When intrinsic definition is fixed, this should have an MMO already. 44775ffd83dbSDimitry Andric // TODO: Should this use datalayout alignment? 44785ffd83dbSDimitry Andric const unsigned MemSize = (Size + 7) / 8; 44795ffd83dbSDimitry Andric const Align MemAlign(4); 44805ffd83dbSDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 44815ffd83dbSDimitry Andric MachinePointerInfo(), 44825ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 44835ffd83dbSDimitry Andric MachineMemOperand::MOInvariant, 44845ffd83dbSDimitry Andric MemSize, MemAlign); 44855ffd83dbSDimitry Andric MI.addMemOperand(MF, MMO); 44865ffd83dbSDimitry Andric 44875ffd83dbSDimitry Andric // There are no 96-bit result scalar loads, but widening to 128-bit should 44885ffd83dbSDimitry Andric // always be legal. We may need to restore this to a 96-bit result if it turns 44895ffd83dbSDimitry Andric // out this needs to be converted to a vector load during RegBankSelect. 44905ffd83dbSDimitry Andric if (!isPowerOf2_32(Size)) { 44915ffd83dbSDimitry Andric if (Ty.isVector()) 44925ffd83dbSDimitry Andric Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 44935ffd83dbSDimitry Andric else 44945ffd83dbSDimitry Andric Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 44955ffd83dbSDimitry Andric } 44965ffd83dbSDimitry Andric 44975ffd83dbSDimitry Andric Observer.changedInstr(MI); 44985ffd83dbSDimitry Andric return true; 44995ffd83dbSDimitry Andric } 45005ffd83dbSDimitry Andric 4501*e8d8bef9SDimitry Andric // TODO: Move to selection 45025ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 45030b57cec5SDimitry Andric MachineRegisterInfo &MRI, 45040b57cec5SDimitry Andric MachineIRBuilder &B) const { 45055ffd83dbSDimitry Andric // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 45065ffd83dbSDimitry Andric if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 45075ffd83dbSDimitry Andric !ST.isTrapHandlerEnabled()) { 45085ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 45095ffd83dbSDimitry Andric } else { 45105ffd83dbSDimitry Andric // Pass queue pointer to trap handler as input, and insert trap instruction 45115ffd83dbSDimitry Andric // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 45125ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 4513*e8d8bef9SDimitry Andric 4514*e8d8bef9SDimitry Andric Register LiveIn = 4515*e8d8bef9SDimitry Andric MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 4516*e8d8bef9SDimitry Andric if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) 45175ffd83dbSDimitry Andric return false; 4518*e8d8bef9SDimitry Andric 4519*e8d8bef9SDimitry Andric Register SGPR01(AMDGPU::SGPR0_SGPR1); 45205ffd83dbSDimitry Andric B.buildCopy(SGPR01, LiveIn); 45215ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 45225ffd83dbSDimitry Andric .addImm(GCNSubtarget::TrapIDLLVMTrap) 45235ffd83dbSDimitry Andric .addReg(SGPR01, RegState::Implicit); 45245ffd83dbSDimitry Andric } 45255ffd83dbSDimitry Andric 45265ffd83dbSDimitry Andric MI.eraseFromParent(); 45275ffd83dbSDimitry Andric return true; 45285ffd83dbSDimitry Andric } 45295ffd83dbSDimitry Andric 45305ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 45315ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 45325ffd83dbSDimitry Andric // Is non-HSA path or trap-handler disabled? then, report a warning 45335ffd83dbSDimitry Andric // accordingly 45345ffd83dbSDimitry Andric if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 45355ffd83dbSDimitry Andric !ST.isTrapHandlerEnabled()) { 45365ffd83dbSDimitry Andric DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 45375ffd83dbSDimitry Andric "debugtrap handler not supported", 45385ffd83dbSDimitry Andric MI.getDebugLoc(), DS_Warning); 45395ffd83dbSDimitry Andric LLVMContext &Ctx = B.getMF().getFunction().getContext(); 45405ffd83dbSDimitry Andric Ctx.diagnose(NoTrap); 45415ffd83dbSDimitry Andric } else { 45425ffd83dbSDimitry Andric // Insert debug-trap instruction 45435ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 45445ffd83dbSDimitry Andric } 45455ffd83dbSDimitry Andric 45465ffd83dbSDimitry Andric MI.eraseFromParent(); 45475ffd83dbSDimitry Andric return true; 45485ffd83dbSDimitry Andric } 45495ffd83dbSDimitry Andric 4550*e8d8bef9SDimitry Andric bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, 4551*e8d8bef9SDimitry Andric MachineIRBuilder &B) const { 4552*e8d8bef9SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 4553*e8d8bef9SDimitry Andric const LLT S16 = LLT::scalar(16); 4554*e8d8bef9SDimitry Andric const LLT S32 = LLT::scalar(32); 4555*e8d8bef9SDimitry Andric 4556*e8d8bef9SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 4557*e8d8bef9SDimitry Andric Register NodePtr = MI.getOperand(2).getReg(); 4558*e8d8bef9SDimitry Andric Register RayExtent = MI.getOperand(3).getReg(); 4559*e8d8bef9SDimitry Andric Register RayOrigin = MI.getOperand(4).getReg(); 4560*e8d8bef9SDimitry Andric Register RayDir = MI.getOperand(5).getReg(); 4561*e8d8bef9SDimitry Andric Register RayInvDir = MI.getOperand(6).getReg(); 4562*e8d8bef9SDimitry Andric Register TDescr = MI.getOperand(7).getReg(); 4563*e8d8bef9SDimitry Andric 4564*e8d8bef9SDimitry Andric bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; 4565*e8d8bef9SDimitry Andric bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; 4566*e8d8bef9SDimitry Andric unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa 4567*e8d8bef9SDimitry Andric : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa 4568*e8d8bef9SDimitry Andric : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa 4569*e8d8bef9SDimitry Andric : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; 4570*e8d8bef9SDimitry Andric 4571*e8d8bef9SDimitry Andric SmallVector<Register, 12> Ops; 4572*e8d8bef9SDimitry Andric if (Is64) { 4573*e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); 4574*e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 4575*e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 4576*e8d8bef9SDimitry Andric } else { 4577*e8d8bef9SDimitry Andric Ops.push_back(NodePtr); 4578*e8d8bef9SDimitry Andric } 4579*e8d8bef9SDimitry Andric Ops.push_back(RayExtent); 4580*e8d8bef9SDimitry Andric 4581*e8d8bef9SDimitry Andric auto packLanes = [&Ops, &S32, &B] (Register Src) { 4582*e8d8bef9SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); 4583*e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(0)); 4584*e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(1)); 4585*e8d8bef9SDimitry Andric Ops.push_back(Unmerge.getReg(2)); 4586*e8d8bef9SDimitry Andric }; 4587*e8d8bef9SDimitry Andric 4588*e8d8bef9SDimitry Andric packLanes(RayOrigin); 4589*e8d8bef9SDimitry Andric if (IsA16) { 4590*e8d8bef9SDimitry Andric auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); 4591*e8d8bef9SDimitry Andric auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); 4592*e8d8bef9SDimitry Andric Register R1 = MRI.createGenericVirtualRegister(S32); 4593*e8d8bef9SDimitry Andric Register R2 = MRI.createGenericVirtualRegister(S32); 4594*e8d8bef9SDimitry Andric Register R3 = MRI.createGenericVirtualRegister(S32); 4595*e8d8bef9SDimitry Andric B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); 4596*e8d8bef9SDimitry Andric B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); 4597*e8d8bef9SDimitry Andric B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); 4598*e8d8bef9SDimitry Andric Ops.push_back(R1); 4599*e8d8bef9SDimitry Andric Ops.push_back(R2); 4600*e8d8bef9SDimitry Andric Ops.push_back(R3); 4601*e8d8bef9SDimitry Andric } else { 4602*e8d8bef9SDimitry Andric packLanes(RayDir); 4603*e8d8bef9SDimitry Andric packLanes(RayInvDir); 4604*e8d8bef9SDimitry Andric } 4605*e8d8bef9SDimitry Andric 4606*e8d8bef9SDimitry Andric auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) 4607*e8d8bef9SDimitry Andric .addDef(DstReg) 4608*e8d8bef9SDimitry Andric .addImm(Opcode); 4609*e8d8bef9SDimitry Andric 4610*e8d8bef9SDimitry Andric for (Register R : Ops) { 4611*e8d8bef9SDimitry Andric MIB.addUse(R); 4612*e8d8bef9SDimitry Andric } 4613*e8d8bef9SDimitry Andric 4614*e8d8bef9SDimitry Andric MIB.addUse(TDescr) 4615*e8d8bef9SDimitry Andric .addImm(IsA16 ? 1 : 0) 4616*e8d8bef9SDimitry Andric .cloneMemRefs(MI); 4617*e8d8bef9SDimitry Andric 4618*e8d8bef9SDimitry Andric MI.eraseFromParent(); 4619*e8d8bef9SDimitry Andric return true; 4620*e8d8bef9SDimitry Andric } 4621*e8d8bef9SDimitry Andric 46225ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 46235ffd83dbSDimitry Andric MachineInstr &MI) const { 46245ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 46255ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 46265ffd83dbSDimitry Andric 46270b57cec5SDimitry Andric // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4628480093f4SDimitry Andric auto IntrID = MI.getIntrinsicID(); 4629480093f4SDimitry Andric switch (IntrID) { 4630480093f4SDimitry Andric case Intrinsic::amdgcn_if: 4631480093f4SDimitry Andric case Intrinsic::amdgcn_else: { 4632480093f4SDimitry Andric MachineInstr *Br = nullptr; 46335ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 4634*e8d8bef9SDimitry Andric bool Negated = false; 4635*e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 4636*e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 46370b57cec5SDimitry Andric const SIRegisterInfo *TRI 46380b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 46390b57cec5SDimitry Andric 46400b57cec5SDimitry Andric Register Def = MI.getOperand(1).getReg(); 46410b57cec5SDimitry Andric Register Use = MI.getOperand(3).getReg(); 4642480093f4SDimitry Andric 46435ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4644*e8d8bef9SDimitry Andric 4645*e8d8bef9SDimitry Andric if (Negated) 4646*e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 4647*e8d8bef9SDimitry Andric 46485ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4649480093f4SDimitry Andric if (IntrID == Intrinsic::amdgcn_if) { 46500b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_IF) 46510b57cec5SDimitry Andric .addDef(Def) 46520b57cec5SDimitry Andric .addUse(Use) 46535ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 4654480093f4SDimitry Andric } else { 4655480093f4SDimitry Andric B.buildInstr(AMDGPU::SI_ELSE) 4656480093f4SDimitry Andric .addDef(Def) 4657480093f4SDimitry Andric .addUse(Use) 4658*e8d8bef9SDimitry Andric .addMBB(UncondBrTarget); 4659480093f4SDimitry Andric } 4660480093f4SDimitry Andric 46615ffd83dbSDimitry Andric if (Br) { 46625ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 46635ffd83dbSDimitry Andric } else { 46645ffd83dbSDimitry Andric // The IRTranslator skips inserting the G_BR for fallthrough cases, but 46655ffd83dbSDimitry Andric // since we're swapping branch targets it needs to be reinserted. 46665ffd83dbSDimitry Andric // FIXME: IRTranslator should probably not do this 46675ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 46685ffd83dbSDimitry Andric } 46690b57cec5SDimitry Andric 46700b57cec5SDimitry Andric MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 46710b57cec5SDimitry Andric MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 46720b57cec5SDimitry Andric MI.eraseFromParent(); 46730b57cec5SDimitry Andric BrCond->eraseFromParent(); 46740b57cec5SDimitry Andric return true; 46750b57cec5SDimitry Andric } 46760b57cec5SDimitry Andric 46770b57cec5SDimitry Andric return false; 46780b57cec5SDimitry Andric } 46790b57cec5SDimitry Andric case Intrinsic::amdgcn_loop: { 4680480093f4SDimitry Andric MachineInstr *Br = nullptr; 46815ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 4682*e8d8bef9SDimitry Andric bool Negated = false; 4683*e8d8bef9SDimitry Andric if (MachineInstr *BrCond = 4684*e8d8bef9SDimitry Andric verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { 46850b57cec5SDimitry Andric const SIRegisterInfo *TRI 46860b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 46870b57cec5SDimitry Andric 46885ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 46890b57cec5SDimitry Andric Register Reg = MI.getOperand(2).getReg(); 46905ffd83dbSDimitry Andric 4691*e8d8bef9SDimitry Andric if (Negated) 4692*e8d8bef9SDimitry Andric std::swap(CondBrTarget, UncondBrTarget); 4693*e8d8bef9SDimitry Andric 46945ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 46950b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_LOOP) 46960b57cec5SDimitry Andric .addUse(Reg) 46975ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 46985ffd83dbSDimitry Andric 46995ffd83dbSDimitry Andric if (Br) 47005ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 47015ffd83dbSDimitry Andric else 47025ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 47035ffd83dbSDimitry Andric 47040b57cec5SDimitry Andric MI.eraseFromParent(); 47050b57cec5SDimitry Andric BrCond->eraseFromParent(); 47060b57cec5SDimitry Andric MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 47070b57cec5SDimitry Andric return true; 47080b57cec5SDimitry Andric } 47090b57cec5SDimitry Andric 47100b57cec5SDimitry Andric return false; 47110b57cec5SDimitry Andric } 47120b57cec5SDimitry Andric case Intrinsic::amdgcn_kernarg_segment_ptr: 47135ffd83dbSDimitry Andric if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 47145ffd83dbSDimitry Andric // This only makes sense to call in a kernel, so just lower to null. 47155ffd83dbSDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), 0); 47165ffd83dbSDimitry Andric MI.eraseFromParent(); 47175ffd83dbSDimitry Andric return true; 47185ffd83dbSDimitry Andric } 47195ffd83dbSDimitry Andric 47200b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 47210b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 47220b57cec5SDimitry Andric case Intrinsic::amdgcn_implicitarg_ptr: 47230b57cec5SDimitry Andric return legalizeImplicitArgPtr(MI, MRI, B); 47240b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 47250b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47260b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_X); 47270b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 47280b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47290b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 47300b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 47310b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47320b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 47330b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_x: 47340b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47350b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 47360b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_y: 47370b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47380b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 47390b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_z: 47400b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47410b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 47420b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_ptr: 47430b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47440b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_PTR); 47450b57cec5SDimitry Andric case Intrinsic::amdgcn_queue_ptr: 47460b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47470b57cec5SDimitry Andric AMDGPUFunctionArgInfo::QUEUE_PTR); 47480b57cec5SDimitry Andric case Intrinsic::amdgcn_implicit_buffer_ptr: 47490b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 47500b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 47510b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_id: 47520b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 47530b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_ID); 47548bcb0991SDimitry Andric case Intrinsic::amdgcn_fdiv_fast: 47558bcb0991SDimitry Andric return legalizeFDIVFastIntrin(MI, MRI, B); 47568bcb0991SDimitry Andric case Intrinsic::amdgcn_is_shared: 47578bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 47588bcb0991SDimitry Andric case Intrinsic::amdgcn_is_private: 47598bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 47608bcb0991SDimitry Andric case Intrinsic::amdgcn_wavefrontsize: { 47618bcb0991SDimitry Andric B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 47628bcb0991SDimitry Andric MI.eraseFromParent(); 47638bcb0991SDimitry Andric return true; 47648bcb0991SDimitry Andric } 47655ffd83dbSDimitry Andric case Intrinsic::amdgcn_s_buffer_load: 4766*e8d8bef9SDimitry Andric return legalizeSBufferLoad(Helper, MI); 47678bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store: 47685ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store: 47695ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, false); 47708bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store_format: 47715ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store_format: 47725ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, true); 47735ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_store: 47745ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_store: 47755ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, true, true); 47765ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load: 47775ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load: 47785ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, false, false); 47795ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load_format: 47805ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load_format: 47815ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, false); 47825ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_load: 47835ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_load: 47845ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, true); 47855ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 47865ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 47875ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 47885ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 47895ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 47905ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 47915ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 47925ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 47935ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 47945ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 47955ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 47965ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 47975ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 47985ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 47995ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 48005ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 48015ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 48025ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 48035ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 48045ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 48055ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 48065ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 48075ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 48085ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4809*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_fadd: 4810*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_fadd: 48115ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 48125ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 48135ffd83dbSDimitry Andric return legalizeBufferAtomic(MI, B, IntrID); 48145ffd83dbSDimitry Andric case Intrinsic::amdgcn_atomic_inc: 48155ffd83dbSDimitry Andric return legalizeAtomicIncDec(MI, B, true); 48165ffd83dbSDimitry Andric case Intrinsic::amdgcn_atomic_dec: 48175ffd83dbSDimitry Andric return legalizeAtomicIncDec(MI, B, false); 48185ffd83dbSDimitry Andric case Intrinsic::trap: 48195ffd83dbSDimitry Andric return legalizeTrapIntrinsic(MI, MRI, B); 48205ffd83dbSDimitry Andric case Intrinsic::debugtrap: 48215ffd83dbSDimitry Andric return legalizeDebugTrapIntrinsic(MI, MRI, B); 4822*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_rsq_clamp: 4823*e8d8bef9SDimitry Andric return legalizeRsqClampIntrinsic(MI, MRI, B); 4824*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fadd: 4825*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmin: 4826*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_ds_fmax: 4827*e8d8bef9SDimitry Andric return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); 4828*e8d8bef9SDimitry Andric case Intrinsic::amdgcn_image_bvh_intersect_ray: 4829*e8d8bef9SDimitry Andric return legalizeBVHIntrinsic(MI, B); 48305ffd83dbSDimitry Andric default: { 48315ffd83dbSDimitry Andric if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 48325ffd83dbSDimitry Andric AMDGPU::getImageDimIntrinsicInfo(IntrID)) 48335ffd83dbSDimitry Andric return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 48340b57cec5SDimitry Andric return true; 48350b57cec5SDimitry Andric } 48365ffd83dbSDimitry Andric } 48370b57cec5SDimitry Andric 48380b57cec5SDimitry Andric return true; 48390b57cec5SDimitry Andric } 4840