10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for 100b57cec5SDimitry Andric /// AMDGPU. 110b57cec5SDimitry Andric /// \todo This should be generated by TableGen. 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 14*8bcb0991SDimitry Andric #if defined(_MSC_VER) || defined(__MINGW32__) 15*8bcb0991SDimitry Andric // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 16*8bcb0991SDimitry Andric // from the Visual C++ cmath / math.h headers: 17*8bcb0991SDimitry Andric // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 18*8bcb0991SDimitry Andric #define _USE_MATH_DEFINES 19*8bcb0991SDimitry Andric #endif 20*8bcb0991SDimitry Andric 210b57cec5SDimitry Andric #include "AMDGPU.h" 220b57cec5SDimitry Andric #include "AMDGPULegalizerInfo.h" 230b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 240b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 250b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 260b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 270b57cec5SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h" 280b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h" 290b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h" 30*8bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 310b57cec5SDimitry Andric #include "llvm/IR/Type.h" 320b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 330b57cec5SDimitry Andric 340b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo" 350b57cec5SDimitry Andric 360b57cec5SDimitry Andric using namespace llvm; 370b57cec5SDimitry Andric using namespace LegalizeActions; 380b57cec5SDimitry Andric using namespace LegalizeMutations; 390b57cec5SDimitry Andric using namespace LegalityPredicates; 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric 420b57cec5SDimitry Andric static LegalityPredicate isMultiple32(unsigned TypeIdx, 43*8bcb0991SDimitry Andric unsigned MaxSize = 1024) { 440b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 450b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 460b57cec5SDimitry Andric const LLT EltTy = Ty.getScalarType(); 470b57cec5SDimitry Andric return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 480b57cec5SDimitry Andric }; 490b57cec5SDimitry Andric } 500b57cec5SDimitry Andric 51*8bcb0991SDimitry Andric static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 52*8bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 53*8bcb0991SDimitry Andric return Query.Types[TypeIdx].getSizeInBits() == Size; 54*8bcb0991SDimitry Andric }; 55*8bcb0991SDimitry Andric } 56*8bcb0991SDimitry Andric 570b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 580b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 590b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 600b57cec5SDimitry Andric return Ty.isVector() && 610b57cec5SDimitry Andric Ty.getNumElements() % 2 != 0 && 62*8bcb0991SDimitry Andric Ty.getElementType().getSizeInBits() < 32 && 63*8bcb0991SDimitry Andric Ty.getSizeInBits() % 32 != 0; 64*8bcb0991SDimitry Andric }; 65*8bcb0991SDimitry Andric } 66*8bcb0991SDimitry Andric 67*8bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) { 68*8bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 69*8bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 70*8bcb0991SDimitry Andric const LLT EltTy = Ty.getScalarType(); 71*8bcb0991SDimitry Andric return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 720b57cec5SDimitry Andric }; 730b57cec5SDimitry Andric } 740b57cec5SDimitry Andric 750b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 760b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 770b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 780b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 790b57cec5SDimitry Andric return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 800b57cec5SDimitry Andric }; 810b57cec5SDimitry Andric } 820b57cec5SDimitry Andric 830b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 840b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 850b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 860b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 870b57cec5SDimitry Andric unsigned Size = Ty.getSizeInBits(); 880b57cec5SDimitry Andric unsigned Pieces = (Size + 63) / 64; 890b57cec5SDimitry Andric unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 900b57cec5SDimitry Andric return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 910b57cec5SDimitry Andric }; 920b57cec5SDimitry Andric } 930b57cec5SDimitry Andric 94*8bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit 95*8bcb0991SDimitry Andric // type. 96*8bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 97*8bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 98*8bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 99*8bcb0991SDimitry Andric 100*8bcb0991SDimitry Andric const LLT EltTy = Ty.getElementType(); 101*8bcb0991SDimitry Andric const int Size = Ty.getSizeInBits(); 102*8bcb0991SDimitry Andric const int EltSize = EltTy.getSizeInBits(); 103*8bcb0991SDimitry Andric const int NextMul32 = (Size + 31) / 32; 104*8bcb0991SDimitry Andric 105*8bcb0991SDimitry Andric assert(EltSize < 32); 106*8bcb0991SDimitry Andric 107*8bcb0991SDimitry Andric const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 108*8bcb0991SDimitry Andric return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 109*8bcb0991SDimitry Andric }; 110*8bcb0991SDimitry Andric } 111*8bcb0991SDimitry Andric 112*8bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 113*8bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 114*8bcb0991SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 115*8bcb0991SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 116*8bcb0991SDimitry Andric }; 117*8bcb0991SDimitry Andric } 118*8bcb0991SDimitry Andric 1190b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 1200b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1210b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1220b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 1230b57cec5SDimitry Andric }; 1240b57cec5SDimitry Andric } 1250b57cec5SDimitry Andric 1260b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 1270b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1280b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1290b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 1300b57cec5SDimitry Andric }; 1310b57cec5SDimitry Andric } 1320b57cec5SDimitry Andric 133*8bcb0991SDimitry Andric // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 1340b57cec5SDimitry Andric // v2s16. 1350b57cec5SDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) { 1360b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1370b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1380b57cec5SDimitry Andric if (Ty.isVector()) { 1390b57cec5SDimitry Andric const int EltSize = Ty.getElementType().getSizeInBits(); 1400b57cec5SDimitry Andric return EltSize == 32 || EltSize == 64 || 1410b57cec5SDimitry Andric (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 1420b57cec5SDimitry Andric EltSize == 128 || EltSize == 256; 1430b57cec5SDimitry Andric } 1440b57cec5SDimitry Andric 145*8bcb0991SDimitry Andric return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 146*8bcb0991SDimitry Andric }; 147*8bcb0991SDimitry Andric } 148*8bcb0991SDimitry Andric 149*8bcb0991SDimitry Andric static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 150*8bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 151*8bcb0991SDimitry Andric return Query.Types[TypeIdx].getElementType() == Type; 152*8bcb0991SDimitry Andric }; 153*8bcb0991SDimitry Andric } 154*8bcb0991SDimitry Andric 155*8bcb0991SDimitry Andric static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 156*8bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 157*8bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 158*8bcb0991SDimitry Andric return !Ty.isVector() && Ty.getSizeInBits() > 32 && 159*8bcb0991SDimitry Andric Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 1600b57cec5SDimitry Andric }; 1610b57cec5SDimitry Andric } 1620b57cec5SDimitry Andric 1630b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 1640b57cec5SDimitry Andric const GCNTargetMachine &TM) 1650b57cec5SDimitry Andric : ST(ST_) { 1660b57cec5SDimitry Andric using namespace TargetOpcode; 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric auto GetAddrSpacePtr = [&TM](unsigned AS) { 1690b57cec5SDimitry Andric return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 1700b57cec5SDimitry Andric }; 1710b57cec5SDimitry Andric 1720b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 1730b57cec5SDimitry Andric const LLT S8 = LLT::scalar(8); 1740b57cec5SDimitry Andric const LLT S16 = LLT::scalar(16); 1750b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 1760b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 177*8bcb0991SDimitry Andric const LLT S96 = LLT::scalar(96); 1780b57cec5SDimitry Andric const LLT S128 = LLT::scalar(128); 1790b57cec5SDimitry Andric const LLT S256 = LLT::scalar(256); 180*8bcb0991SDimitry Andric const LLT S1024 = LLT::scalar(1024); 1810b57cec5SDimitry Andric 1820b57cec5SDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 1830b57cec5SDimitry Andric const LLT V4S16 = LLT::vector(4, 16); 1840b57cec5SDimitry Andric 1850b57cec5SDimitry Andric const LLT V2S32 = LLT::vector(2, 32); 1860b57cec5SDimitry Andric const LLT V3S32 = LLT::vector(3, 32); 1870b57cec5SDimitry Andric const LLT V4S32 = LLT::vector(4, 32); 1880b57cec5SDimitry Andric const LLT V5S32 = LLT::vector(5, 32); 1890b57cec5SDimitry Andric const LLT V6S32 = LLT::vector(6, 32); 1900b57cec5SDimitry Andric const LLT V7S32 = LLT::vector(7, 32); 1910b57cec5SDimitry Andric const LLT V8S32 = LLT::vector(8, 32); 1920b57cec5SDimitry Andric const LLT V9S32 = LLT::vector(9, 32); 1930b57cec5SDimitry Andric const LLT V10S32 = LLT::vector(10, 32); 1940b57cec5SDimitry Andric const LLT V11S32 = LLT::vector(11, 32); 1950b57cec5SDimitry Andric const LLT V12S32 = LLT::vector(12, 32); 1960b57cec5SDimitry Andric const LLT V13S32 = LLT::vector(13, 32); 1970b57cec5SDimitry Andric const LLT V14S32 = LLT::vector(14, 32); 1980b57cec5SDimitry Andric const LLT V15S32 = LLT::vector(15, 32); 1990b57cec5SDimitry Andric const LLT V16S32 = LLT::vector(16, 32); 200*8bcb0991SDimitry Andric const LLT V32S32 = LLT::vector(32, 32); 2010b57cec5SDimitry Andric 2020b57cec5SDimitry Andric const LLT V2S64 = LLT::vector(2, 64); 2030b57cec5SDimitry Andric const LLT V3S64 = LLT::vector(3, 64); 2040b57cec5SDimitry Andric const LLT V4S64 = LLT::vector(4, 64); 2050b57cec5SDimitry Andric const LLT V5S64 = LLT::vector(5, 64); 2060b57cec5SDimitry Andric const LLT V6S64 = LLT::vector(6, 64); 2070b57cec5SDimitry Andric const LLT V7S64 = LLT::vector(7, 64); 2080b57cec5SDimitry Andric const LLT V8S64 = LLT::vector(8, 64); 209*8bcb0991SDimitry Andric const LLT V16S64 = LLT::vector(16, 64); 2100b57cec5SDimitry Andric 2110b57cec5SDimitry Andric std::initializer_list<LLT> AllS32Vectors = 2120b57cec5SDimitry Andric {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 213*8bcb0991SDimitry Andric V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 2140b57cec5SDimitry Andric std::initializer_list<LLT> AllS64Vectors = 215*8bcb0991SDimitry Andric {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 2160b57cec5SDimitry Andric 2170b57cec5SDimitry Andric const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 2180b57cec5SDimitry Andric const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 219*8bcb0991SDimitry Andric const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 2200b57cec5SDimitry Andric const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 221*8bcb0991SDimitry Andric const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 2220b57cec5SDimitry Andric const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 2230b57cec5SDimitry Andric const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 2240b57cec5SDimitry Andric 2250b57cec5SDimitry Andric const LLT CodePtr = FlatPtr; 2260b57cec5SDimitry Andric 2270b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces64 = { 2280b57cec5SDimitry Andric GlobalPtr, ConstantPtr, FlatPtr 2290b57cec5SDimitry Andric }; 2300b57cec5SDimitry Andric 2310b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces32 = { 232*8bcb0991SDimitry Andric LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 2330b57cec5SDimitry Andric }; 2340b57cec5SDimitry Andric 2350b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesBase = { 2360b57cec5SDimitry Andric S32, S64 2370b57cec5SDimitry Andric }; 2380b57cec5SDimitry Andric 2390b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypes16 = { 2400b57cec5SDimitry Andric S32, S64, S16 2410b57cec5SDimitry Andric }; 2420b57cec5SDimitry Andric 2430b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesPK16 = { 2440b57cec5SDimitry Andric S32, S64, S16, V2S16 2450b57cec5SDimitry Andric }; 2460b57cec5SDimitry Andric 2470b57cec5SDimitry Andric setAction({G_BRCOND, S1}, Legal); 2480b57cec5SDimitry Andric 2490b57cec5SDimitry Andric // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 2500b57cec5SDimitry Andric // elements for v3s16 2510b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PHI) 2520b57cec5SDimitry Andric .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 2530b57cec5SDimitry Andric .legalFor(AllS32Vectors) 2540b57cec5SDimitry Andric .legalFor(AllS64Vectors) 2550b57cec5SDimitry Andric .legalFor(AddrSpaces64) 2560b57cec5SDimitry Andric .legalFor(AddrSpaces32) 2570b57cec5SDimitry Andric .clampScalar(0, S32, S256) 2580b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 2590b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16) 2600b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 2610b57cec5SDimitry Andric .legalIf(isPointer(0)); 2620b57cec5SDimitry Andric 2630b57cec5SDimitry Andric if (ST.has16BitInsts()) { 2640b57cec5SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 2650b57cec5SDimitry Andric .legalFor({S32, S16}) 2660b57cec5SDimitry Andric .clampScalar(0, S16, S32) 2670b57cec5SDimitry Andric .scalarize(0); 2680b57cec5SDimitry Andric } else { 2690b57cec5SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 2700b57cec5SDimitry Andric .legalFor({S32}) 2710b57cec5SDimitry Andric .clampScalar(0, S32, S32) 2720b57cec5SDimitry Andric .scalarize(0); 2730b57cec5SDimitry Andric } 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 2760b57cec5SDimitry Andric .legalFor({S32}) 2770b57cec5SDimitry Andric .clampScalar(0, S32, S32) 2780b57cec5SDimitry Andric .scalarize(0); 2790b57cec5SDimitry Andric 2800b57cec5SDimitry Andric // Report legal for any types we can handle anywhere. For the cases only legal 2810b57cec5SDimitry Andric // on the SALU, RegBankSelect will be able to re-legalize. 2820b57cec5SDimitry Andric getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 2830b57cec5SDimitry Andric .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 2840b57cec5SDimitry Andric .clampScalar(0, S32, S64) 2850b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 286*8bcb0991SDimitry Andric .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 2870b57cec5SDimitry Andric .widenScalarToNextPow2(0) 2880b57cec5SDimitry Andric .scalarize(0); 2890b57cec5SDimitry Andric 290*8bcb0991SDimitry Andric getActionDefinitionsBuilder({G_UADDO, G_USUBO, 2910b57cec5SDimitry Andric G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 2920b57cec5SDimitry Andric .legalFor({{S32, S1}}) 293*8bcb0991SDimitry Andric .clampScalar(0, S32, S32) 294*8bcb0991SDimitry Andric .scalarize(0); // TODO: Implement. 295*8bcb0991SDimitry Andric 296*8bcb0991SDimitry Andric getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 297*8bcb0991SDimitry Andric .lower(); 2980b57cec5SDimitry Andric 2990b57cec5SDimitry Andric getActionDefinitionsBuilder(G_BITCAST) 3000b57cec5SDimitry Andric // Don't worry about the size constraint. 301*8bcb0991SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 302*8bcb0991SDimitry Andric // FIXME: Testing hack 303*8bcb0991SDimitry Andric .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 3040b57cec5SDimitry Andric 3050b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCONSTANT) 3060b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 3070b57cec5SDimitry Andric .clampScalar(0, S16, S64); 3080b57cec5SDimitry Andric 3090b57cec5SDimitry Andric getActionDefinitionsBuilder(G_IMPLICIT_DEF) 310*8bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 3110b57cec5SDimitry Andric ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 3120b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 313*8bcb0991SDimitry Andric .clampScalarOrElt(0, S32, S1024) 3140b57cec5SDimitry Andric .legalIf(isMultiple32(0)) 3150b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 3160b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16); 3170b57cec5SDimitry Andric 3180b57cec5SDimitry Andric 3190b57cec5SDimitry Andric // FIXME: i1 operands to intrinsics should always be legal, but other i1 3200b57cec5SDimitry Andric // values may not be legal. We need to figure out how to distinguish 3210b57cec5SDimitry Andric // between these two scenarios. 3220b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONSTANT) 323*8bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, GlobalPtr, 3240b57cec5SDimitry Andric LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 3250b57cec5SDimitry Andric .clampScalar(0, S32, S64) 3260b57cec5SDimitry Andric .widenScalarToNextPow2(0) 3270b57cec5SDimitry Andric .legalIf(isPointer(0)); 3280b57cec5SDimitry Andric 3290b57cec5SDimitry Andric setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 330*8bcb0991SDimitry Andric getActionDefinitionsBuilder(G_GLOBAL_VALUE) 331*8bcb0991SDimitry Andric .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 332*8bcb0991SDimitry Andric 3330b57cec5SDimitry Andric 3340b57cec5SDimitry Andric auto &FPOpActions = getActionDefinitionsBuilder( 335*8bcb0991SDimitry Andric { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 3360b57cec5SDimitry Andric .legalFor({S32, S64}); 337*8bcb0991SDimitry Andric auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 338*8bcb0991SDimitry Andric .customFor({S32, S64}); 339*8bcb0991SDimitry Andric auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 340*8bcb0991SDimitry Andric .customFor({S32, S64}); 3410b57cec5SDimitry Andric 3420b57cec5SDimitry Andric if (ST.has16BitInsts()) { 3430b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 3440b57cec5SDimitry Andric FPOpActions.legalFor({S16, V2S16}); 3450b57cec5SDimitry Andric else 3460b57cec5SDimitry Andric FPOpActions.legalFor({S16}); 347*8bcb0991SDimitry Andric 348*8bcb0991SDimitry Andric TrigActions.customFor({S16}); 349*8bcb0991SDimitry Andric FDIVActions.customFor({S16}); 3500b57cec5SDimitry Andric } 3510b57cec5SDimitry Andric 3520b57cec5SDimitry Andric auto &MinNumMaxNum = getActionDefinitionsBuilder({ 3530b57cec5SDimitry Andric G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 3540b57cec5SDimitry Andric 3550b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 3560b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesPK16) 3570b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 3580b57cec5SDimitry Andric .clampScalar(0, S16, S64) 3590b57cec5SDimitry Andric .scalarize(0); 3600b57cec5SDimitry Andric } else if (ST.has16BitInsts()) { 3610b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypes16) 3620b57cec5SDimitry Andric .clampScalar(0, S16, S64) 3630b57cec5SDimitry Andric .scalarize(0); 3640b57cec5SDimitry Andric } else { 3650b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesBase) 3660b57cec5SDimitry Andric .clampScalar(0, S32, S64) 3670b57cec5SDimitry Andric .scalarize(0); 3680b57cec5SDimitry Andric } 3690b57cec5SDimitry Andric 3700b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 3710b57cec5SDimitry Andric FPOpActions.clampMaxNumElements(0, S16, 2); 372*8bcb0991SDimitry Andric 3730b57cec5SDimitry Andric FPOpActions 3740b57cec5SDimitry Andric .scalarize(0) 3750b57cec5SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 3760b57cec5SDimitry Andric 377*8bcb0991SDimitry Andric TrigActions 378*8bcb0991SDimitry Andric .scalarize(0) 379*8bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 380*8bcb0991SDimitry Andric 381*8bcb0991SDimitry Andric FDIVActions 382*8bcb0991SDimitry Andric .scalarize(0) 383*8bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 384*8bcb0991SDimitry Andric 385*8bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FNEG, G_FABS}) 386*8bcb0991SDimitry Andric .legalFor(FPTypesPK16) 387*8bcb0991SDimitry Andric .clampMaxNumElements(0, S16, 2) 388*8bcb0991SDimitry Andric .scalarize(0) 389*8bcb0991SDimitry Andric .clampScalar(0, S16, S64); 390*8bcb0991SDimitry Andric 391*8bcb0991SDimitry Andric // TODO: Implement 392*8bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 393*8bcb0991SDimitry Andric 3940b57cec5SDimitry Andric if (ST.has16BitInsts()) { 395*8bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 3960b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 3970b57cec5SDimitry Andric .scalarize(0) 3980b57cec5SDimitry Andric .clampScalar(0, S16, S64); 3990b57cec5SDimitry Andric } else { 400*8bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 4010b57cec5SDimitry Andric .legalFor({S32, S64}) 4020b57cec5SDimitry Andric .scalarize(0) 4030b57cec5SDimitry Andric .clampScalar(0, S32, S64); 4040b57cec5SDimitry Andric } 4050b57cec5SDimitry Andric 4060b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPTRUNC) 4070b57cec5SDimitry Andric .legalFor({{S32, S64}, {S16, S32}}) 4080b57cec5SDimitry Andric .scalarize(0); 4090b57cec5SDimitry Andric 4100b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPEXT) 4110b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}}) 4120b57cec5SDimitry Andric .lowerFor({{S64, S16}}) // FIXME: Implement 4130b57cec5SDimitry Andric .scalarize(0); 4140b57cec5SDimitry Andric 4150b57cec5SDimitry Andric // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 4160b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 4170b57cec5SDimitry Andric 4180b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FSUB) 4190b57cec5SDimitry Andric // Use actual fsub instruction 4200b57cec5SDimitry Andric .legalFor({S32}) 4210b57cec5SDimitry Andric // Must use fadd + fneg 4220b57cec5SDimitry Andric .lowerFor({S64, S16, V2S16}) 4230b57cec5SDimitry Andric .scalarize(0) 4240b57cec5SDimitry Andric .clampScalar(0, S32, S64); 4250b57cec5SDimitry Andric 426*8bcb0991SDimitry Andric // Whether this is legal depends on the floating point mode for the function. 427*8bcb0991SDimitry Andric auto &FMad = getActionDefinitionsBuilder(G_FMAD); 428*8bcb0991SDimitry Andric if (ST.hasMadF16()) 429*8bcb0991SDimitry Andric FMad.customFor({S32, S16}); 430*8bcb0991SDimitry Andric else 431*8bcb0991SDimitry Andric FMad.customFor({S32}); 432*8bcb0991SDimitry Andric FMad.scalarize(0) 433*8bcb0991SDimitry Andric .lower(); 434*8bcb0991SDimitry Andric 4350b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 4360b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 4370b57cec5SDimitry Andric {S32, S1}, {S64, S1}, {S16, S1}, 438*8bcb0991SDimitry Andric {S96, S32}, 4390b57cec5SDimitry Andric // FIXME: Hack 4400b57cec5SDimitry Andric {S64, LLT::scalar(33)}, 4410b57cec5SDimitry Andric {S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}}) 4420b57cec5SDimitry Andric .scalarize(0); 4430b57cec5SDimitry Andric 444*8bcb0991SDimitry Andric // TODO: Split s1->s64 during regbankselect for VALU. 445*8bcb0991SDimitry Andric auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 446*8bcb0991SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}}) 4470b57cec5SDimitry Andric .lowerFor({{S32, S64}}) 448*8bcb0991SDimitry Andric .customFor({{S64, S64}}); 449*8bcb0991SDimitry Andric if (ST.has16BitInsts()) 450*8bcb0991SDimitry Andric IToFP.legalFor({{S16, S16}}); 451*8bcb0991SDimitry Andric IToFP.clampScalar(1, S32, S64) 4520b57cec5SDimitry Andric .scalarize(0); 4530b57cec5SDimitry Andric 454*8bcb0991SDimitry Andric auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 455*8bcb0991SDimitry Andric .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 456*8bcb0991SDimitry Andric if (ST.has16BitInsts()) 457*8bcb0991SDimitry Andric FPToI.legalFor({{S16, S16}}); 458*8bcb0991SDimitry Andric else 459*8bcb0991SDimitry Andric FPToI.minScalar(1, S32); 460*8bcb0991SDimitry Andric 461*8bcb0991SDimitry Andric FPToI.minScalar(0, S32) 4620b57cec5SDimitry Andric .scalarize(0); 4630b57cec5SDimitry Andric 4640b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 4650b57cec5SDimitry Andric .legalFor({S32, S64}) 4660b57cec5SDimitry Andric .scalarize(0); 4670b57cec5SDimitry Andric 4680b57cec5SDimitry Andric if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 4690b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 4700b57cec5SDimitry Andric .legalFor({S32, S64}) 4710b57cec5SDimitry Andric .clampScalar(0, S32, S64) 4720b57cec5SDimitry Andric .scalarize(0); 4730b57cec5SDimitry Andric } else { 4740b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 4750b57cec5SDimitry Andric .legalFor({S32}) 4760b57cec5SDimitry Andric .customFor({S64}) 4770b57cec5SDimitry Andric .clampScalar(0, S32, S64) 4780b57cec5SDimitry Andric .scalarize(0); 4790b57cec5SDimitry Andric } 4800b57cec5SDimitry Andric 4810b57cec5SDimitry Andric getActionDefinitionsBuilder(G_GEP) 4820b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 4830b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 4840b57cec5SDimitry Andric .scalarize(0); 4850b57cec5SDimitry Andric 486*8bcb0991SDimitry Andric getActionDefinitionsBuilder(G_PTR_MASK) 487*8bcb0991SDimitry Andric .scalarize(0) 488*8bcb0991SDimitry Andric .alwaysLegal(); 489*8bcb0991SDimitry Andric 4900b57cec5SDimitry Andric setAction({G_BLOCK_ADDR, CodePtr}, Legal); 4910b57cec5SDimitry Andric 4920b57cec5SDimitry Andric auto &CmpBuilder = 4930b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ICMP) 4940b57cec5SDimitry Andric .legalForCartesianProduct( 4950b57cec5SDimitry Andric {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 4960b57cec5SDimitry Andric .legalFor({{S1, S32}, {S1, S64}}); 4970b57cec5SDimitry Andric if (ST.has16BitInsts()) { 4980b57cec5SDimitry Andric CmpBuilder.legalFor({{S1, S16}}); 4990b57cec5SDimitry Andric } 5000b57cec5SDimitry Andric 5010b57cec5SDimitry Andric CmpBuilder 5020b57cec5SDimitry Andric .widenScalarToNextPow2(1) 5030b57cec5SDimitry Andric .clampScalar(1, S32, S64) 5040b57cec5SDimitry Andric .scalarize(0) 5050b57cec5SDimitry Andric .legalIf(all(typeIs(0, S1), isPointer(1))); 5060b57cec5SDimitry Andric 5070b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCMP) 5080b57cec5SDimitry Andric .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 5090b57cec5SDimitry Andric .widenScalarToNextPow2(1) 5100b57cec5SDimitry Andric .clampScalar(1, S32, S64) 5110b57cec5SDimitry Andric .scalarize(0); 5120b57cec5SDimitry Andric 5130b57cec5SDimitry Andric // FIXME: fexp, flog2, flog10 needs to be custom lowered. 5140b57cec5SDimitry Andric getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 5150b57cec5SDimitry Andric G_FLOG, G_FLOG2, G_FLOG10}) 5160b57cec5SDimitry Andric .legalFor({S32}) 5170b57cec5SDimitry Andric .scalarize(0); 5180b57cec5SDimitry Andric 5190b57cec5SDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 5200b57cec5SDimitry Andric getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 5210b57cec5SDimitry Andric G_CTTZ, G_CTTZ_ZERO_UNDEF, 5220b57cec5SDimitry Andric G_CTPOP}) 5230b57cec5SDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 5240b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5250b57cec5SDimitry Andric .clampScalar(1, S32, S64) 5260b57cec5SDimitry Andric .scalarize(0) 5270b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 5280b57cec5SDimitry Andric .widenScalarToNextPow2(1, 32); 5290b57cec5SDimitry Andric 5300b57cec5SDimitry Andric // TODO: Expand for > s32 531*8bcb0991SDimitry Andric getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 5320b57cec5SDimitry Andric .legalFor({S32}) 5330b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5340b57cec5SDimitry Andric .scalarize(0); 5350b57cec5SDimitry Andric 5360b57cec5SDimitry Andric if (ST.has16BitInsts()) { 5370b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 5380b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 5390b57cec5SDimitry Andric .legalFor({S32, S16, V2S16}) 5400b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 5410b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 5420b57cec5SDimitry Andric .clampScalar(0, S16, S32) 5430b57cec5SDimitry Andric .widenScalarToNextPow2(0) 5440b57cec5SDimitry Andric .scalarize(0); 5450b57cec5SDimitry Andric } else { 5460b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 5470b57cec5SDimitry Andric .legalFor({S32, S16}) 5480b57cec5SDimitry Andric .widenScalarToNextPow2(0) 5490b57cec5SDimitry Andric .clampScalar(0, S16, S32) 5500b57cec5SDimitry Andric .scalarize(0); 5510b57cec5SDimitry Andric } 5520b57cec5SDimitry Andric } else { 5530b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 5540b57cec5SDimitry Andric .legalFor({S32}) 5550b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5560b57cec5SDimitry Andric .widenScalarToNextPow2(0) 5570b57cec5SDimitry Andric .scalarize(0); 5580b57cec5SDimitry Andric } 5590b57cec5SDimitry Andric 5600b57cec5SDimitry Andric auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 5610b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 5620b57cec5SDimitry Andric return Query.Types[TypeIdx0].getSizeInBits() < 5630b57cec5SDimitry Andric Query.Types[TypeIdx1].getSizeInBits(); 5640b57cec5SDimitry Andric }; 5650b57cec5SDimitry Andric }; 5660b57cec5SDimitry Andric 5670b57cec5SDimitry Andric auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 5680b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 5690b57cec5SDimitry Andric return Query.Types[TypeIdx0].getSizeInBits() > 5700b57cec5SDimitry Andric Query.Types[TypeIdx1].getSizeInBits(); 5710b57cec5SDimitry Andric }; 5720b57cec5SDimitry Andric }; 5730b57cec5SDimitry Andric 5740b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTTOPTR) 5750b57cec5SDimitry Andric // List the common cases 5760b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 5770b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 5780b57cec5SDimitry Andric .scalarize(0) 5790b57cec5SDimitry Andric // Accept any address space as long as the size matches 5800b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 5810b57cec5SDimitry Andric .widenScalarIf(smallerThan(1, 0), 5820b57cec5SDimitry Andric [](const LegalityQuery &Query) { 5830b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 5840b57cec5SDimitry Andric }) 5850b57cec5SDimitry Andric .narrowScalarIf(greaterThan(1, 0), 5860b57cec5SDimitry Andric [](const LegalityQuery &Query) { 5870b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 5880b57cec5SDimitry Andric }); 5890b57cec5SDimitry Andric 5900b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PTRTOINT) 5910b57cec5SDimitry Andric // List the common cases 5920b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 5930b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 5940b57cec5SDimitry Andric .scalarize(0) 5950b57cec5SDimitry Andric // Accept any address space as long as the size matches 5960b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 5970b57cec5SDimitry Andric .widenScalarIf(smallerThan(0, 1), 5980b57cec5SDimitry Andric [](const LegalityQuery &Query) { 5990b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 6000b57cec5SDimitry Andric }) 6010b57cec5SDimitry Andric .narrowScalarIf( 6020b57cec5SDimitry Andric greaterThan(0, 1), 6030b57cec5SDimitry Andric [](const LegalityQuery &Query) { 6040b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 6050b57cec5SDimitry Andric }); 6060b57cec5SDimitry Andric 6070b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 6080b57cec5SDimitry Andric .scalarize(0) 6090b57cec5SDimitry Andric .custom(); 6100b57cec5SDimitry Andric 6110b57cec5SDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 6120b57cec5SDimitry Andric // handle some operations by just promoting the register during 6130b57cec5SDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits. 614*8bcb0991SDimitry Andric auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 615*8bcb0991SDimitry Andric switch (AS) { 616*8bcb0991SDimitry Andric // FIXME: Private element size. 617*8bcb0991SDimitry Andric case AMDGPUAS::PRIVATE_ADDRESS: 618*8bcb0991SDimitry Andric return 32; 619*8bcb0991SDimitry Andric // FIXME: Check subtarget 620*8bcb0991SDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 621*8bcb0991SDimitry Andric return ST.useDS128() ? 128 : 64; 6220b57cec5SDimitry Andric 623*8bcb0991SDimitry Andric // Treat constant and global as identical. SMRD loads are sometimes usable 624*8bcb0991SDimitry Andric // for global loads (ideally constant address space should be eliminated) 625*8bcb0991SDimitry Andric // depending on the context. Legality cannot be context dependent, but 626*8bcb0991SDimitry Andric // RegBankSelect can split the load as necessary depending on the pointer 627*8bcb0991SDimitry Andric // register bank/uniformity and if the memory is invariant or not written in 628*8bcb0991SDimitry Andric // a kernel. 629*8bcb0991SDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS: 630*8bcb0991SDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 631*8bcb0991SDimitry Andric return 512; 632*8bcb0991SDimitry Andric default: 633*8bcb0991SDimitry Andric return 128; 634*8bcb0991SDimitry Andric } 635*8bcb0991SDimitry Andric }; 636*8bcb0991SDimitry Andric 637*8bcb0991SDimitry Andric const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 638*8bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 639*8bcb0991SDimitry Andric 640*8bcb0991SDimitry Andric // Split vector extloads. 641*8bcb0991SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 642*8bcb0991SDimitry Andric if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 643*8bcb0991SDimitry Andric return true; 644*8bcb0991SDimitry Andric 645*8bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 646*8bcb0991SDimitry Andric unsigned AS = PtrTy.getAddressSpace(); 647*8bcb0991SDimitry Andric if (MemSize > maxSizeForAddrSpace(AS)) 648*8bcb0991SDimitry Andric return true; 649*8bcb0991SDimitry Andric 650*8bcb0991SDimitry Andric // Catch weird sized loads that don't evenly divide into the access sizes 651*8bcb0991SDimitry Andric // TODO: May be able to widen depending on alignment etc. 652*8bcb0991SDimitry Andric unsigned NumRegs = MemSize / 32; 653*8bcb0991SDimitry Andric if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 654*8bcb0991SDimitry Andric return true; 655*8bcb0991SDimitry Andric 656*8bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 657*8bcb0991SDimitry Andric if (Align < MemSize) { 658*8bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 659*8bcb0991SDimitry Andric return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 660*8bcb0991SDimitry Andric } 661*8bcb0991SDimitry Andric 662*8bcb0991SDimitry Andric return false; 663*8bcb0991SDimitry Andric }; 664*8bcb0991SDimitry Andric 665*8bcb0991SDimitry Andric unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 666*8bcb0991SDimitry Andric unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 667*8bcb0991SDimitry Andric unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 668*8bcb0991SDimitry Andric 669*8bcb0991SDimitry Andric // TODO: Refine based on subtargets which support unaligned access or 128-bit 670*8bcb0991SDimitry Andric // LDS 671*8bcb0991SDimitry Andric // TODO: Unsupported flat for SI. 672*8bcb0991SDimitry Andric 673*8bcb0991SDimitry Andric for (unsigned Op : {G_LOAD, G_STORE}) { 674*8bcb0991SDimitry Andric const bool IsStore = Op == G_STORE; 675*8bcb0991SDimitry Andric 676*8bcb0991SDimitry Andric auto &Actions = getActionDefinitionsBuilder(Op); 677*8bcb0991SDimitry Andric // Whitelist the common cases. 678*8bcb0991SDimitry Andric // TODO: Pointer loads 679*8bcb0991SDimitry Andric // TODO: Wide constant loads 680*8bcb0991SDimitry Andric // TODO: Only CI+ has 3x loads 681*8bcb0991SDimitry Andric // TODO: Loads to s16 on gfx9 682*8bcb0991SDimitry Andric Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 683*8bcb0991SDimitry Andric {V2S32, GlobalPtr, 64, GlobalAlign32}, 684*8bcb0991SDimitry Andric {V3S32, GlobalPtr, 96, GlobalAlign32}, 685*8bcb0991SDimitry Andric {S96, GlobalPtr, 96, GlobalAlign32}, 686*8bcb0991SDimitry Andric {V4S32, GlobalPtr, 128, GlobalAlign32}, 687*8bcb0991SDimitry Andric {S128, GlobalPtr, 128, GlobalAlign32}, 688*8bcb0991SDimitry Andric {S64, GlobalPtr, 64, GlobalAlign32}, 689*8bcb0991SDimitry Andric {V2S64, GlobalPtr, 128, GlobalAlign32}, 690*8bcb0991SDimitry Andric {V2S16, GlobalPtr, 32, GlobalAlign32}, 691*8bcb0991SDimitry Andric {S32, GlobalPtr, 8, GlobalAlign8}, 692*8bcb0991SDimitry Andric {S32, GlobalPtr, 16, GlobalAlign16}, 693*8bcb0991SDimitry Andric 694*8bcb0991SDimitry Andric {S32, LocalPtr, 32, 32}, 695*8bcb0991SDimitry Andric {S64, LocalPtr, 64, 32}, 696*8bcb0991SDimitry Andric {V2S32, LocalPtr, 64, 32}, 697*8bcb0991SDimitry Andric {S32, LocalPtr, 8, 8}, 698*8bcb0991SDimitry Andric {S32, LocalPtr, 16, 16}, 699*8bcb0991SDimitry Andric {V2S16, LocalPtr, 32, 32}, 700*8bcb0991SDimitry Andric 701*8bcb0991SDimitry Andric {S32, PrivatePtr, 32, 32}, 702*8bcb0991SDimitry Andric {S32, PrivatePtr, 8, 8}, 703*8bcb0991SDimitry Andric {S32, PrivatePtr, 16, 16}, 704*8bcb0991SDimitry Andric {V2S16, PrivatePtr, 32, 32}, 705*8bcb0991SDimitry Andric 706*8bcb0991SDimitry Andric {S32, FlatPtr, 32, GlobalAlign32}, 707*8bcb0991SDimitry Andric {S32, FlatPtr, 16, GlobalAlign16}, 708*8bcb0991SDimitry Andric {S32, FlatPtr, 8, GlobalAlign8}, 709*8bcb0991SDimitry Andric {V2S16, FlatPtr, 32, GlobalAlign32}, 710*8bcb0991SDimitry Andric 711*8bcb0991SDimitry Andric {S32, ConstantPtr, 32, GlobalAlign32}, 712*8bcb0991SDimitry Andric {V2S32, ConstantPtr, 64, GlobalAlign32}, 713*8bcb0991SDimitry Andric {V3S32, ConstantPtr, 96, GlobalAlign32}, 714*8bcb0991SDimitry Andric {V4S32, ConstantPtr, 128, GlobalAlign32}, 715*8bcb0991SDimitry Andric {S64, ConstantPtr, 64, GlobalAlign32}, 716*8bcb0991SDimitry Andric {S128, ConstantPtr, 128, GlobalAlign32}, 717*8bcb0991SDimitry Andric {V2S32, ConstantPtr, 32, GlobalAlign32}}); 718*8bcb0991SDimitry Andric Actions 719*8bcb0991SDimitry Andric .customIf(typeIs(1, Constant32Ptr)) 720*8bcb0991SDimitry Andric .narrowScalarIf( 721*8bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 722*8bcb0991SDimitry Andric return !Query.Types[0].isVector() && needToSplitLoad(Query); 723*8bcb0991SDimitry Andric }, 724*8bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 725*8bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 726*8bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 727*8bcb0991SDimitry Andric 728*8bcb0991SDimitry Andric const unsigned DstSize = DstTy.getSizeInBits(); 729*8bcb0991SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 730*8bcb0991SDimitry Andric 731*8bcb0991SDimitry Andric // Split extloads. 732*8bcb0991SDimitry Andric if (DstSize > MemSize) 733*8bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MemSize)); 734*8bcb0991SDimitry Andric 735*8bcb0991SDimitry Andric if (DstSize > 32 && (DstSize % 32 != 0)) { 736*8bcb0991SDimitry Andric // FIXME: Need a way to specify non-extload of larger size if 737*8bcb0991SDimitry Andric // suitably aligned. 738*8bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 739*8bcb0991SDimitry Andric } 740*8bcb0991SDimitry Andric 741*8bcb0991SDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 742*8bcb0991SDimitry Andric if (MemSize > MaxSize) 743*8bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MaxSize)); 744*8bcb0991SDimitry Andric 745*8bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 746*8bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(Align)); 747*8bcb0991SDimitry Andric }) 748*8bcb0991SDimitry Andric .fewerElementsIf( 749*8bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 750*8bcb0991SDimitry Andric return Query.Types[0].isVector() && needToSplitLoad(Query); 751*8bcb0991SDimitry Andric }, 752*8bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 753*8bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 754*8bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 755*8bcb0991SDimitry Andric 756*8bcb0991SDimitry Andric LLT EltTy = DstTy.getElementType(); 757*8bcb0991SDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 758*8bcb0991SDimitry Andric 759*8bcb0991SDimitry Andric // Split if it's too large for the address space. 760*8bcb0991SDimitry Andric if (Query.MMODescrs[0].SizeInBits > MaxSize) { 761*8bcb0991SDimitry Andric unsigned NumElts = DstTy.getNumElements(); 762*8bcb0991SDimitry Andric unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 763*8bcb0991SDimitry Andric 764*8bcb0991SDimitry Andric // FIXME: Refine when odd breakdowns handled 765*8bcb0991SDimitry Andric // The scalars will need to be re-legalized. 766*8bcb0991SDimitry Andric if (NumPieces == 1 || NumPieces >= NumElts || 767*8bcb0991SDimitry Andric NumElts % NumPieces != 0) 768*8bcb0991SDimitry Andric return std::make_pair(0, EltTy); 769*8bcb0991SDimitry Andric 770*8bcb0991SDimitry Andric return std::make_pair(0, 771*8bcb0991SDimitry Andric LLT::vector(NumElts / NumPieces, EltTy)); 772*8bcb0991SDimitry Andric } 773*8bcb0991SDimitry Andric 774*8bcb0991SDimitry Andric // Need to split because of alignment. 775*8bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 776*8bcb0991SDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 777*8bcb0991SDimitry Andric if (EltSize > Align && 778*8bcb0991SDimitry Andric (EltSize / Align < DstTy.getNumElements())) { 779*8bcb0991SDimitry Andric return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 780*8bcb0991SDimitry Andric } 781*8bcb0991SDimitry Andric 782*8bcb0991SDimitry Andric // May need relegalization for the scalars. 783*8bcb0991SDimitry Andric return std::make_pair(0, EltTy); 784*8bcb0991SDimitry Andric }) 785*8bcb0991SDimitry Andric .minScalar(0, S32); 786*8bcb0991SDimitry Andric 787*8bcb0991SDimitry Andric if (IsStore) 788*8bcb0991SDimitry Andric Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 789*8bcb0991SDimitry Andric 790*8bcb0991SDimitry Andric // TODO: Need a bitcast lower option? 791*8bcb0991SDimitry Andric Actions 792*8bcb0991SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 793*8bcb0991SDimitry Andric const LLT Ty0 = Query.Types[0]; 7940b57cec5SDimitry Andric unsigned Size = Ty0.getSizeInBits(); 7950b57cec5SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 796*8bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 797*8bcb0991SDimitry Andric 798*8bcb0991SDimitry Andric // No extending vector loads. 799*8bcb0991SDimitry Andric if (Size > MemSize && Ty0.isVector()) 8000b57cec5SDimitry Andric return false; 8010b57cec5SDimitry Andric 802*8bcb0991SDimitry Andric // FIXME: Widening store from alignment not valid. 803*8bcb0991SDimitry Andric if (MemSize < Size) 804*8bcb0991SDimitry Andric MemSize = std::max(MemSize, Align); 8050b57cec5SDimitry Andric 8060b57cec5SDimitry Andric switch (MemSize) { 8070b57cec5SDimitry Andric case 8: 8080b57cec5SDimitry Andric case 16: 8090b57cec5SDimitry Andric return Size == 32; 8100b57cec5SDimitry Andric case 32: 8110b57cec5SDimitry Andric case 64: 8120b57cec5SDimitry Andric case 128: 8130b57cec5SDimitry Andric return true; 8140b57cec5SDimitry Andric case 96: 8150b57cec5SDimitry Andric return ST.hasDwordx3LoadStores(); 8160b57cec5SDimitry Andric case 256: 8170b57cec5SDimitry Andric case 512: 818*8bcb0991SDimitry Andric return true; 8190b57cec5SDimitry Andric default: 8200b57cec5SDimitry Andric return false; 8210b57cec5SDimitry Andric } 8220b57cec5SDimitry Andric }) 823*8bcb0991SDimitry Andric .widenScalarToNextPow2(0) 824*8bcb0991SDimitry Andric // TODO: v3s32->v4s32 with alignment 825*8bcb0991SDimitry Andric .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 826*8bcb0991SDimitry Andric } 8270b57cec5SDimitry Andric 8280b57cec5SDimitry Andric auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 829*8bcb0991SDimitry Andric .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 830*8bcb0991SDimitry Andric {S32, GlobalPtr, 16, 2 * 8}, 8310b57cec5SDimitry Andric {S32, LocalPtr, 8, 8}, 832*8bcb0991SDimitry Andric {S32, LocalPtr, 16, 16}, 8330b57cec5SDimitry Andric {S32, PrivatePtr, 8, 8}, 834*8bcb0991SDimitry Andric {S32, PrivatePtr, 16, 16}, 835*8bcb0991SDimitry Andric {S32, ConstantPtr, 8, 8}, 836*8bcb0991SDimitry Andric {S32, ConstantPtr, 16, 2 * 8}}); 8370b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 838*8bcb0991SDimitry Andric ExtLoads.legalForTypesWithMemDesc( 839*8bcb0991SDimitry Andric {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 8400b57cec5SDimitry Andric } 8410b57cec5SDimitry Andric 8420b57cec5SDimitry Andric ExtLoads.clampScalar(0, S32, S32) 8430b57cec5SDimitry Andric .widenScalarToNextPow2(0) 8440b57cec5SDimitry Andric .unsupportedIfMemSizeNotPow2() 8450b57cec5SDimitry Andric .lower(); 8460b57cec5SDimitry Andric 8470b57cec5SDimitry Andric auto &Atomics = getActionDefinitionsBuilder( 8480b57cec5SDimitry Andric {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 8490b57cec5SDimitry Andric G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 8500b57cec5SDimitry Andric G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 8510b57cec5SDimitry Andric G_ATOMICRMW_UMIN, G_ATOMIC_CMPXCHG}) 8520b57cec5SDimitry Andric .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 8530b57cec5SDimitry Andric {S64, GlobalPtr}, {S64, LocalPtr}}); 8540b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 8550b57cec5SDimitry Andric Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 8560b57cec5SDimitry Andric } 8570b57cec5SDimitry Andric 858*8bcb0991SDimitry Andric getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 859*8bcb0991SDimitry Andric .legalFor({{S32, LocalPtr}}); 860*8bcb0991SDimitry Andric 861*8bcb0991SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 862*8bcb0991SDimitry Andric .lower(); 863*8bcb0991SDimitry Andric 8640b57cec5SDimitry Andric // TODO: Pointer types, any 32-bit or 64-bit vector 8650b57cec5SDimitry Andric getActionDefinitionsBuilder(G_SELECT) 8660b57cec5SDimitry Andric .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 8670b57cec5SDimitry Andric GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 8680b57cec5SDimitry Andric LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1}) 8690b57cec5SDimitry Andric .clampScalar(0, S16, S64) 8700b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 8710b57cec5SDimitry Andric .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 8720b57cec5SDimitry Andric .scalarize(1) 8730b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 2) 8740b57cec5SDimitry Andric .clampMaxNumElements(0, LocalPtr, 2) 8750b57cec5SDimitry Andric .clampMaxNumElements(0, PrivatePtr, 2) 8760b57cec5SDimitry Andric .scalarize(0) 8770b57cec5SDimitry Andric .widenScalarToNextPow2(0) 8780b57cec5SDimitry Andric .legalIf(all(isPointer(0), typeIs(1, S1))); 8790b57cec5SDimitry Andric 8800b57cec5SDimitry Andric // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 8810b57cec5SDimitry Andric // be more flexible with the shift amount type. 8820b57cec5SDimitry Andric auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 8830b57cec5SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}); 8840b57cec5SDimitry Andric if (ST.has16BitInsts()) { 8850b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 8860b57cec5SDimitry Andric Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 8870b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2); 8880b57cec5SDimitry Andric } else 8890b57cec5SDimitry Andric Shifts.legalFor({{S16, S32}, {S16, S16}}); 8900b57cec5SDimitry Andric 8910b57cec5SDimitry Andric Shifts.clampScalar(1, S16, S32); 8920b57cec5SDimitry Andric Shifts.clampScalar(0, S16, S64); 8930b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 16); 8940b57cec5SDimitry Andric } else { 8950b57cec5SDimitry Andric // Make sure we legalize the shift amount type first, as the general 8960b57cec5SDimitry Andric // expansion for the shifted type will produce much worse code if it hasn't 8970b57cec5SDimitry Andric // been truncated already. 8980b57cec5SDimitry Andric Shifts.clampScalar(1, S32, S32); 8990b57cec5SDimitry Andric Shifts.clampScalar(0, S32, S64); 9000b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 32); 9010b57cec5SDimitry Andric } 9020b57cec5SDimitry Andric Shifts.scalarize(0); 9030b57cec5SDimitry Andric 9040b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 9050b57cec5SDimitry Andric unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 9060b57cec5SDimitry Andric unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 9070b57cec5SDimitry Andric unsigned IdxTypeIdx = 2; 9080b57cec5SDimitry Andric 9090b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 9100b57cec5SDimitry Andric .customIf([=](const LegalityQuery &Query) { 9110b57cec5SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 9120b57cec5SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 9130b57cec5SDimitry Andric const LLT IdxTy = Query.Types[IdxTypeIdx]; 9140b57cec5SDimitry Andric return (EltTy.getSizeInBits() == 16 || 9150b57cec5SDimitry Andric EltTy.getSizeInBits() % 32 == 0) && 9160b57cec5SDimitry Andric VecTy.getSizeInBits() % 32 == 0 && 917*8bcb0991SDimitry Andric VecTy.getSizeInBits() <= 1024 && 9180b57cec5SDimitry Andric IdxTy.getSizeInBits() == 32; 9190b57cec5SDimitry Andric }) 9200b57cec5SDimitry Andric .clampScalar(EltTypeIdx, S32, S64) 9210b57cec5SDimitry Andric .clampScalar(VecTypeIdx, S32, S64) 9220b57cec5SDimitry Andric .clampScalar(IdxTypeIdx, S32, S32); 9230b57cec5SDimitry Andric } 9240b57cec5SDimitry Andric 9250b57cec5SDimitry Andric getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 9260b57cec5SDimitry Andric .unsupportedIf([=](const LegalityQuery &Query) { 9270b57cec5SDimitry Andric const LLT &EltTy = Query.Types[1].getElementType(); 9280b57cec5SDimitry Andric return Query.Types[0] != EltTy; 9290b57cec5SDimitry Andric }); 9300b57cec5SDimitry Andric 9310b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT, G_INSERT}) { 9320b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 9330b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 9340b57cec5SDimitry Andric 9350b57cec5SDimitry Andric // FIXME: Doesn't handle extract of illegal sizes. 9360b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 937*8bcb0991SDimitry Andric .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 938*8bcb0991SDimitry Andric // FIXME: Multiples of 16 should not be legal. 9390b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 9400b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 9410b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 9420b57cec5SDimitry Andric return (BigTy.getSizeInBits() % 32 == 0) && 9430b57cec5SDimitry Andric (LitTy.getSizeInBits() % 16 == 0); 9440b57cec5SDimitry Andric }) 9450b57cec5SDimitry Andric .widenScalarIf( 9460b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 9470b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 9480b57cec5SDimitry Andric return (BigTy.getScalarSizeInBits() < 16); 9490b57cec5SDimitry Andric }, 9500b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 9510b57cec5SDimitry Andric .widenScalarIf( 9520b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 9530b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 9540b57cec5SDimitry Andric return (LitTy.getScalarSizeInBits() < 16); 9550b57cec5SDimitry Andric }, 9560b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 9570b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 9580b57cec5SDimitry Andric .widenScalarToNextPow2(BigTyIdx, 32); 9590b57cec5SDimitry Andric 9600b57cec5SDimitry Andric } 9610b57cec5SDimitry Andric 962*8bcb0991SDimitry Andric auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 9630b57cec5SDimitry Andric .legalForCartesianProduct(AllS32Vectors, {S32}) 9640b57cec5SDimitry Andric .legalForCartesianProduct(AllS64Vectors, {S64}) 965*8bcb0991SDimitry Andric .clampNumElements(0, V16S32, V32S32) 966*8bcb0991SDimitry Andric .clampNumElements(0, V2S64, V16S64) 967*8bcb0991SDimitry Andric .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 968*8bcb0991SDimitry Andric 969*8bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) 970*8bcb0991SDimitry Andric BuildVector.legalFor({V2S16, S32}); 971*8bcb0991SDimitry Andric 972*8bcb0991SDimitry Andric BuildVector 9730b57cec5SDimitry Andric .minScalarSameAs(1, 0) 9740b57cec5SDimitry Andric .legalIf(isRegisterType(0)) 9750b57cec5SDimitry Andric .minScalarOrElt(0, S32); 9760b57cec5SDimitry Andric 977*8bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) { 978*8bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 979*8bcb0991SDimitry Andric .legalFor({V2S16, S32}) 980*8bcb0991SDimitry Andric .lower(); 981*8bcb0991SDimitry Andric } else { 982*8bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 983*8bcb0991SDimitry Andric .lower(); 984*8bcb0991SDimitry Andric } 985*8bcb0991SDimitry Andric 9860b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONCAT_VECTORS) 9870b57cec5SDimitry Andric .legalIf(isRegisterType(0)); 9880b57cec5SDimitry Andric 989*8bcb0991SDimitry Andric // TODO: Don't fully scalarize v2s16 pieces 990*8bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 991*8bcb0991SDimitry Andric 9920b57cec5SDimitry Andric // Merge/Unmerge 9930b57cec5SDimitry Andric for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 9940b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 9950b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 9960b57cec5SDimitry Andric 9970b57cec5SDimitry Andric auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 9980b57cec5SDimitry Andric const LLT &Ty = Query.Types[TypeIdx]; 9990b57cec5SDimitry Andric if (Ty.isVector()) { 10000b57cec5SDimitry Andric const LLT &EltTy = Ty.getElementType(); 10010b57cec5SDimitry Andric if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 10020b57cec5SDimitry Andric return true; 10030b57cec5SDimitry Andric if (!isPowerOf2_32(EltTy.getSizeInBits())) 10040b57cec5SDimitry Andric return true; 10050b57cec5SDimitry Andric } 10060b57cec5SDimitry Andric return false; 10070b57cec5SDimitry Andric }; 10080b57cec5SDimitry Andric 1009*8bcb0991SDimitry Andric auto &Builder = getActionDefinitionsBuilder(Op) 10100b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 10110b57cec5SDimitry Andric // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 10120b57cec5SDimitry Andric // worth considering the multiples of 64 since 2*192 and 2*384 are not 10130b57cec5SDimitry Andric // valid. 10140b57cec5SDimitry Andric .clampScalar(LitTyIdx, S16, S256) 10150b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 1016*8bcb0991SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 1017*8bcb0991SDimitry Andric .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 1018*8bcb0991SDimitry Andric elementTypeIs(1, S16)), 1019*8bcb0991SDimitry Andric changeTo(1, V2S16)) 10200b57cec5SDimitry Andric // Break up vectors with weird elements into scalars 10210b57cec5SDimitry Andric .fewerElementsIf( 10220b57cec5SDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 10230b57cec5SDimitry Andric scalarize(0)) 10240b57cec5SDimitry Andric .fewerElementsIf( 10250b57cec5SDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 10260b57cec5SDimitry Andric scalarize(1)) 1027*8bcb0991SDimitry Andric .clampScalar(BigTyIdx, S32, S1024) 1028*8bcb0991SDimitry Andric .lowerFor({{S16, V2S16}}); 1029*8bcb0991SDimitry Andric 1030*8bcb0991SDimitry Andric if (Op == G_MERGE_VALUES) { 1031*8bcb0991SDimitry Andric Builder.widenScalarIf( 1032*8bcb0991SDimitry Andric // TODO: Use 16-bit shifts if legal for 8-bit values? 10330b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 1034*8bcb0991SDimitry Andric const LLT Ty = Query.Types[LitTyIdx]; 1035*8bcb0991SDimitry Andric return Ty.getSizeInBits() < 32; 1036*8bcb0991SDimitry Andric }, 1037*8bcb0991SDimitry Andric changeTo(LitTyIdx, S32)); 1038*8bcb0991SDimitry Andric } 1039*8bcb0991SDimitry Andric 1040*8bcb0991SDimitry Andric Builder.widenScalarIf( 1041*8bcb0991SDimitry Andric [=](const LegalityQuery &Query) { 1042*8bcb0991SDimitry Andric const LLT Ty = Query.Types[BigTyIdx]; 10430b57cec5SDimitry Andric return !isPowerOf2_32(Ty.getSizeInBits()) && 10440b57cec5SDimitry Andric Ty.getSizeInBits() % 16 != 0; 10450b57cec5SDimitry Andric }, 10460b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 10470b57cec5SDimitry Andric // Pick the next power of 2, or a multiple of 64 over 128. 10480b57cec5SDimitry Andric // Whichever is smaller. 10490b57cec5SDimitry Andric const LLT &Ty = Query.Types[BigTyIdx]; 10500b57cec5SDimitry Andric unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 10510b57cec5SDimitry Andric if (NewSizeInBits >= 256) { 10520b57cec5SDimitry Andric unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 10530b57cec5SDimitry Andric if (RoundedTo < NewSizeInBits) 10540b57cec5SDimitry Andric NewSizeInBits = RoundedTo; 10550b57cec5SDimitry Andric } 10560b57cec5SDimitry Andric return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 10570b57cec5SDimitry Andric }) 10580b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 10590b57cec5SDimitry Andric const LLT &BigTy = Query.Types[BigTyIdx]; 10600b57cec5SDimitry Andric const LLT &LitTy = Query.Types[LitTyIdx]; 10610b57cec5SDimitry Andric 10620b57cec5SDimitry Andric if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 10630b57cec5SDimitry Andric return false; 10640b57cec5SDimitry Andric if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 10650b57cec5SDimitry Andric return false; 10660b57cec5SDimitry Andric 10670b57cec5SDimitry Andric return BigTy.getSizeInBits() % 16 == 0 && 10680b57cec5SDimitry Andric LitTy.getSizeInBits() % 16 == 0 && 1069*8bcb0991SDimitry Andric BigTy.getSizeInBits() <= 1024; 10700b57cec5SDimitry Andric }) 10710b57cec5SDimitry Andric // Any vectors left are the wrong size. Scalarize them. 10720b57cec5SDimitry Andric .scalarize(0) 10730b57cec5SDimitry Andric .scalarize(1); 10740b57cec5SDimitry Andric } 10750b57cec5SDimitry Andric 1076*8bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 1077*8bcb0991SDimitry Andric 10780b57cec5SDimitry Andric computeTables(); 10790b57cec5SDimitry Andric verify(*ST.getInstrInfo()); 10800b57cec5SDimitry Andric } 10810b57cec5SDimitry Andric 10820b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 10830b57cec5SDimitry Andric MachineRegisterInfo &MRI, 1084*8bcb0991SDimitry Andric MachineIRBuilder &B, 10850b57cec5SDimitry Andric GISelChangeObserver &Observer) const { 10860b57cec5SDimitry Andric switch (MI.getOpcode()) { 10870b57cec5SDimitry Andric case TargetOpcode::G_ADDRSPACE_CAST: 1088*8bcb0991SDimitry Andric return legalizeAddrSpaceCast(MI, MRI, B); 10890b57cec5SDimitry Andric case TargetOpcode::G_FRINT: 1090*8bcb0991SDimitry Andric return legalizeFrint(MI, MRI, B); 10910b57cec5SDimitry Andric case TargetOpcode::G_FCEIL: 1092*8bcb0991SDimitry Andric return legalizeFceil(MI, MRI, B); 10930b57cec5SDimitry Andric case TargetOpcode::G_INTRINSIC_TRUNC: 1094*8bcb0991SDimitry Andric return legalizeIntrinsicTrunc(MI, MRI, B); 10950b57cec5SDimitry Andric case TargetOpcode::G_SITOFP: 1096*8bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, true); 10970b57cec5SDimitry Andric case TargetOpcode::G_UITOFP: 1098*8bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, false); 10990b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM: 11000b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM: 11010b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM_IEEE: 11020b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM_IEEE: 1103*8bcb0991SDimitry Andric return legalizeMinNumMaxNum(MI, MRI, B); 11040b57cec5SDimitry Andric case TargetOpcode::G_EXTRACT_VECTOR_ELT: 1105*8bcb0991SDimitry Andric return legalizeExtractVectorElt(MI, MRI, B); 11060b57cec5SDimitry Andric case TargetOpcode::G_INSERT_VECTOR_ELT: 1107*8bcb0991SDimitry Andric return legalizeInsertVectorElt(MI, MRI, B); 1108*8bcb0991SDimitry Andric case TargetOpcode::G_FSIN: 1109*8bcb0991SDimitry Andric case TargetOpcode::G_FCOS: 1110*8bcb0991SDimitry Andric return legalizeSinCos(MI, MRI, B); 1111*8bcb0991SDimitry Andric case TargetOpcode::G_GLOBAL_VALUE: 1112*8bcb0991SDimitry Andric return legalizeGlobalValue(MI, MRI, B); 1113*8bcb0991SDimitry Andric case TargetOpcode::G_LOAD: 1114*8bcb0991SDimitry Andric return legalizeLoad(MI, MRI, B, Observer); 1115*8bcb0991SDimitry Andric case TargetOpcode::G_FMAD: 1116*8bcb0991SDimitry Andric return legalizeFMad(MI, MRI, B); 1117*8bcb0991SDimitry Andric case TargetOpcode::G_FDIV: 1118*8bcb0991SDimitry Andric return legalizeFDIV(MI, MRI, B); 11190b57cec5SDimitry Andric default: 11200b57cec5SDimitry Andric return false; 11210b57cec5SDimitry Andric } 11220b57cec5SDimitry Andric 11230b57cec5SDimitry Andric llvm_unreachable("expected switch to return"); 11240b57cec5SDimitry Andric } 11250b57cec5SDimitry Andric 11260b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture( 11270b57cec5SDimitry Andric unsigned AS, 11280b57cec5SDimitry Andric MachineRegisterInfo &MRI, 1129*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1130*8bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 11310b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 11320b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 11330b57cec5SDimitry Andric 1134*8bcb0991SDimitry Andric assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 1135*8bcb0991SDimitry Andric 11360b57cec5SDimitry Andric if (ST.hasApertureRegs()) { 11370b57cec5SDimitry Andric // FIXME: Use inline constants (src_{shared, private}_base) instead of 11380b57cec5SDimitry Andric // getreg. 11390b57cec5SDimitry Andric unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 11400b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 11410b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 11420b57cec5SDimitry Andric unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 11430b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 11440b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 11450b57cec5SDimitry Andric unsigned Encoding = 11460b57cec5SDimitry Andric AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 11470b57cec5SDimitry Andric Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 11480b57cec5SDimitry Andric WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 11490b57cec5SDimitry Andric 11500b57cec5SDimitry Andric Register ApertureReg = MRI.createGenericVirtualRegister(S32); 11510b57cec5SDimitry Andric Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 11520b57cec5SDimitry Andric 1153*8bcb0991SDimitry Andric B.buildInstr(AMDGPU::S_GETREG_B32) 11540b57cec5SDimitry Andric .addDef(GetReg) 11550b57cec5SDimitry Andric .addImm(Encoding); 11560b57cec5SDimitry Andric MRI.setType(GetReg, S32); 11570b57cec5SDimitry Andric 1158*8bcb0991SDimitry Andric auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1159*8bcb0991SDimitry Andric B.buildInstr(TargetOpcode::G_SHL) 11600b57cec5SDimitry Andric .addDef(ApertureReg) 11610b57cec5SDimitry Andric .addUse(GetReg) 11620b57cec5SDimitry Andric .addUse(ShiftAmt.getReg(0)); 11630b57cec5SDimitry Andric 11640b57cec5SDimitry Andric return ApertureReg; 11650b57cec5SDimitry Andric } 11660b57cec5SDimitry Andric 11670b57cec5SDimitry Andric Register QueuePtr = MRI.createGenericVirtualRegister( 11680b57cec5SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 11690b57cec5SDimitry Andric 1170*8bcb0991SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1171*8bcb0991SDimitry Andric if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 1172*8bcb0991SDimitry Andric return Register(); 11730b57cec5SDimitry Andric 11740b57cec5SDimitry Andric // Offset into amd_queue_t for group_segment_aperture_base_hi / 11750b57cec5SDimitry Andric // private_segment_aperture_base_hi. 11760b57cec5SDimitry Andric uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 11770b57cec5SDimitry Andric 11780b57cec5SDimitry Andric // FIXME: Don't use undef 11790b57cec5SDimitry Andric Value *V = UndefValue::get(PointerType::get( 11800b57cec5SDimitry Andric Type::getInt8Ty(MF.getFunction().getContext()), 11810b57cec5SDimitry Andric AMDGPUAS::CONSTANT_ADDRESS)); 11820b57cec5SDimitry Andric 11830b57cec5SDimitry Andric MachinePointerInfo PtrInfo(V, StructOffset); 11840b57cec5SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 11850b57cec5SDimitry Andric PtrInfo, 11860b57cec5SDimitry Andric MachineMemOperand::MOLoad | 11870b57cec5SDimitry Andric MachineMemOperand::MODereferenceable | 11880b57cec5SDimitry Andric MachineMemOperand::MOInvariant, 11890b57cec5SDimitry Andric 4, 11900b57cec5SDimitry Andric MinAlign(64, StructOffset)); 11910b57cec5SDimitry Andric 11920b57cec5SDimitry Andric Register LoadResult = MRI.createGenericVirtualRegister(S32); 11930b57cec5SDimitry Andric Register LoadAddr; 11940b57cec5SDimitry Andric 1195*8bcb0991SDimitry Andric B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1196*8bcb0991SDimitry Andric B.buildLoad(LoadResult, LoadAddr, *MMO); 11970b57cec5SDimitry Andric return LoadResult; 11980b57cec5SDimitry Andric } 11990b57cec5SDimitry Andric 12000b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 12010b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1202*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1203*8bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 12040b57cec5SDimitry Andric 1205*8bcb0991SDimitry Andric B.setInstr(MI); 12060b57cec5SDimitry Andric 1207*8bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 12080b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 12090b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 12100b57cec5SDimitry Andric 12110b57cec5SDimitry Andric LLT DstTy = MRI.getType(Dst); 12120b57cec5SDimitry Andric LLT SrcTy = MRI.getType(Src); 12130b57cec5SDimitry Andric unsigned DestAS = DstTy.getAddressSpace(); 12140b57cec5SDimitry Andric unsigned SrcAS = SrcTy.getAddressSpace(); 12150b57cec5SDimitry Andric 12160b57cec5SDimitry Andric // TODO: Avoid reloading from the queue ptr for each cast, or at least each 12170b57cec5SDimitry Andric // vector element. 12180b57cec5SDimitry Andric assert(!DstTy.isVector()); 12190b57cec5SDimitry Andric 12200b57cec5SDimitry Andric const AMDGPUTargetMachine &TM 12210b57cec5SDimitry Andric = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 12220b57cec5SDimitry Andric 12230b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 12240b57cec5SDimitry Andric if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 1225*8bcb0991SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 1226*8bcb0991SDimitry Andric return true; 1227*8bcb0991SDimitry Andric } 1228*8bcb0991SDimitry Andric 1229*8bcb0991SDimitry Andric if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1230*8bcb0991SDimitry Andric // Truncate. 1231*8bcb0991SDimitry Andric B.buildExtract(Dst, Src, 0); 1232*8bcb0991SDimitry Andric MI.eraseFromParent(); 1233*8bcb0991SDimitry Andric return true; 1234*8bcb0991SDimitry Andric } 1235*8bcb0991SDimitry Andric 1236*8bcb0991SDimitry Andric if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 1237*8bcb0991SDimitry Andric const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 1238*8bcb0991SDimitry Andric uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 1239*8bcb0991SDimitry Andric 1240*8bcb0991SDimitry Andric // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 1241*8bcb0991SDimitry Andric // another. Merge operands are required to be the same type, but creating an 1242*8bcb0991SDimitry Andric // extra ptrtoint would be kind of pointless. 1243*8bcb0991SDimitry Andric auto HighAddr = B.buildConstant( 1244*8bcb0991SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1245*8bcb0991SDimitry Andric B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 1246*8bcb0991SDimitry Andric MI.eraseFromParent(); 12470b57cec5SDimitry Andric return true; 12480b57cec5SDimitry Andric } 12490b57cec5SDimitry Andric 12500b57cec5SDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 12510b57cec5SDimitry Andric assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 12520b57cec5SDimitry Andric DestAS == AMDGPUAS::PRIVATE_ADDRESS); 12530b57cec5SDimitry Andric unsigned NullVal = TM.getNullPointerValue(DestAS); 12540b57cec5SDimitry Andric 1255*8bcb0991SDimitry Andric auto SegmentNull = B.buildConstant(DstTy, NullVal); 1256*8bcb0991SDimitry Andric auto FlatNull = B.buildConstant(SrcTy, 0); 12570b57cec5SDimitry Andric 12580b57cec5SDimitry Andric Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 12590b57cec5SDimitry Andric 12600b57cec5SDimitry Andric // Extract low 32-bits of the pointer. 1261*8bcb0991SDimitry Andric B.buildExtract(PtrLo32, Src, 0); 12620b57cec5SDimitry Andric 12630b57cec5SDimitry Andric Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1264*8bcb0991SDimitry Andric B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 1265*8bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 12660b57cec5SDimitry Andric 12670b57cec5SDimitry Andric MI.eraseFromParent(); 12680b57cec5SDimitry Andric return true; 12690b57cec5SDimitry Andric } 12700b57cec5SDimitry Andric 1271*8bcb0991SDimitry Andric if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 1272*8bcb0991SDimitry Andric return false; 1273*8bcb0991SDimitry Andric 1274*8bcb0991SDimitry Andric if (!ST.hasFlatAddressSpace()) 1275*8bcb0991SDimitry Andric return false; 12760b57cec5SDimitry Andric 12770b57cec5SDimitry Andric auto SegmentNull = 1278*8bcb0991SDimitry Andric B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 12790b57cec5SDimitry Andric auto FlatNull = 1280*8bcb0991SDimitry Andric B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 12810b57cec5SDimitry Andric 1282*8bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 1283*8bcb0991SDimitry Andric if (!ApertureReg.isValid()) 1284*8bcb0991SDimitry Andric return false; 12850b57cec5SDimitry Andric 12860b57cec5SDimitry Andric Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 1287*8bcb0991SDimitry Andric B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 12880b57cec5SDimitry Andric 12890b57cec5SDimitry Andric Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 12900b57cec5SDimitry Andric 12910b57cec5SDimitry Andric // Coerce the type of the low half of the result so we can use merge_values. 1292*8bcb0991SDimitry Andric Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 1293*8bcb0991SDimitry Andric B.buildInstr(TargetOpcode::G_PTRTOINT) 12940b57cec5SDimitry Andric .addDef(SrcAsInt) 12950b57cec5SDimitry Andric .addUse(Src); 12960b57cec5SDimitry Andric 12970b57cec5SDimitry Andric // TODO: Should we allow mismatched types but matching sizes in merges to 12980b57cec5SDimitry Andric // avoid the ptrtoint? 1299*8bcb0991SDimitry Andric B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 1300*8bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 13010b57cec5SDimitry Andric 13020b57cec5SDimitry Andric MI.eraseFromParent(); 13030b57cec5SDimitry Andric return true; 13040b57cec5SDimitry Andric } 13050b57cec5SDimitry Andric 13060b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint( 13070b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1308*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1309*8bcb0991SDimitry Andric B.setInstr(MI); 13100b57cec5SDimitry Andric 13110b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 13120b57cec5SDimitry Andric LLT Ty = MRI.getType(Src); 13130b57cec5SDimitry Andric assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 13140b57cec5SDimitry Andric 13150b57cec5SDimitry Andric APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 13160b57cec5SDimitry Andric APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 13170b57cec5SDimitry Andric 1318*8bcb0991SDimitry Andric auto C1 = B.buildFConstant(Ty, C1Val); 1319*8bcb0991SDimitry Andric auto CopySign = B.buildFCopysign(Ty, C1, Src); 13200b57cec5SDimitry Andric 13210b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 1322*8bcb0991SDimitry Andric auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 1323*8bcb0991SDimitry Andric auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 13240b57cec5SDimitry Andric 1325*8bcb0991SDimitry Andric auto C2 = B.buildFConstant(Ty, C2Val); 1326*8bcb0991SDimitry Andric auto Fabs = B.buildFAbs(Ty, Src); 13270b57cec5SDimitry Andric 1328*8bcb0991SDimitry Andric auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 1329*8bcb0991SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 13300b57cec5SDimitry Andric return true; 13310b57cec5SDimitry Andric } 13320b57cec5SDimitry Andric 13330b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil( 13340b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 13350b57cec5SDimitry Andric MachineIRBuilder &B) const { 13360b57cec5SDimitry Andric B.setInstr(MI); 13370b57cec5SDimitry Andric 13380b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 13390b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 13400b57cec5SDimitry Andric 13410b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 13420b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 13430b57cec5SDimitry Andric 13440b57cec5SDimitry Andric // result = trunc(src) 13450b57cec5SDimitry Andric // if (src > 0.0 && src != result) 13460b57cec5SDimitry Andric // result += 1.0 13470b57cec5SDimitry Andric 13480b57cec5SDimitry Andric auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 13490b57cec5SDimitry Andric 13500b57cec5SDimitry Andric const auto Zero = B.buildFConstant(S64, 0.0); 13510b57cec5SDimitry Andric const auto One = B.buildFConstant(S64, 1.0); 13520b57cec5SDimitry Andric auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 13530b57cec5SDimitry Andric auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 13540b57cec5SDimitry Andric auto And = B.buildAnd(S1, Lt0, NeTrunc); 13550b57cec5SDimitry Andric auto Add = B.buildSelect(S64, And, One, Zero); 13560b57cec5SDimitry Andric 13570b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 13580b57cec5SDimitry Andric B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 13590b57cec5SDimitry Andric return true; 13600b57cec5SDimitry Andric } 13610b57cec5SDimitry Andric 13620b57cec5SDimitry Andric static MachineInstrBuilder extractF64Exponent(unsigned Hi, 13630b57cec5SDimitry Andric MachineIRBuilder &B) { 13640b57cec5SDimitry Andric const unsigned FractBits = 52; 13650b57cec5SDimitry Andric const unsigned ExpBits = 11; 13660b57cec5SDimitry Andric LLT S32 = LLT::scalar(32); 13670b57cec5SDimitry Andric 13680b57cec5SDimitry Andric auto Const0 = B.buildConstant(S32, FractBits - 32); 13690b57cec5SDimitry Andric auto Const1 = B.buildConstant(S32, ExpBits); 13700b57cec5SDimitry Andric 13710b57cec5SDimitry Andric auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 13720b57cec5SDimitry Andric .addUse(Const0.getReg(0)) 13730b57cec5SDimitry Andric .addUse(Const1.getReg(0)); 13740b57cec5SDimitry Andric 13750b57cec5SDimitry Andric return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 13760b57cec5SDimitry Andric } 13770b57cec5SDimitry Andric 13780b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 13790b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 13800b57cec5SDimitry Andric MachineIRBuilder &B) const { 13810b57cec5SDimitry Andric B.setInstr(MI); 13820b57cec5SDimitry Andric 13830b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 13840b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 13850b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 13860b57cec5SDimitry Andric 13870b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 13880b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 13890b57cec5SDimitry Andric 13900b57cec5SDimitry Andric // TODO: Should this use extract since the low half is unused? 13910b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 13920b57cec5SDimitry Andric Register Hi = Unmerge.getReg(1); 13930b57cec5SDimitry Andric 13940b57cec5SDimitry Andric // Extract the upper half, since this is where we will find the sign and 13950b57cec5SDimitry Andric // exponent. 13960b57cec5SDimitry Andric auto Exp = extractF64Exponent(Hi, B); 13970b57cec5SDimitry Andric 13980b57cec5SDimitry Andric const unsigned FractBits = 52; 13990b57cec5SDimitry Andric 14000b57cec5SDimitry Andric // Extract the sign bit. 14010b57cec5SDimitry Andric const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 14020b57cec5SDimitry Andric auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 14030b57cec5SDimitry Andric 14040b57cec5SDimitry Andric const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 14050b57cec5SDimitry Andric 14060b57cec5SDimitry Andric const auto Zero32 = B.buildConstant(S32, 0); 14070b57cec5SDimitry Andric 14080b57cec5SDimitry Andric // Extend back to 64-bits. 14090b57cec5SDimitry Andric auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 14100b57cec5SDimitry Andric 14110b57cec5SDimitry Andric auto Shr = B.buildAShr(S64, FractMask, Exp); 14120b57cec5SDimitry Andric auto Not = B.buildNot(S64, Shr); 14130b57cec5SDimitry Andric auto Tmp0 = B.buildAnd(S64, Src, Not); 14140b57cec5SDimitry Andric auto FiftyOne = B.buildConstant(S32, FractBits - 1); 14150b57cec5SDimitry Andric 14160b57cec5SDimitry Andric auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 14170b57cec5SDimitry Andric auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 14180b57cec5SDimitry Andric 14190b57cec5SDimitry Andric auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 14200b57cec5SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 14210b57cec5SDimitry Andric return true; 14220b57cec5SDimitry Andric } 14230b57cec5SDimitry Andric 14240b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP( 14250b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 14260b57cec5SDimitry Andric MachineIRBuilder &B, bool Signed) const { 14270b57cec5SDimitry Andric B.setInstr(MI); 14280b57cec5SDimitry Andric 14290b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 14300b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 14310b57cec5SDimitry Andric 14320b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 14330b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 14340b57cec5SDimitry Andric 14350b57cec5SDimitry Andric assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 14360b57cec5SDimitry Andric 14370b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 14380b57cec5SDimitry Andric 14390b57cec5SDimitry Andric auto CvtHi = Signed ? 14400b57cec5SDimitry Andric B.buildSITOFP(S64, Unmerge.getReg(1)) : 14410b57cec5SDimitry Andric B.buildUITOFP(S64, Unmerge.getReg(1)); 14420b57cec5SDimitry Andric 14430b57cec5SDimitry Andric auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 14440b57cec5SDimitry Andric 14450b57cec5SDimitry Andric auto ThirtyTwo = B.buildConstant(S32, 32); 14460b57cec5SDimitry Andric auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 14470b57cec5SDimitry Andric .addUse(CvtHi.getReg(0)) 14480b57cec5SDimitry Andric .addUse(ThirtyTwo.getReg(0)); 14490b57cec5SDimitry Andric 14500b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 14510b57cec5SDimitry Andric B.buildFAdd(Dst, LdExp, CvtLo); 14520b57cec5SDimitry Andric MI.eraseFromParent(); 14530b57cec5SDimitry Andric return true; 14540b57cec5SDimitry Andric } 14550b57cec5SDimitry Andric 14560b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 14570b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 14580b57cec5SDimitry Andric MachineIRBuilder &B) const { 14590b57cec5SDimitry Andric MachineFunction &MF = B.getMF(); 14600b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 14610b57cec5SDimitry Andric 14620b57cec5SDimitry Andric const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 14630b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 14640b57cec5SDimitry Andric 14650b57cec5SDimitry Andric // With ieee_mode disabled, the instructions have the correct behavior 14660b57cec5SDimitry Andric // already for G_FMINNUM/G_FMAXNUM 14670b57cec5SDimitry Andric if (!MFI->getMode().IEEE) 14680b57cec5SDimitry Andric return !IsIEEEOp; 14690b57cec5SDimitry Andric 14700b57cec5SDimitry Andric if (IsIEEEOp) 14710b57cec5SDimitry Andric return true; 14720b57cec5SDimitry Andric 14730b57cec5SDimitry Andric MachineIRBuilder HelperBuilder(MI); 14740b57cec5SDimitry Andric GISelObserverWrapper DummyObserver; 14750b57cec5SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1476*8bcb0991SDimitry Andric HelperBuilder.setInstr(MI); 14770b57cec5SDimitry Andric return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 14780b57cec5SDimitry Andric } 14790b57cec5SDimitry Andric 14800b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 14810b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 14820b57cec5SDimitry Andric MachineIRBuilder &B) const { 14830b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 14840b57cec5SDimitry Andric 14850b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 14860b57cec5SDimitry Andric // TODO: Dynamic s64 indexing is only legal for SGPR. 14870b57cec5SDimitry Andric Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 14880b57cec5SDimitry Andric if (!IdxVal) // Dynamic case will be selected to register indexing. 14890b57cec5SDimitry Andric return true; 14900b57cec5SDimitry Andric 14910b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 14920b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 14930b57cec5SDimitry Andric 14940b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 14950b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 14960b57cec5SDimitry Andric assert(EltTy == MRI.getType(Dst)); 14970b57cec5SDimitry Andric 14980b57cec5SDimitry Andric B.setInstr(MI); 14990b57cec5SDimitry Andric 15000b57cec5SDimitry Andric if (IdxVal.getValue() < VecTy.getNumElements()) 15010b57cec5SDimitry Andric B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 15020b57cec5SDimitry Andric else 15030b57cec5SDimitry Andric B.buildUndef(Dst); 15040b57cec5SDimitry Andric 15050b57cec5SDimitry Andric MI.eraseFromParent(); 15060b57cec5SDimitry Andric return true; 15070b57cec5SDimitry Andric } 15080b57cec5SDimitry Andric 15090b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 15100b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 15110b57cec5SDimitry Andric MachineIRBuilder &B) const { 15120b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 15130b57cec5SDimitry Andric 15140b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 15150b57cec5SDimitry Andric // TODO: Dynamic s64 indexing is only legal for SGPR. 15160b57cec5SDimitry Andric Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 15170b57cec5SDimitry Andric if (!IdxVal) // Dynamic case will be selected to register indexing. 15180b57cec5SDimitry Andric return true; 15190b57cec5SDimitry Andric 15200b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 15210b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 15220b57cec5SDimitry Andric Register Ins = MI.getOperand(2).getReg(); 15230b57cec5SDimitry Andric 15240b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 15250b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 15260b57cec5SDimitry Andric assert(EltTy == MRI.getType(Ins)); 15270b57cec5SDimitry Andric 15280b57cec5SDimitry Andric B.setInstr(MI); 15290b57cec5SDimitry Andric 15300b57cec5SDimitry Andric if (IdxVal.getValue() < VecTy.getNumElements()) 15310b57cec5SDimitry Andric B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 15320b57cec5SDimitry Andric else 15330b57cec5SDimitry Andric B.buildUndef(Dst); 15340b57cec5SDimitry Andric 15350b57cec5SDimitry Andric MI.eraseFromParent(); 15360b57cec5SDimitry Andric return true; 15370b57cec5SDimitry Andric } 15380b57cec5SDimitry Andric 1539*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos( 1540*8bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1541*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1542*8bcb0991SDimitry Andric B.setInstr(MI); 1543*8bcb0991SDimitry Andric 1544*8bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 1545*8bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 1546*8bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 1547*8bcb0991SDimitry Andric unsigned Flags = MI.getFlags(); 1548*8bcb0991SDimitry Andric 1549*8bcb0991SDimitry Andric Register TrigVal; 1550*8bcb0991SDimitry Andric auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 1551*8bcb0991SDimitry Andric if (ST.hasTrigReducedRange()) { 1552*8bcb0991SDimitry Andric auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 1553*8bcb0991SDimitry Andric TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 1554*8bcb0991SDimitry Andric .addUse(MulVal.getReg(0)) 1555*8bcb0991SDimitry Andric .setMIFlags(Flags).getReg(0); 1556*8bcb0991SDimitry Andric } else 1557*8bcb0991SDimitry Andric TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 1558*8bcb0991SDimitry Andric 1559*8bcb0991SDimitry Andric Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 1560*8bcb0991SDimitry Andric Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 1561*8bcb0991SDimitry Andric B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 1562*8bcb0991SDimitry Andric .addUse(TrigVal) 1563*8bcb0991SDimitry Andric .setMIFlags(Flags); 1564*8bcb0991SDimitry Andric MI.eraseFromParent(); 1565*8bcb0991SDimitry Andric return true; 1566*8bcb0991SDimitry Andric } 1567*8bcb0991SDimitry Andric 1568*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 1569*8bcb0991SDimitry Andric Register DstReg, LLT PtrTy, 1570*8bcb0991SDimitry Andric MachineIRBuilder &B, const GlobalValue *GV, 1571*8bcb0991SDimitry Andric unsigned Offset, unsigned GAFlags) const { 1572*8bcb0991SDimitry Andric // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 1573*8bcb0991SDimitry Andric // to the following code sequence: 1574*8bcb0991SDimitry Andric // 1575*8bcb0991SDimitry Andric // For constant address space: 1576*8bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 1577*8bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol 1578*8bcb0991SDimitry Andric // s_addc_u32 s1, s1, 0 1579*8bcb0991SDimitry Andric // 1580*8bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1581*8bcb0991SDimitry Andric // a fixup or relocation is emitted to replace $symbol with a literal 1582*8bcb0991SDimitry Andric // constant, which is a pc-relative offset from the encoding of the $symbol 1583*8bcb0991SDimitry Andric // operand to the global variable. 1584*8bcb0991SDimitry Andric // 1585*8bcb0991SDimitry Andric // For global address space: 1586*8bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 1587*8bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 1588*8bcb0991SDimitry Andric // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 1589*8bcb0991SDimitry Andric // 1590*8bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 1591*8bcb0991SDimitry Andric // fixups or relocations are emitted to replace $symbol@*@lo and 1592*8bcb0991SDimitry Andric // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 1593*8bcb0991SDimitry Andric // which is a 64-bit pc-relative offset from the encoding of the $symbol 1594*8bcb0991SDimitry Andric // operand to the global variable. 1595*8bcb0991SDimitry Andric // 1596*8bcb0991SDimitry Andric // What we want here is an offset from the value returned by s_getpc 1597*8bcb0991SDimitry Andric // (which is the address of the s_add_u32 instruction) to the global 1598*8bcb0991SDimitry Andric // variable, but since the encoding of $symbol starts 4 bytes after the start 1599*8bcb0991SDimitry Andric // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 1600*8bcb0991SDimitry Andric // small. This requires us to add 4 to the global variable offset in order to 1601*8bcb0991SDimitry Andric // compute the correct address. 1602*8bcb0991SDimitry Andric 1603*8bcb0991SDimitry Andric LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1604*8bcb0991SDimitry Andric 1605*8bcb0991SDimitry Andric Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 1606*8bcb0991SDimitry Andric B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 1607*8bcb0991SDimitry Andric 1608*8bcb0991SDimitry Andric MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 1609*8bcb0991SDimitry Andric .addDef(PCReg); 1610*8bcb0991SDimitry Andric 1611*8bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 1612*8bcb0991SDimitry Andric if (GAFlags == SIInstrInfo::MO_NONE) 1613*8bcb0991SDimitry Andric MIB.addImm(0); 1614*8bcb0991SDimitry Andric else 1615*8bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 1616*8bcb0991SDimitry Andric 1617*8bcb0991SDimitry Andric B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 1618*8bcb0991SDimitry Andric 1619*8bcb0991SDimitry Andric if (PtrTy.getSizeInBits() == 32) 1620*8bcb0991SDimitry Andric B.buildExtract(DstReg, PCReg, 0); 1621*8bcb0991SDimitry Andric return true; 1622*8bcb0991SDimitry Andric } 1623*8bcb0991SDimitry Andric 1624*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue( 1625*8bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1626*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1627*8bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 1628*8bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 1629*8bcb0991SDimitry Andric unsigned AS = Ty.getAddressSpace(); 1630*8bcb0991SDimitry Andric 1631*8bcb0991SDimitry Andric const GlobalValue *GV = MI.getOperand(1).getGlobal(); 1632*8bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 1633*8bcb0991SDimitry Andric SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1634*8bcb0991SDimitry Andric B.setInstr(MI); 1635*8bcb0991SDimitry Andric 1636*8bcb0991SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 1637*8bcb0991SDimitry Andric if (!MFI->isEntryFunction()) { 1638*8bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 1639*8bcb0991SDimitry Andric DiagnosticInfoUnsupported BadLDSDecl( 1640*8bcb0991SDimitry Andric Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 1641*8bcb0991SDimitry Andric Fn.getContext().diagnose(BadLDSDecl); 1642*8bcb0991SDimitry Andric } 1643*8bcb0991SDimitry Andric 1644*8bcb0991SDimitry Andric // TODO: We could emit code to handle the initialization somewhere. 1645*8bcb0991SDimitry Andric if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 1646*8bcb0991SDimitry Andric B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 1647*8bcb0991SDimitry Andric MI.eraseFromParent(); 1648*8bcb0991SDimitry Andric return true; 1649*8bcb0991SDimitry Andric } 1650*8bcb0991SDimitry Andric 1651*8bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 1652*8bcb0991SDimitry Andric DiagnosticInfoUnsupported BadInit( 1653*8bcb0991SDimitry Andric Fn, "unsupported initializer for address space", MI.getDebugLoc()); 1654*8bcb0991SDimitry Andric Fn.getContext().diagnose(BadInit); 1655*8bcb0991SDimitry Andric return true; 1656*8bcb0991SDimitry Andric } 1657*8bcb0991SDimitry Andric 1658*8bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 1659*8bcb0991SDimitry Andric 1660*8bcb0991SDimitry Andric if (TLI->shouldEmitFixup(GV)) { 1661*8bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 1662*8bcb0991SDimitry Andric MI.eraseFromParent(); 1663*8bcb0991SDimitry Andric return true; 1664*8bcb0991SDimitry Andric } 1665*8bcb0991SDimitry Andric 1666*8bcb0991SDimitry Andric if (TLI->shouldEmitPCReloc(GV)) { 1667*8bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 1668*8bcb0991SDimitry Andric MI.eraseFromParent(); 1669*8bcb0991SDimitry Andric return true; 1670*8bcb0991SDimitry Andric } 1671*8bcb0991SDimitry Andric 1672*8bcb0991SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1673*8bcb0991SDimitry Andric Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 1674*8bcb0991SDimitry Andric 1675*8bcb0991SDimitry Andric MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 1676*8bcb0991SDimitry Andric MachinePointerInfo::getGOT(MF), 1677*8bcb0991SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 1678*8bcb0991SDimitry Andric MachineMemOperand::MOInvariant, 1679*8bcb0991SDimitry Andric 8 /*Size*/, 8 /*Align*/); 1680*8bcb0991SDimitry Andric 1681*8bcb0991SDimitry Andric buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 1682*8bcb0991SDimitry Andric 1683*8bcb0991SDimitry Andric if (Ty.getSizeInBits() == 32) { 1684*8bcb0991SDimitry Andric // Truncate if this is a 32-bit constant adrdess. 1685*8bcb0991SDimitry Andric auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 1686*8bcb0991SDimitry Andric B.buildExtract(DstReg, Load, 0); 1687*8bcb0991SDimitry Andric } else 1688*8bcb0991SDimitry Andric B.buildLoad(DstReg, GOTAddr, *GOTMMO); 1689*8bcb0991SDimitry Andric 1690*8bcb0991SDimitry Andric MI.eraseFromParent(); 1691*8bcb0991SDimitry Andric return true; 1692*8bcb0991SDimitry Andric } 1693*8bcb0991SDimitry Andric 1694*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad( 1695*8bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1696*8bcb0991SDimitry Andric MachineIRBuilder &B, GISelChangeObserver &Observer) const { 1697*8bcb0991SDimitry Andric B.setInstr(MI); 1698*8bcb0991SDimitry Andric LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 1699*8bcb0991SDimitry Andric auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 1700*8bcb0991SDimitry Andric Observer.changingInstr(MI); 1701*8bcb0991SDimitry Andric MI.getOperand(1).setReg(Cast.getReg(0)); 1702*8bcb0991SDimitry Andric Observer.changedInstr(MI); 1703*8bcb0991SDimitry Andric return true; 1704*8bcb0991SDimitry Andric } 1705*8bcb0991SDimitry Andric 1706*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad( 1707*8bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1708*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1709*8bcb0991SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 1710*8bcb0991SDimitry Andric assert(Ty.isScalar()); 1711*8bcb0991SDimitry Andric 1712*8bcb0991SDimitry Andric // TODO: Always legal with future ftz flag. 1713*8bcb0991SDimitry Andric if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals()) 1714*8bcb0991SDimitry Andric return true; 1715*8bcb0991SDimitry Andric if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals()) 1716*8bcb0991SDimitry Andric return true; 1717*8bcb0991SDimitry Andric 1718*8bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 1719*8bcb0991SDimitry Andric 1720*8bcb0991SDimitry Andric MachineIRBuilder HelperBuilder(MI); 1721*8bcb0991SDimitry Andric GISelObserverWrapper DummyObserver; 1722*8bcb0991SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 1723*8bcb0991SDimitry Andric HelperBuilder.setMBB(*MI.getParent()); 1724*8bcb0991SDimitry Andric return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 1725*8bcb0991SDimitry Andric } 1726*8bcb0991SDimitry Andric 17270b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid. 17280b57cec5SDimitry Andric static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 17290b57cec5SDimitry Andric MachineRegisterInfo &MRI) { 17300b57cec5SDimitry Andric Register CondDef = MI.getOperand(0).getReg(); 17310b57cec5SDimitry Andric if (!MRI.hasOneNonDBGUse(CondDef)) 17320b57cec5SDimitry Andric return nullptr; 17330b57cec5SDimitry Andric 17340b57cec5SDimitry Andric MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 17350b57cec5SDimitry Andric return UseMI.getParent() == MI.getParent() && 17360b57cec5SDimitry Andric UseMI.getOpcode() == AMDGPU::G_BRCOND ? &UseMI : nullptr; 17370b57cec5SDimitry Andric } 17380b57cec5SDimitry Andric 17390b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 17400b57cec5SDimitry Andric Register Reg, LLT Ty) const { 17410b57cec5SDimitry Andric Register LiveIn = MRI.getLiveInVirtReg(Reg); 17420b57cec5SDimitry Andric if (LiveIn) 17430b57cec5SDimitry Andric return LiveIn; 17440b57cec5SDimitry Andric 17450b57cec5SDimitry Andric Register NewReg = MRI.createGenericVirtualRegister(Ty); 17460b57cec5SDimitry Andric MRI.addLiveIn(Reg, NewReg); 17470b57cec5SDimitry Andric return NewReg; 17480b57cec5SDimitry Andric } 17490b57cec5SDimitry Andric 17500b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 17510b57cec5SDimitry Andric const ArgDescriptor *Arg) const { 1752*8bcb0991SDimitry Andric if (!Arg->isRegister() || !Arg->getRegister().isValid()) 17530b57cec5SDimitry Andric return false; // TODO: Handle these 17540b57cec5SDimitry Andric 17550b57cec5SDimitry Andric assert(Arg->getRegister().isPhysical()); 17560b57cec5SDimitry Andric 17570b57cec5SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 17580b57cec5SDimitry Andric 17590b57cec5SDimitry Andric LLT Ty = MRI.getType(DstReg); 17600b57cec5SDimitry Andric Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 17610b57cec5SDimitry Andric 17620b57cec5SDimitry Andric if (Arg->isMasked()) { 17630b57cec5SDimitry Andric // TODO: Should we try to emit this once in the entry block? 17640b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 17650b57cec5SDimitry Andric const unsigned Mask = Arg->getMask(); 17660b57cec5SDimitry Andric const unsigned Shift = countTrailingZeros<unsigned>(Mask); 17670b57cec5SDimitry Andric 1768*8bcb0991SDimitry Andric Register AndMaskSrc = LiveIn; 1769*8bcb0991SDimitry Andric 1770*8bcb0991SDimitry Andric if (Shift != 0) { 17710b57cec5SDimitry Andric auto ShiftAmt = B.buildConstant(S32, Shift); 1772*8bcb0991SDimitry Andric AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 1773*8bcb0991SDimitry Andric } 1774*8bcb0991SDimitry Andric 1775*8bcb0991SDimitry Andric B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 17760b57cec5SDimitry Andric } else 17770b57cec5SDimitry Andric B.buildCopy(DstReg, LiveIn); 17780b57cec5SDimitry Andric 17790b57cec5SDimitry Andric // Insert the argument copy if it doens't already exist. 17800b57cec5SDimitry Andric // FIXME: It seems EmitLiveInCopies isn't called anywhere? 17810b57cec5SDimitry Andric if (!MRI.getVRegDef(LiveIn)) { 1782*8bcb0991SDimitry Andric // FIXME: Should have scoped insert pt 1783*8bcb0991SDimitry Andric MachineBasicBlock &OrigInsBB = B.getMBB(); 1784*8bcb0991SDimitry Andric auto OrigInsPt = B.getInsertPt(); 1785*8bcb0991SDimitry Andric 17860b57cec5SDimitry Andric MachineBasicBlock &EntryMBB = B.getMF().front(); 17870b57cec5SDimitry Andric EntryMBB.addLiveIn(Arg->getRegister()); 17880b57cec5SDimitry Andric B.setInsertPt(EntryMBB, EntryMBB.begin()); 17890b57cec5SDimitry Andric B.buildCopy(LiveIn, Arg->getRegister()); 1790*8bcb0991SDimitry Andric 1791*8bcb0991SDimitry Andric B.setInsertPt(OrigInsBB, OrigInsPt); 17920b57cec5SDimitry Andric } 17930b57cec5SDimitry Andric 17940b57cec5SDimitry Andric return true; 17950b57cec5SDimitry Andric } 17960b57cec5SDimitry Andric 17970b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 17980b57cec5SDimitry Andric MachineInstr &MI, 17990b57cec5SDimitry Andric MachineRegisterInfo &MRI, 18000b57cec5SDimitry Andric MachineIRBuilder &B, 18010b57cec5SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 18020b57cec5SDimitry Andric B.setInstr(MI); 18030b57cec5SDimitry Andric 18040b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 18050b57cec5SDimitry Andric 18060b57cec5SDimitry Andric const ArgDescriptor *Arg; 18070b57cec5SDimitry Andric const TargetRegisterClass *RC; 18080b57cec5SDimitry Andric std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 18090b57cec5SDimitry Andric if (!Arg) { 18100b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 18110b57cec5SDimitry Andric return false; 18120b57cec5SDimitry Andric } 18130b57cec5SDimitry Andric 18140b57cec5SDimitry Andric if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 18150b57cec5SDimitry Andric MI.eraseFromParent(); 18160b57cec5SDimitry Andric return true; 18170b57cec5SDimitry Andric } 18180b57cec5SDimitry Andric 18190b57cec5SDimitry Andric return false; 18200b57cec5SDimitry Andric } 18210b57cec5SDimitry Andric 1822*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 1823*8bcb0991SDimitry Andric MachineRegisterInfo &MRI, 1824*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1825*8bcb0991SDimitry Andric B.setInstr(MI); 1826*8bcb0991SDimitry Andric 1827*8bcb0991SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 1828*8bcb0991SDimitry Andric return true; 1829*8bcb0991SDimitry Andric 1830*8bcb0991SDimitry Andric return false; 1831*8bcb0991SDimitry Andric } 1832*8bcb0991SDimitry Andric 1833*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 1834*8bcb0991SDimitry Andric MachineRegisterInfo &MRI, 1835*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1836*8bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 1837*8bcb0991SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 1838*8bcb0991SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 1839*8bcb0991SDimitry Andric 1840*8bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 1841*8bcb0991SDimitry Andric 1842*8bcb0991SDimitry Andric LLT ResTy = MRI.getType(Res); 1843*8bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 1844*8bcb0991SDimitry Andric LLT S64 = LLT::scalar(64); 1845*8bcb0991SDimitry Andric 1846*8bcb0991SDimitry Andric const MachineFunction &MF = B.getMF(); 1847*8bcb0991SDimitry Andric bool Unsafe = 1848*8bcb0991SDimitry Andric MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 1849*8bcb0991SDimitry Andric 1850*8bcb0991SDimitry Andric if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 1851*8bcb0991SDimitry Andric return false; 1852*8bcb0991SDimitry Andric 1853*8bcb0991SDimitry Andric if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals()) 1854*8bcb0991SDimitry Andric return false; 1855*8bcb0991SDimitry Andric 1856*8bcb0991SDimitry Andric if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 1857*8bcb0991SDimitry Andric // 1 / x -> RCP(x) 1858*8bcb0991SDimitry Andric if (CLHS->isExactlyValue(1.0)) { 1859*8bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1860*8bcb0991SDimitry Andric .addUse(RHS) 1861*8bcb0991SDimitry Andric .setMIFlags(Flags); 1862*8bcb0991SDimitry Andric 1863*8bcb0991SDimitry Andric MI.eraseFromParent(); 1864*8bcb0991SDimitry Andric return true; 1865*8bcb0991SDimitry Andric } 1866*8bcb0991SDimitry Andric 1867*8bcb0991SDimitry Andric // -1 / x -> RCP( FNEG(x) ) 1868*8bcb0991SDimitry Andric if (CLHS->isExactlyValue(-1.0)) { 1869*8bcb0991SDimitry Andric auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 1870*8bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 1871*8bcb0991SDimitry Andric .addUse(FNeg.getReg(0)) 1872*8bcb0991SDimitry Andric .setMIFlags(Flags); 1873*8bcb0991SDimitry Andric 1874*8bcb0991SDimitry Andric MI.eraseFromParent(); 1875*8bcb0991SDimitry Andric return true; 1876*8bcb0991SDimitry Andric } 1877*8bcb0991SDimitry Andric } 1878*8bcb0991SDimitry Andric 1879*8bcb0991SDimitry Andric // x / y -> x * (1.0 / y) 1880*8bcb0991SDimitry Andric if (Unsafe) { 1881*8bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 1882*8bcb0991SDimitry Andric .addUse(RHS) 1883*8bcb0991SDimitry Andric .setMIFlags(Flags); 1884*8bcb0991SDimitry Andric B.buildFMul(Res, LHS, RCP, Flags); 1885*8bcb0991SDimitry Andric 1886*8bcb0991SDimitry Andric MI.eraseFromParent(); 1887*8bcb0991SDimitry Andric return true; 1888*8bcb0991SDimitry Andric } 1889*8bcb0991SDimitry Andric 1890*8bcb0991SDimitry Andric return false; 1891*8bcb0991SDimitry Andric } 1892*8bcb0991SDimitry Andric 1893*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 1894*8bcb0991SDimitry Andric MachineRegisterInfo &MRI, 1895*8bcb0991SDimitry Andric MachineIRBuilder &B) const { 1896*8bcb0991SDimitry Andric B.setInstr(MI); 1897*8bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 1898*8bcb0991SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 1899*8bcb0991SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 1900*8bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 1901*8bcb0991SDimitry Andric 1902*8bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 1903*8bcb0991SDimitry Andric LLT S1 = LLT::scalar(1); 1904*8bcb0991SDimitry Andric 1905*8bcb0991SDimitry Andric auto Abs = B.buildFAbs(S32, RHS, Flags); 1906*8bcb0991SDimitry Andric const APFloat C0Val(1.0f); 1907*8bcb0991SDimitry Andric 1908*8bcb0991SDimitry Andric auto C0 = B.buildConstant(S32, 0x6f800000); 1909*8bcb0991SDimitry Andric auto C1 = B.buildConstant(S32, 0x2f800000); 1910*8bcb0991SDimitry Andric auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 1911*8bcb0991SDimitry Andric 1912*8bcb0991SDimitry Andric auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 1913*8bcb0991SDimitry Andric auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 1914*8bcb0991SDimitry Andric 1915*8bcb0991SDimitry Andric auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 1916*8bcb0991SDimitry Andric 1917*8bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 1918*8bcb0991SDimitry Andric .addUse(Mul0.getReg(0)) 1919*8bcb0991SDimitry Andric .setMIFlags(Flags); 1920*8bcb0991SDimitry Andric 1921*8bcb0991SDimitry Andric auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 1922*8bcb0991SDimitry Andric 1923*8bcb0991SDimitry Andric B.buildFMul(Res, Sel, Mul1, Flags); 1924*8bcb0991SDimitry Andric 1925*8bcb0991SDimitry Andric MI.eraseFromParent(); 1926*8bcb0991SDimitry Andric return true; 1927*8bcb0991SDimitry Andric } 1928*8bcb0991SDimitry Andric 19290b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 19300b57cec5SDimitry Andric MachineRegisterInfo &MRI, 19310b57cec5SDimitry Andric MachineIRBuilder &B) const { 19320b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 19330b57cec5SDimitry Andric if (!MFI->isEntryFunction()) { 19340b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 19350b57cec5SDimitry Andric AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 19360b57cec5SDimitry Andric } 19370b57cec5SDimitry Andric 19380b57cec5SDimitry Andric B.setInstr(MI); 19390b57cec5SDimitry Andric 19400b57cec5SDimitry Andric uint64_t Offset = 19410b57cec5SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset( 19420b57cec5SDimitry Andric B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 19430b57cec5SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 19440b57cec5SDimitry Andric LLT DstTy = MRI.getType(DstReg); 19450b57cec5SDimitry Andric LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 19460b57cec5SDimitry Andric 19470b57cec5SDimitry Andric const ArgDescriptor *Arg; 19480b57cec5SDimitry Andric const TargetRegisterClass *RC; 19490b57cec5SDimitry Andric std::tie(Arg, RC) 19500b57cec5SDimitry Andric = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 19510b57cec5SDimitry Andric if (!Arg) 19520b57cec5SDimitry Andric return false; 19530b57cec5SDimitry Andric 19540b57cec5SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 19550b57cec5SDimitry Andric if (!loadInputValue(KernargPtrReg, B, Arg)) 19560b57cec5SDimitry Andric return false; 19570b57cec5SDimitry Andric 19580b57cec5SDimitry Andric B.buildGEP(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 19590b57cec5SDimitry Andric MI.eraseFromParent(); 19600b57cec5SDimitry Andric return true; 19610b57cec5SDimitry Andric } 19620b57cec5SDimitry Andric 1963*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 1964*8bcb0991SDimitry Andric MachineRegisterInfo &MRI, 1965*8bcb0991SDimitry Andric MachineIRBuilder &B, 1966*8bcb0991SDimitry Andric unsigned AddrSpace) const { 1967*8bcb0991SDimitry Andric B.setInstr(MI); 1968*8bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 1969*8bcb0991SDimitry Andric auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 1970*8bcb0991SDimitry Andric B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 1971*8bcb0991SDimitry Andric MI.eraseFromParent(); 1972*8bcb0991SDimitry Andric return true; 1973*8bcb0991SDimitry Andric } 1974*8bcb0991SDimitry Andric 1975*8bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets. 1976*8bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 1977*8bcb0991SDimitry Andric MachineRegisterInfo &MRI, 1978*8bcb0991SDimitry Andric Register Reg) const { 1979*8bcb0991SDimitry Andric if (!ST.hasUnpackedD16VMem()) 1980*8bcb0991SDimitry Andric return Reg; 1981*8bcb0991SDimitry Andric 1982*8bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 1983*8bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 1984*8bcb0991SDimitry Andric LLT StoreVT = MRI.getType(Reg); 1985*8bcb0991SDimitry Andric assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 1986*8bcb0991SDimitry Andric 1987*8bcb0991SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 1988*8bcb0991SDimitry Andric 1989*8bcb0991SDimitry Andric SmallVector<Register, 4> WideRegs; 1990*8bcb0991SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 1991*8bcb0991SDimitry Andric WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 1992*8bcb0991SDimitry Andric 1993*8bcb0991SDimitry Andric int NumElts = StoreVT.getNumElements(); 1994*8bcb0991SDimitry Andric 1995*8bcb0991SDimitry Andric return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 1996*8bcb0991SDimitry Andric } 1997*8bcb0991SDimitry Andric 1998*8bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 1999*8bcb0991SDimitry Andric MachineRegisterInfo &MRI, 2000*8bcb0991SDimitry Andric MachineIRBuilder &B, 2001*8bcb0991SDimitry Andric bool IsFormat) const { 2002*8bcb0991SDimitry Andric // TODO: Reject f16 format on targets where unsupported. 2003*8bcb0991SDimitry Andric Register VData = MI.getOperand(1).getReg(); 2004*8bcb0991SDimitry Andric LLT Ty = MRI.getType(VData); 2005*8bcb0991SDimitry Andric 2006*8bcb0991SDimitry Andric B.setInstr(MI); 2007*8bcb0991SDimitry Andric 2008*8bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 2009*8bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 2010*8bcb0991SDimitry Andric 2011*8bcb0991SDimitry Andric // Fixup illegal register types for i8 stores. 2012*8bcb0991SDimitry Andric if (Ty == LLT::scalar(8) || Ty == S16) { 2013*8bcb0991SDimitry Andric Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 2014*8bcb0991SDimitry Andric MI.getOperand(1).setReg(AnyExt); 2015*8bcb0991SDimitry Andric return true; 2016*8bcb0991SDimitry Andric } 2017*8bcb0991SDimitry Andric 2018*8bcb0991SDimitry Andric if (Ty.isVector()) { 2019*8bcb0991SDimitry Andric if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 2020*8bcb0991SDimitry Andric if (IsFormat) 2021*8bcb0991SDimitry Andric MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 2022*8bcb0991SDimitry Andric return true; 2023*8bcb0991SDimitry Andric } 2024*8bcb0991SDimitry Andric 2025*8bcb0991SDimitry Andric return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 2026*8bcb0991SDimitry Andric } 2027*8bcb0991SDimitry Andric 2028*8bcb0991SDimitry Andric return Ty == S32; 2029*8bcb0991SDimitry Andric } 2030*8bcb0991SDimitry Andric 20310b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 20320b57cec5SDimitry Andric MachineRegisterInfo &MRI, 20330b57cec5SDimitry Andric MachineIRBuilder &B) const { 20340b57cec5SDimitry Andric // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2035*8bcb0991SDimitry Andric switch (MI.getIntrinsicID()) { 20360b57cec5SDimitry Andric case Intrinsic::amdgcn_if: { 20370b57cec5SDimitry Andric if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 20380b57cec5SDimitry Andric const SIRegisterInfo *TRI 20390b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 20400b57cec5SDimitry Andric 20410b57cec5SDimitry Andric B.setInstr(*BrCond); 20420b57cec5SDimitry Andric Register Def = MI.getOperand(1).getReg(); 20430b57cec5SDimitry Andric Register Use = MI.getOperand(3).getReg(); 20440b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_IF) 20450b57cec5SDimitry Andric .addDef(Def) 20460b57cec5SDimitry Andric .addUse(Use) 20470b57cec5SDimitry Andric .addMBB(BrCond->getOperand(1).getMBB()); 20480b57cec5SDimitry Andric 20490b57cec5SDimitry Andric MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 20500b57cec5SDimitry Andric MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 20510b57cec5SDimitry Andric MI.eraseFromParent(); 20520b57cec5SDimitry Andric BrCond->eraseFromParent(); 20530b57cec5SDimitry Andric return true; 20540b57cec5SDimitry Andric } 20550b57cec5SDimitry Andric 20560b57cec5SDimitry Andric return false; 20570b57cec5SDimitry Andric } 20580b57cec5SDimitry Andric case Intrinsic::amdgcn_loop: { 20590b57cec5SDimitry Andric if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) { 20600b57cec5SDimitry Andric const SIRegisterInfo *TRI 20610b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 20620b57cec5SDimitry Andric 20630b57cec5SDimitry Andric B.setInstr(*BrCond); 20640b57cec5SDimitry Andric Register Reg = MI.getOperand(2).getReg(); 20650b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_LOOP) 20660b57cec5SDimitry Andric .addUse(Reg) 20670b57cec5SDimitry Andric .addMBB(BrCond->getOperand(1).getMBB()); 20680b57cec5SDimitry Andric MI.eraseFromParent(); 20690b57cec5SDimitry Andric BrCond->eraseFromParent(); 20700b57cec5SDimitry Andric MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 20710b57cec5SDimitry Andric return true; 20720b57cec5SDimitry Andric } 20730b57cec5SDimitry Andric 20740b57cec5SDimitry Andric return false; 20750b57cec5SDimitry Andric } 20760b57cec5SDimitry Andric case Intrinsic::amdgcn_kernarg_segment_ptr: 20770b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 20780b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 20790b57cec5SDimitry Andric case Intrinsic::amdgcn_implicitarg_ptr: 20800b57cec5SDimitry Andric return legalizeImplicitArgPtr(MI, MRI, B); 20810b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 20820b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 20830b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_X); 20840b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 20850b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 20860b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 20870b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 20880b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 20890b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 20900b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_x: 20910b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 20920b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 20930b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_y: 20940b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 20950b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 20960b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_z: 20970b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 20980b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 20990b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_ptr: 21000b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 21010b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_PTR); 21020b57cec5SDimitry Andric case Intrinsic::amdgcn_queue_ptr: 21030b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 21040b57cec5SDimitry Andric AMDGPUFunctionArgInfo::QUEUE_PTR); 21050b57cec5SDimitry Andric case Intrinsic::amdgcn_implicit_buffer_ptr: 21060b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 21070b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 21080b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_id: 21090b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 21100b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_ID); 2111*8bcb0991SDimitry Andric case Intrinsic::amdgcn_fdiv_fast: 2112*8bcb0991SDimitry Andric return legalizeFDIVFastIntrin(MI, MRI, B); 2113*8bcb0991SDimitry Andric case Intrinsic::amdgcn_is_shared: 2114*8bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 2115*8bcb0991SDimitry Andric case Intrinsic::amdgcn_is_private: 2116*8bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 2117*8bcb0991SDimitry Andric case Intrinsic::amdgcn_wavefrontsize: { 2118*8bcb0991SDimitry Andric B.setInstr(MI); 2119*8bcb0991SDimitry Andric B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 2120*8bcb0991SDimitry Andric MI.eraseFromParent(); 2121*8bcb0991SDimitry Andric return true; 2122*8bcb0991SDimitry Andric } 2123*8bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store: 2124*8bcb0991SDimitry Andric return legalizeRawBufferStore(MI, MRI, B, false); 2125*8bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store_format: 2126*8bcb0991SDimitry Andric return legalizeRawBufferStore(MI, MRI, B, true); 21270b57cec5SDimitry Andric default: 21280b57cec5SDimitry Andric return true; 21290b57cec5SDimitry Andric } 21300b57cec5SDimitry Andric 21310b57cec5SDimitry Andric return true; 21320b57cec5SDimitry Andric } 2133