10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for 100b57cec5SDimitry Andric /// AMDGPU. 110b57cec5SDimitry Andric /// \todo This should be generated by TableGen. 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 148bcb0991SDimitry Andric #if defined(_MSC_VER) || defined(__MINGW32__) 158bcb0991SDimitry Andric // According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI 168bcb0991SDimitry Andric // from the Visual C++ cmath / math.h headers: 178bcb0991SDimitry Andric // https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 188bcb0991SDimitry Andric #define _USE_MATH_DEFINES 198bcb0991SDimitry Andric #endif 208bcb0991SDimitry Andric 210b57cec5SDimitry Andric #include "AMDGPU.h" 220b57cec5SDimitry Andric #include "AMDGPULegalizerInfo.h" 230b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 240b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 250b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 260b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 270b57cec5SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h" 280b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h" 290b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h" 308bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 310b57cec5SDimitry Andric #include "llvm/IR/Type.h" 320b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 330b57cec5SDimitry Andric 340b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo" 350b57cec5SDimitry Andric 360b57cec5SDimitry Andric using namespace llvm; 370b57cec5SDimitry Andric using namespace LegalizeActions; 380b57cec5SDimitry Andric using namespace LegalizeMutations; 390b57cec5SDimitry Andric using namespace LegalityPredicates; 400b57cec5SDimitry Andric 410b57cec5SDimitry Andric 420b57cec5SDimitry Andric static LegalityPredicate isMultiple32(unsigned TypeIdx, 438bcb0991SDimitry Andric unsigned MaxSize = 1024) { 440b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 450b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 460b57cec5SDimitry Andric const LLT EltTy = Ty.getScalarType(); 470b57cec5SDimitry Andric return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; 480b57cec5SDimitry Andric }; 490b57cec5SDimitry Andric } 500b57cec5SDimitry Andric 518bcb0991SDimitry Andric static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { 528bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 538bcb0991SDimitry Andric return Query.Types[TypeIdx].getSizeInBits() == Size; 548bcb0991SDimitry Andric }; 558bcb0991SDimitry Andric } 568bcb0991SDimitry Andric 570b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 580b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 590b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 600b57cec5SDimitry Andric return Ty.isVector() && 610b57cec5SDimitry Andric Ty.getNumElements() % 2 != 0 && 628bcb0991SDimitry Andric Ty.getElementType().getSizeInBits() < 32 && 638bcb0991SDimitry Andric Ty.getSizeInBits() % 32 != 0; 648bcb0991SDimitry Andric }; 658bcb0991SDimitry Andric } 668bcb0991SDimitry Andric 678bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) { 688bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 698bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 708bcb0991SDimitry Andric const LLT EltTy = Ty.getScalarType(); 718bcb0991SDimitry Andric return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 720b57cec5SDimitry Andric }; 730b57cec5SDimitry Andric } 740b57cec5SDimitry Andric 750b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 760b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 770b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 780b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 790b57cec5SDimitry Andric return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 800b57cec5SDimitry Andric }; 810b57cec5SDimitry Andric } 820b57cec5SDimitry Andric 830b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 840b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 850b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 860b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 870b57cec5SDimitry Andric unsigned Size = Ty.getSizeInBits(); 880b57cec5SDimitry Andric unsigned Pieces = (Size + 63) / 64; 890b57cec5SDimitry Andric unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 900b57cec5SDimitry Andric return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 910b57cec5SDimitry Andric }; 920b57cec5SDimitry Andric } 930b57cec5SDimitry Andric 948bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit 958bcb0991SDimitry Andric // type. 968bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 978bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 988bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 998bcb0991SDimitry Andric 1008bcb0991SDimitry Andric const LLT EltTy = Ty.getElementType(); 1018bcb0991SDimitry Andric const int Size = Ty.getSizeInBits(); 1028bcb0991SDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1038bcb0991SDimitry Andric const int NextMul32 = (Size + 31) / 32; 1048bcb0991SDimitry Andric 1058bcb0991SDimitry Andric assert(EltSize < 32); 1068bcb0991SDimitry Andric 1078bcb0991SDimitry Andric const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 1088bcb0991SDimitry Andric return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 1098bcb0991SDimitry Andric }; 1108bcb0991SDimitry Andric } 1118bcb0991SDimitry Andric 1128bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 1138bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1148bcb0991SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1158bcb0991SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 1168bcb0991SDimitry Andric }; 1178bcb0991SDimitry Andric } 1188bcb0991SDimitry Andric 1190b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 1200b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1210b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1220b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 1230b57cec5SDimitry Andric }; 1240b57cec5SDimitry Andric } 1250b57cec5SDimitry Andric 1260b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 1270b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1280b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1290b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 1300b57cec5SDimitry Andric }; 1310b57cec5SDimitry Andric } 1320b57cec5SDimitry Andric 1338bcb0991SDimitry Andric // Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of 1340b57cec5SDimitry Andric // v2s16. 1350b57cec5SDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) { 1360b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1370b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1380b57cec5SDimitry Andric if (Ty.isVector()) { 1390b57cec5SDimitry Andric const int EltSize = Ty.getElementType().getSizeInBits(); 1400b57cec5SDimitry Andric return EltSize == 32 || EltSize == 64 || 1410b57cec5SDimitry Andric (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 1420b57cec5SDimitry Andric EltSize == 128 || EltSize == 256; 1430b57cec5SDimitry Andric } 1440b57cec5SDimitry Andric 1458bcb0991SDimitry Andric return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; 1468bcb0991SDimitry Andric }; 1478bcb0991SDimitry Andric } 1488bcb0991SDimitry Andric 1498bcb0991SDimitry Andric static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { 1508bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1518bcb0991SDimitry Andric return Query.Types[TypeIdx].getElementType() == Type; 1528bcb0991SDimitry Andric }; 1538bcb0991SDimitry Andric } 1548bcb0991SDimitry Andric 1558bcb0991SDimitry Andric static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 1568bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1578bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1588bcb0991SDimitry Andric return !Ty.isVector() && Ty.getSizeInBits() > 32 && 1598bcb0991SDimitry Andric Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 1600b57cec5SDimitry Andric }; 1610b57cec5SDimitry Andric } 1620b57cec5SDimitry Andric 1630b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 1640b57cec5SDimitry Andric const GCNTargetMachine &TM) 1650b57cec5SDimitry Andric : ST(ST_) { 1660b57cec5SDimitry Andric using namespace TargetOpcode; 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric auto GetAddrSpacePtr = [&TM](unsigned AS) { 1690b57cec5SDimitry Andric return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 1700b57cec5SDimitry Andric }; 1710b57cec5SDimitry Andric 1720b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 1730b57cec5SDimitry Andric const LLT S8 = LLT::scalar(8); 1740b57cec5SDimitry Andric const LLT S16 = LLT::scalar(16); 1750b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 1760b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 1778bcb0991SDimitry Andric const LLT S96 = LLT::scalar(96); 1780b57cec5SDimitry Andric const LLT S128 = LLT::scalar(128); 1790b57cec5SDimitry Andric const LLT S256 = LLT::scalar(256); 1808bcb0991SDimitry Andric const LLT S1024 = LLT::scalar(1024); 1810b57cec5SDimitry Andric 1820b57cec5SDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 1830b57cec5SDimitry Andric const LLT V4S16 = LLT::vector(4, 16); 1840b57cec5SDimitry Andric 1850b57cec5SDimitry Andric const LLT V2S32 = LLT::vector(2, 32); 1860b57cec5SDimitry Andric const LLT V3S32 = LLT::vector(3, 32); 1870b57cec5SDimitry Andric const LLT V4S32 = LLT::vector(4, 32); 1880b57cec5SDimitry Andric const LLT V5S32 = LLT::vector(5, 32); 1890b57cec5SDimitry Andric const LLT V6S32 = LLT::vector(6, 32); 1900b57cec5SDimitry Andric const LLT V7S32 = LLT::vector(7, 32); 1910b57cec5SDimitry Andric const LLT V8S32 = LLT::vector(8, 32); 1920b57cec5SDimitry Andric const LLT V9S32 = LLT::vector(9, 32); 1930b57cec5SDimitry Andric const LLT V10S32 = LLT::vector(10, 32); 1940b57cec5SDimitry Andric const LLT V11S32 = LLT::vector(11, 32); 1950b57cec5SDimitry Andric const LLT V12S32 = LLT::vector(12, 32); 1960b57cec5SDimitry Andric const LLT V13S32 = LLT::vector(13, 32); 1970b57cec5SDimitry Andric const LLT V14S32 = LLT::vector(14, 32); 1980b57cec5SDimitry Andric const LLT V15S32 = LLT::vector(15, 32); 1990b57cec5SDimitry Andric const LLT V16S32 = LLT::vector(16, 32); 2008bcb0991SDimitry Andric const LLT V32S32 = LLT::vector(32, 32); 2010b57cec5SDimitry Andric 2020b57cec5SDimitry Andric const LLT V2S64 = LLT::vector(2, 64); 2030b57cec5SDimitry Andric const LLT V3S64 = LLT::vector(3, 64); 2040b57cec5SDimitry Andric const LLT V4S64 = LLT::vector(4, 64); 2050b57cec5SDimitry Andric const LLT V5S64 = LLT::vector(5, 64); 2060b57cec5SDimitry Andric const LLT V6S64 = LLT::vector(6, 64); 2070b57cec5SDimitry Andric const LLT V7S64 = LLT::vector(7, 64); 2080b57cec5SDimitry Andric const LLT V8S64 = LLT::vector(8, 64); 2098bcb0991SDimitry Andric const LLT V16S64 = LLT::vector(16, 64); 2100b57cec5SDimitry Andric 2110b57cec5SDimitry Andric std::initializer_list<LLT> AllS32Vectors = 2120b57cec5SDimitry Andric {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 2138bcb0991SDimitry Andric V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 2140b57cec5SDimitry Andric std::initializer_list<LLT> AllS64Vectors = 2158bcb0991SDimitry Andric {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 2160b57cec5SDimitry Andric 2170b57cec5SDimitry Andric const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 2180b57cec5SDimitry Andric const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 2198bcb0991SDimitry Andric const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 2200b57cec5SDimitry Andric const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 2218bcb0991SDimitry Andric const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 2220b57cec5SDimitry Andric const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 2230b57cec5SDimitry Andric const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 2240b57cec5SDimitry Andric 2250b57cec5SDimitry Andric const LLT CodePtr = FlatPtr; 2260b57cec5SDimitry Andric 2270b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces64 = { 2280b57cec5SDimitry Andric GlobalPtr, ConstantPtr, FlatPtr 2290b57cec5SDimitry Andric }; 2300b57cec5SDimitry Andric 2310b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces32 = { 2328bcb0991SDimitry Andric LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 2330b57cec5SDimitry Andric }; 2340b57cec5SDimitry Andric 2350b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesBase = { 2360b57cec5SDimitry Andric S32, S64 2370b57cec5SDimitry Andric }; 2380b57cec5SDimitry Andric 2390b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypes16 = { 2400b57cec5SDimitry Andric S32, S64, S16 2410b57cec5SDimitry Andric }; 2420b57cec5SDimitry Andric 2430b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesPK16 = { 2440b57cec5SDimitry Andric S32, S64, S16, V2S16 2450b57cec5SDimitry Andric }; 2460b57cec5SDimitry Andric 247*480093f4SDimitry Andric setAction({G_BRCOND, S1}, Legal); // VCC branches 248*480093f4SDimitry Andric setAction({G_BRCOND, S32}, Legal); // SCC branches 2490b57cec5SDimitry Andric 2500b57cec5SDimitry Andric // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 2510b57cec5SDimitry Andric // elements for v3s16 2520b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PHI) 2530b57cec5SDimitry Andric .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 2540b57cec5SDimitry Andric .legalFor(AllS32Vectors) 2550b57cec5SDimitry Andric .legalFor(AllS64Vectors) 2560b57cec5SDimitry Andric .legalFor(AddrSpaces64) 2570b57cec5SDimitry Andric .legalFor(AddrSpaces32) 2580b57cec5SDimitry Andric .clampScalar(0, S32, S256) 2590b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 2600b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16) 2610b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 2620b57cec5SDimitry Andric .legalIf(isPointer(0)); 2630b57cec5SDimitry Andric 2640b57cec5SDimitry Andric if (ST.has16BitInsts()) { 2650b57cec5SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 2660b57cec5SDimitry Andric .legalFor({S32, S16}) 2670b57cec5SDimitry Andric .clampScalar(0, S16, S32) 2680b57cec5SDimitry Andric .scalarize(0); 2690b57cec5SDimitry Andric } else { 2700b57cec5SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 2710b57cec5SDimitry Andric .legalFor({S32}) 2720b57cec5SDimitry Andric .clampScalar(0, S32, S32) 2730b57cec5SDimitry Andric .scalarize(0); 2740b57cec5SDimitry Andric } 2750b57cec5SDimitry Andric 276*480093f4SDimitry Andric // FIXME: Not really legal. Placeholder for custom lowering. 277*480093f4SDimitry Andric getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 278*480093f4SDimitry Andric .legalFor({S32, S64}) 279*480093f4SDimitry Andric .clampScalar(0, S32, S64) 280*480093f4SDimitry Andric .widenScalarToNextPow2(0, 32) 281*480093f4SDimitry Andric .scalarize(0); 282*480093f4SDimitry Andric 2830b57cec5SDimitry Andric getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 2840b57cec5SDimitry Andric .legalFor({S32}) 2850b57cec5SDimitry Andric .clampScalar(0, S32, S32) 2860b57cec5SDimitry Andric .scalarize(0); 2870b57cec5SDimitry Andric 2880b57cec5SDimitry Andric // Report legal for any types we can handle anywhere. For the cases only legal 2890b57cec5SDimitry Andric // on the SALU, RegBankSelect will be able to re-legalize. 2900b57cec5SDimitry Andric getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 2910b57cec5SDimitry Andric .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 2920b57cec5SDimitry Andric .clampScalar(0, S32, S64) 2930b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 2948bcb0991SDimitry Andric .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 2950b57cec5SDimitry Andric .widenScalarToNextPow2(0) 2960b57cec5SDimitry Andric .scalarize(0); 2970b57cec5SDimitry Andric 2988bcb0991SDimitry Andric getActionDefinitionsBuilder({G_UADDO, G_USUBO, 2990b57cec5SDimitry Andric G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 300*480093f4SDimitry Andric .legalFor({{S32, S1}, {S32, S32}}) 3018bcb0991SDimitry Andric .clampScalar(0, S32, S32) 3028bcb0991SDimitry Andric .scalarize(0); // TODO: Implement. 3038bcb0991SDimitry Andric 3048bcb0991SDimitry Andric getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) 3058bcb0991SDimitry Andric .lower(); 3060b57cec5SDimitry Andric 3070b57cec5SDimitry Andric getActionDefinitionsBuilder(G_BITCAST) 3080b57cec5SDimitry Andric // Don't worry about the size constraint. 3098bcb0991SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 3108bcb0991SDimitry Andric // FIXME: Testing hack 3118bcb0991SDimitry Andric .legalForCartesianProduct({S16, LLT::vector(2, 8), }); 3120b57cec5SDimitry Andric 3130b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCONSTANT) 3140b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 3150b57cec5SDimitry Andric .clampScalar(0, S16, S64); 3160b57cec5SDimitry Andric 3170b57cec5SDimitry Andric getActionDefinitionsBuilder(G_IMPLICIT_DEF) 3188bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, 3190b57cec5SDimitry Andric ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) 3200b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 3218bcb0991SDimitry Andric .clampScalarOrElt(0, S32, S1024) 3220b57cec5SDimitry Andric .legalIf(isMultiple32(0)) 3230b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 3240b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16); 3250b57cec5SDimitry Andric 3260b57cec5SDimitry Andric 3270b57cec5SDimitry Andric // FIXME: i1 operands to intrinsics should always be legal, but other i1 3280b57cec5SDimitry Andric // values may not be legal. We need to figure out how to distinguish 3290b57cec5SDimitry Andric // between these two scenarios. 3300b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONSTANT) 3318bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, GlobalPtr, 3320b57cec5SDimitry Andric LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 3330b57cec5SDimitry Andric .clampScalar(0, S32, S64) 3340b57cec5SDimitry Andric .widenScalarToNextPow2(0) 3350b57cec5SDimitry Andric .legalIf(isPointer(0)); 3360b57cec5SDimitry Andric 3370b57cec5SDimitry Andric setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 3388bcb0991SDimitry Andric getActionDefinitionsBuilder(G_GLOBAL_VALUE) 3398bcb0991SDimitry Andric .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); 3408bcb0991SDimitry Andric 3410b57cec5SDimitry Andric 3420b57cec5SDimitry Andric auto &FPOpActions = getActionDefinitionsBuilder( 3438bcb0991SDimitry Andric { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 3440b57cec5SDimitry Andric .legalFor({S32, S64}); 3458bcb0991SDimitry Andric auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 3468bcb0991SDimitry Andric .customFor({S32, S64}); 3478bcb0991SDimitry Andric auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 3488bcb0991SDimitry Andric .customFor({S32, S64}); 3490b57cec5SDimitry Andric 3500b57cec5SDimitry Andric if (ST.has16BitInsts()) { 3510b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 3520b57cec5SDimitry Andric FPOpActions.legalFor({S16, V2S16}); 3530b57cec5SDimitry Andric else 3540b57cec5SDimitry Andric FPOpActions.legalFor({S16}); 3558bcb0991SDimitry Andric 3568bcb0991SDimitry Andric TrigActions.customFor({S16}); 3578bcb0991SDimitry Andric FDIVActions.customFor({S16}); 3580b57cec5SDimitry Andric } 3590b57cec5SDimitry Andric 3600b57cec5SDimitry Andric auto &MinNumMaxNum = getActionDefinitionsBuilder({ 3610b57cec5SDimitry Andric G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 3620b57cec5SDimitry Andric 3630b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 3640b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesPK16) 365*480093f4SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 3660b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 3670b57cec5SDimitry Andric .clampScalar(0, S16, S64) 3680b57cec5SDimitry Andric .scalarize(0); 3690b57cec5SDimitry Andric } else if (ST.has16BitInsts()) { 3700b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypes16) 3710b57cec5SDimitry Andric .clampScalar(0, S16, S64) 3720b57cec5SDimitry Andric .scalarize(0); 3730b57cec5SDimitry Andric } else { 3740b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesBase) 3750b57cec5SDimitry Andric .clampScalar(0, S32, S64) 3760b57cec5SDimitry Andric .scalarize(0); 3770b57cec5SDimitry Andric } 3780b57cec5SDimitry Andric 3790b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 3800b57cec5SDimitry Andric FPOpActions.clampMaxNumElements(0, S16, 2); 3818bcb0991SDimitry Andric 3820b57cec5SDimitry Andric FPOpActions 3830b57cec5SDimitry Andric .scalarize(0) 3840b57cec5SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 3850b57cec5SDimitry Andric 3868bcb0991SDimitry Andric TrigActions 3878bcb0991SDimitry Andric .scalarize(0) 3888bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 3898bcb0991SDimitry Andric 3908bcb0991SDimitry Andric FDIVActions 3918bcb0991SDimitry Andric .scalarize(0) 3928bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 3938bcb0991SDimitry Andric 3948bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FNEG, G_FABS}) 3958bcb0991SDimitry Andric .legalFor(FPTypesPK16) 3968bcb0991SDimitry Andric .clampMaxNumElements(0, S16, 2) 3978bcb0991SDimitry Andric .scalarize(0) 3988bcb0991SDimitry Andric .clampScalar(0, S16, S64); 3998bcb0991SDimitry Andric 4008bcb0991SDimitry Andric // TODO: Implement 4018bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); 4028bcb0991SDimitry Andric 4030b57cec5SDimitry Andric if (ST.has16BitInsts()) { 4048bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 4050b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 4060b57cec5SDimitry Andric .scalarize(0) 4070b57cec5SDimitry Andric .clampScalar(0, S16, S64); 4080b57cec5SDimitry Andric } else { 4098bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 4100b57cec5SDimitry Andric .legalFor({S32, S64}) 4110b57cec5SDimitry Andric .scalarize(0) 4120b57cec5SDimitry Andric .clampScalar(0, S32, S64); 4130b57cec5SDimitry Andric } 4140b57cec5SDimitry Andric 4150b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPTRUNC) 4160b57cec5SDimitry Andric .legalFor({{S32, S64}, {S16, S32}}) 4170b57cec5SDimitry Andric .scalarize(0); 4180b57cec5SDimitry Andric 4190b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPEXT) 4200b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}}) 4210b57cec5SDimitry Andric .lowerFor({{S64, S16}}) // FIXME: Implement 4220b57cec5SDimitry Andric .scalarize(0); 4230b57cec5SDimitry Andric 4240b57cec5SDimitry Andric // TODO: Verify V_BFI_B32 is generated from expanded bit ops. 4250b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); 4260b57cec5SDimitry Andric 4270b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FSUB) 4280b57cec5SDimitry Andric // Use actual fsub instruction 4290b57cec5SDimitry Andric .legalFor({S32}) 4300b57cec5SDimitry Andric // Must use fadd + fneg 4310b57cec5SDimitry Andric .lowerFor({S64, S16, V2S16}) 4320b57cec5SDimitry Andric .scalarize(0) 4330b57cec5SDimitry Andric .clampScalar(0, S32, S64); 4340b57cec5SDimitry Andric 4358bcb0991SDimitry Andric // Whether this is legal depends on the floating point mode for the function. 4368bcb0991SDimitry Andric auto &FMad = getActionDefinitionsBuilder(G_FMAD); 4378bcb0991SDimitry Andric if (ST.hasMadF16()) 4388bcb0991SDimitry Andric FMad.customFor({S32, S16}); 4398bcb0991SDimitry Andric else 4408bcb0991SDimitry Andric FMad.customFor({S32}); 4418bcb0991SDimitry Andric FMad.scalarize(0) 4428bcb0991SDimitry Andric .lower(); 4438bcb0991SDimitry Andric 4440b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 4450b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 4460b57cec5SDimitry Andric {S32, S1}, {S64, S1}, {S16, S1}, 4478bcb0991SDimitry Andric {S96, S32}, 4480b57cec5SDimitry Andric // FIXME: Hack 4490b57cec5SDimitry Andric {S64, LLT::scalar(33)}, 450*480093f4SDimitry Andric {S32, S8}, {S32, LLT::scalar(24)}}) 451*480093f4SDimitry Andric .scalarize(0) 452*480093f4SDimitry Andric .clampScalar(0, S32, S64); 4530b57cec5SDimitry Andric 4548bcb0991SDimitry Andric // TODO: Split s1->s64 during regbankselect for VALU. 4558bcb0991SDimitry Andric auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 456*480093f4SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 4570b57cec5SDimitry Andric .lowerFor({{S32, S64}}) 458*480093f4SDimitry Andric .lowerIf(typeIs(1, S1)) 4598bcb0991SDimitry Andric .customFor({{S64, S64}}); 4608bcb0991SDimitry Andric if (ST.has16BitInsts()) 4618bcb0991SDimitry Andric IToFP.legalFor({{S16, S16}}); 4628bcb0991SDimitry Andric IToFP.clampScalar(1, S32, S64) 4630b57cec5SDimitry Andric .scalarize(0); 4640b57cec5SDimitry Andric 4658bcb0991SDimitry Andric auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 4668bcb0991SDimitry Andric .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); 4678bcb0991SDimitry Andric if (ST.has16BitInsts()) 4688bcb0991SDimitry Andric FPToI.legalFor({{S16, S16}}); 4698bcb0991SDimitry Andric else 4708bcb0991SDimitry Andric FPToI.minScalar(1, S32); 4718bcb0991SDimitry Andric 4728bcb0991SDimitry Andric FPToI.minScalar(0, S32) 4730b57cec5SDimitry Andric .scalarize(0); 4740b57cec5SDimitry Andric 4750b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 476*480093f4SDimitry Andric .scalarize(0) 477*480093f4SDimitry Andric .lower(); 4780b57cec5SDimitry Andric 479*480093f4SDimitry Andric if (ST.has16BitInsts()) { 480*480093f4SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 481*480093f4SDimitry Andric .legalFor({S16, S32, S64}) 482*480093f4SDimitry Andric .clampScalar(0, S16, S64) 483*480093f4SDimitry Andric .scalarize(0); 484*480093f4SDimitry Andric } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 4850b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 4860b57cec5SDimitry Andric .legalFor({S32, S64}) 4870b57cec5SDimitry Andric .clampScalar(0, S32, S64) 4880b57cec5SDimitry Andric .scalarize(0); 4890b57cec5SDimitry Andric } else { 4900b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 4910b57cec5SDimitry Andric .legalFor({S32}) 4920b57cec5SDimitry Andric .customFor({S64}) 4930b57cec5SDimitry Andric .clampScalar(0, S32, S64) 4940b57cec5SDimitry Andric .scalarize(0); 4950b57cec5SDimitry Andric } 4960b57cec5SDimitry Andric 497*480093f4SDimitry Andric getActionDefinitionsBuilder(G_PTR_ADD) 4980b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 4990b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 5000b57cec5SDimitry Andric .scalarize(0); 5010b57cec5SDimitry Andric 5028bcb0991SDimitry Andric getActionDefinitionsBuilder(G_PTR_MASK) 5038bcb0991SDimitry Andric .scalarize(0) 5048bcb0991SDimitry Andric .alwaysLegal(); 5058bcb0991SDimitry Andric 5060b57cec5SDimitry Andric setAction({G_BLOCK_ADDR, CodePtr}, Legal); 5070b57cec5SDimitry Andric 5080b57cec5SDimitry Andric auto &CmpBuilder = 5090b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ICMP) 510*480093f4SDimitry Andric // The compare output type differs based on the register bank of the output, 511*480093f4SDimitry Andric // so make both s1 and s32 legal. 512*480093f4SDimitry Andric // 513*480093f4SDimitry Andric // Scalar compares producing output in scc will be promoted to s32, as that 514*480093f4SDimitry Andric // is the allocatable register type that will be needed for the copy from 515*480093f4SDimitry Andric // scc. This will be promoted during RegBankSelect, and we assume something 516*480093f4SDimitry Andric // before that won't try to use s32 result types. 517*480093f4SDimitry Andric // 518*480093f4SDimitry Andric // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 519*480093f4SDimitry Andric // bank. 5200b57cec5SDimitry Andric .legalForCartesianProduct( 5210b57cec5SDimitry Andric {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 522*480093f4SDimitry Andric .legalForCartesianProduct( 523*480093f4SDimitry Andric {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 5240b57cec5SDimitry Andric if (ST.has16BitInsts()) { 5250b57cec5SDimitry Andric CmpBuilder.legalFor({{S1, S16}}); 5260b57cec5SDimitry Andric } 5270b57cec5SDimitry Andric 5280b57cec5SDimitry Andric CmpBuilder 5290b57cec5SDimitry Andric .widenScalarToNextPow2(1) 5300b57cec5SDimitry Andric .clampScalar(1, S32, S64) 5310b57cec5SDimitry Andric .scalarize(0) 532*480093f4SDimitry Andric .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 5330b57cec5SDimitry Andric 5340b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCMP) 5350b57cec5SDimitry Andric .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 5360b57cec5SDimitry Andric .widenScalarToNextPow2(1) 5370b57cec5SDimitry Andric .clampScalar(1, S32, S64) 5380b57cec5SDimitry Andric .scalarize(0); 5390b57cec5SDimitry Andric 5400b57cec5SDimitry Andric // FIXME: fexp, flog2, flog10 needs to be custom lowered. 5410b57cec5SDimitry Andric getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, 5420b57cec5SDimitry Andric G_FLOG, G_FLOG2, G_FLOG10}) 5430b57cec5SDimitry Andric .legalFor({S32}) 5440b57cec5SDimitry Andric .scalarize(0); 5450b57cec5SDimitry Andric 5460b57cec5SDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 5470b57cec5SDimitry Andric getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, 5480b57cec5SDimitry Andric G_CTTZ, G_CTTZ_ZERO_UNDEF, 5490b57cec5SDimitry Andric G_CTPOP}) 5500b57cec5SDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 5510b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5520b57cec5SDimitry Andric .clampScalar(1, S32, S64) 5530b57cec5SDimitry Andric .scalarize(0) 5540b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 5550b57cec5SDimitry Andric .widenScalarToNextPow2(1, 32); 5560b57cec5SDimitry Andric 5570b57cec5SDimitry Andric // TODO: Expand for > s32 5588bcb0991SDimitry Andric getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) 5590b57cec5SDimitry Andric .legalFor({S32}) 5600b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5610b57cec5SDimitry Andric .scalarize(0); 5620b57cec5SDimitry Andric 5630b57cec5SDimitry Andric if (ST.has16BitInsts()) { 5640b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 5650b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 5660b57cec5SDimitry Andric .legalFor({S32, S16, V2S16}) 5670b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 5680b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 5690b57cec5SDimitry Andric .clampScalar(0, S16, S32) 5700b57cec5SDimitry Andric .widenScalarToNextPow2(0) 5710b57cec5SDimitry Andric .scalarize(0); 5720b57cec5SDimitry Andric } else { 5730b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 5740b57cec5SDimitry Andric .legalFor({S32, S16}) 5750b57cec5SDimitry Andric .widenScalarToNextPow2(0) 5760b57cec5SDimitry Andric .clampScalar(0, S16, S32) 5770b57cec5SDimitry Andric .scalarize(0); 5780b57cec5SDimitry Andric } 5790b57cec5SDimitry Andric } else { 5800b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 5810b57cec5SDimitry Andric .legalFor({S32}) 5820b57cec5SDimitry Andric .clampScalar(0, S32, S32) 5830b57cec5SDimitry Andric .widenScalarToNextPow2(0) 5840b57cec5SDimitry Andric .scalarize(0); 5850b57cec5SDimitry Andric } 5860b57cec5SDimitry Andric 5870b57cec5SDimitry Andric auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 5880b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 5890b57cec5SDimitry Andric return Query.Types[TypeIdx0].getSizeInBits() < 5900b57cec5SDimitry Andric Query.Types[TypeIdx1].getSizeInBits(); 5910b57cec5SDimitry Andric }; 5920b57cec5SDimitry Andric }; 5930b57cec5SDimitry Andric 5940b57cec5SDimitry Andric auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { 5950b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 5960b57cec5SDimitry Andric return Query.Types[TypeIdx0].getSizeInBits() > 5970b57cec5SDimitry Andric Query.Types[TypeIdx1].getSizeInBits(); 5980b57cec5SDimitry Andric }; 5990b57cec5SDimitry Andric }; 6000b57cec5SDimitry Andric 6010b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTTOPTR) 6020b57cec5SDimitry Andric // List the common cases 6030b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 6040b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 6050b57cec5SDimitry Andric .scalarize(0) 6060b57cec5SDimitry Andric // Accept any address space as long as the size matches 6070b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 6080b57cec5SDimitry Andric .widenScalarIf(smallerThan(1, 0), 6090b57cec5SDimitry Andric [](const LegalityQuery &Query) { 6100b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 6110b57cec5SDimitry Andric }) 6120b57cec5SDimitry Andric .narrowScalarIf(greaterThan(1, 0), 6130b57cec5SDimitry Andric [](const LegalityQuery &Query) { 6140b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 6150b57cec5SDimitry Andric }); 6160b57cec5SDimitry Andric 6170b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PTRTOINT) 6180b57cec5SDimitry Andric // List the common cases 6190b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 6200b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 6210b57cec5SDimitry Andric .scalarize(0) 6220b57cec5SDimitry Andric // Accept any address space as long as the size matches 6230b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 6240b57cec5SDimitry Andric .widenScalarIf(smallerThan(0, 1), 6250b57cec5SDimitry Andric [](const LegalityQuery &Query) { 6260b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 6270b57cec5SDimitry Andric }) 6280b57cec5SDimitry Andric .narrowScalarIf( 6290b57cec5SDimitry Andric greaterThan(0, 1), 6300b57cec5SDimitry Andric [](const LegalityQuery &Query) { 6310b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 6320b57cec5SDimitry Andric }); 6330b57cec5SDimitry Andric 6340b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 6350b57cec5SDimitry Andric .scalarize(0) 6360b57cec5SDimitry Andric .custom(); 6370b57cec5SDimitry Andric 6380b57cec5SDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 6390b57cec5SDimitry Andric // handle some operations by just promoting the register during 6400b57cec5SDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits. 6418bcb0991SDimitry Andric auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { 6428bcb0991SDimitry Andric switch (AS) { 6438bcb0991SDimitry Andric // FIXME: Private element size. 6448bcb0991SDimitry Andric case AMDGPUAS::PRIVATE_ADDRESS: 6458bcb0991SDimitry Andric return 32; 6468bcb0991SDimitry Andric // FIXME: Check subtarget 6478bcb0991SDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 6488bcb0991SDimitry Andric return ST.useDS128() ? 128 : 64; 6490b57cec5SDimitry Andric 6508bcb0991SDimitry Andric // Treat constant and global as identical. SMRD loads are sometimes usable 6518bcb0991SDimitry Andric // for global loads (ideally constant address space should be eliminated) 6528bcb0991SDimitry Andric // depending on the context. Legality cannot be context dependent, but 6538bcb0991SDimitry Andric // RegBankSelect can split the load as necessary depending on the pointer 6548bcb0991SDimitry Andric // register bank/uniformity and if the memory is invariant or not written in 6558bcb0991SDimitry Andric // a kernel. 6568bcb0991SDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS: 6578bcb0991SDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 6588bcb0991SDimitry Andric return 512; 6598bcb0991SDimitry Andric default: 6608bcb0991SDimitry Andric return 128; 6618bcb0991SDimitry Andric } 6628bcb0991SDimitry Andric }; 6638bcb0991SDimitry Andric 6648bcb0991SDimitry Andric const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { 6658bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 6668bcb0991SDimitry Andric 6678bcb0991SDimitry Andric // Split vector extloads. 6688bcb0991SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 669*480093f4SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 670*480093f4SDimitry Andric 671*480093f4SDimitry Andric if (MemSize < DstTy.getSizeInBits()) 672*480093f4SDimitry Andric MemSize = std::max(MemSize, Align); 673*480093f4SDimitry Andric 6748bcb0991SDimitry Andric if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 6758bcb0991SDimitry Andric return true; 6768bcb0991SDimitry Andric 6778bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 6788bcb0991SDimitry Andric unsigned AS = PtrTy.getAddressSpace(); 6798bcb0991SDimitry Andric if (MemSize > maxSizeForAddrSpace(AS)) 6808bcb0991SDimitry Andric return true; 6818bcb0991SDimitry Andric 6828bcb0991SDimitry Andric // Catch weird sized loads that don't evenly divide into the access sizes 6838bcb0991SDimitry Andric // TODO: May be able to widen depending on alignment etc. 6848bcb0991SDimitry Andric unsigned NumRegs = MemSize / 32; 6858bcb0991SDimitry Andric if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) 6868bcb0991SDimitry Andric return true; 6878bcb0991SDimitry Andric 6888bcb0991SDimitry Andric if (Align < MemSize) { 6898bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 6908bcb0991SDimitry Andric return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 6918bcb0991SDimitry Andric } 6928bcb0991SDimitry Andric 6938bcb0991SDimitry Andric return false; 6948bcb0991SDimitry Andric }; 6958bcb0991SDimitry Andric 6968bcb0991SDimitry Andric unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 6978bcb0991SDimitry Andric unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 6988bcb0991SDimitry Andric unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 6998bcb0991SDimitry Andric 7008bcb0991SDimitry Andric // TODO: Refine based on subtargets which support unaligned access or 128-bit 7018bcb0991SDimitry Andric // LDS 7028bcb0991SDimitry Andric // TODO: Unsupported flat for SI. 7038bcb0991SDimitry Andric 7048bcb0991SDimitry Andric for (unsigned Op : {G_LOAD, G_STORE}) { 7058bcb0991SDimitry Andric const bool IsStore = Op == G_STORE; 7068bcb0991SDimitry Andric 7078bcb0991SDimitry Andric auto &Actions = getActionDefinitionsBuilder(Op); 7088bcb0991SDimitry Andric // Whitelist the common cases. 7098bcb0991SDimitry Andric // TODO: Pointer loads 7108bcb0991SDimitry Andric // TODO: Wide constant loads 7118bcb0991SDimitry Andric // TODO: Only CI+ has 3x loads 7128bcb0991SDimitry Andric // TODO: Loads to s16 on gfx9 7138bcb0991SDimitry Andric Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 7148bcb0991SDimitry Andric {V2S32, GlobalPtr, 64, GlobalAlign32}, 7158bcb0991SDimitry Andric {V3S32, GlobalPtr, 96, GlobalAlign32}, 7168bcb0991SDimitry Andric {S96, GlobalPtr, 96, GlobalAlign32}, 7178bcb0991SDimitry Andric {V4S32, GlobalPtr, 128, GlobalAlign32}, 7188bcb0991SDimitry Andric {S128, GlobalPtr, 128, GlobalAlign32}, 7198bcb0991SDimitry Andric {S64, GlobalPtr, 64, GlobalAlign32}, 7208bcb0991SDimitry Andric {V2S64, GlobalPtr, 128, GlobalAlign32}, 7218bcb0991SDimitry Andric {V2S16, GlobalPtr, 32, GlobalAlign32}, 7228bcb0991SDimitry Andric {S32, GlobalPtr, 8, GlobalAlign8}, 7238bcb0991SDimitry Andric {S32, GlobalPtr, 16, GlobalAlign16}, 7248bcb0991SDimitry Andric 7258bcb0991SDimitry Andric {S32, LocalPtr, 32, 32}, 7268bcb0991SDimitry Andric {S64, LocalPtr, 64, 32}, 7278bcb0991SDimitry Andric {V2S32, LocalPtr, 64, 32}, 7288bcb0991SDimitry Andric {S32, LocalPtr, 8, 8}, 7298bcb0991SDimitry Andric {S32, LocalPtr, 16, 16}, 7308bcb0991SDimitry Andric {V2S16, LocalPtr, 32, 32}, 7318bcb0991SDimitry Andric 7328bcb0991SDimitry Andric {S32, PrivatePtr, 32, 32}, 7338bcb0991SDimitry Andric {S32, PrivatePtr, 8, 8}, 7348bcb0991SDimitry Andric {S32, PrivatePtr, 16, 16}, 7358bcb0991SDimitry Andric {V2S16, PrivatePtr, 32, 32}, 7368bcb0991SDimitry Andric 7378bcb0991SDimitry Andric {S32, FlatPtr, 32, GlobalAlign32}, 7388bcb0991SDimitry Andric {S32, FlatPtr, 16, GlobalAlign16}, 7398bcb0991SDimitry Andric {S32, FlatPtr, 8, GlobalAlign8}, 7408bcb0991SDimitry Andric {V2S16, FlatPtr, 32, GlobalAlign32}, 7418bcb0991SDimitry Andric 7428bcb0991SDimitry Andric {S32, ConstantPtr, 32, GlobalAlign32}, 7438bcb0991SDimitry Andric {V2S32, ConstantPtr, 64, GlobalAlign32}, 7448bcb0991SDimitry Andric {V3S32, ConstantPtr, 96, GlobalAlign32}, 7458bcb0991SDimitry Andric {V4S32, ConstantPtr, 128, GlobalAlign32}, 7468bcb0991SDimitry Andric {S64, ConstantPtr, 64, GlobalAlign32}, 7478bcb0991SDimitry Andric {S128, ConstantPtr, 128, GlobalAlign32}, 7488bcb0991SDimitry Andric {V2S32, ConstantPtr, 32, GlobalAlign32}}); 7498bcb0991SDimitry Andric Actions 7508bcb0991SDimitry Andric .customIf(typeIs(1, Constant32Ptr)) 7518bcb0991SDimitry Andric .narrowScalarIf( 7528bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 7538bcb0991SDimitry Andric return !Query.Types[0].isVector() && needToSplitLoad(Query); 7548bcb0991SDimitry Andric }, 7558bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 7568bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 7578bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 7588bcb0991SDimitry Andric 7598bcb0991SDimitry Andric const unsigned DstSize = DstTy.getSizeInBits(); 7608bcb0991SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 7618bcb0991SDimitry Andric 7628bcb0991SDimitry Andric // Split extloads. 7638bcb0991SDimitry Andric if (DstSize > MemSize) 7648bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MemSize)); 7658bcb0991SDimitry Andric 7668bcb0991SDimitry Andric if (DstSize > 32 && (DstSize % 32 != 0)) { 7678bcb0991SDimitry Andric // FIXME: Need a way to specify non-extload of larger size if 7688bcb0991SDimitry Andric // suitably aligned. 7698bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 7708bcb0991SDimitry Andric } 7718bcb0991SDimitry Andric 7728bcb0991SDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 7738bcb0991SDimitry Andric if (MemSize > MaxSize) 7748bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MaxSize)); 7758bcb0991SDimitry Andric 7768bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 7778bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(Align)); 7788bcb0991SDimitry Andric }) 7798bcb0991SDimitry Andric .fewerElementsIf( 7808bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 7818bcb0991SDimitry Andric return Query.Types[0].isVector() && needToSplitLoad(Query); 7828bcb0991SDimitry Andric }, 7838bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 7848bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 7858bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 7868bcb0991SDimitry Andric 7878bcb0991SDimitry Andric LLT EltTy = DstTy.getElementType(); 7888bcb0991SDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); 7898bcb0991SDimitry Andric 7908bcb0991SDimitry Andric // Split if it's too large for the address space. 7918bcb0991SDimitry Andric if (Query.MMODescrs[0].SizeInBits > MaxSize) { 7928bcb0991SDimitry Andric unsigned NumElts = DstTy.getNumElements(); 7938bcb0991SDimitry Andric unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 7948bcb0991SDimitry Andric 7958bcb0991SDimitry Andric // FIXME: Refine when odd breakdowns handled 7968bcb0991SDimitry Andric // The scalars will need to be re-legalized. 7978bcb0991SDimitry Andric if (NumPieces == 1 || NumPieces >= NumElts || 7988bcb0991SDimitry Andric NumElts % NumPieces != 0) 7998bcb0991SDimitry Andric return std::make_pair(0, EltTy); 8008bcb0991SDimitry Andric 8018bcb0991SDimitry Andric return std::make_pair(0, 8028bcb0991SDimitry Andric LLT::vector(NumElts / NumPieces, EltTy)); 8038bcb0991SDimitry Andric } 8048bcb0991SDimitry Andric 8058bcb0991SDimitry Andric // Need to split because of alignment. 8068bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 8078bcb0991SDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 8088bcb0991SDimitry Andric if (EltSize > Align && 8098bcb0991SDimitry Andric (EltSize / Align < DstTy.getNumElements())) { 8108bcb0991SDimitry Andric return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 8118bcb0991SDimitry Andric } 8128bcb0991SDimitry Andric 8138bcb0991SDimitry Andric // May need relegalization for the scalars. 8148bcb0991SDimitry Andric return std::make_pair(0, EltTy); 8158bcb0991SDimitry Andric }) 8168bcb0991SDimitry Andric .minScalar(0, S32); 8178bcb0991SDimitry Andric 8188bcb0991SDimitry Andric if (IsStore) 8198bcb0991SDimitry Andric Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 8208bcb0991SDimitry Andric 8218bcb0991SDimitry Andric // TODO: Need a bitcast lower option? 8228bcb0991SDimitry Andric Actions 8238bcb0991SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 8248bcb0991SDimitry Andric const LLT Ty0 = Query.Types[0]; 8250b57cec5SDimitry Andric unsigned Size = Ty0.getSizeInBits(); 8260b57cec5SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 8278bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 8288bcb0991SDimitry Andric 8298bcb0991SDimitry Andric // FIXME: Widening store from alignment not valid. 8308bcb0991SDimitry Andric if (MemSize < Size) 8318bcb0991SDimitry Andric MemSize = std::max(MemSize, Align); 8320b57cec5SDimitry Andric 833*480093f4SDimitry Andric // No extending vector loads. 834*480093f4SDimitry Andric if (Size > MemSize && Ty0.isVector()) 835*480093f4SDimitry Andric return false; 836*480093f4SDimitry Andric 8370b57cec5SDimitry Andric switch (MemSize) { 8380b57cec5SDimitry Andric case 8: 8390b57cec5SDimitry Andric case 16: 8400b57cec5SDimitry Andric return Size == 32; 8410b57cec5SDimitry Andric case 32: 8420b57cec5SDimitry Andric case 64: 8430b57cec5SDimitry Andric case 128: 8440b57cec5SDimitry Andric return true; 8450b57cec5SDimitry Andric case 96: 8460b57cec5SDimitry Andric return ST.hasDwordx3LoadStores(); 8470b57cec5SDimitry Andric case 256: 8480b57cec5SDimitry Andric case 512: 8498bcb0991SDimitry Andric return true; 8500b57cec5SDimitry Andric default: 8510b57cec5SDimitry Andric return false; 8520b57cec5SDimitry Andric } 8530b57cec5SDimitry Andric }) 8548bcb0991SDimitry Andric .widenScalarToNextPow2(0) 8558bcb0991SDimitry Andric // TODO: v3s32->v4s32 with alignment 8568bcb0991SDimitry Andric .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 8578bcb0991SDimitry Andric } 8580b57cec5SDimitry Andric 8590b57cec5SDimitry Andric auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 8608bcb0991SDimitry Andric .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 8618bcb0991SDimitry Andric {S32, GlobalPtr, 16, 2 * 8}, 8620b57cec5SDimitry Andric {S32, LocalPtr, 8, 8}, 8638bcb0991SDimitry Andric {S32, LocalPtr, 16, 16}, 8640b57cec5SDimitry Andric {S32, PrivatePtr, 8, 8}, 8658bcb0991SDimitry Andric {S32, PrivatePtr, 16, 16}, 8668bcb0991SDimitry Andric {S32, ConstantPtr, 8, 8}, 8678bcb0991SDimitry Andric {S32, ConstantPtr, 16, 2 * 8}}); 8680b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 8698bcb0991SDimitry Andric ExtLoads.legalForTypesWithMemDesc( 8708bcb0991SDimitry Andric {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 8710b57cec5SDimitry Andric } 8720b57cec5SDimitry Andric 8730b57cec5SDimitry Andric ExtLoads.clampScalar(0, S32, S32) 8740b57cec5SDimitry Andric .widenScalarToNextPow2(0) 8750b57cec5SDimitry Andric .unsupportedIfMemSizeNotPow2() 8760b57cec5SDimitry Andric .lower(); 8770b57cec5SDimitry Andric 8780b57cec5SDimitry Andric auto &Atomics = getActionDefinitionsBuilder( 8790b57cec5SDimitry Andric {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 8800b57cec5SDimitry Andric G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 8810b57cec5SDimitry Andric G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 882*480093f4SDimitry Andric G_ATOMICRMW_UMIN}) 8830b57cec5SDimitry Andric .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 8840b57cec5SDimitry Andric {S64, GlobalPtr}, {S64, LocalPtr}}); 8850b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 8860b57cec5SDimitry Andric Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 8870b57cec5SDimitry Andric } 8880b57cec5SDimitry Andric 8898bcb0991SDimitry Andric getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 8908bcb0991SDimitry Andric .legalFor({{S32, LocalPtr}}); 8918bcb0991SDimitry Andric 892*480093f4SDimitry Andric // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 893*480093f4SDimitry Andric // demarshalling 894*480093f4SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 895*480093f4SDimitry Andric .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 896*480093f4SDimitry Andric {S32, FlatPtr}, {S64, FlatPtr}}) 897*480093f4SDimitry Andric .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 898*480093f4SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 899*480093f4SDimitry Andric 9008bcb0991SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) 9018bcb0991SDimitry Andric .lower(); 9028bcb0991SDimitry Andric 9030b57cec5SDimitry Andric // TODO: Pointer types, any 32-bit or 64-bit vector 904*480093f4SDimitry Andric 905*480093f4SDimitry Andric // Condition should be s32 for scalar, s1 for vector. 9060b57cec5SDimitry Andric getActionDefinitionsBuilder(G_SELECT) 9070b57cec5SDimitry Andric .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 9080b57cec5SDimitry Andric GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 909*480093f4SDimitry Andric LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 9100b57cec5SDimitry Andric .clampScalar(0, S16, S64) 9110b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 9120b57cec5SDimitry Andric .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 9130b57cec5SDimitry Andric .scalarize(1) 9140b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 2) 9150b57cec5SDimitry Andric .clampMaxNumElements(0, LocalPtr, 2) 9160b57cec5SDimitry Andric .clampMaxNumElements(0, PrivatePtr, 2) 9170b57cec5SDimitry Andric .scalarize(0) 9180b57cec5SDimitry Andric .widenScalarToNextPow2(0) 919*480093f4SDimitry Andric .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 9200b57cec5SDimitry Andric 9210b57cec5SDimitry Andric // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 9220b57cec5SDimitry Andric // be more flexible with the shift amount type. 9230b57cec5SDimitry Andric auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 9240b57cec5SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}); 9250b57cec5SDimitry Andric if (ST.has16BitInsts()) { 9260b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 9270b57cec5SDimitry Andric Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) 9280b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2); 9290b57cec5SDimitry Andric } else 9300b57cec5SDimitry Andric Shifts.legalFor({{S16, S32}, {S16, S16}}); 9310b57cec5SDimitry Andric 932*480093f4SDimitry Andric // TODO: Support 16-bit shift amounts 933*480093f4SDimitry Andric Shifts.clampScalar(1, S32, S32); 9340b57cec5SDimitry Andric Shifts.clampScalar(0, S16, S64); 9350b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 16); 9360b57cec5SDimitry Andric } else { 9370b57cec5SDimitry Andric // Make sure we legalize the shift amount type first, as the general 9380b57cec5SDimitry Andric // expansion for the shifted type will produce much worse code if it hasn't 9390b57cec5SDimitry Andric // been truncated already. 9400b57cec5SDimitry Andric Shifts.clampScalar(1, S32, S32); 9410b57cec5SDimitry Andric Shifts.clampScalar(0, S32, S64); 9420b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 32); 9430b57cec5SDimitry Andric } 9440b57cec5SDimitry Andric Shifts.scalarize(0); 9450b57cec5SDimitry Andric 9460b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 9470b57cec5SDimitry Andric unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 9480b57cec5SDimitry Andric unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 9490b57cec5SDimitry Andric unsigned IdxTypeIdx = 2; 9500b57cec5SDimitry Andric 9510b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 9520b57cec5SDimitry Andric .customIf([=](const LegalityQuery &Query) { 9530b57cec5SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 9540b57cec5SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 9550b57cec5SDimitry Andric const LLT IdxTy = Query.Types[IdxTypeIdx]; 9560b57cec5SDimitry Andric return (EltTy.getSizeInBits() == 16 || 9570b57cec5SDimitry Andric EltTy.getSizeInBits() % 32 == 0) && 9580b57cec5SDimitry Andric VecTy.getSizeInBits() % 32 == 0 && 9598bcb0991SDimitry Andric VecTy.getSizeInBits() <= 1024 && 9600b57cec5SDimitry Andric IdxTy.getSizeInBits() == 32; 9610b57cec5SDimitry Andric }) 9620b57cec5SDimitry Andric .clampScalar(EltTypeIdx, S32, S64) 9630b57cec5SDimitry Andric .clampScalar(VecTypeIdx, S32, S64) 9640b57cec5SDimitry Andric .clampScalar(IdxTypeIdx, S32, S32); 9650b57cec5SDimitry Andric } 9660b57cec5SDimitry Andric 9670b57cec5SDimitry Andric getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 9680b57cec5SDimitry Andric .unsupportedIf([=](const LegalityQuery &Query) { 9690b57cec5SDimitry Andric const LLT &EltTy = Query.Types[1].getElementType(); 9700b57cec5SDimitry Andric return Query.Types[0] != EltTy; 9710b57cec5SDimitry Andric }); 9720b57cec5SDimitry Andric 9730b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT, G_INSERT}) { 9740b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 9750b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 9760b57cec5SDimitry Andric 9770b57cec5SDimitry Andric // FIXME: Doesn't handle extract of illegal sizes. 9780b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 9798bcb0991SDimitry Andric .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 9808bcb0991SDimitry Andric // FIXME: Multiples of 16 should not be legal. 9810b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 9820b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 9830b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 9840b57cec5SDimitry Andric return (BigTy.getSizeInBits() % 32 == 0) && 9850b57cec5SDimitry Andric (LitTy.getSizeInBits() % 16 == 0); 9860b57cec5SDimitry Andric }) 9870b57cec5SDimitry Andric .widenScalarIf( 9880b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 9890b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 9900b57cec5SDimitry Andric return (BigTy.getScalarSizeInBits() < 16); 9910b57cec5SDimitry Andric }, 9920b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 9930b57cec5SDimitry Andric .widenScalarIf( 9940b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 9950b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 9960b57cec5SDimitry Andric return (LitTy.getScalarSizeInBits() < 16); 9970b57cec5SDimitry Andric }, 9980b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 9990b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 10000b57cec5SDimitry Andric .widenScalarToNextPow2(BigTyIdx, 32); 10010b57cec5SDimitry Andric 10020b57cec5SDimitry Andric } 10030b57cec5SDimitry Andric 10048bcb0991SDimitry Andric auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 10050b57cec5SDimitry Andric .legalForCartesianProduct(AllS32Vectors, {S32}) 10060b57cec5SDimitry Andric .legalForCartesianProduct(AllS64Vectors, {S64}) 10078bcb0991SDimitry Andric .clampNumElements(0, V16S32, V32S32) 10088bcb0991SDimitry Andric .clampNumElements(0, V2S64, V16S64) 10098bcb0991SDimitry Andric .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 10108bcb0991SDimitry Andric 10118bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) 10128bcb0991SDimitry Andric BuildVector.legalFor({V2S16, S32}); 10138bcb0991SDimitry Andric 10148bcb0991SDimitry Andric BuildVector 10150b57cec5SDimitry Andric .minScalarSameAs(1, 0) 10160b57cec5SDimitry Andric .legalIf(isRegisterType(0)) 10170b57cec5SDimitry Andric .minScalarOrElt(0, S32); 10180b57cec5SDimitry Andric 10198bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) { 10208bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 10218bcb0991SDimitry Andric .legalFor({V2S16, S32}) 10228bcb0991SDimitry Andric .lower(); 10238bcb0991SDimitry Andric } else { 10248bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 10258bcb0991SDimitry Andric .lower(); 10268bcb0991SDimitry Andric } 10278bcb0991SDimitry Andric 10280b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONCAT_VECTORS) 10290b57cec5SDimitry Andric .legalIf(isRegisterType(0)); 10300b57cec5SDimitry Andric 10318bcb0991SDimitry Andric // TODO: Don't fully scalarize v2s16 pieces 10328bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 10338bcb0991SDimitry Andric 10340b57cec5SDimitry Andric // Merge/Unmerge 10350b57cec5SDimitry Andric for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 10360b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 10370b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 10380b57cec5SDimitry Andric 10390b57cec5SDimitry Andric auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 10400b57cec5SDimitry Andric const LLT &Ty = Query.Types[TypeIdx]; 10410b57cec5SDimitry Andric if (Ty.isVector()) { 10420b57cec5SDimitry Andric const LLT &EltTy = Ty.getElementType(); 10430b57cec5SDimitry Andric if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) 10440b57cec5SDimitry Andric return true; 10450b57cec5SDimitry Andric if (!isPowerOf2_32(EltTy.getSizeInBits())) 10460b57cec5SDimitry Andric return true; 10470b57cec5SDimitry Andric } 10480b57cec5SDimitry Andric return false; 10490b57cec5SDimitry Andric }; 10500b57cec5SDimitry Andric 10518bcb0991SDimitry Andric auto &Builder = getActionDefinitionsBuilder(Op) 10520b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 10530b57cec5SDimitry Andric // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 10540b57cec5SDimitry Andric // worth considering the multiples of 64 since 2*192 and 2*384 are not 10550b57cec5SDimitry Andric // valid. 10560b57cec5SDimitry Andric .clampScalar(LitTyIdx, S16, S256) 10570b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 10588bcb0991SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 10598bcb0991SDimitry Andric .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 10608bcb0991SDimitry Andric elementTypeIs(1, S16)), 10618bcb0991SDimitry Andric changeTo(1, V2S16)) 10620b57cec5SDimitry Andric // Break up vectors with weird elements into scalars 10630b57cec5SDimitry Andric .fewerElementsIf( 10640b57cec5SDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, 10650b57cec5SDimitry Andric scalarize(0)) 10660b57cec5SDimitry Andric .fewerElementsIf( 10670b57cec5SDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, 10680b57cec5SDimitry Andric scalarize(1)) 10698bcb0991SDimitry Andric .clampScalar(BigTyIdx, S32, S1024) 10708bcb0991SDimitry Andric .lowerFor({{S16, V2S16}}); 10718bcb0991SDimitry Andric 10728bcb0991SDimitry Andric if (Op == G_MERGE_VALUES) { 10738bcb0991SDimitry Andric Builder.widenScalarIf( 10748bcb0991SDimitry Andric // TODO: Use 16-bit shifts if legal for 8-bit values? 10750b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 10768bcb0991SDimitry Andric const LLT Ty = Query.Types[LitTyIdx]; 10778bcb0991SDimitry Andric return Ty.getSizeInBits() < 32; 10788bcb0991SDimitry Andric }, 10798bcb0991SDimitry Andric changeTo(LitTyIdx, S32)); 10808bcb0991SDimitry Andric } 10818bcb0991SDimitry Andric 10828bcb0991SDimitry Andric Builder.widenScalarIf( 10838bcb0991SDimitry Andric [=](const LegalityQuery &Query) { 10848bcb0991SDimitry Andric const LLT Ty = Query.Types[BigTyIdx]; 10850b57cec5SDimitry Andric return !isPowerOf2_32(Ty.getSizeInBits()) && 10860b57cec5SDimitry Andric Ty.getSizeInBits() % 16 != 0; 10870b57cec5SDimitry Andric }, 10880b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 10890b57cec5SDimitry Andric // Pick the next power of 2, or a multiple of 64 over 128. 10900b57cec5SDimitry Andric // Whichever is smaller. 10910b57cec5SDimitry Andric const LLT &Ty = Query.Types[BigTyIdx]; 10920b57cec5SDimitry Andric unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 10930b57cec5SDimitry Andric if (NewSizeInBits >= 256) { 10940b57cec5SDimitry Andric unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 10950b57cec5SDimitry Andric if (RoundedTo < NewSizeInBits) 10960b57cec5SDimitry Andric NewSizeInBits = RoundedTo; 10970b57cec5SDimitry Andric } 10980b57cec5SDimitry Andric return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 10990b57cec5SDimitry Andric }) 11000b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 11010b57cec5SDimitry Andric const LLT &BigTy = Query.Types[BigTyIdx]; 11020b57cec5SDimitry Andric const LLT &LitTy = Query.Types[LitTyIdx]; 11030b57cec5SDimitry Andric 11040b57cec5SDimitry Andric if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 11050b57cec5SDimitry Andric return false; 11060b57cec5SDimitry Andric if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 11070b57cec5SDimitry Andric return false; 11080b57cec5SDimitry Andric 11090b57cec5SDimitry Andric return BigTy.getSizeInBits() % 16 == 0 && 11100b57cec5SDimitry Andric LitTy.getSizeInBits() % 16 == 0 && 11118bcb0991SDimitry Andric BigTy.getSizeInBits() <= 1024; 11120b57cec5SDimitry Andric }) 11130b57cec5SDimitry Andric // Any vectors left are the wrong size. Scalarize them. 11140b57cec5SDimitry Andric .scalarize(0) 11150b57cec5SDimitry Andric .scalarize(1); 11160b57cec5SDimitry Andric } 11170b57cec5SDimitry Andric 11188bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SEXT_INREG).lower(); 11198bcb0991SDimitry Andric 1120*480093f4SDimitry Andric getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); 1121*480093f4SDimitry Andric 1122*480093f4SDimitry Andric getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1123*480093f4SDimitry Andric .legalFor({S64}); 1124*480093f4SDimitry Andric 1125*480093f4SDimitry Andric getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1126*480093f4SDimitry Andric G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1127*480093f4SDimitry Andric G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1128*480093f4SDimitry Andric .unsupported(); 1129*480093f4SDimitry Andric 11300b57cec5SDimitry Andric computeTables(); 11310b57cec5SDimitry Andric verify(*ST.getInstrInfo()); 11320b57cec5SDimitry Andric } 11330b57cec5SDimitry Andric 11340b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, 11350b57cec5SDimitry Andric MachineRegisterInfo &MRI, 11368bcb0991SDimitry Andric MachineIRBuilder &B, 11370b57cec5SDimitry Andric GISelChangeObserver &Observer) const { 11380b57cec5SDimitry Andric switch (MI.getOpcode()) { 11390b57cec5SDimitry Andric case TargetOpcode::G_ADDRSPACE_CAST: 11408bcb0991SDimitry Andric return legalizeAddrSpaceCast(MI, MRI, B); 11410b57cec5SDimitry Andric case TargetOpcode::G_FRINT: 11428bcb0991SDimitry Andric return legalizeFrint(MI, MRI, B); 11430b57cec5SDimitry Andric case TargetOpcode::G_FCEIL: 11448bcb0991SDimitry Andric return legalizeFceil(MI, MRI, B); 11450b57cec5SDimitry Andric case TargetOpcode::G_INTRINSIC_TRUNC: 11468bcb0991SDimitry Andric return legalizeIntrinsicTrunc(MI, MRI, B); 11470b57cec5SDimitry Andric case TargetOpcode::G_SITOFP: 11488bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, true); 11490b57cec5SDimitry Andric case TargetOpcode::G_UITOFP: 11508bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, false); 11510b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM: 11520b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM: 11530b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM_IEEE: 11540b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM_IEEE: 11558bcb0991SDimitry Andric return legalizeMinNumMaxNum(MI, MRI, B); 11560b57cec5SDimitry Andric case TargetOpcode::G_EXTRACT_VECTOR_ELT: 11578bcb0991SDimitry Andric return legalizeExtractVectorElt(MI, MRI, B); 11580b57cec5SDimitry Andric case TargetOpcode::G_INSERT_VECTOR_ELT: 11598bcb0991SDimitry Andric return legalizeInsertVectorElt(MI, MRI, B); 11608bcb0991SDimitry Andric case TargetOpcode::G_FSIN: 11618bcb0991SDimitry Andric case TargetOpcode::G_FCOS: 11628bcb0991SDimitry Andric return legalizeSinCos(MI, MRI, B); 11638bcb0991SDimitry Andric case TargetOpcode::G_GLOBAL_VALUE: 11648bcb0991SDimitry Andric return legalizeGlobalValue(MI, MRI, B); 11658bcb0991SDimitry Andric case TargetOpcode::G_LOAD: 11668bcb0991SDimitry Andric return legalizeLoad(MI, MRI, B, Observer); 11678bcb0991SDimitry Andric case TargetOpcode::G_FMAD: 11688bcb0991SDimitry Andric return legalizeFMad(MI, MRI, B); 11698bcb0991SDimitry Andric case TargetOpcode::G_FDIV: 11708bcb0991SDimitry Andric return legalizeFDIV(MI, MRI, B); 1171*480093f4SDimitry Andric case TargetOpcode::G_ATOMIC_CMPXCHG: 1172*480093f4SDimitry Andric return legalizeAtomicCmpXChg(MI, MRI, B); 11730b57cec5SDimitry Andric default: 11740b57cec5SDimitry Andric return false; 11750b57cec5SDimitry Andric } 11760b57cec5SDimitry Andric 11770b57cec5SDimitry Andric llvm_unreachable("expected switch to return"); 11780b57cec5SDimitry Andric } 11790b57cec5SDimitry Andric 11800b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture( 11810b57cec5SDimitry Andric unsigned AS, 11820b57cec5SDimitry Andric MachineRegisterInfo &MRI, 11838bcb0991SDimitry Andric MachineIRBuilder &B) const { 11848bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 11850b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 11860b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 11870b57cec5SDimitry Andric 11888bcb0991SDimitry Andric assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 11898bcb0991SDimitry Andric 11900b57cec5SDimitry Andric if (ST.hasApertureRegs()) { 11910b57cec5SDimitry Andric // FIXME: Use inline constants (src_{shared, private}_base) instead of 11920b57cec5SDimitry Andric // getreg. 11930b57cec5SDimitry Andric unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 11940b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 11950b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 11960b57cec5SDimitry Andric unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 11970b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 11980b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 11990b57cec5SDimitry Andric unsigned Encoding = 12000b57cec5SDimitry Andric AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 12010b57cec5SDimitry Andric Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 12020b57cec5SDimitry Andric WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 12030b57cec5SDimitry Andric 12040b57cec5SDimitry Andric Register ApertureReg = MRI.createGenericVirtualRegister(S32); 12050b57cec5SDimitry Andric Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 12060b57cec5SDimitry Andric 12078bcb0991SDimitry Andric B.buildInstr(AMDGPU::S_GETREG_B32) 12080b57cec5SDimitry Andric .addDef(GetReg) 12090b57cec5SDimitry Andric .addImm(Encoding); 12100b57cec5SDimitry Andric MRI.setType(GetReg, S32); 12110b57cec5SDimitry Andric 12128bcb0991SDimitry Andric auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 12138bcb0991SDimitry Andric B.buildInstr(TargetOpcode::G_SHL) 12140b57cec5SDimitry Andric .addDef(ApertureReg) 12150b57cec5SDimitry Andric .addUse(GetReg) 12160b57cec5SDimitry Andric .addUse(ShiftAmt.getReg(0)); 12170b57cec5SDimitry Andric 12180b57cec5SDimitry Andric return ApertureReg; 12190b57cec5SDimitry Andric } 12200b57cec5SDimitry Andric 12210b57cec5SDimitry Andric Register QueuePtr = MRI.createGenericVirtualRegister( 12220b57cec5SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 12230b57cec5SDimitry Andric 12248bcb0991SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 12258bcb0991SDimitry Andric if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 12268bcb0991SDimitry Andric return Register(); 12270b57cec5SDimitry Andric 12280b57cec5SDimitry Andric // Offset into amd_queue_t for group_segment_aperture_base_hi / 12290b57cec5SDimitry Andric // private_segment_aperture_base_hi. 12300b57cec5SDimitry Andric uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 12310b57cec5SDimitry Andric 1232*480093f4SDimitry Andric // TODO: can we be smarter about machine pointer info? 1233*480093f4SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 12340b57cec5SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 12350b57cec5SDimitry Andric PtrInfo, 12360b57cec5SDimitry Andric MachineMemOperand::MOLoad | 12370b57cec5SDimitry Andric MachineMemOperand::MODereferenceable | 12380b57cec5SDimitry Andric MachineMemOperand::MOInvariant, 12390b57cec5SDimitry Andric 4, 12400b57cec5SDimitry Andric MinAlign(64, StructOffset)); 12410b57cec5SDimitry Andric 12420b57cec5SDimitry Andric Register LoadResult = MRI.createGenericVirtualRegister(S32); 12430b57cec5SDimitry Andric Register LoadAddr; 12440b57cec5SDimitry Andric 1245*480093f4SDimitry Andric B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 12468bcb0991SDimitry Andric B.buildLoad(LoadResult, LoadAddr, *MMO); 12470b57cec5SDimitry Andric return LoadResult; 12480b57cec5SDimitry Andric } 12490b57cec5SDimitry Andric 12500b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 12510b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 12528bcb0991SDimitry Andric MachineIRBuilder &B) const { 12538bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 12540b57cec5SDimitry Andric 12558bcb0991SDimitry Andric B.setInstr(MI); 12560b57cec5SDimitry Andric 12578bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 12580b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 12590b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 12600b57cec5SDimitry Andric 12610b57cec5SDimitry Andric LLT DstTy = MRI.getType(Dst); 12620b57cec5SDimitry Andric LLT SrcTy = MRI.getType(Src); 12630b57cec5SDimitry Andric unsigned DestAS = DstTy.getAddressSpace(); 12640b57cec5SDimitry Andric unsigned SrcAS = SrcTy.getAddressSpace(); 12650b57cec5SDimitry Andric 12660b57cec5SDimitry Andric // TODO: Avoid reloading from the queue ptr for each cast, or at least each 12670b57cec5SDimitry Andric // vector element. 12680b57cec5SDimitry Andric assert(!DstTy.isVector()); 12690b57cec5SDimitry Andric 12700b57cec5SDimitry Andric const AMDGPUTargetMachine &TM 12710b57cec5SDimitry Andric = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 12720b57cec5SDimitry Andric 12730b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 12740b57cec5SDimitry Andric if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 12758bcb0991SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 12768bcb0991SDimitry Andric return true; 12778bcb0991SDimitry Andric } 12788bcb0991SDimitry Andric 12798bcb0991SDimitry Andric if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 12808bcb0991SDimitry Andric // Truncate. 12818bcb0991SDimitry Andric B.buildExtract(Dst, Src, 0); 12828bcb0991SDimitry Andric MI.eraseFromParent(); 12838bcb0991SDimitry Andric return true; 12848bcb0991SDimitry Andric } 12858bcb0991SDimitry Andric 12868bcb0991SDimitry Andric if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 12878bcb0991SDimitry Andric const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 12888bcb0991SDimitry Andric uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 12898bcb0991SDimitry Andric 12908bcb0991SDimitry Andric // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 12918bcb0991SDimitry Andric // another. Merge operands are required to be the same type, but creating an 12928bcb0991SDimitry Andric // extra ptrtoint would be kind of pointless. 12938bcb0991SDimitry Andric auto HighAddr = B.buildConstant( 12948bcb0991SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 12958bcb0991SDimitry Andric B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); 12968bcb0991SDimitry Andric MI.eraseFromParent(); 12970b57cec5SDimitry Andric return true; 12980b57cec5SDimitry Andric } 12990b57cec5SDimitry Andric 13000b57cec5SDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 13010b57cec5SDimitry Andric assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 13020b57cec5SDimitry Andric DestAS == AMDGPUAS::PRIVATE_ADDRESS); 13030b57cec5SDimitry Andric unsigned NullVal = TM.getNullPointerValue(DestAS); 13040b57cec5SDimitry Andric 13058bcb0991SDimitry Andric auto SegmentNull = B.buildConstant(DstTy, NullVal); 13068bcb0991SDimitry Andric auto FlatNull = B.buildConstant(SrcTy, 0); 13070b57cec5SDimitry Andric 13080b57cec5SDimitry Andric Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); 13090b57cec5SDimitry Andric 13100b57cec5SDimitry Andric // Extract low 32-bits of the pointer. 13118bcb0991SDimitry Andric B.buildExtract(PtrLo32, Src, 0); 13120b57cec5SDimitry Andric 13130b57cec5SDimitry Andric Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 13148bcb0991SDimitry Andric B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); 13158bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 13160b57cec5SDimitry Andric 13170b57cec5SDimitry Andric MI.eraseFromParent(); 13180b57cec5SDimitry Andric return true; 13190b57cec5SDimitry Andric } 13200b57cec5SDimitry Andric 13218bcb0991SDimitry Andric if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 13228bcb0991SDimitry Andric return false; 13238bcb0991SDimitry Andric 13248bcb0991SDimitry Andric if (!ST.hasFlatAddressSpace()) 13258bcb0991SDimitry Andric return false; 13260b57cec5SDimitry Andric 13270b57cec5SDimitry Andric auto SegmentNull = 13288bcb0991SDimitry Andric B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 13290b57cec5SDimitry Andric auto FlatNull = 13308bcb0991SDimitry Andric B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 13310b57cec5SDimitry Andric 13328bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 13338bcb0991SDimitry Andric if (!ApertureReg.isValid()) 13348bcb0991SDimitry Andric return false; 13350b57cec5SDimitry Andric 13360b57cec5SDimitry Andric Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); 13378bcb0991SDimitry Andric B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); 13380b57cec5SDimitry Andric 13390b57cec5SDimitry Andric Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); 13400b57cec5SDimitry Andric 13410b57cec5SDimitry Andric // Coerce the type of the low half of the result so we can use merge_values. 13428bcb0991SDimitry Andric Register SrcAsInt = MRI.createGenericVirtualRegister(S32); 13438bcb0991SDimitry Andric B.buildInstr(TargetOpcode::G_PTRTOINT) 13440b57cec5SDimitry Andric .addDef(SrcAsInt) 13450b57cec5SDimitry Andric .addUse(Src); 13460b57cec5SDimitry Andric 13470b57cec5SDimitry Andric // TODO: Should we allow mismatched types but matching sizes in merges to 13480b57cec5SDimitry Andric // avoid the ptrtoint? 13498bcb0991SDimitry Andric B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); 13508bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); 13510b57cec5SDimitry Andric 13520b57cec5SDimitry Andric MI.eraseFromParent(); 13530b57cec5SDimitry Andric return true; 13540b57cec5SDimitry Andric } 13550b57cec5SDimitry Andric 13560b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint( 13570b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 13588bcb0991SDimitry Andric MachineIRBuilder &B) const { 13598bcb0991SDimitry Andric B.setInstr(MI); 13600b57cec5SDimitry Andric 13610b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 13620b57cec5SDimitry Andric LLT Ty = MRI.getType(Src); 13630b57cec5SDimitry Andric assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 13640b57cec5SDimitry Andric 13650b57cec5SDimitry Andric APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 13660b57cec5SDimitry Andric APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 13670b57cec5SDimitry Andric 13688bcb0991SDimitry Andric auto C1 = B.buildFConstant(Ty, C1Val); 13698bcb0991SDimitry Andric auto CopySign = B.buildFCopysign(Ty, C1, Src); 13700b57cec5SDimitry Andric 13710b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 13728bcb0991SDimitry Andric auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 13738bcb0991SDimitry Andric auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 13740b57cec5SDimitry Andric 13758bcb0991SDimitry Andric auto C2 = B.buildFConstant(Ty, C2Val); 13768bcb0991SDimitry Andric auto Fabs = B.buildFAbs(Ty, Src); 13770b57cec5SDimitry Andric 13788bcb0991SDimitry Andric auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 13798bcb0991SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 13800b57cec5SDimitry Andric return true; 13810b57cec5SDimitry Andric } 13820b57cec5SDimitry Andric 13830b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil( 13840b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 13850b57cec5SDimitry Andric MachineIRBuilder &B) const { 13860b57cec5SDimitry Andric B.setInstr(MI); 13870b57cec5SDimitry Andric 13880b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 13890b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 13900b57cec5SDimitry Andric 13910b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 13920b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 13930b57cec5SDimitry Andric 13940b57cec5SDimitry Andric // result = trunc(src) 13950b57cec5SDimitry Andric // if (src > 0.0 && src != result) 13960b57cec5SDimitry Andric // result += 1.0 13970b57cec5SDimitry Andric 13980b57cec5SDimitry Andric auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); 13990b57cec5SDimitry Andric 14000b57cec5SDimitry Andric const auto Zero = B.buildFConstant(S64, 0.0); 14010b57cec5SDimitry Andric const auto One = B.buildFConstant(S64, 1.0); 14020b57cec5SDimitry Andric auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 14030b57cec5SDimitry Andric auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 14040b57cec5SDimitry Andric auto And = B.buildAnd(S1, Lt0, NeTrunc); 14050b57cec5SDimitry Andric auto Add = B.buildSelect(S64, And, One, Zero); 14060b57cec5SDimitry Andric 14070b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 14080b57cec5SDimitry Andric B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 14090b57cec5SDimitry Andric return true; 14100b57cec5SDimitry Andric } 14110b57cec5SDimitry Andric 14120b57cec5SDimitry Andric static MachineInstrBuilder extractF64Exponent(unsigned Hi, 14130b57cec5SDimitry Andric MachineIRBuilder &B) { 14140b57cec5SDimitry Andric const unsigned FractBits = 52; 14150b57cec5SDimitry Andric const unsigned ExpBits = 11; 14160b57cec5SDimitry Andric LLT S32 = LLT::scalar(32); 14170b57cec5SDimitry Andric 14180b57cec5SDimitry Andric auto Const0 = B.buildConstant(S32, FractBits - 32); 14190b57cec5SDimitry Andric auto Const1 = B.buildConstant(S32, ExpBits); 14200b57cec5SDimitry Andric 14210b57cec5SDimitry Andric auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 14220b57cec5SDimitry Andric .addUse(Const0.getReg(0)) 14230b57cec5SDimitry Andric .addUse(Const1.getReg(0)); 14240b57cec5SDimitry Andric 14250b57cec5SDimitry Andric return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 14260b57cec5SDimitry Andric } 14270b57cec5SDimitry Andric 14280b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 14290b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 14300b57cec5SDimitry Andric MachineIRBuilder &B) const { 14310b57cec5SDimitry Andric B.setInstr(MI); 14320b57cec5SDimitry Andric 14330b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 14340b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 14350b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 14360b57cec5SDimitry Andric 14370b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 14380b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 14390b57cec5SDimitry Andric 14400b57cec5SDimitry Andric // TODO: Should this use extract since the low half is unused? 14410b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 14420b57cec5SDimitry Andric Register Hi = Unmerge.getReg(1); 14430b57cec5SDimitry Andric 14440b57cec5SDimitry Andric // Extract the upper half, since this is where we will find the sign and 14450b57cec5SDimitry Andric // exponent. 14460b57cec5SDimitry Andric auto Exp = extractF64Exponent(Hi, B); 14470b57cec5SDimitry Andric 14480b57cec5SDimitry Andric const unsigned FractBits = 52; 14490b57cec5SDimitry Andric 14500b57cec5SDimitry Andric // Extract the sign bit. 14510b57cec5SDimitry Andric const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 14520b57cec5SDimitry Andric auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 14530b57cec5SDimitry Andric 14540b57cec5SDimitry Andric const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 14550b57cec5SDimitry Andric 14560b57cec5SDimitry Andric const auto Zero32 = B.buildConstant(S32, 0); 14570b57cec5SDimitry Andric 14580b57cec5SDimitry Andric // Extend back to 64-bits. 14590b57cec5SDimitry Andric auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); 14600b57cec5SDimitry Andric 14610b57cec5SDimitry Andric auto Shr = B.buildAShr(S64, FractMask, Exp); 14620b57cec5SDimitry Andric auto Not = B.buildNot(S64, Shr); 14630b57cec5SDimitry Andric auto Tmp0 = B.buildAnd(S64, Src, Not); 14640b57cec5SDimitry Andric auto FiftyOne = B.buildConstant(S32, FractBits - 1); 14650b57cec5SDimitry Andric 14660b57cec5SDimitry Andric auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 14670b57cec5SDimitry Andric auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 14680b57cec5SDimitry Andric 14690b57cec5SDimitry Andric auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 14700b57cec5SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 14710b57cec5SDimitry Andric return true; 14720b57cec5SDimitry Andric } 14730b57cec5SDimitry Andric 14740b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP( 14750b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 14760b57cec5SDimitry Andric MachineIRBuilder &B, bool Signed) const { 14770b57cec5SDimitry Andric B.setInstr(MI); 14780b57cec5SDimitry Andric 14790b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 14800b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 14810b57cec5SDimitry Andric 14820b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 14830b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 14840b57cec5SDimitry Andric 14850b57cec5SDimitry Andric assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 14860b57cec5SDimitry Andric 14870b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 14880b57cec5SDimitry Andric 14890b57cec5SDimitry Andric auto CvtHi = Signed ? 14900b57cec5SDimitry Andric B.buildSITOFP(S64, Unmerge.getReg(1)) : 14910b57cec5SDimitry Andric B.buildUITOFP(S64, Unmerge.getReg(1)); 14920b57cec5SDimitry Andric 14930b57cec5SDimitry Andric auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 14940b57cec5SDimitry Andric 14950b57cec5SDimitry Andric auto ThirtyTwo = B.buildConstant(S32, 32); 14960b57cec5SDimitry Andric auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 14970b57cec5SDimitry Andric .addUse(CvtHi.getReg(0)) 14980b57cec5SDimitry Andric .addUse(ThirtyTwo.getReg(0)); 14990b57cec5SDimitry Andric 15000b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 15010b57cec5SDimitry Andric B.buildFAdd(Dst, LdExp, CvtLo); 15020b57cec5SDimitry Andric MI.eraseFromParent(); 15030b57cec5SDimitry Andric return true; 15040b57cec5SDimitry Andric } 15050b57cec5SDimitry Andric 15060b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( 15070b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 15080b57cec5SDimitry Andric MachineIRBuilder &B) const { 15090b57cec5SDimitry Andric MachineFunction &MF = B.getMF(); 15100b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 15110b57cec5SDimitry Andric 15120b57cec5SDimitry Andric const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 15130b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 15140b57cec5SDimitry Andric 15150b57cec5SDimitry Andric // With ieee_mode disabled, the instructions have the correct behavior 15160b57cec5SDimitry Andric // already for G_FMINNUM/G_FMAXNUM 15170b57cec5SDimitry Andric if (!MFI->getMode().IEEE) 15180b57cec5SDimitry Andric return !IsIEEEOp; 15190b57cec5SDimitry Andric 15200b57cec5SDimitry Andric if (IsIEEEOp) 15210b57cec5SDimitry Andric return true; 15220b57cec5SDimitry Andric 15230b57cec5SDimitry Andric MachineIRBuilder HelperBuilder(MI); 15240b57cec5SDimitry Andric GISelObserverWrapper DummyObserver; 15250b57cec5SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 15268bcb0991SDimitry Andric HelperBuilder.setInstr(MI); 15270b57cec5SDimitry Andric return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 15280b57cec5SDimitry Andric } 15290b57cec5SDimitry Andric 15300b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 15310b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 15320b57cec5SDimitry Andric MachineIRBuilder &B) const { 15330b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 15340b57cec5SDimitry Andric 15350b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 15360b57cec5SDimitry Andric // TODO: Dynamic s64 indexing is only legal for SGPR. 15370b57cec5SDimitry Andric Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); 15380b57cec5SDimitry Andric if (!IdxVal) // Dynamic case will be selected to register indexing. 15390b57cec5SDimitry Andric return true; 15400b57cec5SDimitry Andric 15410b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 15420b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 15430b57cec5SDimitry Andric 15440b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 15450b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 15460b57cec5SDimitry Andric assert(EltTy == MRI.getType(Dst)); 15470b57cec5SDimitry Andric 15480b57cec5SDimitry Andric B.setInstr(MI); 15490b57cec5SDimitry Andric 15500b57cec5SDimitry Andric if (IdxVal.getValue() < VecTy.getNumElements()) 15510b57cec5SDimitry Andric B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); 15520b57cec5SDimitry Andric else 15530b57cec5SDimitry Andric B.buildUndef(Dst); 15540b57cec5SDimitry Andric 15550b57cec5SDimitry Andric MI.eraseFromParent(); 15560b57cec5SDimitry Andric return true; 15570b57cec5SDimitry Andric } 15580b57cec5SDimitry Andric 15590b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 15600b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 15610b57cec5SDimitry Andric MachineIRBuilder &B) const { 15620b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 15630b57cec5SDimitry Andric 15640b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 15650b57cec5SDimitry Andric // TODO: Dynamic s64 indexing is only legal for SGPR. 15660b57cec5SDimitry Andric Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); 15670b57cec5SDimitry Andric if (!IdxVal) // Dynamic case will be selected to register indexing. 15680b57cec5SDimitry Andric return true; 15690b57cec5SDimitry Andric 15700b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 15710b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 15720b57cec5SDimitry Andric Register Ins = MI.getOperand(2).getReg(); 15730b57cec5SDimitry Andric 15740b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 15750b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 15760b57cec5SDimitry Andric assert(EltTy == MRI.getType(Ins)); 15770b57cec5SDimitry Andric 15780b57cec5SDimitry Andric B.setInstr(MI); 15790b57cec5SDimitry Andric 15800b57cec5SDimitry Andric if (IdxVal.getValue() < VecTy.getNumElements()) 15810b57cec5SDimitry Andric B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); 15820b57cec5SDimitry Andric else 15830b57cec5SDimitry Andric B.buildUndef(Dst); 15840b57cec5SDimitry Andric 15850b57cec5SDimitry Andric MI.eraseFromParent(); 15860b57cec5SDimitry Andric return true; 15870b57cec5SDimitry Andric } 15880b57cec5SDimitry Andric 15898bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos( 15908bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 15918bcb0991SDimitry Andric MachineIRBuilder &B) const { 15928bcb0991SDimitry Andric B.setInstr(MI); 15938bcb0991SDimitry Andric 15948bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 15958bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 15968bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 15978bcb0991SDimitry Andric unsigned Flags = MI.getFlags(); 15988bcb0991SDimitry Andric 15998bcb0991SDimitry Andric Register TrigVal; 16008bcb0991SDimitry Andric auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); 16018bcb0991SDimitry Andric if (ST.hasTrigReducedRange()) { 16028bcb0991SDimitry Andric auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 16038bcb0991SDimitry Andric TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 16048bcb0991SDimitry Andric .addUse(MulVal.getReg(0)) 16058bcb0991SDimitry Andric .setMIFlags(Flags).getReg(0); 16068bcb0991SDimitry Andric } else 16078bcb0991SDimitry Andric TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 16088bcb0991SDimitry Andric 16098bcb0991SDimitry Andric Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 16108bcb0991SDimitry Andric Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 16118bcb0991SDimitry Andric B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 16128bcb0991SDimitry Andric .addUse(TrigVal) 16138bcb0991SDimitry Andric .setMIFlags(Flags); 16148bcb0991SDimitry Andric MI.eraseFromParent(); 16158bcb0991SDimitry Andric return true; 16168bcb0991SDimitry Andric } 16178bcb0991SDimitry Andric 16188bcb0991SDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( 16198bcb0991SDimitry Andric Register DstReg, LLT PtrTy, 16208bcb0991SDimitry Andric MachineIRBuilder &B, const GlobalValue *GV, 16218bcb0991SDimitry Andric unsigned Offset, unsigned GAFlags) const { 16228bcb0991SDimitry Andric // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 16238bcb0991SDimitry Andric // to the following code sequence: 16248bcb0991SDimitry Andric // 16258bcb0991SDimitry Andric // For constant address space: 16268bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 16278bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol 16288bcb0991SDimitry Andric // s_addc_u32 s1, s1, 0 16298bcb0991SDimitry Andric // 16308bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 16318bcb0991SDimitry Andric // a fixup or relocation is emitted to replace $symbol with a literal 16328bcb0991SDimitry Andric // constant, which is a pc-relative offset from the encoding of the $symbol 16338bcb0991SDimitry Andric // operand to the global variable. 16348bcb0991SDimitry Andric // 16358bcb0991SDimitry Andric // For global address space: 16368bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 16378bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 16388bcb0991SDimitry Andric // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 16398bcb0991SDimitry Andric // 16408bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 16418bcb0991SDimitry Andric // fixups or relocations are emitted to replace $symbol@*@lo and 16428bcb0991SDimitry Andric // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 16438bcb0991SDimitry Andric // which is a 64-bit pc-relative offset from the encoding of the $symbol 16448bcb0991SDimitry Andric // operand to the global variable. 16458bcb0991SDimitry Andric // 16468bcb0991SDimitry Andric // What we want here is an offset from the value returned by s_getpc 16478bcb0991SDimitry Andric // (which is the address of the s_add_u32 instruction) to the global 16488bcb0991SDimitry Andric // variable, but since the encoding of $symbol starts 4 bytes after the start 16498bcb0991SDimitry Andric // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 16508bcb0991SDimitry Andric // small. This requires us to add 4 to the global variable offset in order to 16518bcb0991SDimitry Andric // compute the correct address. 16528bcb0991SDimitry Andric 16538bcb0991SDimitry Andric LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 16548bcb0991SDimitry Andric 16558bcb0991SDimitry Andric Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 16568bcb0991SDimitry Andric B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 16578bcb0991SDimitry Andric 16588bcb0991SDimitry Andric MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 16598bcb0991SDimitry Andric .addDef(PCReg); 16608bcb0991SDimitry Andric 16618bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 16628bcb0991SDimitry Andric if (GAFlags == SIInstrInfo::MO_NONE) 16638bcb0991SDimitry Andric MIB.addImm(0); 16648bcb0991SDimitry Andric else 16658bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 16668bcb0991SDimitry Andric 16678bcb0991SDimitry Andric B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 16688bcb0991SDimitry Andric 16698bcb0991SDimitry Andric if (PtrTy.getSizeInBits() == 32) 16708bcb0991SDimitry Andric B.buildExtract(DstReg, PCReg, 0); 16718bcb0991SDimitry Andric return true; 16728bcb0991SDimitry Andric } 16738bcb0991SDimitry Andric 16748bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue( 16758bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 16768bcb0991SDimitry Andric MachineIRBuilder &B) const { 16778bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 16788bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 16798bcb0991SDimitry Andric unsigned AS = Ty.getAddressSpace(); 16808bcb0991SDimitry Andric 16818bcb0991SDimitry Andric const GlobalValue *GV = MI.getOperand(1).getGlobal(); 16828bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 16838bcb0991SDimitry Andric SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 16848bcb0991SDimitry Andric B.setInstr(MI); 16858bcb0991SDimitry Andric 16868bcb0991SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 16878bcb0991SDimitry Andric if (!MFI->isEntryFunction()) { 16888bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 16898bcb0991SDimitry Andric DiagnosticInfoUnsupported BadLDSDecl( 16908bcb0991SDimitry Andric Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); 16918bcb0991SDimitry Andric Fn.getContext().diagnose(BadLDSDecl); 16928bcb0991SDimitry Andric } 16938bcb0991SDimitry Andric 16948bcb0991SDimitry Andric // TODO: We could emit code to handle the initialization somewhere. 16958bcb0991SDimitry Andric if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 16968bcb0991SDimitry Andric B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); 16978bcb0991SDimitry Andric MI.eraseFromParent(); 16988bcb0991SDimitry Andric return true; 16998bcb0991SDimitry Andric } 17008bcb0991SDimitry Andric 17018bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 17028bcb0991SDimitry Andric DiagnosticInfoUnsupported BadInit( 17038bcb0991SDimitry Andric Fn, "unsupported initializer for address space", MI.getDebugLoc()); 17048bcb0991SDimitry Andric Fn.getContext().diagnose(BadInit); 17058bcb0991SDimitry Andric return true; 17068bcb0991SDimitry Andric } 17078bcb0991SDimitry Andric 17088bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 17098bcb0991SDimitry Andric 17108bcb0991SDimitry Andric if (TLI->shouldEmitFixup(GV)) { 17118bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 17128bcb0991SDimitry Andric MI.eraseFromParent(); 17138bcb0991SDimitry Andric return true; 17148bcb0991SDimitry Andric } 17158bcb0991SDimitry Andric 17168bcb0991SDimitry Andric if (TLI->shouldEmitPCReloc(GV)) { 17178bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 17188bcb0991SDimitry Andric MI.eraseFromParent(); 17198bcb0991SDimitry Andric return true; 17208bcb0991SDimitry Andric } 17218bcb0991SDimitry Andric 17228bcb0991SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 17238bcb0991SDimitry Andric Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 17248bcb0991SDimitry Andric 17258bcb0991SDimitry Andric MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 17268bcb0991SDimitry Andric MachinePointerInfo::getGOT(MF), 17278bcb0991SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 17288bcb0991SDimitry Andric MachineMemOperand::MOInvariant, 17298bcb0991SDimitry Andric 8 /*Size*/, 8 /*Align*/); 17308bcb0991SDimitry Andric 17318bcb0991SDimitry Andric buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 17328bcb0991SDimitry Andric 17338bcb0991SDimitry Andric if (Ty.getSizeInBits() == 32) { 17348bcb0991SDimitry Andric // Truncate if this is a 32-bit constant adrdess. 17358bcb0991SDimitry Andric auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 17368bcb0991SDimitry Andric B.buildExtract(DstReg, Load, 0); 17378bcb0991SDimitry Andric } else 17388bcb0991SDimitry Andric B.buildLoad(DstReg, GOTAddr, *GOTMMO); 17398bcb0991SDimitry Andric 17408bcb0991SDimitry Andric MI.eraseFromParent(); 17418bcb0991SDimitry Andric return true; 17428bcb0991SDimitry Andric } 17438bcb0991SDimitry Andric 17448bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad( 17458bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 17468bcb0991SDimitry Andric MachineIRBuilder &B, GISelChangeObserver &Observer) const { 17478bcb0991SDimitry Andric B.setInstr(MI); 17488bcb0991SDimitry Andric LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 17498bcb0991SDimitry Andric auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 17508bcb0991SDimitry Andric Observer.changingInstr(MI); 17518bcb0991SDimitry Andric MI.getOperand(1).setReg(Cast.getReg(0)); 17528bcb0991SDimitry Andric Observer.changedInstr(MI); 17538bcb0991SDimitry Andric return true; 17548bcb0991SDimitry Andric } 17558bcb0991SDimitry Andric 17568bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad( 17578bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 17588bcb0991SDimitry Andric MachineIRBuilder &B) const { 17598bcb0991SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 17608bcb0991SDimitry Andric assert(Ty.isScalar()); 17618bcb0991SDimitry Andric 1762*480093f4SDimitry Andric MachineFunction &MF = B.getMF(); 1763*480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 1764*480093f4SDimitry Andric 17658bcb0991SDimitry Andric // TODO: Always legal with future ftz flag. 1766*480093f4SDimitry Andric if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) 17678bcb0991SDimitry Andric return true; 1768*480093f4SDimitry Andric if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) 17698bcb0991SDimitry Andric return true; 17708bcb0991SDimitry Andric 17718bcb0991SDimitry Andric 17728bcb0991SDimitry Andric MachineIRBuilder HelperBuilder(MI); 17738bcb0991SDimitry Andric GISelObserverWrapper DummyObserver; 17748bcb0991SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 17758bcb0991SDimitry Andric HelperBuilder.setMBB(*MI.getParent()); 17768bcb0991SDimitry Andric return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 17778bcb0991SDimitry Andric } 17788bcb0991SDimitry Andric 1779*480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 1780*480093f4SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 1781*480093f4SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 1782*480093f4SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 1783*480093f4SDimitry Andric Register CmpVal = MI.getOperand(2).getReg(); 1784*480093f4SDimitry Andric Register NewVal = MI.getOperand(3).getReg(); 1785*480093f4SDimitry Andric 1786*480093f4SDimitry Andric assert(SITargetLowering::isFlatGlobalAddrSpace( 1787*480093f4SDimitry Andric MRI.getType(PtrReg).getAddressSpace()) && 1788*480093f4SDimitry Andric "this should not have been custom lowered"); 1789*480093f4SDimitry Andric 1790*480093f4SDimitry Andric LLT ValTy = MRI.getType(CmpVal); 1791*480093f4SDimitry Andric LLT VecTy = LLT::vector(2, ValTy); 1792*480093f4SDimitry Andric 1793*480093f4SDimitry Andric B.setInstr(MI); 1794*480093f4SDimitry Andric Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 1795*480093f4SDimitry Andric 1796*480093f4SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 1797*480093f4SDimitry Andric .addDef(DstReg) 1798*480093f4SDimitry Andric .addUse(PtrReg) 1799*480093f4SDimitry Andric .addUse(PackedVal) 1800*480093f4SDimitry Andric .setMemRefs(MI.memoperands()); 1801*480093f4SDimitry Andric 1802*480093f4SDimitry Andric MI.eraseFromParent(); 1803*480093f4SDimitry Andric return true; 1804*480093f4SDimitry Andric } 1805*480093f4SDimitry Andric 18060b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid. 18070b57cec5SDimitry Andric static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 1808*480093f4SDimitry Andric MachineRegisterInfo &MRI, 1809*480093f4SDimitry Andric MachineInstr *&Br) { 18100b57cec5SDimitry Andric Register CondDef = MI.getOperand(0).getReg(); 18110b57cec5SDimitry Andric if (!MRI.hasOneNonDBGUse(CondDef)) 18120b57cec5SDimitry Andric return nullptr; 18130b57cec5SDimitry Andric 18140b57cec5SDimitry Andric MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 1815*480093f4SDimitry Andric if (UseMI.getParent() != MI.getParent() || 1816*480093f4SDimitry Andric UseMI.getOpcode() != AMDGPU::G_BRCOND) 1817*480093f4SDimitry Andric return nullptr; 1818*480093f4SDimitry Andric 1819*480093f4SDimitry Andric // Make sure the cond br is followed by a G_BR 1820*480093f4SDimitry Andric MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 1821*480093f4SDimitry Andric if (Next != MI.getParent()->end()) { 1822*480093f4SDimitry Andric if (Next->getOpcode() != AMDGPU::G_BR) 1823*480093f4SDimitry Andric return nullptr; 1824*480093f4SDimitry Andric Br = &*Next; 1825*480093f4SDimitry Andric } 1826*480093f4SDimitry Andric 1827*480093f4SDimitry Andric return &UseMI; 18280b57cec5SDimitry Andric } 18290b57cec5SDimitry Andric 18300b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, 18310b57cec5SDimitry Andric Register Reg, LLT Ty) const { 18320b57cec5SDimitry Andric Register LiveIn = MRI.getLiveInVirtReg(Reg); 18330b57cec5SDimitry Andric if (LiveIn) 18340b57cec5SDimitry Andric return LiveIn; 18350b57cec5SDimitry Andric 18360b57cec5SDimitry Andric Register NewReg = MRI.createGenericVirtualRegister(Ty); 18370b57cec5SDimitry Andric MRI.addLiveIn(Reg, NewReg); 18380b57cec5SDimitry Andric return NewReg; 18390b57cec5SDimitry Andric } 18400b57cec5SDimitry Andric 18410b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 18420b57cec5SDimitry Andric const ArgDescriptor *Arg) const { 18438bcb0991SDimitry Andric if (!Arg->isRegister() || !Arg->getRegister().isValid()) 18440b57cec5SDimitry Andric return false; // TODO: Handle these 18450b57cec5SDimitry Andric 18460b57cec5SDimitry Andric assert(Arg->getRegister().isPhysical()); 18470b57cec5SDimitry Andric 18480b57cec5SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 18490b57cec5SDimitry Andric 18500b57cec5SDimitry Andric LLT Ty = MRI.getType(DstReg); 18510b57cec5SDimitry Andric Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); 18520b57cec5SDimitry Andric 18530b57cec5SDimitry Andric if (Arg->isMasked()) { 18540b57cec5SDimitry Andric // TODO: Should we try to emit this once in the entry block? 18550b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 18560b57cec5SDimitry Andric const unsigned Mask = Arg->getMask(); 18570b57cec5SDimitry Andric const unsigned Shift = countTrailingZeros<unsigned>(Mask); 18580b57cec5SDimitry Andric 18598bcb0991SDimitry Andric Register AndMaskSrc = LiveIn; 18608bcb0991SDimitry Andric 18618bcb0991SDimitry Andric if (Shift != 0) { 18620b57cec5SDimitry Andric auto ShiftAmt = B.buildConstant(S32, Shift); 18638bcb0991SDimitry Andric AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 18648bcb0991SDimitry Andric } 18658bcb0991SDimitry Andric 18668bcb0991SDimitry Andric B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 18670b57cec5SDimitry Andric } else 18680b57cec5SDimitry Andric B.buildCopy(DstReg, LiveIn); 18690b57cec5SDimitry Andric 18700b57cec5SDimitry Andric // Insert the argument copy if it doens't already exist. 18710b57cec5SDimitry Andric // FIXME: It seems EmitLiveInCopies isn't called anywhere? 18720b57cec5SDimitry Andric if (!MRI.getVRegDef(LiveIn)) { 18738bcb0991SDimitry Andric // FIXME: Should have scoped insert pt 18748bcb0991SDimitry Andric MachineBasicBlock &OrigInsBB = B.getMBB(); 18758bcb0991SDimitry Andric auto OrigInsPt = B.getInsertPt(); 18768bcb0991SDimitry Andric 18770b57cec5SDimitry Andric MachineBasicBlock &EntryMBB = B.getMF().front(); 18780b57cec5SDimitry Andric EntryMBB.addLiveIn(Arg->getRegister()); 18790b57cec5SDimitry Andric B.setInsertPt(EntryMBB, EntryMBB.begin()); 18800b57cec5SDimitry Andric B.buildCopy(LiveIn, Arg->getRegister()); 18818bcb0991SDimitry Andric 18828bcb0991SDimitry Andric B.setInsertPt(OrigInsBB, OrigInsPt); 18830b57cec5SDimitry Andric } 18840b57cec5SDimitry Andric 18850b57cec5SDimitry Andric return true; 18860b57cec5SDimitry Andric } 18870b57cec5SDimitry Andric 18880b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 18890b57cec5SDimitry Andric MachineInstr &MI, 18900b57cec5SDimitry Andric MachineRegisterInfo &MRI, 18910b57cec5SDimitry Andric MachineIRBuilder &B, 18920b57cec5SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 18930b57cec5SDimitry Andric B.setInstr(MI); 18940b57cec5SDimitry Andric 18950b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 18960b57cec5SDimitry Andric 18970b57cec5SDimitry Andric const ArgDescriptor *Arg; 18980b57cec5SDimitry Andric const TargetRegisterClass *RC; 18990b57cec5SDimitry Andric std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); 19000b57cec5SDimitry Andric if (!Arg) { 19010b57cec5SDimitry Andric LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 19020b57cec5SDimitry Andric return false; 19030b57cec5SDimitry Andric } 19040b57cec5SDimitry Andric 19050b57cec5SDimitry Andric if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { 19060b57cec5SDimitry Andric MI.eraseFromParent(); 19070b57cec5SDimitry Andric return true; 19080b57cec5SDimitry Andric } 19090b57cec5SDimitry Andric 19100b57cec5SDimitry Andric return false; 19110b57cec5SDimitry Andric } 19120b57cec5SDimitry Andric 19138bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 19148bcb0991SDimitry Andric MachineRegisterInfo &MRI, 19158bcb0991SDimitry Andric MachineIRBuilder &B) const { 19168bcb0991SDimitry Andric B.setInstr(MI); 1917*480093f4SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 1918*480093f4SDimitry Andric LLT DstTy = MRI.getType(Dst); 1919*480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 1920*480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 1921*480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 19228bcb0991SDimitry Andric 19238bcb0991SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 19248bcb0991SDimitry Andric return true; 19258bcb0991SDimitry Andric 1926*480093f4SDimitry Andric if (DstTy == S16) 1927*480093f4SDimitry Andric return legalizeFDIV16(MI, MRI, B); 1928*480093f4SDimitry Andric if (DstTy == S32) 1929*480093f4SDimitry Andric return legalizeFDIV32(MI, MRI, B); 1930*480093f4SDimitry Andric if (DstTy == S64) 1931*480093f4SDimitry Andric return legalizeFDIV64(MI, MRI, B); 1932*480093f4SDimitry Andric 19338bcb0991SDimitry Andric return false; 19348bcb0991SDimitry Andric } 19358bcb0991SDimitry Andric 19368bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 19378bcb0991SDimitry Andric MachineRegisterInfo &MRI, 19388bcb0991SDimitry Andric MachineIRBuilder &B) const { 19398bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 19408bcb0991SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 19418bcb0991SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 19428bcb0991SDimitry Andric 19438bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 19448bcb0991SDimitry Andric 19458bcb0991SDimitry Andric LLT ResTy = MRI.getType(Res); 19468bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 19478bcb0991SDimitry Andric LLT S64 = LLT::scalar(64); 19488bcb0991SDimitry Andric 19498bcb0991SDimitry Andric const MachineFunction &MF = B.getMF(); 19508bcb0991SDimitry Andric bool Unsafe = 19518bcb0991SDimitry Andric MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 19528bcb0991SDimitry Andric 19538bcb0991SDimitry Andric if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 19548bcb0991SDimitry Andric return false; 19558bcb0991SDimitry Andric 1956*480093f4SDimitry Andric if (!Unsafe && ResTy == S32 && 1957*480093f4SDimitry Andric MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) 19588bcb0991SDimitry Andric return false; 19598bcb0991SDimitry Andric 19608bcb0991SDimitry Andric if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 19618bcb0991SDimitry Andric // 1 / x -> RCP(x) 19628bcb0991SDimitry Andric if (CLHS->isExactlyValue(1.0)) { 19638bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 19648bcb0991SDimitry Andric .addUse(RHS) 19658bcb0991SDimitry Andric .setMIFlags(Flags); 19668bcb0991SDimitry Andric 19678bcb0991SDimitry Andric MI.eraseFromParent(); 19688bcb0991SDimitry Andric return true; 19698bcb0991SDimitry Andric } 19708bcb0991SDimitry Andric 19718bcb0991SDimitry Andric // -1 / x -> RCP( FNEG(x) ) 19728bcb0991SDimitry Andric if (CLHS->isExactlyValue(-1.0)) { 19738bcb0991SDimitry Andric auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 19748bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 19758bcb0991SDimitry Andric .addUse(FNeg.getReg(0)) 19768bcb0991SDimitry Andric .setMIFlags(Flags); 19778bcb0991SDimitry Andric 19788bcb0991SDimitry Andric MI.eraseFromParent(); 19798bcb0991SDimitry Andric return true; 19808bcb0991SDimitry Andric } 19818bcb0991SDimitry Andric } 19828bcb0991SDimitry Andric 19838bcb0991SDimitry Andric // x / y -> x * (1.0 / y) 19848bcb0991SDimitry Andric if (Unsafe) { 19858bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 19868bcb0991SDimitry Andric .addUse(RHS) 19878bcb0991SDimitry Andric .setMIFlags(Flags); 19888bcb0991SDimitry Andric B.buildFMul(Res, LHS, RCP, Flags); 19898bcb0991SDimitry Andric 19908bcb0991SDimitry Andric MI.eraseFromParent(); 19918bcb0991SDimitry Andric return true; 19928bcb0991SDimitry Andric } 19938bcb0991SDimitry Andric 19948bcb0991SDimitry Andric return false; 19958bcb0991SDimitry Andric } 19968bcb0991SDimitry Andric 1997*480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 1998*480093f4SDimitry Andric MachineRegisterInfo &MRI, 1999*480093f4SDimitry Andric MachineIRBuilder &B) const { 2000*480093f4SDimitry Andric B.setInstr(MI); 2001*480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 2002*480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 2003*480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 2004*480093f4SDimitry Andric 2005*480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 2006*480093f4SDimitry Andric 2007*480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 2008*480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 2009*480093f4SDimitry Andric 2010*480093f4SDimitry Andric auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2011*480093f4SDimitry Andric auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2012*480093f4SDimitry Andric 2013*480093f4SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2014*480093f4SDimitry Andric .addUse(RHSExt.getReg(0)) 2015*480093f4SDimitry Andric .setMIFlags(Flags); 2016*480093f4SDimitry Andric 2017*480093f4SDimitry Andric auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2018*480093f4SDimitry Andric auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2019*480093f4SDimitry Andric 2020*480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2021*480093f4SDimitry Andric .addUse(RDst.getReg(0)) 2022*480093f4SDimitry Andric .addUse(RHS) 2023*480093f4SDimitry Andric .addUse(LHS) 2024*480093f4SDimitry Andric .setMIFlags(Flags); 2025*480093f4SDimitry Andric 2026*480093f4SDimitry Andric MI.eraseFromParent(); 2027*480093f4SDimitry Andric return true; 2028*480093f4SDimitry Andric } 2029*480093f4SDimitry Andric 2030*480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2031*480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2032*480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable, 2033*480093f4SDimitry Andric MachineIRBuilder &B, 2034*480093f4SDimitry Andric const GCNSubtarget &ST, 2035*480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode) { 2036*480093f4SDimitry Andric // Set SP denorm mode to this value. 2037*480093f4SDimitry Andric unsigned SPDenormMode = 2038*480093f4SDimitry Andric Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2039*480093f4SDimitry Andric 2040*480093f4SDimitry Andric if (ST.hasDenormModeInst()) { 2041*480093f4SDimitry Andric // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2042*480093f4SDimitry Andric unsigned DPDenormModeDefault = Mode.FP64FP16Denormals 2043*480093f4SDimitry Andric ? FP_DENORM_FLUSH_NONE 2044*480093f4SDimitry Andric : FP_DENORM_FLUSH_IN_FLUSH_OUT; 2045*480093f4SDimitry Andric 2046*480093f4SDimitry Andric unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2047*480093f4SDimitry Andric B.buildInstr(AMDGPU::S_DENORM_MODE) 2048*480093f4SDimitry Andric .addImm(NewDenormModeValue); 2049*480093f4SDimitry Andric 2050*480093f4SDimitry Andric } else { 2051*480093f4SDimitry Andric // Select FP32 bit field in mode register. 2052*480093f4SDimitry Andric unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2053*480093f4SDimitry Andric (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2054*480093f4SDimitry Andric (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2055*480093f4SDimitry Andric 2056*480093f4SDimitry Andric B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2057*480093f4SDimitry Andric .addImm(SPDenormMode) 2058*480093f4SDimitry Andric .addImm(SPDenormModeBitField); 2059*480093f4SDimitry Andric } 2060*480093f4SDimitry Andric } 2061*480093f4SDimitry Andric 2062*480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2063*480093f4SDimitry Andric MachineRegisterInfo &MRI, 2064*480093f4SDimitry Andric MachineIRBuilder &B) const { 2065*480093f4SDimitry Andric B.setInstr(MI); 2066*480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 2067*480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 2068*480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 2069*480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2070*480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2071*480093f4SDimitry Andric 2072*480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 2073*480093f4SDimitry Andric 2074*480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 2075*480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 2076*480093f4SDimitry Andric 2077*480093f4SDimitry Andric auto One = B.buildFConstant(S32, 1.0f); 2078*480093f4SDimitry Andric 2079*480093f4SDimitry Andric auto DenominatorScaled = 2080*480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2081*480093f4SDimitry Andric .addUse(RHS) 2082*480093f4SDimitry Andric .addUse(LHS) 2083*480093f4SDimitry Andric .addImm(1) 2084*480093f4SDimitry Andric .setMIFlags(Flags); 2085*480093f4SDimitry Andric auto NumeratorScaled = 2086*480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2087*480093f4SDimitry Andric .addUse(LHS) 2088*480093f4SDimitry Andric .addUse(RHS) 2089*480093f4SDimitry Andric .addImm(0) 2090*480093f4SDimitry Andric .setMIFlags(Flags); 2091*480093f4SDimitry Andric 2092*480093f4SDimitry Andric auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2093*480093f4SDimitry Andric .addUse(DenominatorScaled.getReg(0)) 2094*480093f4SDimitry Andric .setMIFlags(Flags); 2095*480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2096*480093f4SDimitry Andric 2097*480093f4SDimitry Andric // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2098*480093f4SDimitry Andric // aren't modeled as reading it. 2099*480093f4SDimitry Andric if (!Mode.FP32Denormals) 2100*480093f4SDimitry Andric toggleSPDenormMode(true, B, ST, Mode); 2101*480093f4SDimitry Andric 2102*480093f4SDimitry Andric auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2103*480093f4SDimitry Andric auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2104*480093f4SDimitry Andric auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2105*480093f4SDimitry Andric auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2106*480093f4SDimitry Andric auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2107*480093f4SDimitry Andric auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2108*480093f4SDimitry Andric 2109*480093f4SDimitry Andric if (!Mode.FP32Denormals) 2110*480093f4SDimitry Andric toggleSPDenormMode(false, B, ST, Mode); 2111*480093f4SDimitry Andric 2112*480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2113*480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 2114*480093f4SDimitry Andric .addUse(Fma1.getReg(0)) 2115*480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 2116*480093f4SDimitry Andric .addUse(NumeratorScaled.getReg(1)) 2117*480093f4SDimitry Andric .setMIFlags(Flags); 2118*480093f4SDimitry Andric 2119*480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2120*480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 2121*480093f4SDimitry Andric .addUse(RHS) 2122*480093f4SDimitry Andric .addUse(LHS) 2123*480093f4SDimitry Andric .setMIFlags(Flags); 2124*480093f4SDimitry Andric 2125*480093f4SDimitry Andric MI.eraseFromParent(); 2126*480093f4SDimitry Andric return true; 2127*480093f4SDimitry Andric } 2128*480093f4SDimitry Andric 2129*480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 2130*480093f4SDimitry Andric MachineRegisterInfo &MRI, 2131*480093f4SDimitry Andric MachineIRBuilder &B) const { 2132*480093f4SDimitry Andric B.setInstr(MI); 2133*480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 2134*480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 2135*480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 2136*480093f4SDimitry Andric 2137*480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 2138*480093f4SDimitry Andric 2139*480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 2140*480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 2141*480093f4SDimitry Andric 2142*480093f4SDimitry Andric auto One = B.buildFConstant(S64, 1.0); 2143*480093f4SDimitry Andric 2144*480093f4SDimitry Andric auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2145*480093f4SDimitry Andric .addUse(LHS) 2146*480093f4SDimitry Andric .addUse(RHS) 2147*480093f4SDimitry Andric .addImm(1) 2148*480093f4SDimitry Andric .setMIFlags(Flags); 2149*480093f4SDimitry Andric 2150*480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 2151*480093f4SDimitry Andric 2152*480093f4SDimitry Andric auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 2153*480093f4SDimitry Andric .addUse(DivScale0.getReg(0)) 2154*480093f4SDimitry Andric .setMIFlags(Flags); 2155*480093f4SDimitry Andric 2156*480093f4SDimitry Andric auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 2157*480093f4SDimitry Andric auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 2158*480093f4SDimitry Andric auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 2159*480093f4SDimitry Andric 2160*480093f4SDimitry Andric auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 2161*480093f4SDimitry Andric .addUse(LHS) 2162*480093f4SDimitry Andric .addUse(RHS) 2163*480093f4SDimitry Andric .addImm(0) 2164*480093f4SDimitry Andric .setMIFlags(Flags); 2165*480093f4SDimitry Andric 2166*480093f4SDimitry Andric auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 2167*480093f4SDimitry Andric auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); 2168*480093f4SDimitry Andric auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 2169*480093f4SDimitry Andric 2170*480093f4SDimitry Andric Register Scale; 2171*480093f4SDimitry Andric if (!ST.hasUsableDivScaleConditionOutput()) { 2172*480093f4SDimitry Andric // Workaround a hardware bug on SI where the condition output from div_scale 2173*480093f4SDimitry Andric // is not usable. 2174*480093f4SDimitry Andric 2175*480093f4SDimitry Andric Scale = MRI.createGenericVirtualRegister(S1); 2176*480093f4SDimitry Andric 2177*480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 2178*480093f4SDimitry Andric 2179*480093f4SDimitry Andric auto NumUnmerge = B.buildUnmerge(S32, LHS); 2180*480093f4SDimitry Andric auto DenUnmerge = B.buildUnmerge(S32, RHS); 2181*480093f4SDimitry Andric auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 2182*480093f4SDimitry Andric auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 2183*480093f4SDimitry Andric 2184*480093f4SDimitry Andric auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 2185*480093f4SDimitry Andric Scale1Unmerge.getReg(1)); 2186*480093f4SDimitry Andric auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 2187*480093f4SDimitry Andric Scale0Unmerge.getReg(1)); 2188*480093f4SDimitry Andric B.buildXor(Scale, CmpNum, CmpDen); 2189*480093f4SDimitry Andric } else { 2190*480093f4SDimitry Andric Scale = DivScale1.getReg(1); 2191*480093f4SDimitry Andric } 2192*480093f4SDimitry Andric 2193*480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 2194*480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 2195*480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 2196*480093f4SDimitry Andric .addUse(Mul.getReg(0)) 2197*480093f4SDimitry Andric .addUse(Scale) 2198*480093f4SDimitry Andric .setMIFlags(Flags); 2199*480093f4SDimitry Andric 2200*480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 2201*480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 2202*480093f4SDimitry Andric .addUse(RHS) 2203*480093f4SDimitry Andric .addUse(LHS) 2204*480093f4SDimitry Andric .setMIFlags(Flags); 2205*480093f4SDimitry Andric 2206*480093f4SDimitry Andric MI.eraseFromParent(); 2207*480093f4SDimitry Andric return true; 2208*480093f4SDimitry Andric } 2209*480093f4SDimitry Andric 22108bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 22118bcb0991SDimitry Andric MachineRegisterInfo &MRI, 22128bcb0991SDimitry Andric MachineIRBuilder &B) const { 22138bcb0991SDimitry Andric B.setInstr(MI); 22148bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 22158bcb0991SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 22168bcb0991SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 22178bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 22188bcb0991SDimitry Andric 22198bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 22208bcb0991SDimitry Andric LLT S1 = LLT::scalar(1); 22218bcb0991SDimitry Andric 22228bcb0991SDimitry Andric auto Abs = B.buildFAbs(S32, RHS, Flags); 22238bcb0991SDimitry Andric const APFloat C0Val(1.0f); 22248bcb0991SDimitry Andric 22258bcb0991SDimitry Andric auto C0 = B.buildConstant(S32, 0x6f800000); 22268bcb0991SDimitry Andric auto C1 = B.buildConstant(S32, 0x2f800000); 22278bcb0991SDimitry Andric auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 22288bcb0991SDimitry Andric 22298bcb0991SDimitry Andric auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 22308bcb0991SDimitry Andric auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 22318bcb0991SDimitry Andric 22328bcb0991SDimitry Andric auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 22338bcb0991SDimitry Andric 22348bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 22358bcb0991SDimitry Andric .addUse(Mul0.getReg(0)) 22368bcb0991SDimitry Andric .setMIFlags(Flags); 22378bcb0991SDimitry Andric 22388bcb0991SDimitry Andric auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 22398bcb0991SDimitry Andric 22408bcb0991SDimitry Andric B.buildFMul(Res, Sel, Mul1, Flags); 22418bcb0991SDimitry Andric 22428bcb0991SDimitry Andric MI.eraseFromParent(); 22438bcb0991SDimitry Andric return true; 22448bcb0991SDimitry Andric } 22458bcb0991SDimitry Andric 22460b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 22470b57cec5SDimitry Andric MachineRegisterInfo &MRI, 22480b57cec5SDimitry Andric MachineIRBuilder &B) const { 22490b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 22500b57cec5SDimitry Andric if (!MFI->isEntryFunction()) { 22510b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 22520b57cec5SDimitry Andric AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 22530b57cec5SDimitry Andric } 22540b57cec5SDimitry Andric 22550b57cec5SDimitry Andric B.setInstr(MI); 22560b57cec5SDimitry Andric 22570b57cec5SDimitry Andric uint64_t Offset = 22580b57cec5SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset( 22590b57cec5SDimitry Andric B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 22600b57cec5SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 22610b57cec5SDimitry Andric LLT DstTy = MRI.getType(DstReg); 22620b57cec5SDimitry Andric LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 22630b57cec5SDimitry Andric 22640b57cec5SDimitry Andric const ArgDescriptor *Arg; 22650b57cec5SDimitry Andric const TargetRegisterClass *RC; 22660b57cec5SDimitry Andric std::tie(Arg, RC) 22670b57cec5SDimitry Andric = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 22680b57cec5SDimitry Andric if (!Arg) 22690b57cec5SDimitry Andric return false; 22700b57cec5SDimitry Andric 22710b57cec5SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 22720b57cec5SDimitry Andric if (!loadInputValue(KernargPtrReg, B, Arg)) 22730b57cec5SDimitry Andric return false; 22740b57cec5SDimitry Andric 2275*480093f4SDimitry Andric B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 22760b57cec5SDimitry Andric MI.eraseFromParent(); 22770b57cec5SDimitry Andric return true; 22780b57cec5SDimitry Andric } 22790b57cec5SDimitry Andric 22808bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 22818bcb0991SDimitry Andric MachineRegisterInfo &MRI, 22828bcb0991SDimitry Andric MachineIRBuilder &B, 22838bcb0991SDimitry Andric unsigned AddrSpace) const { 22848bcb0991SDimitry Andric B.setInstr(MI); 22858bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 22868bcb0991SDimitry Andric auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 22878bcb0991SDimitry Andric B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 22888bcb0991SDimitry Andric MI.eraseFromParent(); 22898bcb0991SDimitry Andric return true; 22908bcb0991SDimitry Andric } 22918bcb0991SDimitry Andric 22928bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets. 22938bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 22948bcb0991SDimitry Andric MachineRegisterInfo &MRI, 22958bcb0991SDimitry Andric Register Reg) const { 22968bcb0991SDimitry Andric if (!ST.hasUnpackedD16VMem()) 22978bcb0991SDimitry Andric return Reg; 22988bcb0991SDimitry Andric 22998bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 23008bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 23018bcb0991SDimitry Andric LLT StoreVT = MRI.getType(Reg); 23028bcb0991SDimitry Andric assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 23038bcb0991SDimitry Andric 23048bcb0991SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 23058bcb0991SDimitry Andric 23068bcb0991SDimitry Andric SmallVector<Register, 4> WideRegs; 23078bcb0991SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 23088bcb0991SDimitry Andric WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 23098bcb0991SDimitry Andric 23108bcb0991SDimitry Andric int NumElts = StoreVT.getNumElements(); 23118bcb0991SDimitry Andric 23128bcb0991SDimitry Andric return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 23138bcb0991SDimitry Andric } 23148bcb0991SDimitry Andric 23158bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, 23168bcb0991SDimitry Andric MachineRegisterInfo &MRI, 23178bcb0991SDimitry Andric MachineIRBuilder &B, 23188bcb0991SDimitry Andric bool IsFormat) const { 23198bcb0991SDimitry Andric // TODO: Reject f16 format on targets where unsupported. 23208bcb0991SDimitry Andric Register VData = MI.getOperand(1).getReg(); 23218bcb0991SDimitry Andric LLT Ty = MRI.getType(VData); 23228bcb0991SDimitry Andric 23238bcb0991SDimitry Andric B.setInstr(MI); 23248bcb0991SDimitry Andric 23258bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 23268bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 23278bcb0991SDimitry Andric 23288bcb0991SDimitry Andric // Fixup illegal register types for i8 stores. 23298bcb0991SDimitry Andric if (Ty == LLT::scalar(8) || Ty == S16) { 23308bcb0991SDimitry Andric Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 23318bcb0991SDimitry Andric MI.getOperand(1).setReg(AnyExt); 23328bcb0991SDimitry Andric return true; 23338bcb0991SDimitry Andric } 23348bcb0991SDimitry Andric 23358bcb0991SDimitry Andric if (Ty.isVector()) { 23368bcb0991SDimitry Andric if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 23378bcb0991SDimitry Andric if (IsFormat) 23388bcb0991SDimitry Andric MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); 23398bcb0991SDimitry Andric return true; 23408bcb0991SDimitry Andric } 23418bcb0991SDimitry Andric 23428bcb0991SDimitry Andric return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; 23438bcb0991SDimitry Andric } 23448bcb0991SDimitry Andric 23458bcb0991SDimitry Andric return Ty == S32; 23468bcb0991SDimitry Andric } 23478bcb0991SDimitry Andric 23480b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, 23490b57cec5SDimitry Andric MachineRegisterInfo &MRI, 23500b57cec5SDimitry Andric MachineIRBuilder &B) const { 23510b57cec5SDimitry Andric // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 2352*480093f4SDimitry Andric auto IntrID = MI.getIntrinsicID(); 2353*480093f4SDimitry Andric switch (IntrID) { 2354*480093f4SDimitry Andric case Intrinsic::amdgcn_if: 2355*480093f4SDimitry Andric case Intrinsic::amdgcn_else: { 2356*480093f4SDimitry Andric MachineInstr *Br = nullptr; 2357*480093f4SDimitry Andric if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 23580b57cec5SDimitry Andric const SIRegisterInfo *TRI 23590b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 23600b57cec5SDimitry Andric 23610b57cec5SDimitry Andric B.setInstr(*BrCond); 23620b57cec5SDimitry Andric Register Def = MI.getOperand(1).getReg(); 23630b57cec5SDimitry Andric Register Use = MI.getOperand(3).getReg(); 2364*480093f4SDimitry Andric 2365*480093f4SDimitry Andric MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); 2366*480093f4SDimitry Andric if (Br) 2367*480093f4SDimitry Andric BrTarget = Br->getOperand(0).getMBB(); 2368*480093f4SDimitry Andric 2369*480093f4SDimitry Andric if (IntrID == Intrinsic::amdgcn_if) { 23700b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_IF) 23710b57cec5SDimitry Andric .addDef(Def) 23720b57cec5SDimitry Andric .addUse(Use) 2373*480093f4SDimitry Andric .addMBB(BrTarget); 2374*480093f4SDimitry Andric } else { 2375*480093f4SDimitry Andric B.buildInstr(AMDGPU::SI_ELSE) 2376*480093f4SDimitry Andric .addDef(Def) 2377*480093f4SDimitry Andric .addUse(Use) 2378*480093f4SDimitry Andric .addMBB(BrTarget) 2379*480093f4SDimitry Andric .addImm(0); 2380*480093f4SDimitry Andric } 2381*480093f4SDimitry Andric 2382*480093f4SDimitry Andric if (Br) 2383*480093f4SDimitry Andric Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); 23840b57cec5SDimitry Andric 23850b57cec5SDimitry Andric MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 23860b57cec5SDimitry Andric MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 23870b57cec5SDimitry Andric MI.eraseFromParent(); 23880b57cec5SDimitry Andric BrCond->eraseFromParent(); 23890b57cec5SDimitry Andric return true; 23900b57cec5SDimitry Andric } 23910b57cec5SDimitry Andric 23920b57cec5SDimitry Andric return false; 23930b57cec5SDimitry Andric } 23940b57cec5SDimitry Andric case Intrinsic::amdgcn_loop: { 2395*480093f4SDimitry Andric MachineInstr *Br = nullptr; 2396*480093f4SDimitry Andric if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { 23970b57cec5SDimitry Andric const SIRegisterInfo *TRI 23980b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 23990b57cec5SDimitry Andric 24000b57cec5SDimitry Andric B.setInstr(*BrCond); 2401*480093f4SDimitry Andric 2402*480093f4SDimitry Andric // FIXME: Need to adjust branch targets based on unconditional branch. 24030b57cec5SDimitry Andric Register Reg = MI.getOperand(2).getReg(); 24040b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_LOOP) 24050b57cec5SDimitry Andric .addUse(Reg) 24060b57cec5SDimitry Andric .addMBB(BrCond->getOperand(1).getMBB()); 24070b57cec5SDimitry Andric MI.eraseFromParent(); 24080b57cec5SDimitry Andric BrCond->eraseFromParent(); 24090b57cec5SDimitry Andric MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 24100b57cec5SDimitry Andric return true; 24110b57cec5SDimitry Andric } 24120b57cec5SDimitry Andric 24130b57cec5SDimitry Andric return false; 24140b57cec5SDimitry Andric } 24150b57cec5SDimitry Andric case Intrinsic::amdgcn_kernarg_segment_ptr: 24160b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 24170b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 24180b57cec5SDimitry Andric case Intrinsic::amdgcn_implicitarg_ptr: 24190b57cec5SDimitry Andric return legalizeImplicitArgPtr(MI, MRI, B); 24200b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 24210b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24220b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_X); 24230b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 24240b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24250b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 24260b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 24270b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24280b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 24290b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_x: 24300b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24310b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 24320b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_y: 24330b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24340b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 24350b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_z: 24360b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24370b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 24380b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_ptr: 24390b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24400b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_PTR); 24410b57cec5SDimitry Andric case Intrinsic::amdgcn_queue_ptr: 24420b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24430b57cec5SDimitry Andric AMDGPUFunctionArgInfo::QUEUE_PTR); 24440b57cec5SDimitry Andric case Intrinsic::amdgcn_implicit_buffer_ptr: 24450b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 24460b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 24470b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_id: 24480b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 24490b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_ID); 24508bcb0991SDimitry Andric case Intrinsic::amdgcn_fdiv_fast: 24518bcb0991SDimitry Andric return legalizeFDIVFastIntrin(MI, MRI, B); 24528bcb0991SDimitry Andric case Intrinsic::amdgcn_is_shared: 24538bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 24548bcb0991SDimitry Andric case Intrinsic::amdgcn_is_private: 24558bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 24568bcb0991SDimitry Andric case Intrinsic::amdgcn_wavefrontsize: { 24578bcb0991SDimitry Andric B.setInstr(MI); 24588bcb0991SDimitry Andric B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 24598bcb0991SDimitry Andric MI.eraseFromParent(); 24608bcb0991SDimitry Andric return true; 24618bcb0991SDimitry Andric } 24628bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store: 24638bcb0991SDimitry Andric return legalizeRawBufferStore(MI, MRI, B, false); 24648bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store_format: 24658bcb0991SDimitry Andric return legalizeRawBufferStore(MI, MRI, B, true); 24660b57cec5SDimitry Andric default: 24670b57cec5SDimitry Andric return true; 24680b57cec5SDimitry Andric } 24690b57cec5SDimitry Andric 24700b57cec5SDimitry Andric return true; 24710b57cec5SDimitry Andric } 2472