10b57cec5SDimitry Andric //===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric /// \file 90b57cec5SDimitry Andric /// This file implements the targeting of the Machinelegalizer class for 100b57cec5SDimitry Andric /// AMDGPU. 110b57cec5SDimitry Andric /// \todo This should be generated by TableGen. 120b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 130b57cec5SDimitry Andric 14*5ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 158bcb0991SDimitry Andric 160b57cec5SDimitry Andric #include "AMDGPU.h" 17*5ffd83dbSDimitry Andric #include "AMDGPUGlobalISelUtils.h" 180b57cec5SDimitry Andric #include "AMDGPUTargetMachine.h" 190b57cec5SDimitry Andric #include "SIMachineFunctionInfo.h" 20*5ffd83dbSDimitry Andric #include "llvm/ADT/ScopeExit.h" 210b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" 220b57cec5SDimitry Andric #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 23*5ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 240b57cec5SDimitry Andric #include "llvm/CodeGen/TargetOpcodes.h" 250b57cec5SDimitry Andric #include "llvm/CodeGen/ValueTypes.h" 260b57cec5SDimitry Andric #include "llvm/IR/DerivedTypes.h" 278bcb0991SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 280b57cec5SDimitry Andric #include "llvm/IR/Type.h" 290b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 300b57cec5SDimitry Andric 310b57cec5SDimitry Andric #define DEBUG_TYPE "amdgpu-legalinfo" 320b57cec5SDimitry Andric 330b57cec5SDimitry Andric using namespace llvm; 340b57cec5SDimitry Andric using namespace LegalizeActions; 350b57cec5SDimitry Andric using namespace LegalizeMutations; 360b57cec5SDimitry Andric using namespace LegalityPredicates; 37*5ffd83dbSDimitry Andric using namespace MIPatternMatch; 380b57cec5SDimitry Andric 39*5ffd83dbSDimitry Andric // Hack until load/store selection patterns support any tuple of legal types. 40*5ffd83dbSDimitry Andric static cl::opt<bool> EnableNewLegality( 41*5ffd83dbSDimitry Andric "amdgpu-global-isel-new-legality", 42*5ffd83dbSDimitry Andric cl::desc("Use GlobalISel desired legality, rather than try to use" 43*5ffd83dbSDimitry Andric "rules compatible with selection patterns"), 44*5ffd83dbSDimitry Andric cl::init(false), 45*5ffd83dbSDimitry Andric cl::ReallyHidden); 460b57cec5SDimitry Andric 47*5ffd83dbSDimitry Andric static constexpr unsigned MaxRegisterSize = 1024; 48*5ffd83dbSDimitry Andric 49*5ffd83dbSDimitry Andric // Round the number of elements to the next power of two elements 50*5ffd83dbSDimitry Andric static LLT getPow2VectorType(LLT Ty) { 51*5ffd83dbSDimitry Andric unsigned NElts = Ty.getNumElements(); 52*5ffd83dbSDimitry Andric unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); 53*5ffd83dbSDimitry Andric return Ty.changeNumElements(Pow2NElts); 540b57cec5SDimitry Andric } 550b57cec5SDimitry Andric 56*5ffd83dbSDimitry Andric // Round the number of bits to the next power of two bits 57*5ffd83dbSDimitry Andric static LLT getPow2ScalarType(LLT Ty) { 58*5ffd83dbSDimitry Andric unsigned Bits = Ty.getSizeInBits(); 59*5ffd83dbSDimitry Andric unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); 60*5ffd83dbSDimitry Andric return LLT::scalar(Pow2Bits); 618bcb0991SDimitry Andric } 628bcb0991SDimitry Andric 630b57cec5SDimitry Andric static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { 640b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 650b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 660b57cec5SDimitry Andric return Ty.isVector() && 670b57cec5SDimitry Andric Ty.getNumElements() % 2 != 0 && 688bcb0991SDimitry Andric Ty.getElementType().getSizeInBits() < 32 && 698bcb0991SDimitry Andric Ty.getSizeInBits() % 32 != 0; 708bcb0991SDimitry Andric }; 718bcb0991SDimitry Andric } 728bcb0991SDimitry Andric 738bcb0991SDimitry Andric static LegalityPredicate isWideVec16(unsigned TypeIdx) { 748bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 758bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 768bcb0991SDimitry Andric const LLT EltTy = Ty.getScalarType(); 778bcb0991SDimitry Andric return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2; 780b57cec5SDimitry Andric }; 790b57cec5SDimitry Andric } 800b57cec5SDimitry Andric 810b57cec5SDimitry Andric static LegalizeMutation oneMoreElement(unsigned TypeIdx) { 820b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 830b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 840b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 850b57cec5SDimitry Andric return std::make_pair(TypeIdx, LLT::vector(Ty.getNumElements() + 1, EltTy)); 860b57cec5SDimitry Andric }; 870b57cec5SDimitry Andric } 880b57cec5SDimitry Andric 890b57cec5SDimitry Andric static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) { 900b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 910b57cec5SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 920b57cec5SDimitry Andric const LLT EltTy = Ty.getElementType(); 930b57cec5SDimitry Andric unsigned Size = Ty.getSizeInBits(); 940b57cec5SDimitry Andric unsigned Pieces = (Size + 63) / 64; 950b57cec5SDimitry Andric unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces; 960b57cec5SDimitry Andric return std::make_pair(TypeIdx, LLT::scalarOrVector(NewNumElts, EltTy)); 970b57cec5SDimitry Andric }; 980b57cec5SDimitry Andric } 990b57cec5SDimitry Andric 1008bcb0991SDimitry Andric // Increase the number of vector elements to reach the next multiple of 32-bit 1018bcb0991SDimitry Andric // type. 1028bcb0991SDimitry Andric static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { 1038bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1048bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 1058bcb0991SDimitry Andric 1068bcb0991SDimitry Andric const LLT EltTy = Ty.getElementType(); 1078bcb0991SDimitry Andric const int Size = Ty.getSizeInBits(); 1088bcb0991SDimitry Andric const int EltSize = EltTy.getSizeInBits(); 1098bcb0991SDimitry Andric const int NextMul32 = (Size + 31) / 32; 1108bcb0991SDimitry Andric 1118bcb0991SDimitry Andric assert(EltSize < 32); 1128bcb0991SDimitry Andric 1138bcb0991SDimitry Andric const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize; 1148bcb0991SDimitry Andric return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy)); 1158bcb0991SDimitry Andric }; 1168bcb0991SDimitry Andric } 1178bcb0991SDimitry Andric 118*5ffd83dbSDimitry Andric static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { 119*5ffd83dbSDimitry Andric return [=](const LegalityQuery &Query) { 120*5ffd83dbSDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 121*5ffd83dbSDimitry Andric unsigned Size = Ty.getSizeInBits(); 122*5ffd83dbSDimitry Andric 123*5ffd83dbSDimitry Andric LLT CoercedTy; 124*5ffd83dbSDimitry Andric if (Size <= 32) { 125*5ffd83dbSDimitry Andric // <2 x s8> -> s16 126*5ffd83dbSDimitry Andric // <4 x s8> -> s32 127*5ffd83dbSDimitry Andric CoercedTy = LLT::scalar(Size); 128*5ffd83dbSDimitry Andric } else 129*5ffd83dbSDimitry Andric CoercedTy = LLT::scalarOrVector(Size / 32, 32); 130*5ffd83dbSDimitry Andric 131*5ffd83dbSDimitry Andric return std::make_pair(TypeIdx, CoercedTy); 132*5ffd83dbSDimitry Andric }; 133*5ffd83dbSDimitry Andric } 134*5ffd83dbSDimitry Andric 1358bcb0991SDimitry Andric static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { 1368bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 1378bcb0991SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1388bcb0991SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() < Size; 1398bcb0991SDimitry Andric }; 1408bcb0991SDimitry Andric } 1418bcb0991SDimitry Andric 1420b57cec5SDimitry Andric static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) { 1430b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1440b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1450b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getSizeInBits() > Size; 1460b57cec5SDimitry Andric }; 1470b57cec5SDimitry Andric } 1480b57cec5SDimitry Andric 1490b57cec5SDimitry Andric static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { 1500b57cec5SDimitry Andric return [=](const LegalityQuery &Query) { 1510b57cec5SDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 1520b57cec5SDimitry Andric return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0; 1530b57cec5SDimitry Andric }; 1540b57cec5SDimitry Andric } 1550b57cec5SDimitry Andric 156*5ffd83dbSDimitry Andric static bool isRegisterSize(unsigned Size) { 157*5ffd83dbSDimitry Andric return Size % 32 == 0 && Size <= MaxRegisterSize; 158*5ffd83dbSDimitry Andric } 159*5ffd83dbSDimitry Andric 160*5ffd83dbSDimitry Andric static bool isRegisterVectorElementType(LLT EltTy) { 161*5ffd83dbSDimitry Andric const int EltSize = EltTy.getSizeInBits(); 162*5ffd83dbSDimitry Andric return EltSize == 16 || EltSize % 32 == 0; 163*5ffd83dbSDimitry Andric } 164*5ffd83dbSDimitry Andric 165*5ffd83dbSDimitry Andric static bool isRegisterVectorType(LLT Ty) { 1660b57cec5SDimitry Andric const int EltSize = Ty.getElementType().getSizeInBits(); 1670b57cec5SDimitry Andric return EltSize == 32 || EltSize == 64 || 1680b57cec5SDimitry Andric (EltSize == 16 && Ty.getNumElements() % 2 == 0) || 1690b57cec5SDimitry Andric EltSize == 128 || EltSize == 256; 1700b57cec5SDimitry Andric } 1710b57cec5SDimitry Andric 172*5ffd83dbSDimitry Andric static bool isRegisterType(LLT Ty) { 173*5ffd83dbSDimitry Andric if (!isRegisterSize(Ty.getSizeInBits())) 174*5ffd83dbSDimitry Andric return false; 175*5ffd83dbSDimitry Andric 176*5ffd83dbSDimitry Andric if (Ty.isVector()) 177*5ffd83dbSDimitry Andric return isRegisterVectorType(Ty); 178*5ffd83dbSDimitry Andric 179*5ffd83dbSDimitry Andric return true; 180*5ffd83dbSDimitry Andric } 181*5ffd83dbSDimitry Andric 182*5ffd83dbSDimitry Andric // Any combination of 32 or 64-bit elements up the maximum register size, and 183*5ffd83dbSDimitry Andric // multiples of v2s16. 184*5ffd83dbSDimitry Andric static LegalityPredicate isRegisterType(unsigned TypeIdx) { 185*5ffd83dbSDimitry Andric return [=](const LegalityQuery &Query) { 186*5ffd83dbSDimitry Andric return isRegisterType(Query.Types[TypeIdx]); 1878bcb0991SDimitry Andric }; 1888bcb0991SDimitry Andric } 1898bcb0991SDimitry Andric 190*5ffd83dbSDimitry Andric static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { 1918bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 192*5ffd83dbSDimitry Andric const LLT QueryTy = Query.Types[TypeIdx]; 193*5ffd83dbSDimitry Andric if (!QueryTy.isVector()) 194*5ffd83dbSDimitry Andric return false; 195*5ffd83dbSDimitry Andric const LLT EltTy = QueryTy.getElementType(); 196*5ffd83dbSDimitry Andric return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; 1978bcb0991SDimitry Andric }; 1988bcb0991SDimitry Andric } 1998bcb0991SDimitry Andric 2008bcb0991SDimitry Andric static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { 2018bcb0991SDimitry Andric return [=](const LegalityQuery &Query) { 2028bcb0991SDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 2038bcb0991SDimitry Andric return !Ty.isVector() && Ty.getSizeInBits() > 32 && 2048bcb0991SDimitry Andric Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits(); 2050b57cec5SDimitry Andric }; 2060b57cec5SDimitry Andric } 2070b57cec5SDimitry Andric 208*5ffd83dbSDimitry Andric // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we 209*5ffd83dbSDimitry Andric // handle some operations by just promoting the register during 210*5ffd83dbSDimitry Andric // selection. There are also d16 loads on GFX9+ which preserve the high bits. 211*5ffd83dbSDimitry Andric static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, 212*5ffd83dbSDimitry Andric bool IsLoad) { 213*5ffd83dbSDimitry Andric switch (AS) { 214*5ffd83dbSDimitry Andric case AMDGPUAS::PRIVATE_ADDRESS: 215*5ffd83dbSDimitry Andric // FIXME: Private element size. 216*5ffd83dbSDimitry Andric return 32; 217*5ffd83dbSDimitry Andric case AMDGPUAS::LOCAL_ADDRESS: 218*5ffd83dbSDimitry Andric return ST.useDS128() ? 128 : 64; 219*5ffd83dbSDimitry Andric case AMDGPUAS::GLOBAL_ADDRESS: 220*5ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS: 221*5ffd83dbSDimitry Andric case AMDGPUAS::CONSTANT_ADDRESS_32BIT: 222*5ffd83dbSDimitry Andric // Treat constant and global as identical. SMRD loads are sometimes usable for 223*5ffd83dbSDimitry Andric // global loads (ideally constant address space should be eliminated) 224*5ffd83dbSDimitry Andric // depending on the context. Legality cannot be context dependent, but 225*5ffd83dbSDimitry Andric // RegBankSelect can split the load as necessary depending on the pointer 226*5ffd83dbSDimitry Andric // register bank/uniformity and if the memory is invariant or not written in a 227*5ffd83dbSDimitry Andric // kernel. 228*5ffd83dbSDimitry Andric return IsLoad ? 512 : 128; 229*5ffd83dbSDimitry Andric default: 230*5ffd83dbSDimitry Andric // Flat addresses may contextually need to be split to 32-bit parts if they 231*5ffd83dbSDimitry Andric // may alias scratch depending on the subtarget. 232*5ffd83dbSDimitry Andric return 128; 233*5ffd83dbSDimitry Andric } 234*5ffd83dbSDimitry Andric } 235*5ffd83dbSDimitry Andric 236*5ffd83dbSDimitry Andric static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, 237*5ffd83dbSDimitry Andric const LegalityQuery &Query, 238*5ffd83dbSDimitry Andric unsigned Opcode) { 239*5ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 240*5ffd83dbSDimitry Andric 241*5ffd83dbSDimitry Andric // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD 242*5ffd83dbSDimitry Andric const bool IsLoad = Opcode != AMDGPU::G_STORE; 243*5ffd83dbSDimitry Andric 244*5ffd83dbSDimitry Andric unsigned RegSize = Ty.getSizeInBits(); 245*5ffd83dbSDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 246*5ffd83dbSDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 247*5ffd83dbSDimitry Andric unsigned AS = Query.Types[1].getAddressSpace(); 248*5ffd83dbSDimitry Andric 249*5ffd83dbSDimitry Andric // All of these need to be custom lowered to cast the pointer operand. 250*5ffd83dbSDimitry Andric if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) 251*5ffd83dbSDimitry Andric return false; 252*5ffd83dbSDimitry Andric 253*5ffd83dbSDimitry Andric // TODO: We should be able to widen loads if the alignment is high enough, but 254*5ffd83dbSDimitry Andric // we also need to modify the memory access size. 255*5ffd83dbSDimitry Andric #if 0 256*5ffd83dbSDimitry Andric // Accept widening loads based on alignment. 257*5ffd83dbSDimitry Andric if (IsLoad && MemSize < Size) 258*5ffd83dbSDimitry Andric MemSize = std::max(MemSize, Align); 259*5ffd83dbSDimitry Andric #endif 260*5ffd83dbSDimitry Andric 261*5ffd83dbSDimitry Andric // Only 1-byte and 2-byte to 32-bit extloads are valid. 262*5ffd83dbSDimitry Andric if (MemSize != RegSize && RegSize != 32) 263*5ffd83dbSDimitry Andric return false; 264*5ffd83dbSDimitry Andric 265*5ffd83dbSDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 266*5ffd83dbSDimitry Andric return false; 267*5ffd83dbSDimitry Andric 268*5ffd83dbSDimitry Andric switch (MemSize) { 269*5ffd83dbSDimitry Andric case 8: 270*5ffd83dbSDimitry Andric case 16: 271*5ffd83dbSDimitry Andric case 32: 272*5ffd83dbSDimitry Andric case 64: 273*5ffd83dbSDimitry Andric case 128: 274*5ffd83dbSDimitry Andric break; 275*5ffd83dbSDimitry Andric case 96: 276*5ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 277*5ffd83dbSDimitry Andric return false; 278*5ffd83dbSDimitry Andric break; 279*5ffd83dbSDimitry Andric case 256: 280*5ffd83dbSDimitry Andric case 512: 281*5ffd83dbSDimitry Andric // These may contextually need to be broken down. 282*5ffd83dbSDimitry Andric break; 283*5ffd83dbSDimitry Andric default: 284*5ffd83dbSDimitry Andric return false; 285*5ffd83dbSDimitry Andric } 286*5ffd83dbSDimitry Andric 287*5ffd83dbSDimitry Andric assert(RegSize >= MemSize); 288*5ffd83dbSDimitry Andric 289*5ffd83dbSDimitry Andric if (Align < MemSize) { 290*5ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 291*5ffd83dbSDimitry Andric if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) 292*5ffd83dbSDimitry Andric return false; 293*5ffd83dbSDimitry Andric } 294*5ffd83dbSDimitry Andric 295*5ffd83dbSDimitry Andric return true; 296*5ffd83dbSDimitry Andric } 297*5ffd83dbSDimitry Andric 298*5ffd83dbSDimitry Andric // The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so 299*5ffd83dbSDimitry Andric // workaround this. Eventually it should ignore the type for loads and only care 300*5ffd83dbSDimitry Andric // about the size. Return true in cases where we will workaround this for now by 301*5ffd83dbSDimitry Andric // bitcasting. 302*5ffd83dbSDimitry Andric static bool loadStoreBitcastWorkaround(const LLT Ty) { 303*5ffd83dbSDimitry Andric if (EnableNewLegality) 304*5ffd83dbSDimitry Andric return false; 305*5ffd83dbSDimitry Andric 306*5ffd83dbSDimitry Andric const unsigned Size = Ty.getSizeInBits(); 307*5ffd83dbSDimitry Andric if (Size <= 64) 308*5ffd83dbSDimitry Andric return false; 309*5ffd83dbSDimitry Andric if (!Ty.isVector()) 310*5ffd83dbSDimitry Andric return true; 311*5ffd83dbSDimitry Andric unsigned EltSize = Ty.getElementType().getSizeInBits(); 312*5ffd83dbSDimitry Andric return EltSize != 32 && EltSize != 64; 313*5ffd83dbSDimitry Andric } 314*5ffd83dbSDimitry Andric 315*5ffd83dbSDimitry Andric static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, 316*5ffd83dbSDimitry Andric unsigned Opcode) { 317*5ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 318*5ffd83dbSDimitry Andric return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && 319*5ffd83dbSDimitry Andric !loadStoreBitcastWorkaround(Ty); 320*5ffd83dbSDimitry Andric } 321*5ffd83dbSDimitry Andric 3220b57cec5SDimitry Andric AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, 3230b57cec5SDimitry Andric const GCNTargetMachine &TM) 3240b57cec5SDimitry Andric : ST(ST_) { 3250b57cec5SDimitry Andric using namespace TargetOpcode; 3260b57cec5SDimitry Andric 3270b57cec5SDimitry Andric auto GetAddrSpacePtr = [&TM](unsigned AS) { 3280b57cec5SDimitry Andric return LLT::pointer(AS, TM.getPointerSizeInBits(AS)); 3290b57cec5SDimitry Andric }; 3300b57cec5SDimitry Andric 3310b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 3320b57cec5SDimitry Andric const LLT S16 = LLT::scalar(16); 3330b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 3340b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 3350b57cec5SDimitry Andric const LLT S128 = LLT::scalar(128); 3360b57cec5SDimitry Andric const LLT S256 = LLT::scalar(256); 337*5ffd83dbSDimitry Andric const LLT S512 = LLT::scalar(512); 338*5ffd83dbSDimitry Andric const LLT MaxScalar = LLT::scalar(MaxRegisterSize); 3390b57cec5SDimitry Andric 3400b57cec5SDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 3410b57cec5SDimitry Andric const LLT V4S16 = LLT::vector(4, 16); 3420b57cec5SDimitry Andric 3430b57cec5SDimitry Andric const LLT V2S32 = LLT::vector(2, 32); 3440b57cec5SDimitry Andric const LLT V3S32 = LLT::vector(3, 32); 3450b57cec5SDimitry Andric const LLT V4S32 = LLT::vector(4, 32); 3460b57cec5SDimitry Andric const LLT V5S32 = LLT::vector(5, 32); 3470b57cec5SDimitry Andric const LLT V6S32 = LLT::vector(6, 32); 3480b57cec5SDimitry Andric const LLT V7S32 = LLT::vector(7, 32); 3490b57cec5SDimitry Andric const LLT V8S32 = LLT::vector(8, 32); 3500b57cec5SDimitry Andric const LLT V9S32 = LLT::vector(9, 32); 3510b57cec5SDimitry Andric const LLT V10S32 = LLT::vector(10, 32); 3520b57cec5SDimitry Andric const LLT V11S32 = LLT::vector(11, 32); 3530b57cec5SDimitry Andric const LLT V12S32 = LLT::vector(12, 32); 3540b57cec5SDimitry Andric const LLT V13S32 = LLT::vector(13, 32); 3550b57cec5SDimitry Andric const LLT V14S32 = LLT::vector(14, 32); 3560b57cec5SDimitry Andric const LLT V15S32 = LLT::vector(15, 32); 3570b57cec5SDimitry Andric const LLT V16S32 = LLT::vector(16, 32); 3588bcb0991SDimitry Andric const LLT V32S32 = LLT::vector(32, 32); 3590b57cec5SDimitry Andric 3600b57cec5SDimitry Andric const LLT V2S64 = LLT::vector(2, 64); 3610b57cec5SDimitry Andric const LLT V3S64 = LLT::vector(3, 64); 3620b57cec5SDimitry Andric const LLT V4S64 = LLT::vector(4, 64); 3630b57cec5SDimitry Andric const LLT V5S64 = LLT::vector(5, 64); 3640b57cec5SDimitry Andric const LLT V6S64 = LLT::vector(6, 64); 3650b57cec5SDimitry Andric const LLT V7S64 = LLT::vector(7, 64); 3660b57cec5SDimitry Andric const LLT V8S64 = LLT::vector(8, 64); 3678bcb0991SDimitry Andric const LLT V16S64 = LLT::vector(16, 64); 3680b57cec5SDimitry Andric 3690b57cec5SDimitry Andric std::initializer_list<LLT> AllS32Vectors = 3700b57cec5SDimitry Andric {V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32, 3718bcb0991SDimitry Andric V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32}; 3720b57cec5SDimitry Andric std::initializer_list<LLT> AllS64Vectors = 3738bcb0991SDimitry Andric {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64}; 3740b57cec5SDimitry Andric 3750b57cec5SDimitry Andric const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS); 3760b57cec5SDimitry Andric const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS); 3778bcb0991SDimitry Andric const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT); 3780b57cec5SDimitry Andric const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS); 3798bcb0991SDimitry Andric const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS); 3800b57cec5SDimitry Andric const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS); 3810b57cec5SDimitry Andric const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS); 3820b57cec5SDimitry Andric 3830b57cec5SDimitry Andric const LLT CodePtr = FlatPtr; 3840b57cec5SDimitry Andric 3850b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces64 = { 3860b57cec5SDimitry Andric GlobalPtr, ConstantPtr, FlatPtr 3870b57cec5SDimitry Andric }; 3880b57cec5SDimitry Andric 3890b57cec5SDimitry Andric const std::initializer_list<LLT> AddrSpaces32 = { 3908bcb0991SDimitry Andric LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr 3910b57cec5SDimitry Andric }; 3920b57cec5SDimitry Andric 3930b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesBase = { 3940b57cec5SDimitry Andric S32, S64 3950b57cec5SDimitry Andric }; 3960b57cec5SDimitry Andric 3970b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypes16 = { 3980b57cec5SDimitry Andric S32, S64, S16 3990b57cec5SDimitry Andric }; 4000b57cec5SDimitry Andric 4010b57cec5SDimitry Andric const std::initializer_list<LLT> FPTypesPK16 = { 4020b57cec5SDimitry Andric S32, S64, S16, V2S16 4030b57cec5SDimitry Andric }; 4040b57cec5SDimitry Andric 405*5ffd83dbSDimitry Andric const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; 406*5ffd83dbSDimitry Andric 407480093f4SDimitry Andric setAction({G_BRCOND, S1}, Legal); // VCC branches 408480093f4SDimitry Andric setAction({G_BRCOND, S32}, Legal); // SCC branches 4090b57cec5SDimitry Andric 4100b57cec5SDimitry Andric // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more 4110b57cec5SDimitry Andric // elements for v3s16 4120b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PHI) 4130b57cec5SDimitry Andric .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) 4140b57cec5SDimitry Andric .legalFor(AllS32Vectors) 4150b57cec5SDimitry Andric .legalFor(AllS64Vectors) 4160b57cec5SDimitry Andric .legalFor(AddrSpaces64) 4170b57cec5SDimitry Andric .legalFor(AddrSpaces32) 4180b57cec5SDimitry Andric .clampScalar(0, S32, S256) 4190b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 4200b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 16) 4210b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 4220b57cec5SDimitry Andric .legalIf(isPointer(0)); 4230b57cec5SDimitry Andric 424*5ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 425*5ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 426*5ffd83dbSDimitry Andric .legalFor({S32, S16, V2S16}) 427*5ffd83dbSDimitry Andric .clampScalar(0, S16, S32) 428*5ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 429*5ffd83dbSDimitry Andric .scalarize(0) 430*5ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32); 431*5ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 4320b57cec5SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 4330b57cec5SDimitry Andric .legalFor({S32, S16}) 4340b57cec5SDimitry Andric .clampScalar(0, S16, S32) 435*5ffd83dbSDimitry Andric .scalarize(0) 436*5ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32); 4370b57cec5SDimitry Andric } else { 4380b57cec5SDimitry Andric getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) 4390b57cec5SDimitry Andric .legalFor({S32}) 4400b57cec5SDimitry Andric .clampScalar(0, S32, S32) 4410b57cec5SDimitry Andric .scalarize(0); 4420b57cec5SDimitry Andric } 4430b57cec5SDimitry Andric 444480093f4SDimitry Andric // FIXME: Not really legal. Placeholder for custom lowering. 445480093f4SDimitry Andric getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) 446*5ffd83dbSDimitry Andric .customFor({S32, S64}) 447480093f4SDimitry Andric .clampScalar(0, S32, S64) 448480093f4SDimitry Andric .widenScalarToNextPow2(0, 32) 449480093f4SDimitry Andric .scalarize(0); 450480093f4SDimitry Andric 4510b57cec5SDimitry Andric getActionDefinitionsBuilder({G_UMULH, G_SMULH}) 4520b57cec5SDimitry Andric .legalFor({S32}) 4530b57cec5SDimitry Andric .clampScalar(0, S32, S32) 4540b57cec5SDimitry Andric .scalarize(0); 4550b57cec5SDimitry Andric 4560b57cec5SDimitry Andric // Report legal for any types we can handle anywhere. For the cases only legal 4570b57cec5SDimitry Andric // on the SALU, RegBankSelect will be able to re-legalize. 4580b57cec5SDimitry Andric getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) 4590b57cec5SDimitry Andric .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) 4600b57cec5SDimitry Andric .clampScalar(0, S32, S64) 4610b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 4628bcb0991SDimitry Andric .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) 4630b57cec5SDimitry Andric .widenScalarToNextPow2(0) 4640b57cec5SDimitry Andric .scalarize(0); 4650b57cec5SDimitry Andric 4668bcb0991SDimitry Andric getActionDefinitionsBuilder({G_UADDO, G_USUBO, 4670b57cec5SDimitry Andric G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) 468480093f4SDimitry Andric .legalFor({{S32, S1}, {S32, S32}}) 469*5ffd83dbSDimitry Andric .minScalar(0, S32) 470*5ffd83dbSDimitry Andric // TODO: .scalarize(0) 4718bcb0991SDimitry Andric .lower(); 4720b57cec5SDimitry Andric 4730b57cec5SDimitry Andric getActionDefinitionsBuilder(G_BITCAST) 4740b57cec5SDimitry Andric // Don't worry about the size constraint. 4758bcb0991SDimitry Andric .legalIf(all(isRegisterType(0), isRegisterType(1))) 476*5ffd83dbSDimitry Andric .lower(); 4770b57cec5SDimitry Andric 4780b57cec5SDimitry Andric 4790b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONSTANT) 4808bcb0991SDimitry Andric .legalFor({S1, S32, S64, S16, GlobalPtr, 4810b57cec5SDimitry Andric LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) 4820b57cec5SDimitry Andric .clampScalar(0, S32, S64) 4830b57cec5SDimitry Andric .widenScalarToNextPow2(0) 4840b57cec5SDimitry Andric .legalIf(isPointer(0)); 4850b57cec5SDimitry Andric 486*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FCONSTANT) 487*5ffd83dbSDimitry Andric .legalFor({S32, S64, S16}) 488*5ffd83dbSDimitry Andric .clampScalar(0, S16, S64); 4898bcb0991SDimitry Andric 490*5ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) 491*5ffd83dbSDimitry Andric .legalIf(isRegisterType(0)) 492*5ffd83dbSDimitry Andric // s1 and s16 are special cases because they have legal operations on 493*5ffd83dbSDimitry Andric // them, but don't really occupy registers in the normal way. 494*5ffd83dbSDimitry Andric .legalFor({S1, S16}) 495*5ffd83dbSDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 496*5ffd83dbSDimitry Andric .clampScalarOrElt(0, S32, MaxScalar) 497*5ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 498*5ffd83dbSDimitry Andric .clampMaxNumElements(0, S32, 16); 499*5ffd83dbSDimitry Andric 500*5ffd83dbSDimitry Andric setAction({G_FRAME_INDEX, PrivatePtr}, Legal); 501*5ffd83dbSDimitry Andric 502*5ffd83dbSDimitry Andric // If the amount is divergent, we have to do a wave reduction to get the 503*5ffd83dbSDimitry Andric // maximum value, so this is expanded during RegBankSelect. 504*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_DYN_STACKALLOC) 505*5ffd83dbSDimitry Andric .legalFor({{PrivatePtr, S32}}); 506*5ffd83dbSDimitry Andric 507*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_GLOBAL_VALUE) 508*5ffd83dbSDimitry Andric .unsupportedFor({PrivatePtr}) 509*5ffd83dbSDimitry Andric .custom(); 510*5ffd83dbSDimitry Andric setAction({G_BLOCK_ADDR, CodePtr}, Legal); 5110b57cec5SDimitry Andric 5120b57cec5SDimitry Andric auto &FPOpActions = getActionDefinitionsBuilder( 5138bcb0991SDimitry Andric { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) 5140b57cec5SDimitry Andric .legalFor({S32, S64}); 5158bcb0991SDimitry Andric auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS}) 5168bcb0991SDimitry Andric .customFor({S32, S64}); 5178bcb0991SDimitry Andric auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV) 5188bcb0991SDimitry Andric .customFor({S32, S64}); 5190b57cec5SDimitry Andric 5200b57cec5SDimitry Andric if (ST.has16BitInsts()) { 5210b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 5220b57cec5SDimitry Andric FPOpActions.legalFor({S16, V2S16}); 5230b57cec5SDimitry Andric else 5240b57cec5SDimitry Andric FPOpActions.legalFor({S16}); 5258bcb0991SDimitry Andric 5268bcb0991SDimitry Andric TrigActions.customFor({S16}); 5278bcb0991SDimitry Andric FDIVActions.customFor({S16}); 5280b57cec5SDimitry Andric } 5290b57cec5SDimitry Andric 5300b57cec5SDimitry Andric auto &MinNumMaxNum = getActionDefinitionsBuilder({ 5310b57cec5SDimitry Andric G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE}); 5320b57cec5SDimitry Andric 5330b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 5340b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesPK16) 535480093f4SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 5360b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 5370b57cec5SDimitry Andric .clampScalar(0, S16, S64) 5380b57cec5SDimitry Andric .scalarize(0); 5390b57cec5SDimitry Andric } else if (ST.has16BitInsts()) { 5400b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypes16) 5410b57cec5SDimitry Andric .clampScalar(0, S16, S64) 5420b57cec5SDimitry Andric .scalarize(0); 5430b57cec5SDimitry Andric } else { 5440b57cec5SDimitry Andric MinNumMaxNum.customFor(FPTypesBase) 5450b57cec5SDimitry Andric .clampScalar(0, S32, S64) 5460b57cec5SDimitry Andric .scalarize(0); 5470b57cec5SDimitry Andric } 5480b57cec5SDimitry Andric 5490b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) 5500b57cec5SDimitry Andric FPOpActions.clampMaxNumElements(0, S16, 2); 5518bcb0991SDimitry Andric 5520b57cec5SDimitry Andric FPOpActions 5530b57cec5SDimitry Andric .scalarize(0) 5540b57cec5SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 5550b57cec5SDimitry Andric 5568bcb0991SDimitry Andric TrigActions 5578bcb0991SDimitry Andric .scalarize(0) 5588bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 5598bcb0991SDimitry Andric 5608bcb0991SDimitry Andric FDIVActions 5618bcb0991SDimitry Andric .scalarize(0) 5628bcb0991SDimitry Andric .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64); 5638bcb0991SDimitry Andric 5648bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FNEG, G_FABS}) 5658bcb0991SDimitry Andric .legalFor(FPTypesPK16) 5668bcb0991SDimitry Andric .clampMaxNumElements(0, S16, 2) 5678bcb0991SDimitry Andric .scalarize(0) 5688bcb0991SDimitry Andric .clampScalar(0, S16, S64); 5698bcb0991SDimitry Andric 5700b57cec5SDimitry Andric if (ST.has16BitInsts()) { 5718bcb0991SDimitry Andric getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) 5720b57cec5SDimitry Andric .legalFor({S32, S64, S16}) 5730b57cec5SDimitry Andric .scalarize(0) 5740b57cec5SDimitry Andric .clampScalar(0, S16, S64); 5750b57cec5SDimitry Andric } else { 576*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSQRT) 577*5ffd83dbSDimitry Andric .legalFor({S32, S64}) 578*5ffd83dbSDimitry Andric .scalarize(0) 579*5ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 580*5ffd83dbSDimitry Andric 581*5ffd83dbSDimitry Andric if (ST.hasFractBug()) { 582*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 583*5ffd83dbSDimitry Andric .customFor({S64}) 584*5ffd83dbSDimitry Andric .legalFor({S32, S64}) 585*5ffd83dbSDimitry Andric .scalarize(0) 586*5ffd83dbSDimitry Andric .clampScalar(0, S32, S64); 587*5ffd83dbSDimitry Andric } else { 588*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FFLOOR) 5890b57cec5SDimitry Andric .legalFor({S32, S64}) 5900b57cec5SDimitry Andric .scalarize(0) 5910b57cec5SDimitry Andric .clampScalar(0, S32, S64); 5920b57cec5SDimitry Andric } 593*5ffd83dbSDimitry Andric } 5940b57cec5SDimitry Andric 5950b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPTRUNC) 5960b57cec5SDimitry Andric .legalFor({{S32, S64}, {S16, S32}}) 597*5ffd83dbSDimitry Andric .scalarize(0) 598*5ffd83dbSDimitry Andric .lower(); 5990b57cec5SDimitry Andric 6000b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FPEXT) 6010b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}}) 6020b57cec5SDimitry Andric .lowerFor({{S64, S16}}) // FIXME: Implement 6030b57cec5SDimitry Andric .scalarize(0); 6040b57cec5SDimitry Andric 6050b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FSUB) 6060b57cec5SDimitry Andric // Use actual fsub instruction 6070b57cec5SDimitry Andric .legalFor({S32}) 6080b57cec5SDimitry Andric // Must use fadd + fneg 6090b57cec5SDimitry Andric .lowerFor({S64, S16, V2S16}) 6100b57cec5SDimitry Andric .scalarize(0) 6110b57cec5SDimitry Andric .clampScalar(0, S32, S64); 6120b57cec5SDimitry Andric 6138bcb0991SDimitry Andric // Whether this is legal depends on the floating point mode for the function. 6148bcb0991SDimitry Andric auto &FMad = getActionDefinitionsBuilder(G_FMAD); 615*5ffd83dbSDimitry Andric if (ST.hasMadF16() && ST.hasMadMacF32Insts()) 6168bcb0991SDimitry Andric FMad.customFor({S32, S16}); 617*5ffd83dbSDimitry Andric else if (ST.hasMadMacF32Insts()) 6188bcb0991SDimitry Andric FMad.customFor({S32}); 619*5ffd83dbSDimitry Andric else if (ST.hasMadF16()) 620*5ffd83dbSDimitry Andric FMad.customFor({S16}); 6218bcb0991SDimitry Andric FMad.scalarize(0) 6228bcb0991SDimitry Andric .lower(); 6238bcb0991SDimitry Andric 624*5ffd83dbSDimitry Andric // TODO: Do we need to clamp maximum bitwidth? 625*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_TRUNC) 626*5ffd83dbSDimitry Andric .legalIf(isScalar(0)) 627*5ffd83dbSDimitry Andric .legalFor({{V2S16, V2S32}}) 628*5ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 629*5ffd83dbSDimitry Andric // Avoid scalarizing in cases that should be truly illegal. In unresolvable 630*5ffd83dbSDimitry Andric // situations (like an invalid implicit use), we don't want to infinite loop 631*5ffd83dbSDimitry Andric // in the legalizer. 632*5ffd83dbSDimitry Andric .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) 633*5ffd83dbSDimitry Andric .alwaysLegal(); 634*5ffd83dbSDimitry Andric 6350b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) 6360b57cec5SDimitry Andric .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, 637*5ffd83dbSDimitry Andric {S32, S1}, {S64, S1}, {S16, S1}}) 638480093f4SDimitry Andric .scalarize(0) 639*5ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 640*5ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 6410b57cec5SDimitry Andric 6428bcb0991SDimitry Andric // TODO: Split s1->s64 during regbankselect for VALU. 6438bcb0991SDimitry Andric auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) 644480093f4SDimitry Andric .legalFor({{S32, S32}, {S64, S32}, {S16, S32}}) 6450b57cec5SDimitry Andric .lowerFor({{S32, S64}}) 646480093f4SDimitry Andric .lowerIf(typeIs(1, S1)) 6478bcb0991SDimitry Andric .customFor({{S64, S64}}); 6488bcb0991SDimitry Andric if (ST.has16BitInsts()) 6498bcb0991SDimitry Andric IToFP.legalFor({{S16, S16}}); 6508bcb0991SDimitry Andric IToFP.clampScalar(1, S32, S64) 651*5ffd83dbSDimitry Andric .scalarize(0) 652*5ffd83dbSDimitry Andric .widenScalarToNextPow2(1); 6530b57cec5SDimitry Andric 6548bcb0991SDimitry Andric auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) 655*5ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) 656*5ffd83dbSDimitry Andric .customFor({{S64, S64}}); 6578bcb0991SDimitry Andric if (ST.has16BitInsts()) 6588bcb0991SDimitry Andric FPToI.legalFor({{S16, S16}}); 6598bcb0991SDimitry Andric else 6608bcb0991SDimitry Andric FPToI.minScalar(1, S32); 6618bcb0991SDimitry Andric 6628bcb0991SDimitry Andric FPToI.minScalar(0, S32) 663*5ffd83dbSDimitry Andric .scalarize(0) 664*5ffd83dbSDimitry Andric .lower(); 6650b57cec5SDimitry Andric 6660b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTRINSIC_ROUND) 667480093f4SDimitry Andric .scalarize(0) 668480093f4SDimitry Andric .lower(); 6690b57cec5SDimitry Andric 670480093f4SDimitry Andric if (ST.has16BitInsts()) { 671480093f4SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 672480093f4SDimitry Andric .legalFor({S16, S32, S64}) 673480093f4SDimitry Andric .clampScalar(0, S16, S64) 674480093f4SDimitry Andric .scalarize(0); 675480093f4SDimitry Andric } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { 6760b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 6770b57cec5SDimitry Andric .legalFor({S32, S64}) 6780b57cec5SDimitry Andric .clampScalar(0, S32, S64) 6790b57cec5SDimitry Andric .scalarize(0); 6800b57cec5SDimitry Andric } else { 6810b57cec5SDimitry Andric getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT}) 6820b57cec5SDimitry Andric .legalFor({S32}) 6830b57cec5SDimitry Andric .customFor({S64}) 6840b57cec5SDimitry Andric .clampScalar(0, S32, S64) 6850b57cec5SDimitry Andric .scalarize(0); 6860b57cec5SDimitry Andric } 6870b57cec5SDimitry Andric 688*5ffd83dbSDimitry Andric // FIXME: Clamp offset operand. 689480093f4SDimitry Andric getActionDefinitionsBuilder(G_PTR_ADD) 690*5ffd83dbSDimitry Andric .legalIf(isPointer(0)) 6910b57cec5SDimitry Andric .scalarize(0); 6920b57cec5SDimitry Andric 693*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_PTRMASK) 694*5ffd83dbSDimitry Andric .legalIf(typeInSet(1, {S64, S32})) 695*5ffd83dbSDimitry Andric .minScalar(1, S32) 696*5ffd83dbSDimitry Andric .maxScalarIf(sizeIs(0, 32), 1, S32) 697*5ffd83dbSDimitry Andric .maxScalarIf(sizeIs(0, 64), 1, S64) 698*5ffd83dbSDimitry Andric .scalarize(0); 6990b57cec5SDimitry Andric 7000b57cec5SDimitry Andric auto &CmpBuilder = 7010b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ICMP) 702480093f4SDimitry Andric // The compare output type differs based on the register bank of the output, 703480093f4SDimitry Andric // so make both s1 and s32 legal. 704480093f4SDimitry Andric // 705480093f4SDimitry Andric // Scalar compares producing output in scc will be promoted to s32, as that 706480093f4SDimitry Andric // is the allocatable register type that will be needed for the copy from 707480093f4SDimitry Andric // scc. This will be promoted during RegBankSelect, and we assume something 708480093f4SDimitry Andric // before that won't try to use s32 result types. 709480093f4SDimitry Andric // 710480093f4SDimitry Andric // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg 711480093f4SDimitry Andric // bank. 7120b57cec5SDimitry Andric .legalForCartesianProduct( 7130b57cec5SDimitry Andric {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}) 714480093f4SDimitry Andric .legalForCartesianProduct( 715480093f4SDimitry Andric {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr}); 7160b57cec5SDimitry Andric if (ST.has16BitInsts()) { 7170b57cec5SDimitry Andric CmpBuilder.legalFor({{S1, S16}}); 7180b57cec5SDimitry Andric } 7190b57cec5SDimitry Andric 7200b57cec5SDimitry Andric CmpBuilder 7210b57cec5SDimitry Andric .widenScalarToNextPow2(1) 7220b57cec5SDimitry Andric .clampScalar(1, S32, S64) 7230b57cec5SDimitry Andric .scalarize(0) 724480093f4SDimitry Andric .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1))); 7250b57cec5SDimitry Andric 7260b57cec5SDimitry Andric getActionDefinitionsBuilder(G_FCMP) 7270b57cec5SDimitry Andric .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase) 7280b57cec5SDimitry Andric .widenScalarToNextPow2(1) 7290b57cec5SDimitry Andric .clampScalar(1, S32, S64) 7300b57cec5SDimitry Andric .scalarize(0); 7310b57cec5SDimitry Andric 732*5ffd83dbSDimitry Andric // FIXME: fpow has a selection pattern that should move to custom lowering. 733*5ffd83dbSDimitry Andric auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); 734*5ffd83dbSDimitry Andric if (ST.has16BitInsts()) 735*5ffd83dbSDimitry Andric Exp2Ops.legalFor({S32, S16}); 736*5ffd83dbSDimitry Andric else 737*5ffd83dbSDimitry Andric Exp2Ops.legalFor({S32}); 738*5ffd83dbSDimitry Andric Exp2Ops.clampScalar(0, MinScalarFPTy, S32); 739*5ffd83dbSDimitry Andric Exp2Ops.scalarize(0); 740*5ffd83dbSDimitry Andric 741*5ffd83dbSDimitry Andric auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); 742*5ffd83dbSDimitry Andric if (ST.has16BitInsts()) 743*5ffd83dbSDimitry Andric ExpOps.customFor({{S32}, {S16}}); 744*5ffd83dbSDimitry Andric else 745*5ffd83dbSDimitry Andric ExpOps.customFor({S32}); 746*5ffd83dbSDimitry Andric ExpOps.clampScalar(0, MinScalarFPTy, S32) 7470b57cec5SDimitry Andric .scalarize(0); 7480b57cec5SDimitry Andric 7490b57cec5SDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 750*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_CTPOP) 7510b57cec5SDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 7520b57cec5SDimitry Andric .clampScalar(0, S32, S32) 7530b57cec5SDimitry Andric .clampScalar(1, S32, S64) 7540b57cec5SDimitry Andric .scalarize(0) 7550b57cec5SDimitry Andric .widenScalarToNextPow2(0, 32) 7560b57cec5SDimitry Andric .widenScalarToNextPow2(1, 32); 7570b57cec5SDimitry Andric 758*5ffd83dbSDimitry Andric // The hardware instructions return a different result on 0 than the generic 759*5ffd83dbSDimitry Andric // instructions expect. The hardware produces -1, but these produce the 760*5ffd83dbSDimitry Andric // bitwidth. 761*5ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) 762*5ffd83dbSDimitry Andric .scalarize(0) 763*5ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 764*5ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 765*5ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 766*5ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32) 767*5ffd83dbSDimitry Andric .lower(); 768*5ffd83dbSDimitry Andric 769*5ffd83dbSDimitry Andric // The 64-bit versions produce 32-bit results, but only on the SALU. 770*5ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) 771*5ffd83dbSDimitry Andric .legalFor({{S32, S32}, {S32, S64}}) 772*5ffd83dbSDimitry Andric .clampScalar(0, S32, S32) 773*5ffd83dbSDimitry Andric .clampScalar(1, S32, S64) 774*5ffd83dbSDimitry Andric .scalarize(0) 775*5ffd83dbSDimitry Andric .widenScalarToNextPow2(0, 32) 776*5ffd83dbSDimitry Andric .widenScalarToNextPow2(1, 32); 777*5ffd83dbSDimitry Andric 778*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BITREVERSE) 7790b57cec5SDimitry Andric .legalFor({S32}) 7800b57cec5SDimitry Andric .clampScalar(0, S32, S32) 7810b57cec5SDimitry Andric .scalarize(0); 7820b57cec5SDimitry Andric 7830b57cec5SDimitry Andric if (ST.has16BitInsts()) { 784*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 785*5ffd83dbSDimitry Andric .legalFor({S16, S32, V2S16}) 786*5ffd83dbSDimitry Andric .clampMaxNumElements(0, S16, 2) 787*5ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 788*5ffd83dbSDimitry Andric // narrowScalar limitation. 789*5ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 790*5ffd83dbSDimitry Andric .clampScalar(0, S16, S32) 791*5ffd83dbSDimitry Andric .scalarize(0); 792*5ffd83dbSDimitry Andric 7930b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 7940b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 7950b57cec5SDimitry Andric .legalFor({S32, S16, V2S16}) 7960b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 7970b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2) 798*5ffd83dbSDimitry Andric .minScalar(0, S16) 7990b57cec5SDimitry Andric .widenScalarToNextPow2(0) 800*5ffd83dbSDimitry Andric .scalarize(0) 801*5ffd83dbSDimitry Andric .lower(); 8020b57cec5SDimitry Andric } else { 8030b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 8040b57cec5SDimitry Andric .legalFor({S32, S16}) 8050b57cec5SDimitry Andric .widenScalarToNextPow2(0) 806*5ffd83dbSDimitry Andric .minScalar(0, S16) 807*5ffd83dbSDimitry Andric .scalarize(0) 808*5ffd83dbSDimitry Andric .lower(); 8090b57cec5SDimitry Andric } 8100b57cec5SDimitry Andric } else { 811*5ffd83dbSDimitry Andric // TODO: Should have same legality without v_perm_b32 812*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_BSWAP) 813*5ffd83dbSDimitry Andric .legalFor({S32}) 814*5ffd83dbSDimitry Andric .lowerIf(scalarNarrowerThan(0, 32)) 815*5ffd83dbSDimitry Andric // FIXME: Fixing non-power-of-2 before clamp is workaround for 816*5ffd83dbSDimitry Andric // narrowScalar limitation. 817*5ffd83dbSDimitry Andric .widenScalarToNextPow2(0) 818*5ffd83dbSDimitry Andric .maxScalar(0, S32) 819*5ffd83dbSDimitry Andric .scalarize(0) 820*5ffd83dbSDimitry Andric .lower(); 821*5ffd83dbSDimitry Andric 8220b57cec5SDimitry Andric getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) 8230b57cec5SDimitry Andric .legalFor({S32}) 824*5ffd83dbSDimitry Andric .minScalar(0, S32) 8250b57cec5SDimitry Andric .widenScalarToNextPow2(0) 826*5ffd83dbSDimitry Andric .scalarize(0) 827*5ffd83dbSDimitry Andric .lower(); 8280b57cec5SDimitry Andric } 8290b57cec5SDimitry Andric 8300b57cec5SDimitry Andric getActionDefinitionsBuilder(G_INTTOPTR) 8310b57cec5SDimitry Andric // List the common cases 8320b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 8330b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 8340b57cec5SDimitry Andric .scalarize(0) 8350b57cec5SDimitry Andric // Accept any address space as long as the size matches 8360b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 8370b57cec5SDimitry Andric .widenScalarIf(smallerThan(1, 0), 8380b57cec5SDimitry Andric [](const LegalityQuery &Query) { 8390b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 8400b57cec5SDimitry Andric }) 841*5ffd83dbSDimitry Andric .narrowScalarIf(largerThan(1, 0), 8420b57cec5SDimitry Andric [](const LegalityQuery &Query) { 8430b57cec5SDimitry Andric return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); 8440b57cec5SDimitry Andric }); 8450b57cec5SDimitry Andric 8460b57cec5SDimitry Andric getActionDefinitionsBuilder(G_PTRTOINT) 8470b57cec5SDimitry Andric // List the common cases 8480b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces64, {S64}) 8490b57cec5SDimitry Andric .legalForCartesianProduct(AddrSpaces32, {S32}) 8500b57cec5SDimitry Andric .scalarize(0) 8510b57cec5SDimitry Andric // Accept any address space as long as the size matches 8520b57cec5SDimitry Andric .legalIf(sameSize(0, 1)) 8530b57cec5SDimitry Andric .widenScalarIf(smallerThan(0, 1), 8540b57cec5SDimitry Andric [](const LegalityQuery &Query) { 8550b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 8560b57cec5SDimitry Andric }) 8570b57cec5SDimitry Andric .narrowScalarIf( 858*5ffd83dbSDimitry Andric largerThan(0, 1), 8590b57cec5SDimitry Andric [](const LegalityQuery &Query) { 8600b57cec5SDimitry Andric return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); 8610b57cec5SDimitry Andric }); 8620b57cec5SDimitry Andric 8630b57cec5SDimitry Andric getActionDefinitionsBuilder(G_ADDRSPACE_CAST) 8640b57cec5SDimitry Andric .scalarize(0) 8650b57cec5SDimitry Andric .custom(); 8660b57cec5SDimitry Andric 867*5ffd83dbSDimitry Andric const auto needToSplitMemOp = [=](const LegalityQuery &Query, 868*5ffd83dbSDimitry Andric bool IsLoad) -> bool { 8698bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 8708bcb0991SDimitry Andric 8718bcb0991SDimitry Andric // Split vector extloads. 8728bcb0991SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 873480093f4SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 874480093f4SDimitry Andric 875480093f4SDimitry Andric if (MemSize < DstTy.getSizeInBits()) 876480093f4SDimitry Andric MemSize = std::max(MemSize, Align); 877480093f4SDimitry Andric 8788bcb0991SDimitry Andric if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) 8798bcb0991SDimitry Andric return true; 8808bcb0991SDimitry Andric 8818bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 8828bcb0991SDimitry Andric unsigned AS = PtrTy.getAddressSpace(); 883*5ffd83dbSDimitry Andric if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) 8848bcb0991SDimitry Andric return true; 8858bcb0991SDimitry Andric 8868bcb0991SDimitry Andric // Catch weird sized loads that don't evenly divide into the access sizes 8878bcb0991SDimitry Andric // TODO: May be able to widen depending on alignment etc. 888*5ffd83dbSDimitry Andric unsigned NumRegs = (MemSize + 31) / 32; 889*5ffd83dbSDimitry Andric if (NumRegs == 3) { 890*5ffd83dbSDimitry Andric if (!ST.hasDwordx3LoadStores()) 8918bcb0991SDimitry Andric return true; 892*5ffd83dbSDimitry Andric } else { 893*5ffd83dbSDimitry Andric // If the alignment allows, these should have been widened. 894*5ffd83dbSDimitry Andric if (!isPowerOf2_32(NumRegs)) 895*5ffd83dbSDimitry Andric return true; 896*5ffd83dbSDimitry Andric } 8978bcb0991SDimitry Andric 8988bcb0991SDimitry Andric if (Align < MemSize) { 8998bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 9008bcb0991SDimitry Andric return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); 9018bcb0991SDimitry Andric } 9028bcb0991SDimitry Andric 9038bcb0991SDimitry Andric return false; 9048bcb0991SDimitry Andric }; 9058bcb0991SDimitry Andric 906*5ffd83dbSDimitry Andric const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, 907*5ffd83dbSDimitry Andric unsigned Opc) -> bool { 908*5ffd83dbSDimitry Andric unsigned Size = Query.Types[0].getSizeInBits(); 909*5ffd83dbSDimitry Andric if (isPowerOf2_32(Size)) 910*5ffd83dbSDimitry Andric return false; 911*5ffd83dbSDimitry Andric 912*5ffd83dbSDimitry Andric if (Size == 96 && ST.hasDwordx3LoadStores()) 913*5ffd83dbSDimitry Andric return false; 914*5ffd83dbSDimitry Andric 915*5ffd83dbSDimitry Andric unsigned AddrSpace = Query.Types[1].getAddressSpace(); 916*5ffd83dbSDimitry Andric if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) 917*5ffd83dbSDimitry Andric return false; 918*5ffd83dbSDimitry Andric 919*5ffd83dbSDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 920*5ffd83dbSDimitry Andric unsigned RoundedSize = NextPowerOf2(Size); 921*5ffd83dbSDimitry Andric return (Align >= RoundedSize); 922*5ffd83dbSDimitry Andric }; 923*5ffd83dbSDimitry Andric 9248bcb0991SDimitry Andric unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; 9258bcb0991SDimitry Andric unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; 9268bcb0991SDimitry Andric unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; 9278bcb0991SDimitry Andric 9288bcb0991SDimitry Andric // TODO: Refine based on subtargets which support unaligned access or 128-bit 9298bcb0991SDimitry Andric // LDS 9308bcb0991SDimitry Andric // TODO: Unsupported flat for SI. 9318bcb0991SDimitry Andric 9328bcb0991SDimitry Andric for (unsigned Op : {G_LOAD, G_STORE}) { 9338bcb0991SDimitry Andric const bool IsStore = Op == G_STORE; 9348bcb0991SDimitry Andric 9358bcb0991SDimitry Andric auto &Actions = getActionDefinitionsBuilder(Op); 936*5ffd83dbSDimitry Andric // Explicitly list some common cases. 937*5ffd83dbSDimitry Andric // TODO: Does this help compile time at all? 9388bcb0991SDimitry Andric Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, 9398bcb0991SDimitry Andric {V2S32, GlobalPtr, 64, GlobalAlign32}, 9408bcb0991SDimitry Andric {V4S32, GlobalPtr, 128, GlobalAlign32}, 9418bcb0991SDimitry Andric {S64, GlobalPtr, 64, GlobalAlign32}, 9428bcb0991SDimitry Andric {V2S64, GlobalPtr, 128, GlobalAlign32}, 9438bcb0991SDimitry Andric {V2S16, GlobalPtr, 32, GlobalAlign32}, 9448bcb0991SDimitry Andric {S32, GlobalPtr, 8, GlobalAlign8}, 9458bcb0991SDimitry Andric {S32, GlobalPtr, 16, GlobalAlign16}, 9468bcb0991SDimitry Andric 9478bcb0991SDimitry Andric {S32, LocalPtr, 32, 32}, 9488bcb0991SDimitry Andric {S64, LocalPtr, 64, 32}, 9498bcb0991SDimitry Andric {V2S32, LocalPtr, 64, 32}, 9508bcb0991SDimitry Andric {S32, LocalPtr, 8, 8}, 9518bcb0991SDimitry Andric {S32, LocalPtr, 16, 16}, 9528bcb0991SDimitry Andric {V2S16, LocalPtr, 32, 32}, 9538bcb0991SDimitry Andric 9548bcb0991SDimitry Andric {S32, PrivatePtr, 32, 32}, 9558bcb0991SDimitry Andric {S32, PrivatePtr, 8, 8}, 9568bcb0991SDimitry Andric {S32, PrivatePtr, 16, 16}, 9578bcb0991SDimitry Andric {V2S16, PrivatePtr, 32, 32}, 9588bcb0991SDimitry Andric 9598bcb0991SDimitry Andric {S32, ConstantPtr, 32, GlobalAlign32}, 9608bcb0991SDimitry Andric {V2S32, ConstantPtr, 64, GlobalAlign32}, 9618bcb0991SDimitry Andric {V4S32, ConstantPtr, 128, GlobalAlign32}, 9628bcb0991SDimitry Andric {S64, ConstantPtr, 64, GlobalAlign32}, 9638bcb0991SDimitry Andric {V2S32, ConstantPtr, 32, GlobalAlign32}}); 964*5ffd83dbSDimitry Andric Actions.legalIf( 965*5ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 966*5ffd83dbSDimitry Andric return isLoadStoreLegal(ST, Query, Op); 967*5ffd83dbSDimitry Andric }); 968*5ffd83dbSDimitry Andric 969*5ffd83dbSDimitry Andric // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to 970*5ffd83dbSDimitry Andric // 64-bits. 971*5ffd83dbSDimitry Andric // 972*5ffd83dbSDimitry Andric // TODO: Should generalize bitcast action into coerce, which will also cover 973*5ffd83dbSDimitry Andric // inserting addrspacecasts. 974*5ffd83dbSDimitry Andric Actions.customIf(typeIs(1, Constant32Ptr)); 975*5ffd83dbSDimitry Andric 976*5ffd83dbSDimitry Andric // Turn any illegal element vectors into something easier to deal 977*5ffd83dbSDimitry Andric // with. These will ultimately produce 32-bit scalar shifts to extract the 978*5ffd83dbSDimitry Andric // parts anyway. 979*5ffd83dbSDimitry Andric // 980*5ffd83dbSDimitry Andric // For odd 16-bit element vectors, prefer to split those into pieces with 981*5ffd83dbSDimitry Andric // 16-bit vector parts. 982*5ffd83dbSDimitry Andric Actions.bitcastIf( 983*5ffd83dbSDimitry Andric [=](const LegalityQuery &Query) -> bool { 984*5ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 985*5ffd83dbSDimitry Andric const unsigned Size = Ty.getSizeInBits(); 986*5ffd83dbSDimitry Andric 987*5ffd83dbSDimitry Andric if (Size != Query.MMODescrs[0].SizeInBits) 988*5ffd83dbSDimitry Andric return Size <= 32 && Ty.isVector(); 989*5ffd83dbSDimitry Andric 990*5ffd83dbSDimitry Andric if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) 991*5ffd83dbSDimitry Andric return true; 992*5ffd83dbSDimitry Andric return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && 993*5ffd83dbSDimitry Andric !isRegisterVectorElementType(Ty.getElementType()); 994*5ffd83dbSDimitry Andric }, bitcastToRegisterType(0)); 995*5ffd83dbSDimitry Andric 9968bcb0991SDimitry Andric Actions 9978bcb0991SDimitry Andric .customIf(typeIs(1, Constant32Ptr)) 998*5ffd83dbSDimitry Andric // Widen suitably aligned loads by loading extra elements. 999*5ffd83dbSDimitry Andric .moreElementsIf([=](const LegalityQuery &Query) { 1000*5ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 1001*5ffd83dbSDimitry Andric return Op == G_LOAD && Ty.isVector() && 1002*5ffd83dbSDimitry Andric shouldWidenLoadResult(Query, Op); 1003*5ffd83dbSDimitry Andric }, moreElementsToNextPow2(0)) 1004*5ffd83dbSDimitry Andric .widenScalarIf([=](const LegalityQuery &Query) { 1005*5ffd83dbSDimitry Andric const LLT Ty = Query.Types[0]; 1006*5ffd83dbSDimitry Andric return Op == G_LOAD && !Ty.isVector() && 1007*5ffd83dbSDimitry Andric shouldWidenLoadResult(Query, Op); 1008*5ffd83dbSDimitry Andric }, widenScalarOrEltToNextPow2(0)) 10098bcb0991SDimitry Andric .narrowScalarIf( 10108bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 1011*5ffd83dbSDimitry Andric return !Query.Types[0].isVector() && 1012*5ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 10138bcb0991SDimitry Andric }, 10148bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 10158bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 10168bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 10178bcb0991SDimitry Andric 10188bcb0991SDimitry Andric const unsigned DstSize = DstTy.getSizeInBits(); 10198bcb0991SDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 10208bcb0991SDimitry Andric 10218bcb0991SDimitry Andric // Split extloads. 10228bcb0991SDimitry Andric if (DstSize > MemSize) 10238bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MemSize)); 10248bcb0991SDimitry Andric 1025*5ffd83dbSDimitry Andric if (!isPowerOf2_32(DstSize)) { 1026*5ffd83dbSDimitry Andric // We're probably decomposing an odd sized store. Try to split 1027*5ffd83dbSDimitry Andric // to the widest type. TODO: Account for alignment. As-is it 1028*5ffd83dbSDimitry Andric // should be OK, since the new parts will be further legalized. 1029*5ffd83dbSDimitry Andric unsigned FloorSize = PowerOf2Floor(DstSize); 1030*5ffd83dbSDimitry Andric return std::make_pair(0, LLT::scalar(FloorSize)); 1031*5ffd83dbSDimitry Andric } 1032*5ffd83dbSDimitry Andric 10338bcb0991SDimitry Andric if (DstSize > 32 && (DstSize % 32 != 0)) { 10348bcb0991SDimitry Andric // FIXME: Need a way to specify non-extload of larger size if 10358bcb0991SDimitry Andric // suitably aligned. 10368bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); 10378bcb0991SDimitry Andric } 10388bcb0991SDimitry Andric 1039*5ffd83dbSDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(ST, 1040*5ffd83dbSDimitry Andric PtrTy.getAddressSpace(), 1041*5ffd83dbSDimitry Andric Op == G_LOAD); 10428bcb0991SDimitry Andric if (MemSize > MaxSize) 10438bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(MaxSize)); 10448bcb0991SDimitry Andric 10458bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 10468bcb0991SDimitry Andric return std::make_pair(0, LLT::scalar(Align)); 10478bcb0991SDimitry Andric }) 10488bcb0991SDimitry Andric .fewerElementsIf( 10498bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> bool { 1050*5ffd83dbSDimitry Andric return Query.Types[0].isVector() && 1051*5ffd83dbSDimitry Andric needToSplitMemOp(Query, Op == G_LOAD); 10528bcb0991SDimitry Andric }, 10538bcb0991SDimitry Andric [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { 10548bcb0991SDimitry Andric const LLT DstTy = Query.Types[0]; 10558bcb0991SDimitry Andric const LLT PtrTy = Query.Types[1]; 10568bcb0991SDimitry Andric 10578bcb0991SDimitry Andric LLT EltTy = DstTy.getElementType(); 1058*5ffd83dbSDimitry Andric unsigned MaxSize = maxSizeForAddrSpace(ST, 1059*5ffd83dbSDimitry Andric PtrTy.getAddressSpace(), 1060*5ffd83dbSDimitry Andric Op == G_LOAD); 1061*5ffd83dbSDimitry Andric 1062*5ffd83dbSDimitry Andric // FIXME: Handle widened to power of 2 results better. This ends 1063*5ffd83dbSDimitry Andric // up scalarizing. 1064*5ffd83dbSDimitry Andric // FIXME: 3 element stores scalarized on SI 10658bcb0991SDimitry Andric 10668bcb0991SDimitry Andric // Split if it's too large for the address space. 10678bcb0991SDimitry Andric if (Query.MMODescrs[0].SizeInBits > MaxSize) { 10688bcb0991SDimitry Andric unsigned NumElts = DstTy.getNumElements(); 1069*5ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 1070*5ffd83dbSDimitry Andric 1071*5ffd83dbSDimitry Andric if (MaxSize % EltSize == 0) { 1072*5ffd83dbSDimitry Andric return std::make_pair( 1073*5ffd83dbSDimitry Andric 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); 1074*5ffd83dbSDimitry Andric } 1075*5ffd83dbSDimitry Andric 10768bcb0991SDimitry Andric unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; 10778bcb0991SDimitry Andric 10788bcb0991SDimitry Andric // FIXME: Refine when odd breakdowns handled 10798bcb0991SDimitry Andric // The scalars will need to be re-legalized. 10808bcb0991SDimitry Andric if (NumPieces == 1 || NumPieces >= NumElts || 10818bcb0991SDimitry Andric NumElts % NumPieces != 0) 10828bcb0991SDimitry Andric return std::make_pair(0, EltTy); 10838bcb0991SDimitry Andric 10848bcb0991SDimitry Andric return std::make_pair(0, 10858bcb0991SDimitry Andric LLT::vector(NumElts / NumPieces, EltTy)); 10868bcb0991SDimitry Andric } 10878bcb0991SDimitry Andric 1088*5ffd83dbSDimitry Andric // FIXME: We could probably handle weird extending loads better. 1089*5ffd83dbSDimitry Andric unsigned MemSize = Query.MMODescrs[0].SizeInBits; 1090*5ffd83dbSDimitry Andric if (DstTy.getSizeInBits() > MemSize) 1091*5ffd83dbSDimitry Andric return std::make_pair(0, EltTy); 1092*5ffd83dbSDimitry Andric 1093*5ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 1094*5ffd83dbSDimitry Andric unsigned DstSize = DstTy.getSizeInBits(); 1095*5ffd83dbSDimitry Andric if (!isPowerOf2_32(DstSize)) { 1096*5ffd83dbSDimitry Andric // We're probably decomposing an odd sized store. Try to split 1097*5ffd83dbSDimitry Andric // to the widest type. TODO: Account for alignment. As-is it 1098*5ffd83dbSDimitry Andric // should be OK, since the new parts will be further legalized. 1099*5ffd83dbSDimitry Andric unsigned FloorSize = PowerOf2Floor(DstSize); 1100*5ffd83dbSDimitry Andric return std::make_pair( 1101*5ffd83dbSDimitry Andric 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); 1102*5ffd83dbSDimitry Andric } 1103*5ffd83dbSDimitry Andric 11048bcb0991SDimitry Andric // Need to split because of alignment. 11058bcb0991SDimitry Andric unsigned Align = Query.MMODescrs[0].AlignInBits; 11068bcb0991SDimitry Andric if (EltSize > Align && 11078bcb0991SDimitry Andric (EltSize / Align < DstTy.getNumElements())) { 11088bcb0991SDimitry Andric return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); 11098bcb0991SDimitry Andric } 11108bcb0991SDimitry Andric 11118bcb0991SDimitry Andric // May need relegalization for the scalars. 11128bcb0991SDimitry Andric return std::make_pair(0, EltTy); 11138bcb0991SDimitry Andric }) 11148bcb0991SDimitry Andric .minScalar(0, S32); 11158bcb0991SDimitry Andric 11168bcb0991SDimitry Andric if (IsStore) 11178bcb0991SDimitry Andric Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); 11188bcb0991SDimitry Andric 11198bcb0991SDimitry Andric // TODO: Need a bitcast lower option? 11208bcb0991SDimitry Andric Actions 11218bcb0991SDimitry Andric .widenScalarToNextPow2(0) 11228bcb0991SDimitry Andric .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); 11238bcb0991SDimitry Andric } 11240b57cec5SDimitry Andric 11250b57cec5SDimitry Andric auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) 11268bcb0991SDimitry Andric .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8}, 11278bcb0991SDimitry Andric {S32, GlobalPtr, 16, 2 * 8}, 11280b57cec5SDimitry Andric {S32, LocalPtr, 8, 8}, 11298bcb0991SDimitry Andric {S32, LocalPtr, 16, 16}, 11300b57cec5SDimitry Andric {S32, PrivatePtr, 8, 8}, 11318bcb0991SDimitry Andric {S32, PrivatePtr, 16, 16}, 11328bcb0991SDimitry Andric {S32, ConstantPtr, 8, 8}, 11338bcb0991SDimitry Andric {S32, ConstantPtr, 16, 2 * 8}}); 11340b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 11358bcb0991SDimitry Andric ExtLoads.legalForTypesWithMemDesc( 11368bcb0991SDimitry Andric {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}}); 11370b57cec5SDimitry Andric } 11380b57cec5SDimitry Andric 11390b57cec5SDimitry Andric ExtLoads.clampScalar(0, S32, S32) 11400b57cec5SDimitry Andric .widenScalarToNextPow2(0) 11410b57cec5SDimitry Andric .unsupportedIfMemSizeNotPow2() 11420b57cec5SDimitry Andric .lower(); 11430b57cec5SDimitry Andric 11440b57cec5SDimitry Andric auto &Atomics = getActionDefinitionsBuilder( 11450b57cec5SDimitry Andric {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, 11460b57cec5SDimitry Andric G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR, 11470b57cec5SDimitry Andric G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, 1148480093f4SDimitry Andric G_ATOMICRMW_UMIN}) 11490b57cec5SDimitry Andric .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, 11500b57cec5SDimitry Andric {S64, GlobalPtr}, {S64, LocalPtr}}); 11510b57cec5SDimitry Andric if (ST.hasFlatAddressSpace()) { 11520b57cec5SDimitry Andric Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); 11530b57cec5SDimitry Andric } 11540b57cec5SDimitry Andric 1155*5ffd83dbSDimitry Andric if (ST.hasLDSFPAtomics()) { 11568bcb0991SDimitry Andric getActionDefinitionsBuilder(G_ATOMICRMW_FADD) 11578bcb0991SDimitry Andric .legalFor({{S32, LocalPtr}}); 1158*5ffd83dbSDimitry Andric } 11598bcb0991SDimitry Andric 1160480093f4SDimitry Andric // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output 1161480093f4SDimitry Andric // demarshalling 1162480093f4SDimitry Andric getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) 1163480093f4SDimitry Andric .customFor({{S32, GlobalPtr}, {S64, GlobalPtr}, 1164480093f4SDimitry Andric {S32, FlatPtr}, {S64, FlatPtr}}) 1165480093f4SDimitry Andric .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, 1166480093f4SDimitry Andric {S32, RegionPtr}, {S64, RegionPtr}}); 11670b57cec5SDimitry Andric // TODO: Pointer types, any 32-bit or 64-bit vector 1168480093f4SDimitry Andric 1169480093f4SDimitry Andric // Condition should be s32 for scalar, s1 for vector. 11700b57cec5SDimitry Andric getActionDefinitionsBuilder(G_SELECT) 11710b57cec5SDimitry Andric .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, 11720b57cec5SDimitry Andric GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, 1173480093f4SDimitry Andric LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) 11740b57cec5SDimitry Andric .clampScalar(0, S16, S64) 1175*5ffd83dbSDimitry Andric .scalarize(1) 11760b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) 11770b57cec5SDimitry Andric .fewerElementsIf(numElementsNotEven(0), scalarize(0)) 11780b57cec5SDimitry Andric .clampMaxNumElements(0, S32, 2) 11790b57cec5SDimitry Andric .clampMaxNumElements(0, LocalPtr, 2) 11800b57cec5SDimitry Andric .clampMaxNumElements(0, PrivatePtr, 2) 11810b57cec5SDimitry Andric .scalarize(0) 11820b57cec5SDimitry Andric .widenScalarToNextPow2(0) 1183480093f4SDimitry Andric .legalIf(all(isPointer(0), typeInSet(1, {S1, S32}))); 11840b57cec5SDimitry Andric 11850b57cec5SDimitry Andric // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can 11860b57cec5SDimitry Andric // be more flexible with the shift amount type. 11870b57cec5SDimitry Andric auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) 11880b57cec5SDimitry Andric .legalFor({{S32, S32}, {S64, S32}}); 11890b57cec5SDimitry Andric if (ST.has16BitInsts()) { 11900b57cec5SDimitry Andric if (ST.hasVOP3PInsts()) { 1191*5ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) 11920b57cec5SDimitry Andric .clampMaxNumElements(0, S16, 2); 11930b57cec5SDimitry Andric } else 1194*5ffd83dbSDimitry Andric Shifts.legalFor({{S16, S16}}); 11950b57cec5SDimitry Andric 1196*5ffd83dbSDimitry Andric // TODO: Support 16-bit shift amounts for all types 1197*5ffd83dbSDimitry Andric Shifts.widenScalarIf( 1198*5ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { 1199*5ffd83dbSDimitry Andric // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a 1200*5ffd83dbSDimitry Andric // 32-bit amount. 1201*5ffd83dbSDimitry Andric const LLT ValTy = Query.Types[0]; 1202*5ffd83dbSDimitry Andric const LLT AmountTy = Query.Types[1]; 1203*5ffd83dbSDimitry Andric return ValTy.getSizeInBits() <= 16 && 1204*5ffd83dbSDimitry Andric AmountTy.getSizeInBits() < 16; 1205*5ffd83dbSDimitry Andric }, changeTo(1, S16)); 1206*5ffd83dbSDimitry Andric Shifts.maxScalarIf(typeIs(0, S16), 1, S16); 1207480093f4SDimitry Andric Shifts.clampScalar(1, S32, S32); 12080b57cec5SDimitry Andric Shifts.clampScalar(0, S16, S64); 12090b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 16); 12100b57cec5SDimitry Andric } else { 12110b57cec5SDimitry Andric // Make sure we legalize the shift amount type first, as the general 12120b57cec5SDimitry Andric // expansion for the shifted type will produce much worse code if it hasn't 12130b57cec5SDimitry Andric // been truncated already. 12140b57cec5SDimitry Andric Shifts.clampScalar(1, S32, S32); 12150b57cec5SDimitry Andric Shifts.clampScalar(0, S32, S64); 12160b57cec5SDimitry Andric Shifts.widenScalarToNextPow2(0, 32); 12170b57cec5SDimitry Andric } 12180b57cec5SDimitry Andric Shifts.scalarize(0); 12190b57cec5SDimitry Andric 12200b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) { 12210b57cec5SDimitry Andric unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0; 12220b57cec5SDimitry Andric unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1; 12230b57cec5SDimitry Andric unsigned IdxTypeIdx = 2; 12240b57cec5SDimitry Andric 12250b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 12260b57cec5SDimitry Andric .customIf([=](const LegalityQuery &Query) { 12270b57cec5SDimitry Andric const LLT EltTy = Query.Types[EltTypeIdx]; 12280b57cec5SDimitry Andric const LLT VecTy = Query.Types[VecTypeIdx]; 12290b57cec5SDimitry Andric const LLT IdxTy = Query.Types[IdxTypeIdx]; 12300b57cec5SDimitry Andric return (EltTy.getSizeInBits() == 16 || 12310b57cec5SDimitry Andric EltTy.getSizeInBits() % 32 == 0) && 12320b57cec5SDimitry Andric VecTy.getSizeInBits() % 32 == 0 && 1233*5ffd83dbSDimitry Andric VecTy.getSizeInBits() <= MaxRegisterSize && 12340b57cec5SDimitry Andric IdxTy.getSizeInBits() == 32; 12350b57cec5SDimitry Andric }) 12360b57cec5SDimitry Andric .clampScalar(EltTypeIdx, S32, S64) 12370b57cec5SDimitry Andric .clampScalar(VecTypeIdx, S32, S64) 12380b57cec5SDimitry Andric .clampScalar(IdxTypeIdx, S32, S32); 12390b57cec5SDimitry Andric } 12400b57cec5SDimitry Andric 12410b57cec5SDimitry Andric getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) 12420b57cec5SDimitry Andric .unsupportedIf([=](const LegalityQuery &Query) { 12430b57cec5SDimitry Andric const LLT &EltTy = Query.Types[1].getElementType(); 12440b57cec5SDimitry Andric return Query.Types[0] != EltTy; 12450b57cec5SDimitry Andric }); 12460b57cec5SDimitry Andric 12470b57cec5SDimitry Andric for (unsigned Op : {G_EXTRACT, G_INSERT}) { 12480b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0; 12490b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1; 12500b57cec5SDimitry Andric 12510b57cec5SDimitry Andric // FIXME: Doesn't handle extract of illegal sizes. 12520b57cec5SDimitry Andric getActionDefinitionsBuilder(Op) 12538bcb0991SDimitry Andric .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32))) 12548bcb0991SDimitry Andric // FIXME: Multiples of 16 should not be legal. 12550b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 12560b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 12570b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 12580b57cec5SDimitry Andric return (BigTy.getSizeInBits() % 32 == 0) && 12590b57cec5SDimitry Andric (LitTy.getSizeInBits() % 16 == 0); 12600b57cec5SDimitry Andric }) 12610b57cec5SDimitry Andric .widenScalarIf( 12620b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 12630b57cec5SDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 12640b57cec5SDimitry Andric return (BigTy.getScalarSizeInBits() < 16); 12650b57cec5SDimitry Andric }, 12660b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(BigTyIdx, 16)) 12670b57cec5SDimitry Andric .widenScalarIf( 12680b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 12690b57cec5SDimitry Andric const LLT LitTy = Query.Types[LitTyIdx]; 12700b57cec5SDimitry Andric return (LitTy.getScalarSizeInBits() < 16); 12710b57cec5SDimitry Andric }, 12720b57cec5SDimitry Andric LegalizeMutations::widenScalarOrEltToNextPow2(LitTyIdx, 16)) 12730b57cec5SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 12740b57cec5SDimitry Andric .widenScalarToNextPow2(BigTyIdx, 32); 12750b57cec5SDimitry Andric 12760b57cec5SDimitry Andric } 12770b57cec5SDimitry Andric 12788bcb0991SDimitry Andric auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR) 12790b57cec5SDimitry Andric .legalForCartesianProduct(AllS32Vectors, {S32}) 12800b57cec5SDimitry Andric .legalForCartesianProduct(AllS64Vectors, {S64}) 12818bcb0991SDimitry Andric .clampNumElements(0, V16S32, V32S32) 12828bcb0991SDimitry Andric .clampNumElements(0, V2S64, V16S64) 12838bcb0991SDimitry Andric .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); 12848bcb0991SDimitry Andric 12858bcb0991SDimitry Andric if (ST.hasScalarPackInsts()) { 1286*5ffd83dbSDimitry Andric BuildVector 1287*5ffd83dbSDimitry Andric // FIXME: Should probably widen s1 vectors straight to s32 1288*5ffd83dbSDimitry Andric .minScalarOrElt(0, S16) 1289*5ffd83dbSDimitry Andric // Widen source elements and produce a G_BUILD_VECTOR_TRUNC 1290*5ffd83dbSDimitry Andric .minScalar(1, S32); 1291*5ffd83dbSDimitry Andric 12928bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 12938bcb0991SDimitry Andric .legalFor({V2S16, S32}) 12948bcb0991SDimitry Andric .lower(); 1295*5ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 12968bcb0991SDimitry Andric } else { 1297*5ffd83dbSDimitry Andric BuildVector.customFor({V2S16, S16}); 1298*5ffd83dbSDimitry Andric BuildVector.minScalarOrElt(0, S32); 1299*5ffd83dbSDimitry Andric 13008bcb0991SDimitry Andric getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) 1301*5ffd83dbSDimitry Andric .customFor({V2S16, S32}) 13028bcb0991SDimitry Andric .lower(); 13038bcb0991SDimitry Andric } 13048bcb0991SDimitry Andric 1305*5ffd83dbSDimitry Andric BuildVector.legalIf(isRegisterType(0)); 1306*5ffd83dbSDimitry Andric 1307*5ffd83dbSDimitry Andric // FIXME: Clamp maximum size 13080b57cec5SDimitry Andric getActionDefinitionsBuilder(G_CONCAT_VECTORS) 13090b57cec5SDimitry Andric .legalIf(isRegisterType(0)); 13100b57cec5SDimitry Andric 1311*5ffd83dbSDimitry Andric // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse 1312*5ffd83dbSDimitry Andric // pre-legalize. 1313*5ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 1314*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) 1315*5ffd83dbSDimitry Andric .customFor({V2S16, V2S16}) 1316*5ffd83dbSDimitry Andric .lower(); 1317*5ffd83dbSDimitry Andric } else 13188bcb0991SDimitry Andric getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); 13198bcb0991SDimitry Andric 13200b57cec5SDimitry Andric // Merge/Unmerge 13210b57cec5SDimitry Andric for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { 13220b57cec5SDimitry Andric unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; 13230b57cec5SDimitry Andric unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; 13240b57cec5SDimitry Andric 13250b57cec5SDimitry Andric auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { 1326*5ffd83dbSDimitry Andric const LLT Ty = Query.Types[TypeIdx]; 13270b57cec5SDimitry Andric if (Ty.isVector()) { 13280b57cec5SDimitry Andric const LLT &EltTy = Ty.getElementType(); 1329*5ffd83dbSDimitry Andric if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) 13300b57cec5SDimitry Andric return true; 13310b57cec5SDimitry Andric if (!isPowerOf2_32(EltTy.getSizeInBits())) 13320b57cec5SDimitry Andric return true; 13330b57cec5SDimitry Andric } 13340b57cec5SDimitry Andric return false; 13350b57cec5SDimitry Andric }; 13360b57cec5SDimitry Andric 13378bcb0991SDimitry Andric auto &Builder = getActionDefinitionsBuilder(Op) 1338*5ffd83dbSDimitry Andric .lowerFor({{S16, V2S16}}) 1339*5ffd83dbSDimitry Andric .lowerIf([=](const LegalityQuery &Query) { 1340*5ffd83dbSDimitry Andric const LLT BigTy = Query.Types[BigTyIdx]; 1341*5ffd83dbSDimitry Andric return BigTy.getSizeInBits() == 32; 1342*5ffd83dbSDimitry Andric }) 1343*5ffd83dbSDimitry Andric // Try to widen to s16 first for small types. 1344*5ffd83dbSDimitry Andric // TODO: Only do this on targets with legal s16 shifts 1345*5ffd83dbSDimitry Andric .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) 13460b57cec5SDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) 13478bcb0991SDimitry Andric .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) 13488bcb0991SDimitry Andric .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), 13498bcb0991SDimitry Andric elementTypeIs(1, S16)), 13508bcb0991SDimitry Andric changeTo(1, V2S16)) 1351*5ffd83dbSDimitry Andric // Clamp the little scalar to s8-s256 and make it a power of 2. It's not 1352*5ffd83dbSDimitry Andric // worth considering the multiples of 64 since 2*192 and 2*384 are not 1353*5ffd83dbSDimitry Andric // valid. 1354*5ffd83dbSDimitry Andric .clampScalar(LitTyIdx, S32, S512) 1355*5ffd83dbSDimitry Andric .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) 13560b57cec5SDimitry Andric // Break up vectors with weird elements into scalars 13570b57cec5SDimitry Andric .fewerElementsIf( 1358*5ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, 13590b57cec5SDimitry Andric scalarize(0)) 13600b57cec5SDimitry Andric .fewerElementsIf( 1361*5ffd83dbSDimitry Andric [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, 13620b57cec5SDimitry Andric scalarize(1)) 1363*5ffd83dbSDimitry Andric .clampScalar(BigTyIdx, S32, MaxScalar); 13648bcb0991SDimitry Andric 13658bcb0991SDimitry Andric if (Op == G_MERGE_VALUES) { 13668bcb0991SDimitry Andric Builder.widenScalarIf( 13678bcb0991SDimitry Andric // TODO: Use 16-bit shifts if legal for 8-bit values? 13680b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 13698bcb0991SDimitry Andric const LLT Ty = Query.Types[LitTyIdx]; 13708bcb0991SDimitry Andric return Ty.getSizeInBits() < 32; 13718bcb0991SDimitry Andric }, 13728bcb0991SDimitry Andric changeTo(LitTyIdx, S32)); 13738bcb0991SDimitry Andric } 13748bcb0991SDimitry Andric 13758bcb0991SDimitry Andric Builder.widenScalarIf( 13768bcb0991SDimitry Andric [=](const LegalityQuery &Query) { 13778bcb0991SDimitry Andric const LLT Ty = Query.Types[BigTyIdx]; 13780b57cec5SDimitry Andric return !isPowerOf2_32(Ty.getSizeInBits()) && 13790b57cec5SDimitry Andric Ty.getSizeInBits() % 16 != 0; 13800b57cec5SDimitry Andric }, 13810b57cec5SDimitry Andric [=](const LegalityQuery &Query) { 13820b57cec5SDimitry Andric // Pick the next power of 2, or a multiple of 64 over 128. 13830b57cec5SDimitry Andric // Whichever is smaller. 13840b57cec5SDimitry Andric const LLT &Ty = Query.Types[BigTyIdx]; 13850b57cec5SDimitry Andric unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1); 13860b57cec5SDimitry Andric if (NewSizeInBits >= 256) { 13870b57cec5SDimitry Andric unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1); 13880b57cec5SDimitry Andric if (RoundedTo < NewSizeInBits) 13890b57cec5SDimitry Andric NewSizeInBits = RoundedTo; 13900b57cec5SDimitry Andric } 13910b57cec5SDimitry Andric return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); 13920b57cec5SDimitry Andric }) 13930b57cec5SDimitry Andric .legalIf([=](const LegalityQuery &Query) { 13940b57cec5SDimitry Andric const LLT &BigTy = Query.Types[BigTyIdx]; 13950b57cec5SDimitry Andric const LLT &LitTy = Query.Types[LitTyIdx]; 13960b57cec5SDimitry Andric 13970b57cec5SDimitry Andric if (BigTy.isVector() && BigTy.getSizeInBits() < 32) 13980b57cec5SDimitry Andric return false; 13990b57cec5SDimitry Andric if (LitTy.isVector() && LitTy.getSizeInBits() < 32) 14000b57cec5SDimitry Andric return false; 14010b57cec5SDimitry Andric 14020b57cec5SDimitry Andric return BigTy.getSizeInBits() % 16 == 0 && 14030b57cec5SDimitry Andric LitTy.getSizeInBits() % 16 == 0 && 1404*5ffd83dbSDimitry Andric BigTy.getSizeInBits() <= MaxRegisterSize; 14050b57cec5SDimitry Andric }) 14060b57cec5SDimitry Andric // Any vectors left are the wrong size. Scalarize them. 14070b57cec5SDimitry Andric .scalarize(0) 14080b57cec5SDimitry Andric .scalarize(1); 14090b57cec5SDimitry Andric } 14100b57cec5SDimitry Andric 1411*5ffd83dbSDimitry Andric // S64 is only legal on SALU, and needs to be broken into 32-bit elements in 1412*5ffd83dbSDimitry Andric // RegBankSelect. 1413*5ffd83dbSDimitry Andric auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) 1414*5ffd83dbSDimitry Andric .legalFor({{S32}, {S64}}); 14158bcb0991SDimitry Andric 1416*5ffd83dbSDimitry Andric if (ST.hasVOP3PInsts()) { 1417*5ffd83dbSDimitry Andric SextInReg.lowerFor({{V2S16}}) 1418*5ffd83dbSDimitry Andric // Prefer to reduce vector widths for 16-bit vectors before lowering, to 1419*5ffd83dbSDimitry Andric // get more vector shift opportunities, since we'll get those when 1420*5ffd83dbSDimitry Andric // expanded. 1421*5ffd83dbSDimitry Andric .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); 1422*5ffd83dbSDimitry Andric } else if (ST.has16BitInsts()) { 1423*5ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}, {S16}}); 1424*5ffd83dbSDimitry Andric } else { 1425*5ffd83dbSDimitry Andric // Prefer to promote to s32 before lowering if we don't have 16-bit 1426*5ffd83dbSDimitry Andric // shifts. This avoid a lot of intermediate truncate and extend operations. 1427*5ffd83dbSDimitry Andric SextInReg.lowerFor({{S32}, {S64}}); 1428*5ffd83dbSDimitry Andric } 1429*5ffd83dbSDimitry Andric 1430*5ffd83dbSDimitry Andric // FIXME: Placeholder rule. Really depends on whether the clamp modifier is 1431*5ffd83dbSDimitry Andric // available, and is selectively legal for s16, s32, v2s16. 1432*5ffd83dbSDimitry Andric getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT}) 1433*5ffd83dbSDimitry Andric .scalarize(0) 1434*5ffd83dbSDimitry Andric .clampScalar(0, S16, S32); 1435*5ffd83dbSDimitry Andric 1436*5ffd83dbSDimitry Andric SextInReg 1437*5ffd83dbSDimitry Andric .scalarize(0) 1438*5ffd83dbSDimitry Andric .clampScalar(0, S32, S64) 1439*5ffd83dbSDimitry Andric .lower(); 1440*5ffd83dbSDimitry Andric 1441*5ffd83dbSDimitry Andric getActionDefinitionsBuilder(G_FSHR) 1442*5ffd83dbSDimitry Andric .legalFor({{S32, S32}}) 1443*5ffd83dbSDimitry Andric .scalarize(0) 1444*5ffd83dbSDimitry Andric .lower(); 1445480093f4SDimitry Andric 1446480093f4SDimitry Andric getActionDefinitionsBuilder(G_READCYCLECOUNTER) 1447480093f4SDimitry Andric .legalFor({S64}); 1448480093f4SDimitry Andric 1449*5ffd83dbSDimitry Andric getActionDefinitionsBuilder({ 1450*5ffd83dbSDimitry Andric // TODO: Verify V_BFI_B32 is generated from expanded bit ops 1451*5ffd83dbSDimitry Andric G_FCOPYSIGN, 1452*5ffd83dbSDimitry Andric 1453*5ffd83dbSDimitry Andric G_ATOMIC_CMPXCHG_WITH_SUCCESS, 1454*5ffd83dbSDimitry Andric G_READ_REGISTER, 1455*5ffd83dbSDimitry Andric G_WRITE_REGISTER, 1456*5ffd83dbSDimitry Andric 1457*5ffd83dbSDimitry Andric G_SADDO, G_SSUBO, 1458*5ffd83dbSDimitry Andric 1459*5ffd83dbSDimitry Andric // TODO: Implement 1460*5ffd83dbSDimitry Andric G_FMINIMUM, G_FMAXIMUM, 1461*5ffd83dbSDimitry Andric G_FSHL 1462*5ffd83dbSDimitry Andric }).lower(); 1463*5ffd83dbSDimitry Andric 1464480093f4SDimitry Andric getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, 1465*5ffd83dbSDimitry Andric G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, 1466480093f4SDimitry Andric G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) 1467480093f4SDimitry Andric .unsupported(); 1468480093f4SDimitry Andric 14690b57cec5SDimitry Andric computeTables(); 14700b57cec5SDimitry Andric verify(*ST.getInstrInfo()); 14710b57cec5SDimitry Andric } 14720b57cec5SDimitry Andric 1473*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, 1474*5ffd83dbSDimitry Andric MachineInstr &MI) const { 1475*5ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 1476*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 1477*5ffd83dbSDimitry Andric GISelChangeObserver &Observer = Helper.Observer; 1478*5ffd83dbSDimitry Andric 14790b57cec5SDimitry Andric switch (MI.getOpcode()) { 14800b57cec5SDimitry Andric case TargetOpcode::G_ADDRSPACE_CAST: 14818bcb0991SDimitry Andric return legalizeAddrSpaceCast(MI, MRI, B); 14820b57cec5SDimitry Andric case TargetOpcode::G_FRINT: 14838bcb0991SDimitry Andric return legalizeFrint(MI, MRI, B); 14840b57cec5SDimitry Andric case TargetOpcode::G_FCEIL: 14858bcb0991SDimitry Andric return legalizeFceil(MI, MRI, B); 14860b57cec5SDimitry Andric case TargetOpcode::G_INTRINSIC_TRUNC: 14878bcb0991SDimitry Andric return legalizeIntrinsicTrunc(MI, MRI, B); 14880b57cec5SDimitry Andric case TargetOpcode::G_SITOFP: 14898bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, true); 14900b57cec5SDimitry Andric case TargetOpcode::G_UITOFP: 14918bcb0991SDimitry Andric return legalizeITOFP(MI, MRI, B, false); 1492*5ffd83dbSDimitry Andric case TargetOpcode::G_FPTOSI: 1493*5ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, true); 1494*5ffd83dbSDimitry Andric case TargetOpcode::G_FPTOUI: 1495*5ffd83dbSDimitry Andric return legalizeFPTOI(MI, MRI, B, false); 14960b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM: 14970b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM: 14980b57cec5SDimitry Andric case TargetOpcode::G_FMINNUM_IEEE: 14990b57cec5SDimitry Andric case TargetOpcode::G_FMAXNUM_IEEE: 1500*5ffd83dbSDimitry Andric return legalizeMinNumMaxNum(Helper, MI); 15010b57cec5SDimitry Andric case TargetOpcode::G_EXTRACT_VECTOR_ELT: 15028bcb0991SDimitry Andric return legalizeExtractVectorElt(MI, MRI, B); 15030b57cec5SDimitry Andric case TargetOpcode::G_INSERT_VECTOR_ELT: 15048bcb0991SDimitry Andric return legalizeInsertVectorElt(MI, MRI, B); 1505*5ffd83dbSDimitry Andric case TargetOpcode::G_SHUFFLE_VECTOR: 1506*5ffd83dbSDimitry Andric return legalizeShuffleVector(MI, MRI, B); 15078bcb0991SDimitry Andric case TargetOpcode::G_FSIN: 15088bcb0991SDimitry Andric case TargetOpcode::G_FCOS: 15098bcb0991SDimitry Andric return legalizeSinCos(MI, MRI, B); 15108bcb0991SDimitry Andric case TargetOpcode::G_GLOBAL_VALUE: 15118bcb0991SDimitry Andric return legalizeGlobalValue(MI, MRI, B); 15128bcb0991SDimitry Andric case TargetOpcode::G_LOAD: 15138bcb0991SDimitry Andric return legalizeLoad(MI, MRI, B, Observer); 15148bcb0991SDimitry Andric case TargetOpcode::G_FMAD: 15158bcb0991SDimitry Andric return legalizeFMad(MI, MRI, B); 15168bcb0991SDimitry Andric case TargetOpcode::G_FDIV: 15178bcb0991SDimitry Andric return legalizeFDIV(MI, MRI, B); 1518*5ffd83dbSDimitry Andric case TargetOpcode::G_UDIV: 1519*5ffd83dbSDimitry Andric case TargetOpcode::G_UREM: 1520*5ffd83dbSDimitry Andric return legalizeUDIV_UREM(MI, MRI, B); 1521*5ffd83dbSDimitry Andric case TargetOpcode::G_SDIV: 1522*5ffd83dbSDimitry Andric case TargetOpcode::G_SREM: 1523*5ffd83dbSDimitry Andric return legalizeSDIV_SREM(MI, MRI, B); 1524480093f4SDimitry Andric case TargetOpcode::G_ATOMIC_CMPXCHG: 1525480093f4SDimitry Andric return legalizeAtomicCmpXChg(MI, MRI, B); 1526*5ffd83dbSDimitry Andric case TargetOpcode::G_FLOG: 1527*5ffd83dbSDimitry Andric return legalizeFlog(MI, B, numbers::ln2f); 1528*5ffd83dbSDimitry Andric case TargetOpcode::G_FLOG10: 1529*5ffd83dbSDimitry Andric return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); 1530*5ffd83dbSDimitry Andric case TargetOpcode::G_FEXP: 1531*5ffd83dbSDimitry Andric return legalizeFExp(MI, B); 1532*5ffd83dbSDimitry Andric case TargetOpcode::G_FPOW: 1533*5ffd83dbSDimitry Andric return legalizeFPow(MI, B); 1534*5ffd83dbSDimitry Andric case TargetOpcode::G_FFLOOR: 1535*5ffd83dbSDimitry Andric return legalizeFFloor(MI, MRI, B); 1536*5ffd83dbSDimitry Andric case TargetOpcode::G_BUILD_VECTOR: 1537*5ffd83dbSDimitry Andric return legalizeBuildVector(MI, MRI, B); 15380b57cec5SDimitry Andric default: 15390b57cec5SDimitry Andric return false; 15400b57cec5SDimitry Andric } 15410b57cec5SDimitry Andric 15420b57cec5SDimitry Andric llvm_unreachable("expected switch to return"); 15430b57cec5SDimitry Andric } 15440b57cec5SDimitry Andric 15450b57cec5SDimitry Andric Register AMDGPULegalizerInfo::getSegmentAperture( 15460b57cec5SDimitry Andric unsigned AS, 15470b57cec5SDimitry Andric MachineRegisterInfo &MRI, 15488bcb0991SDimitry Andric MachineIRBuilder &B) const { 15498bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 15500b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 15510b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 15520b57cec5SDimitry Andric 15538bcb0991SDimitry Andric assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS); 15548bcb0991SDimitry Andric 15550b57cec5SDimitry Andric if (ST.hasApertureRegs()) { 15560b57cec5SDimitry Andric // FIXME: Use inline constants (src_{shared, private}_base) instead of 15570b57cec5SDimitry Andric // getreg. 15580b57cec5SDimitry Andric unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ? 15590b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE : 15600b57cec5SDimitry Andric AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE; 15610b57cec5SDimitry Andric unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ? 15620b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE : 15630b57cec5SDimitry Andric AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE; 15640b57cec5SDimitry Andric unsigned Encoding = 15650b57cec5SDimitry Andric AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ | 15660b57cec5SDimitry Andric Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | 15670b57cec5SDimitry Andric WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; 15680b57cec5SDimitry Andric 15690b57cec5SDimitry Andric Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); 15700b57cec5SDimitry Andric 15718bcb0991SDimitry Andric B.buildInstr(AMDGPU::S_GETREG_B32) 15720b57cec5SDimitry Andric .addDef(GetReg) 15730b57cec5SDimitry Andric .addImm(Encoding); 15740b57cec5SDimitry Andric MRI.setType(GetReg, S32); 15750b57cec5SDimitry Andric 15768bcb0991SDimitry Andric auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); 1577*5ffd83dbSDimitry Andric return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); 15780b57cec5SDimitry Andric } 15790b57cec5SDimitry Andric 15800b57cec5SDimitry Andric Register QueuePtr = MRI.createGenericVirtualRegister( 15810b57cec5SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); 15820b57cec5SDimitry Andric 15838bcb0991SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 15848bcb0991SDimitry Andric if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) 15858bcb0991SDimitry Andric return Register(); 15860b57cec5SDimitry Andric 15870b57cec5SDimitry Andric // Offset into amd_queue_t for group_segment_aperture_base_hi / 15880b57cec5SDimitry Andric // private_segment_aperture_base_hi. 15890b57cec5SDimitry Andric uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; 15900b57cec5SDimitry Andric 1591480093f4SDimitry Andric // TODO: can we be smarter about machine pointer info? 1592480093f4SDimitry Andric MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); 15930b57cec5SDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 15940b57cec5SDimitry Andric PtrInfo, 1595*5ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 15960b57cec5SDimitry Andric MachineMemOperand::MOInvariant, 1597*5ffd83dbSDimitry Andric 4, commonAlignment(Align(64), StructOffset)); 15980b57cec5SDimitry Andric 15990b57cec5SDimitry Andric Register LoadAddr; 16000b57cec5SDimitry Andric 1601480093f4SDimitry Andric B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); 1602*5ffd83dbSDimitry Andric return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); 16030b57cec5SDimitry Andric } 16040b57cec5SDimitry Andric 16050b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( 16060b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 16078bcb0991SDimitry Andric MachineIRBuilder &B) const { 16088bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 16090b57cec5SDimitry Andric 16108bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 16110b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 16120b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 16130b57cec5SDimitry Andric 16140b57cec5SDimitry Andric LLT DstTy = MRI.getType(Dst); 16150b57cec5SDimitry Andric LLT SrcTy = MRI.getType(Src); 16160b57cec5SDimitry Andric unsigned DestAS = DstTy.getAddressSpace(); 16170b57cec5SDimitry Andric unsigned SrcAS = SrcTy.getAddressSpace(); 16180b57cec5SDimitry Andric 16190b57cec5SDimitry Andric // TODO: Avoid reloading from the queue ptr for each cast, or at least each 16200b57cec5SDimitry Andric // vector element. 16210b57cec5SDimitry Andric assert(!DstTy.isVector()); 16220b57cec5SDimitry Andric 16230b57cec5SDimitry Andric const AMDGPUTargetMachine &TM 16240b57cec5SDimitry Andric = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); 16250b57cec5SDimitry Andric 16260b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 16270b57cec5SDimitry Andric if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { 16288bcb0991SDimitry Andric MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); 16298bcb0991SDimitry Andric return true; 16308bcb0991SDimitry Andric } 16318bcb0991SDimitry Andric 16328bcb0991SDimitry Andric if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 16338bcb0991SDimitry Andric // Truncate. 16348bcb0991SDimitry Andric B.buildExtract(Dst, Src, 0); 16358bcb0991SDimitry Andric MI.eraseFromParent(); 16368bcb0991SDimitry Andric return true; 16378bcb0991SDimitry Andric } 16388bcb0991SDimitry Andric 16398bcb0991SDimitry Andric if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { 16408bcb0991SDimitry Andric const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); 16418bcb0991SDimitry Andric uint32_t AddrHiVal = Info->get32BitAddressHighBits(); 16428bcb0991SDimitry Andric 16438bcb0991SDimitry Andric // FIXME: This is a bit ugly due to creating a merge of 2 pointers to 16448bcb0991SDimitry Andric // another. Merge operands are required to be the same type, but creating an 16458bcb0991SDimitry Andric // extra ptrtoint would be kind of pointless. 16468bcb0991SDimitry Andric auto HighAddr = B.buildConstant( 16478bcb0991SDimitry Andric LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); 1648*5ffd83dbSDimitry Andric B.buildMerge(Dst, {Src, HighAddr}); 16498bcb0991SDimitry Andric MI.eraseFromParent(); 16500b57cec5SDimitry Andric return true; 16510b57cec5SDimitry Andric } 16520b57cec5SDimitry Andric 16530b57cec5SDimitry Andric if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { 16540b57cec5SDimitry Andric assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || 16550b57cec5SDimitry Andric DestAS == AMDGPUAS::PRIVATE_ADDRESS); 16560b57cec5SDimitry Andric unsigned NullVal = TM.getNullPointerValue(DestAS); 16570b57cec5SDimitry Andric 16588bcb0991SDimitry Andric auto SegmentNull = B.buildConstant(DstTy, NullVal); 16598bcb0991SDimitry Andric auto FlatNull = B.buildConstant(SrcTy, 0); 16600b57cec5SDimitry Andric 16610b57cec5SDimitry Andric // Extract low 32-bits of the pointer. 1662*5ffd83dbSDimitry Andric auto PtrLo32 = B.buildExtract(DstTy, Src, 0); 16630b57cec5SDimitry Andric 1664*5ffd83dbSDimitry Andric auto CmpRes = 1665*5ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); 16668bcb0991SDimitry Andric B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); 16670b57cec5SDimitry Andric 16680b57cec5SDimitry Andric MI.eraseFromParent(); 16690b57cec5SDimitry Andric return true; 16700b57cec5SDimitry Andric } 16710b57cec5SDimitry Andric 16728bcb0991SDimitry Andric if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) 16738bcb0991SDimitry Andric return false; 16748bcb0991SDimitry Andric 16758bcb0991SDimitry Andric if (!ST.hasFlatAddressSpace()) 16768bcb0991SDimitry Andric return false; 16770b57cec5SDimitry Andric 16780b57cec5SDimitry Andric auto SegmentNull = 16798bcb0991SDimitry Andric B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); 16800b57cec5SDimitry Andric auto FlatNull = 16818bcb0991SDimitry Andric B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); 16820b57cec5SDimitry Andric 16838bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); 16848bcb0991SDimitry Andric if (!ApertureReg.isValid()) 16858bcb0991SDimitry Andric return false; 16860b57cec5SDimitry Andric 1687*5ffd83dbSDimitry Andric auto CmpRes = 1688*5ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); 16890b57cec5SDimitry Andric 16900b57cec5SDimitry Andric // Coerce the type of the low half of the result so we can use merge_values. 1691*5ffd83dbSDimitry Andric Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); 16920b57cec5SDimitry Andric 16930b57cec5SDimitry Andric // TODO: Should we allow mismatched types but matching sizes in merges to 16940b57cec5SDimitry Andric // avoid the ptrtoint? 1695*5ffd83dbSDimitry Andric auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); 1696*5ffd83dbSDimitry Andric B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); 16970b57cec5SDimitry Andric 16980b57cec5SDimitry Andric MI.eraseFromParent(); 16990b57cec5SDimitry Andric return true; 17000b57cec5SDimitry Andric } 17010b57cec5SDimitry Andric 17020b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFrint( 17030b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 17048bcb0991SDimitry Andric MachineIRBuilder &B) const { 17050b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 17060b57cec5SDimitry Andric LLT Ty = MRI.getType(Src); 17070b57cec5SDimitry Andric assert(Ty.isScalar() && Ty.getSizeInBits() == 64); 17080b57cec5SDimitry Andric 17090b57cec5SDimitry Andric APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52"); 17100b57cec5SDimitry Andric APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51"); 17110b57cec5SDimitry Andric 17128bcb0991SDimitry Andric auto C1 = B.buildFConstant(Ty, C1Val); 17138bcb0991SDimitry Andric auto CopySign = B.buildFCopysign(Ty, C1, Src); 17140b57cec5SDimitry Andric 17150b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 17168bcb0991SDimitry Andric auto Tmp1 = B.buildFAdd(Ty, Src, CopySign); 17178bcb0991SDimitry Andric auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign); 17180b57cec5SDimitry Andric 17198bcb0991SDimitry Andric auto C2 = B.buildFConstant(Ty, C2Val); 17208bcb0991SDimitry Andric auto Fabs = B.buildFAbs(Ty, Src); 17210b57cec5SDimitry Andric 17228bcb0991SDimitry Andric auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); 17238bcb0991SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); 17240b57cec5SDimitry Andric return true; 17250b57cec5SDimitry Andric } 17260b57cec5SDimitry Andric 17270b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeFceil( 17280b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 17290b57cec5SDimitry Andric MachineIRBuilder &B) const { 17300b57cec5SDimitry Andric 17310b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 17320b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 17330b57cec5SDimitry Andric 17340b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 17350b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 17360b57cec5SDimitry Andric 17370b57cec5SDimitry Andric // result = trunc(src) 17380b57cec5SDimitry Andric // if (src > 0.0 && src != result) 17390b57cec5SDimitry Andric // result += 1.0 17400b57cec5SDimitry Andric 1741*5ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S64, Src); 17420b57cec5SDimitry Andric 17430b57cec5SDimitry Andric const auto Zero = B.buildFConstant(S64, 0.0); 17440b57cec5SDimitry Andric const auto One = B.buildFConstant(S64, 1.0); 17450b57cec5SDimitry Andric auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero); 17460b57cec5SDimitry Andric auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc); 17470b57cec5SDimitry Andric auto And = B.buildAnd(S1, Lt0, NeTrunc); 17480b57cec5SDimitry Andric auto Add = B.buildSelect(S64, And, One, Zero); 17490b57cec5SDimitry Andric 17500b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 17510b57cec5SDimitry Andric B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add); 17520b57cec5SDimitry Andric return true; 17530b57cec5SDimitry Andric } 17540b57cec5SDimitry Andric 17550b57cec5SDimitry Andric static MachineInstrBuilder extractF64Exponent(unsigned Hi, 17560b57cec5SDimitry Andric MachineIRBuilder &B) { 17570b57cec5SDimitry Andric const unsigned FractBits = 52; 17580b57cec5SDimitry Andric const unsigned ExpBits = 11; 17590b57cec5SDimitry Andric LLT S32 = LLT::scalar(32); 17600b57cec5SDimitry Andric 17610b57cec5SDimitry Andric auto Const0 = B.buildConstant(S32, FractBits - 32); 17620b57cec5SDimitry Andric auto Const1 = B.buildConstant(S32, ExpBits); 17630b57cec5SDimitry Andric 17640b57cec5SDimitry Andric auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) 17650b57cec5SDimitry Andric .addUse(Const0.getReg(0)) 17660b57cec5SDimitry Andric .addUse(Const1.getReg(0)); 17670b57cec5SDimitry Andric 17680b57cec5SDimitry Andric return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023)); 17690b57cec5SDimitry Andric } 17700b57cec5SDimitry Andric 17710b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( 17720b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 17730b57cec5SDimitry Andric MachineIRBuilder &B) const { 17740b57cec5SDimitry Andric const LLT S1 = LLT::scalar(1); 17750b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 17760b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 17770b57cec5SDimitry Andric 17780b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 17790b57cec5SDimitry Andric assert(MRI.getType(Src) == S64); 17800b57cec5SDimitry Andric 17810b57cec5SDimitry Andric // TODO: Should this use extract since the low half is unused? 17820b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 17830b57cec5SDimitry Andric Register Hi = Unmerge.getReg(1); 17840b57cec5SDimitry Andric 17850b57cec5SDimitry Andric // Extract the upper half, since this is where we will find the sign and 17860b57cec5SDimitry Andric // exponent. 17870b57cec5SDimitry Andric auto Exp = extractF64Exponent(Hi, B); 17880b57cec5SDimitry Andric 17890b57cec5SDimitry Andric const unsigned FractBits = 52; 17900b57cec5SDimitry Andric 17910b57cec5SDimitry Andric // Extract the sign bit. 17920b57cec5SDimitry Andric const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31); 17930b57cec5SDimitry Andric auto SignBit = B.buildAnd(S32, Hi, SignBitMask); 17940b57cec5SDimitry Andric 17950b57cec5SDimitry Andric const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1); 17960b57cec5SDimitry Andric 17970b57cec5SDimitry Andric const auto Zero32 = B.buildConstant(S32, 0); 17980b57cec5SDimitry Andric 17990b57cec5SDimitry Andric // Extend back to 64-bits. 1800*5ffd83dbSDimitry Andric auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); 18010b57cec5SDimitry Andric 18020b57cec5SDimitry Andric auto Shr = B.buildAShr(S64, FractMask, Exp); 18030b57cec5SDimitry Andric auto Not = B.buildNot(S64, Shr); 18040b57cec5SDimitry Andric auto Tmp0 = B.buildAnd(S64, Src, Not); 18050b57cec5SDimitry Andric auto FiftyOne = B.buildConstant(S32, FractBits - 1); 18060b57cec5SDimitry Andric 18070b57cec5SDimitry Andric auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32); 18080b57cec5SDimitry Andric auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne); 18090b57cec5SDimitry Andric 18100b57cec5SDimitry Andric auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); 18110b57cec5SDimitry Andric B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); 18120b57cec5SDimitry Andric return true; 18130b57cec5SDimitry Andric } 18140b57cec5SDimitry Andric 18150b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeITOFP( 18160b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 18170b57cec5SDimitry Andric MachineIRBuilder &B, bool Signed) const { 18180b57cec5SDimitry Andric 18190b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 18200b57cec5SDimitry Andric Register Src = MI.getOperand(1).getReg(); 18210b57cec5SDimitry Andric 18220b57cec5SDimitry Andric const LLT S64 = LLT::scalar(64); 18230b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 18240b57cec5SDimitry Andric 18250b57cec5SDimitry Andric assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 18260b57cec5SDimitry Andric 18270b57cec5SDimitry Andric auto Unmerge = B.buildUnmerge({S32, S32}, Src); 18280b57cec5SDimitry Andric 18290b57cec5SDimitry Andric auto CvtHi = Signed ? 18300b57cec5SDimitry Andric B.buildSITOFP(S64, Unmerge.getReg(1)) : 18310b57cec5SDimitry Andric B.buildUITOFP(S64, Unmerge.getReg(1)); 18320b57cec5SDimitry Andric 18330b57cec5SDimitry Andric auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0)); 18340b57cec5SDimitry Andric 18350b57cec5SDimitry Andric auto ThirtyTwo = B.buildConstant(S32, 32); 18360b57cec5SDimitry Andric auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false) 18370b57cec5SDimitry Andric .addUse(CvtHi.getReg(0)) 18380b57cec5SDimitry Andric .addUse(ThirtyTwo.getReg(0)); 18390b57cec5SDimitry Andric 18400b57cec5SDimitry Andric // TODO: Should this propagate fast-math-flags? 18410b57cec5SDimitry Andric B.buildFAdd(Dst, LdExp, CvtLo); 18420b57cec5SDimitry Andric MI.eraseFromParent(); 18430b57cec5SDimitry Andric return true; 18440b57cec5SDimitry Andric } 18450b57cec5SDimitry Andric 1846*5ffd83dbSDimitry Andric // TODO: Copied from DAG implementation. Verify logic and document how this 1847*5ffd83dbSDimitry Andric // actually works. 1848*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPTOI( 18490b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1850*5ffd83dbSDimitry Andric MachineIRBuilder &B, bool Signed) const { 1851*5ffd83dbSDimitry Andric 1852*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 1853*5ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 1854*5ffd83dbSDimitry Andric 1855*5ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 1856*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 1857*5ffd83dbSDimitry Andric 1858*5ffd83dbSDimitry Andric assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); 1859*5ffd83dbSDimitry Andric 1860*5ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 1861*5ffd83dbSDimitry Andric 1862*5ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); 1863*5ffd83dbSDimitry Andric auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); 1864*5ffd83dbSDimitry Andric auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); 1865*5ffd83dbSDimitry Andric 1866*5ffd83dbSDimitry Andric auto Mul = B.buildFMul(S64, Trunc, K0, Flags); 1867*5ffd83dbSDimitry Andric auto FloorMul = B.buildFFloor(S64, Mul, Flags); 1868*5ffd83dbSDimitry Andric auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); 1869*5ffd83dbSDimitry Andric 1870*5ffd83dbSDimitry Andric auto Hi = Signed ? 1871*5ffd83dbSDimitry Andric B.buildFPTOSI(S32, FloorMul) : 1872*5ffd83dbSDimitry Andric B.buildFPTOUI(S32, FloorMul); 1873*5ffd83dbSDimitry Andric auto Lo = B.buildFPTOUI(S32, Fma); 1874*5ffd83dbSDimitry Andric 1875*5ffd83dbSDimitry Andric B.buildMerge(Dst, { Lo, Hi }); 1876*5ffd83dbSDimitry Andric MI.eraseFromParent(); 1877*5ffd83dbSDimitry Andric 1878*5ffd83dbSDimitry Andric return true; 1879*5ffd83dbSDimitry Andric } 1880*5ffd83dbSDimitry Andric 1881*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, 1882*5ffd83dbSDimitry Andric MachineInstr &MI) const { 1883*5ffd83dbSDimitry Andric MachineFunction &MF = Helper.MIRBuilder.getMF(); 18840b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 18850b57cec5SDimitry Andric 18860b57cec5SDimitry Andric const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || 18870b57cec5SDimitry Andric MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; 18880b57cec5SDimitry Andric 18890b57cec5SDimitry Andric // With ieee_mode disabled, the instructions have the correct behavior 18900b57cec5SDimitry Andric // already for G_FMINNUM/G_FMAXNUM 18910b57cec5SDimitry Andric if (!MFI->getMode().IEEE) 18920b57cec5SDimitry Andric return !IsIEEEOp; 18930b57cec5SDimitry Andric 18940b57cec5SDimitry Andric if (IsIEEEOp) 18950b57cec5SDimitry Andric return true; 18960b57cec5SDimitry Andric 18970b57cec5SDimitry Andric return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; 18980b57cec5SDimitry Andric } 18990b57cec5SDimitry Andric 19000b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeExtractVectorElt( 19010b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 19020b57cec5SDimitry Andric MachineIRBuilder &B) const { 19030b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 19040b57cec5SDimitry Andric 19050b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 1906*5ffd83dbSDimitry Andric 1907*5ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 1908*5ffd83dbSDimitry Andric // constant before this, so we shouldn't need 1909*5ffd83dbSDimitry Andric // getConstantVRegValWithLookThrough. 1910*5ffd83dbSDimitry Andric Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1911*5ffd83dbSDimitry Andric MI.getOperand(2).getReg(), MRI); 19120b57cec5SDimitry Andric if (!IdxVal) // Dynamic case will be selected to register indexing. 19130b57cec5SDimitry Andric return true; 19140b57cec5SDimitry Andric 19150b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 19160b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 19170b57cec5SDimitry Andric 19180b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 19190b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 19200b57cec5SDimitry Andric assert(EltTy == MRI.getType(Dst)); 19210b57cec5SDimitry Andric 1922*5ffd83dbSDimitry Andric if (IdxVal->Value < VecTy.getNumElements()) 1923*5ffd83dbSDimitry Andric B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); 19240b57cec5SDimitry Andric else 19250b57cec5SDimitry Andric B.buildUndef(Dst); 19260b57cec5SDimitry Andric 19270b57cec5SDimitry Andric MI.eraseFromParent(); 19280b57cec5SDimitry Andric return true; 19290b57cec5SDimitry Andric } 19300b57cec5SDimitry Andric 19310b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeInsertVectorElt( 19320b57cec5SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 19330b57cec5SDimitry Andric MachineIRBuilder &B) const { 19340b57cec5SDimitry Andric // TODO: Should move some of this into LegalizerHelper. 19350b57cec5SDimitry Andric 19360b57cec5SDimitry Andric // TODO: Promote dynamic indexing of s16 to s32 1937*5ffd83dbSDimitry Andric 1938*5ffd83dbSDimitry Andric // FIXME: Artifact combiner probably should have replaced the truncated 1939*5ffd83dbSDimitry Andric // constant before this, so we shouldn't need 1940*5ffd83dbSDimitry Andric // getConstantVRegValWithLookThrough. 1941*5ffd83dbSDimitry Andric Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( 1942*5ffd83dbSDimitry Andric MI.getOperand(3).getReg(), MRI); 19430b57cec5SDimitry Andric if (!IdxVal) // Dynamic case will be selected to register indexing. 19440b57cec5SDimitry Andric return true; 19450b57cec5SDimitry Andric 19460b57cec5SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 19470b57cec5SDimitry Andric Register Vec = MI.getOperand(1).getReg(); 19480b57cec5SDimitry Andric Register Ins = MI.getOperand(2).getReg(); 19490b57cec5SDimitry Andric 19500b57cec5SDimitry Andric LLT VecTy = MRI.getType(Vec); 19510b57cec5SDimitry Andric LLT EltTy = VecTy.getElementType(); 19520b57cec5SDimitry Andric assert(EltTy == MRI.getType(Ins)); 19530b57cec5SDimitry Andric 1954*5ffd83dbSDimitry Andric if (IdxVal->Value < VecTy.getNumElements()) 1955*5ffd83dbSDimitry Andric B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); 19560b57cec5SDimitry Andric else 19570b57cec5SDimitry Andric B.buildUndef(Dst); 19580b57cec5SDimitry Andric 19590b57cec5SDimitry Andric MI.eraseFromParent(); 19600b57cec5SDimitry Andric return true; 19610b57cec5SDimitry Andric } 19620b57cec5SDimitry Andric 1963*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeShuffleVector( 1964*5ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 1965*5ffd83dbSDimitry Andric MachineIRBuilder &B) const { 1966*5ffd83dbSDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 1967*5ffd83dbSDimitry Andric 1968*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 1969*5ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 1970*5ffd83dbSDimitry Andric LLT DstTy = MRI.getType(Dst); 1971*5ffd83dbSDimitry Andric LLT SrcTy = MRI.getType(Src0); 1972*5ffd83dbSDimitry Andric 1973*5ffd83dbSDimitry Andric if (SrcTy == V2S16 && DstTy == V2S16 && 1974*5ffd83dbSDimitry Andric AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) 1975*5ffd83dbSDimitry Andric return true; 1976*5ffd83dbSDimitry Andric 1977*5ffd83dbSDimitry Andric MachineIRBuilder HelperBuilder(MI); 1978*5ffd83dbSDimitry Andric GISelObserverWrapper DummyObserver; 1979*5ffd83dbSDimitry Andric LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); 1980*5ffd83dbSDimitry Andric return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; 1981*5ffd83dbSDimitry Andric } 1982*5ffd83dbSDimitry Andric 19838bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeSinCos( 19848bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 19858bcb0991SDimitry Andric MachineIRBuilder &B) const { 19868bcb0991SDimitry Andric 19878bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 19888bcb0991SDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 19898bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 19908bcb0991SDimitry Andric unsigned Flags = MI.getFlags(); 19918bcb0991SDimitry Andric 19928bcb0991SDimitry Andric Register TrigVal; 1993*5ffd83dbSDimitry Andric auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); 19948bcb0991SDimitry Andric if (ST.hasTrigReducedRange()) { 19958bcb0991SDimitry Andric auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); 19968bcb0991SDimitry Andric TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) 19978bcb0991SDimitry Andric .addUse(MulVal.getReg(0)) 19988bcb0991SDimitry Andric .setMIFlags(Flags).getReg(0); 19998bcb0991SDimitry Andric } else 20008bcb0991SDimitry Andric TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0); 20018bcb0991SDimitry Andric 20028bcb0991SDimitry Andric Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ? 20038bcb0991SDimitry Andric Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos; 20048bcb0991SDimitry Andric B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false) 20058bcb0991SDimitry Andric .addUse(TrigVal) 20068bcb0991SDimitry Andric .setMIFlags(Flags); 20078bcb0991SDimitry Andric MI.eraseFromParent(); 20088bcb0991SDimitry Andric return true; 20098bcb0991SDimitry Andric } 20108bcb0991SDimitry Andric 2011*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, 2012*5ffd83dbSDimitry Andric MachineIRBuilder &B, 2013*5ffd83dbSDimitry Andric const GlobalValue *GV, 2014*5ffd83dbSDimitry Andric int64_t Offset, 2015*5ffd83dbSDimitry Andric unsigned GAFlags) const { 2016*5ffd83dbSDimitry Andric assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); 20178bcb0991SDimitry Andric // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered 20188bcb0991SDimitry Andric // to the following code sequence: 20198bcb0991SDimitry Andric // 20208bcb0991SDimitry Andric // For constant address space: 20218bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 20228bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol 20238bcb0991SDimitry Andric // s_addc_u32 s1, s1, 0 20248bcb0991SDimitry Andric // 20258bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 20268bcb0991SDimitry Andric // a fixup or relocation is emitted to replace $symbol with a literal 20278bcb0991SDimitry Andric // constant, which is a pc-relative offset from the encoding of the $symbol 20288bcb0991SDimitry Andric // operand to the global variable. 20298bcb0991SDimitry Andric // 20308bcb0991SDimitry Andric // For global address space: 20318bcb0991SDimitry Andric // s_getpc_b64 s[0:1] 20328bcb0991SDimitry Andric // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo 20338bcb0991SDimitry Andric // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi 20348bcb0991SDimitry Andric // 20358bcb0991SDimitry Andric // s_getpc_b64 returns the address of the s_add_u32 instruction and then 20368bcb0991SDimitry Andric // fixups or relocations are emitted to replace $symbol@*@lo and 20378bcb0991SDimitry Andric // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, 20388bcb0991SDimitry Andric // which is a 64-bit pc-relative offset from the encoding of the $symbol 20398bcb0991SDimitry Andric // operand to the global variable. 20408bcb0991SDimitry Andric // 20418bcb0991SDimitry Andric // What we want here is an offset from the value returned by s_getpc 20428bcb0991SDimitry Andric // (which is the address of the s_add_u32 instruction) to the global 20438bcb0991SDimitry Andric // variable, but since the encoding of $symbol starts 4 bytes after the start 20448bcb0991SDimitry Andric // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too 20458bcb0991SDimitry Andric // small. This requires us to add 4 to the global variable offset in order to 20468bcb0991SDimitry Andric // compute the correct address. 20478bcb0991SDimitry Andric 20488bcb0991SDimitry Andric LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 20498bcb0991SDimitry Andric 20508bcb0991SDimitry Andric Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg : 20518bcb0991SDimitry Andric B.getMRI()->createGenericVirtualRegister(ConstPtrTy); 20528bcb0991SDimitry Andric 20538bcb0991SDimitry Andric MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET) 20548bcb0991SDimitry Andric .addDef(PCReg); 20558bcb0991SDimitry Andric 20568bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags); 20578bcb0991SDimitry Andric if (GAFlags == SIInstrInfo::MO_NONE) 20588bcb0991SDimitry Andric MIB.addImm(0); 20598bcb0991SDimitry Andric else 20608bcb0991SDimitry Andric MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); 20618bcb0991SDimitry Andric 20628bcb0991SDimitry Andric B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); 20638bcb0991SDimitry Andric 20648bcb0991SDimitry Andric if (PtrTy.getSizeInBits() == 32) 20658bcb0991SDimitry Andric B.buildExtract(DstReg, PCReg, 0); 20668bcb0991SDimitry Andric return true; 20678bcb0991SDimitry Andric } 20688bcb0991SDimitry Andric 20698bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeGlobalValue( 20708bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 20718bcb0991SDimitry Andric MachineIRBuilder &B) const { 20728bcb0991SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 20738bcb0991SDimitry Andric LLT Ty = MRI.getType(DstReg); 20748bcb0991SDimitry Andric unsigned AS = Ty.getAddressSpace(); 20758bcb0991SDimitry Andric 20768bcb0991SDimitry Andric const GlobalValue *GV = MI.getOperand(1).getGlobal(); 20778bcb0991SDimitry Andric MachineFunction &MF = B.getMF(); 20788bcb0991SDimitry Andric SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 20798bcb0991SDimitry Andric 20808bcb0991SDimitry Andric if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { 20818bcb0991SDimitry Andric if (!MFI->isEntryFunction()) { 20828bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 20838bcb0991SDimitry Andric DiagnosticInfoUnsupported BadLDSDecl( 2084*5ffd83dbSDimitry Andric Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), 2085*5ffd83dbSDimitry Andric DS_Warning); 20868bcb0991SDimitry Andric Fn.getContext().diagnose(BadLDSDecl); 2087*5ffd83dbSDimitry Andric 2088*5ffd83dbSDimitry Andric // We currently don't have a way to correctly allocate LDS objects that 2089*5ffd83dbSDimitry Andric // aren't directly associated with a kernel. We do force inlining of 2090*5ffd83dbSDimitry Andric // functions that use local objects. However, if these dead functions are 2091*5ffd83dbSDimitry Andric // not eliminated, we don't want a compile time error. Just emit a warning 2092*5ffd83dbSDimitry Andric // and a trap, since there should be no callable path here. 2093*5ffd83dbSDimitry Andric B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); 2094*5ffd83dbSDimitry Andric B.buildUndef(DstReg); 2095*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2096*5ffd83dbSDimitry Andric return true; 20978bcb0991SDimitry Andric } 20988bcb0991SDimitry Andric 20998bcb0991SDimitry Andric // TODO: We could emit code to handle the initialization somewhere. 21008bcb0991SDimitry Andric if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { 2101*5ffd83dbSDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 2102*5ffd83dbSDimitry Andric if (!TLI->shouldUseLDSConstAddress(GV)) { 2103*5ffd83dbSDimitry Andric MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); 2104*5ffd83dbSDimitry Andric return true; // Leave in place; 2105*5ffd83dbSDimitry Andric } 2106*5ffd83dbSDimitry Andric 2107*5ffd83dbSDimitry Andric B.buildConstant( 2108*5ffd83dbSDimitry Andric DstReg, 2109*5ffd83dbSDimitry Andric MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); 21108bcb0991SDimitry Andric MI.eraseFromParent(); 21118bcb0991SDimitry Andric return true; 21128bcb0991SDimitry Andric } 21138bcb0991SDimitry Andric 21148bcb0991SDimitry Andric const Function &Fn = MF.getFunction(); 21158bcb0991SDimitry Andric DiagnosticInfoUnsupported BadInit( 21168bcb0991SDimitry Andric Fn, "unsupported initializer for address space", MI.getDebugLoc()); 21178bcb0991SDimitry Andric Fn.getContext().diagnose(BadInit); 21188bcb0991SDimitry Andric return true; 21198bcb0991SDimitry Andric } 21208bcb0991SDimitry Andric 21218bcb0991SDimitry Andric const SITargetLowering *TLI = ST.getTargetLowering(); 21228bcb0991SDimitry Andric 21238bcb0991SDimitry Andric if (TLI->shouldEmitFixup(GV)) { 21248bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0); 21258bcb0991SDimitry Andric MI.eraseFromParent(); 21268bcb0991SDimitry Andric return true; 21278bcb0991SDimitry Andric } 21288bcb0991SDimitry Andric 21298bcb0991SDimitry Andric if (TLI->shouldEmitPCReloc(GV)) { 21308bcb0991SDimitry Andric buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32); 21318bcb0991SDimitry Andric MI.eraseFromParent(); 21328bcb0991SDimitry Andric return true; 21338bcb0991SDimitry Andric } 21348bcb0991SDimitry Andric 21358bcb0991SDimitry Andric LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 21368bcb0991SDimitry Andric Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); 21378bcb0991SDimitry Andric 21388bcb0991SDimitry Andric MachineMemOperand *GOTMMO = MF.getMachineMemOperand( 21398bcb0991SDimitry Andric MachinePointerInfo::getGOT(MF), 21408bcb0991SDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 21418bcb0991SDimitry Andric MachineMemOperand::MOInvariant, 2142*5ffd83dbSDimitry Andric 8 /*Size*/, Align(8)); 21438bcb0991SDimitry Andric 21448bcb0991SDimitry Andric buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); 21458bcb0991SDimitry Andric 21468bcb0991SDimitry Andric if (Ty.getSizeInBits() == 32) { 21478bcb0991SDimitry Andric // Truncate if this is a 32-bit constant adrdess. 21488bcb0991SDimitry Andric auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO); 21498bcb0991SDimitry Andric B.buildExtract(DstReg, Load, 0); 21508bcb0991SDimitry Andric } else 21518bcb0991SDimitry Andric B.buildLoad(DstReg, GOTAddr, *GOTMMO); 21528bcb0991SDimitry Andric 21538bcb0991SDimitry Andric MI.eraseFromParent(); 21548bcb0991SDimitry Andric return true; 21558bcb0991SDimitry Andric } 21568bcb0991SDimitry Andric 21578bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeLoad( 21588bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21598bcb0991SDimitry Andric MachineIRBuilder &B, GISelChangeObserver &Observer) const { 21608bcb0991SDimitry Andric LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); 21618bcb0991SDimitry Andric auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); 21628bcb0991SDimitry Andric Observer.changingInstr(MI); 21638bcb0991SDimitry Andric MI.getOperand(1).setReg(Cast.getReg(0)); 21648bcb0991SDimitry Andric Observer.changedInstr(MI); 21658bcb0991SDimitry Andric return true; 21668bcb0991SDimitry Andric } 21678bcb0991SDimitry Andric 21688bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFMad( 21698bcb0991SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, 21708bcb0991SDimitry Andric MachineIRBuilder &B) const { 21718bcb0991SDimitry Andric LLT Ty = MRI.getType(MI.getOperand(0).getReg()); 21728bcb0991SDimitry Andric assert(Ty.isScalar()); 21738bcb0991SDimitry Andric 2174480093f4SDimitry Andric MachineFunction &MF = B.getMF(); 2175480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); 2176480093f4SDimitry Andric 21778bcb0991SDimitry Andric // TODO: Always legal with future ftz flag. 2178*5ffd83dbSDimitry Andric // FIXME: Do we need just output? 2179*5ffd83dbSDimitry Andric if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) 21808bcb0991SDimitry Andric return true; 2181*5ffd83dbSDimitry Andric if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) 21828bcb0991SDimitry Andric return true; 21838bcb0991SDimitry Andric 21848bcb0991SDimitry Andric MachineIRBuilder HelperBuilder(MI); 21858bcb0991SDimitry Andric GISelObserverWrapper DummyObserver; 21868bcb0991SDimitry Andric LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); 21878bcb0991SDimitry Andric return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; 21888bcb0991SDimitry Andric } 21898bcb0991SDimitry Andric 2190480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( 2191480093f4SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2192480093f4SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2193480093f4SDimitry Andric Register PtrReg = MI.getOperand(1).getReg(); 2194480093f4SDimitry Andric Register CmpVal = MI.getOperand(2).getReg(); 2195480093f4SDimitry Andric Register NewVal = MI.getOperand(3).getReg(); 2196480093f4SDimitry Andric 2197480093f4SDimitry Andric assert(SITargetLowering::isFlatGlobalAddrSpace( 2198480093f4SDimitry Andric MRI.getType(PtrReg).getAddressSpace()) && 2199480093f4SDimitry Andric "this should not have been custom lowered"); 2200480093f4SDimitry Andric 2201480093f4SDimitry Andric LLT ValTy = MRI.getType(CmpVal); 2202480093f4SDimitry Andric LLT VecTy = LLT::vector(2, ValTy); 2203480093f4SDimitry Andric 2204480093f4SDimitry Andric Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); 2205480093f4SDimitry Andric 2206480093f4SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) 2207480093f4SDimitry Andric .addDef(DstReg) 2208480093f4SDimitry Andric .addUse(PtrReg) 2209480093f4SDimitry Andric .addUse(PackedVal) 2210480093f4SDimitry Andric .setMemRefs(MI.memoperands()); 2211480093f4SDimitry Andric 2212480093f4SDimitry Andric MI.eraseFromParent(); 2213480093f4SDimitry Andric return true; 2214480093f4SDimitry Andric } 2215480093f4SDimitry Andric 2216*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFlog( 2217*5ffd83dbSDimitry Andric MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { 2218*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2219*5ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 2220*5ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 2221*5ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 2222*5ffd83dbSDimitry Andric 2223*5ffd83dbSDimitry Andric auto Log2Operand = B.buildFLog2(Ty, Src, Flags); 2224*5ffd83dbSDimitry Andric auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); 2225*5ffd83dbSDimitry Andric 2226*5ffd83dbSDimitry Andric B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); 2227*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2228*5ffd83dbSDimitry Andric return true; 2229*5ffd83dbSDimitry Andric } 2230*5ffd83dbSDimitry Andric 2231*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, 2232*5ffd83dbSDimitry Andric MachineIRBuilder &B) const { 2233*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2234*5ffd83dbSDimitry Andric Register Src = MI.getOperand(1).getReg(); 2235*5ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 2236*5ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 2237*5ffd83dbSDimitry Andric 2238*5ffd83dbSDimitry Andric auto K = B.buildFConstant(Ty, numbers::log2e); 2239*5ffd83dbSDimitry Andric auto Mul = B.buildFMul(Ty, Src, K, Flags); 2240*5ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 2241*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2242*5ffd83dbSDimitry Andric return true; 2243*5ffd83dbSDimitry Andric } 2244*5ffd83dbSDimitry Andric 2245*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, 2246*5ffd83dbSDimitry Andric MachineIRBuilder &B) const { 2247*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2248*5ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 2249*5ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 2250*5ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 2251*5ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 2252*5ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 2253*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2254*5ffd83dbSDimitry Andric 2255*5ffd83dbSDimitry Andric if (Ty == S32) { 2256*5ffd83dbSDimitry Andric auto Log = B.buildFLog2(S32, Src0, Flags); 2257*5ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2258*5ffd83dbSDimitry Andric .addUse(Log.getReg(0)) 2259*5ffd83dbSDimitry Andric .addUse(Src1) 2260*5ffd83dbSDimitry Andric .setMIFlags(Flags); 2261*5ffd83dbSDimitry Andric B.buildFExp2(Dst, Mul, Flags); 2262*5ffd83dbSDimitry Andric } else if (Ty == S16) { 2263*5ffd83dbSDimitry Andric // There's no f16 fmul_legacy, so we need to convert for it. 2264*5ffd83dbSDimitry Andric auto Log = B.buildFLog2(S16, Src0, Flags); 2265*5ffd83dbSDimitry Andric auto Ext0 = B.buildFPExt(S32, Log, Flags); 2266*5ffd83dbSDimitry Andric auto Ext1 = B.buildFPExt(S32, Src1, Flags); 2267*5ffd83dbSDimitry Andric auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) 2268*5ffd83dbSDimitry Andric .addUse(Ext0.getReg(0)) 2269*5ffd83dbSDimitry Andric .addUse(Ext1.getReg(0)) 2270*5ffd83dbSDimitry Andric .setMIFlags(Flags); 2271*5ffd83dbSDimitry Andric 2272*5ffd83dbSDimitry Andric B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); 2273*5ffd83dbSDimitry Andric } else 2274*5ffd83dbSDimitry Andric return false; 2275*5ffd83dbSDimitry Andric 2276*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2277*5ffd83dbSDimitry Andric return true; 2278*5ffd83dbSDimitry Andric } 2279*5ffd83dbSDimitry Andric 2280*5ffd83dbSDimitry Andric // Find a source register, ignoring any possible source modifiers. 2281*5ffd83dbSDimitry Andric static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { 2282*5ffd83dbSDimitry Andric Register ModSrc = OrigSrc; 2283*5ffd83dbSDimitry Andric if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { 2284*5ffd83dbSDimitry Andric ModSrc = SrcFNeg->getOperand(1).getReg(); 2285*5ffd83dbSDimitry Andric if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2286*5ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 2287*5ffd83dbSDimitry Andric } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) 2288*5ffd83dbSDimitry Andric ModSrc = SrcFAbs->getOperand(1).getReg(); 2289*5ffd83dbSDimitry Andric return ModSrc; 2290*5ffd83dbSDimitry Andric } 2291*5ffd83dbSDimitry Andric 2292*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, 2293*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 2294*5ffd83dbSDimitry Andric MachineIRBuilder &B) const { 2295*5ffd83dbSDimitry Andric 2296*5ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 2297*5ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 2298*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2299*5ffd83dbSDimitry Andric Register OrigSrc = MI.getOperand(1).getReg(); 2300*5ffd83dbSDimitry Andric unsigned Flags = MI.getFlags(); 2301*5ffd83dbSDimitry Andric assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && 2302*5ffd83dbSDimitry Andric "this should not have been custom lowered"); 2303*5ffd83dbSDimitry Andric 2304*5ffd83dbSDimitry Andric // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) 2305*5ffd83dbSDimitry Andric // is used instead. However, SI doesn't have V_FLOOR_F64, so the most 2306*5ffd83dbSDimitry Andric // efficient way to implement it is using V_FRACT_F64. The workaround for the 2307*5ffd83dbSDimitry Andric // V_FRACT bug is: 2308*5ffd83dbSDimitry Andric // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2309*5ffd83dbSDimitry Andric // 2310*5ffd83dbSDimitry Andric // Convert floor(x) to (x - fract(x)) 2311*5ffd83dbSDimitry Andric 2312*5ffd83dbSDimitry Andric auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) 2313*5ffd83dbSDimitry Andric .addUse(OrigSrc) 2314*5ffd83dbSDimitry Andric .setMIFlags(Flags); 2315*5ffd83dbSDimitry Andric 2316*5ffd83dbSDimitry Andric // Give source modifier matching some assistance before obscuring a foldable 2317*5ffd83dbSDimitry Andric // pattern. 2318*5ffd83dbSDimitry Andric 2319*5ffd83dbSDimitry Andric // TODO: We can avoid the neg on the fract? The input sign to fract 2320*5ffd83dbSDimitry Andric // shouldn't matter? 2321*5ffd83dbSDimitry Andric Register ModSrc = stripAnySourceMods(OrigSrc, MRI); 2322*5ffd83dbSDimitry Andric 2323*5ffd83dbSDimitry Andric auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); 2324*5ffd83dbSDimitry Andric 2325*5ffd83dbSDimitry Andric Register Min = MRI.createGenericVirtualRegister(S64); 2326*5ffd83dbSDimitry Andric 2327*5ffd83dbSDimitry Andric // We don't need to concern ourselves with the snan handling difference, so 2328*5ffd83dbSDimitry Andric // use the one which will directly select. 2329*5ffd83dbSDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2330*5ffd83dbSDimitry Andric if (MFI->getMode().IEEE) 2331*5ffd83dbSDimitry Andric B.buildFMinNumIEEE(Min, Fract, Const, Flags); 2332*5ffd83dbSDimitry Andric else 2333*5ffd83dbSDimitry Andric B.buildFMinNum(Min, Fract, Const, Flags); 2334*5ffd83dbSDimitry Andric 2335*5ffd83dbSDimitry Andric Register CorrectedFract = Min; 2336*5ffd83dbSDimitry Andric if (!MI.getFlag(MachineInstr::FmNoNans)) { 2337*5ffd83dbSDimitry Andric auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); 2338*5ffd83dbSDimitry Andric CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); 2339*5ffd83dbSDimitry Andric } 2340*5ffd83dbSDimitry Andric 2341*5ffd83dbSDimitry Andric auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); 2342*5ffd83dbSDimitry Andric B.buildFAdd(Dst, OrigSrc, NegFract, Flags); 2343*5ffd83dbSDimitry Andric 2344*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2345*5ffd83dbSDimitry Andric return true; 2346*5ffd83dbSDimitry Andric } 2347*5ffd83dbSDimitry Andric 2348*5ffd83dbSDimitry Andric // Turn an illegal packed v2s16 build vector into bit operations. 2349*5ffd83dbSDimitry Andric // TODO: This should probably be a bitcast action in LegalizerHelper. 2350*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBuildVector( 2351*5ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 2352*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2353*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2354*5ffd83dbSDimitry Andric assert(MRI.getType(Dst) == LLT::vector(2, 16)); 2355*5ffd83dbSDimitry Andric 2356*5ffd83dbSDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 2357*5ffd83dbSDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 2358*5ffd83dbSDimitry Andric assert(MRI.getType(Src0) == LLT::scalar(16)); 2359*5ffd83dbSDimitry Andric 2360*5ffd83dbSDimitry Andric auto Merge = B.buildMerge(S32, {Src0, Src1}); 2361*5ffd83dbSDimitry Andric B.buildBitcast(Dst, Merge); 2362*5ffd83dbSDimitry Andric 2363*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2364*5ffd83dbSDimitry Andric return true; 2365*5ffd83dbSDimitry Andric } 2366*5ffd83dbSDimitry Andric 23670b57cec5SDimitry Andric // Return the use branch instruction, otherwise null if the usage is invalid. 23680b57cec5SDimitry Andric static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, 2369480093f4SDimitry Andric MachineRegisterInfo &MRI, 2370*5ffd83dbSDimitry Andric MachineInstr *&Br, 2371*5ffd83dbSDimitry Andric MachineBasicBlock *&UncondBrTarget) { 23720b57cec5SDimitry Andric Register CondDef = MI.getOperand(0).getReg(); 23730b57cec5SDimitry Andric if (!MRI.hasOneNonDBGUse(CondDef)) 23740b57cec5SDimitry Andric return nullptr; 23750b57cec5SDimitry Andric 2376*5ffd83dbSDimitry Andric MachineBasicBlock *Parent = MI.getParent(); 23770b57cec5SDimitry Andric MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); 2378*5ffd83dbSDimitry Andric if (UseMI.getParent() != Parent || 2379480093f4SDimitry Andric UseMI.getOpcode() != AMDGPU::G_BRCOND) 2380480093f4SDimitry Andric return nullptr; 2381480093f4SDimitry Andric 2382*5ffd83dbSDimitry Andric // Make sure the cond br is followed by a G_BR, or is the last instruction. 2383480093f4SDimitry Andric MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); 2384*5ffd83dbSDimitry Andric if (Next == Parent->end()) { 2385*5ffd83dbSDimitry Andric MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); 2386*5ffd83dbSDimitry Andric if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. 2387*5ffd83dbSDimitry Andric return nullptr; 2388*5ffd83dbSDimitry Andric UncondBrTarget = &*NextMBB; 2389*5ffd83dbSDimitry Andric } else { 2390480093f4SDimitry Andric if (Next->getOpcode() != AMDGPU::G_BR) 2391480093f4SDimitry Andric return nullptr; 2392480093f4SDimitry Andric Br = &*Next; 2393*5ffd83dbSDimitry Andric UncondBrTarget = Br->getOperand(0).getMBB(); 2394480093f4SDimitry Andric } 2395480093f4SDimitry Andric 2396480093f4SDimitry Andric return &UseMI; 23970b57cec5SDimitry Andric } 23980b57cec5SDimitry Andric 2399*5ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, 2400*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 2401*5ffd83dbSDimitry Andric Register LiveIn, 2402*5ffd83dbSDimitry Andric Register PhyReg) const { 2403*5ffd83dbSDimitry Andric assert(PhyReg.isPhysical() && "Physical register expected"); 2404*5ffd83dbSDimitry Andric 2405*5ffd83dbSDimitry Andric // Insert the live-in copy, if required, by defining destination virtual 2406*5ffd83dbSDimitry Andric // register. 2407*5ffd83dbSDimitry Andric // FIXME: It seems EmitLiveInCopies isn't called anywhere? 2408*5ffd83dbSDimitry Andric if (!MRI.getVRegDef(LiveIn)) { 2409*5ffd83dbSDimitry Andric // FIXME: Should have scoped insert pt 2410*5ffd83dbSDimitry Andric MachineBasicBlock &OrigInsBB = B.getMBB(); 2411*5ffd83dbSDimitry Andric auto OrigInsPt = B.getInsertPt(); 2412*5ffd83dbSDimitry Andric 2413*5ffd83dbSDimitry Andric MachineBasicBlock &EntryMBB = B.getMF().front(); 2414*5ffd83dbSDimitry Andric EntryMBB.addLiveIn(PhyReg); 2415*5ffd83dbSDimitry Andric B.setInsertPt(EntryMBB, EntryMBB.begin()); 2416*5ffd83dbSDimitry Andric B.buildCopy(LiveIn, PhyReg); 2417*5ffd83dbSDimitry Andric 2418*5ffd83dbSDimitry Andric B.setInsertPt(OrigInsBB, OrigInsPt); 2419*5ffd83dbSDimitry Andric } 2420*5ffd83dbSDimitry Andric 2421*5ffd83dbSDimitry Andric return LiveIn; 2422*5ffd83dbSDimitry Andric } 2423*5ffd83dbSDimitry Andric 2424*5ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, 2425*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 2426*5ffd83dbSDimitry Andric Register PhyReg, LLT Ty, 2427*5ffd83dbSDimitry Andric bool InsertLiveInCopy) const { 2428*5ffd83dbSDimitry Andric assert(PhyReg.isPhysical() && "Physical register expected"); 2429*5ffd83dbSDimitry Andric 2430*5ffd83dbSDimitry Andric // Get or create virtual live-in regester 2431*5ffd83dbSDimitry Andric Register LiveIn = MRI.getLiveInVirtReg(PhyReg); 2432*5ffd83dbSDimitry Andric if (!LiveIn) { 2433*5ffd83dbSDimitry Andric LiveIn = MRI.createGenericVirtualRegister(Ty); 2434*5ffd83dbSDimitry Andric MRI.addLiveIn(PhyReg, LiveIn); 2435*5ffd83dbSDimitry Andric } 2436*5ffd83dbSDimitry Andric 2437*5ffd83dbSDimitry Andric // When the actual true copy required is from virtual register to physical 2438*5ffd83dbSDimitry Andric // register (to be inserted later), live-in copy insertion from physical 2439*5ffd83dbSDimitry Andric // to register virtual register is not required 2440*5ffd83dbSDimitry Andric if (!InsertLiveInCopy) 24410b57cec5SDimitry Andric return LiveIn; 24420b57cec5SDimitry Andric 2443*5ffd83dbSDimitry Andric return insertLiveInCopy(B, MRI, LiveIn, PhyReg); 2444*5ffd83dbSDimitry Andric } 2445*5ffd83dbSDimitry Andric 2446*5ffd83dbSDimitry Andric const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( 2447*5ffd83dbSDimitry Andric MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 2448*5ffd83dbSDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2449*5ffd83dbSDimitry Andric const ArgDescriptor *Arg; 2450*5ffd83dbSDimitry Andric const TargetRegisterClass *RC; 2451*5ffd83dbSDimitry Andric LLT ArgTy; 2452*5ffd83dbSDimitry Andric std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); 2453*5ffd83dbSDimitry Andric if (!Arg) { 2454*5ffd83dbSDimitry Andric LLVM_DEBUG(dbgs() << "Required arg register missing\n"); 2455*5ffd83dbSDimitry Andric return nullptr; 2456*5ffd83dbSDimitry Andric } 2457*5ffd83dbSDimitry Andric return Arg; 24580b57cec5SDimitry Andric } 24590b57cec5SDimitry Andric 24600b57cec5SDimitry Andric bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, 24610b57cec5SDimitry Andric const ArgDescriptor *Arg) const { 24628bcb0991SDimitry Andric if (!Arg->isRegister() || !Arg->getRegister().isValid()) 24630b57cec5SDimitry Andric return false; // TODO: Handle these 24640b57cec5SDimitry Andric 2465*5ffd83dbSDimitry Andric Register SrcReg = Arg->getRegister(); 2466*5ffd83dbSDimitry Andric assert(SrcReg.isPhysical() && "Physical register expected"); 2467*5ffd83dbSDimitry Andric assert(DstReg.isVirtual() && "Virtual register expected"); 24680b57cec5SDimitry Andric 24690b57cec5SDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 24700b57cec5SDimitry Andric 24710b57cec5SDimitry Andric LLT Ty = MRI.getType(DstReg); 2472*5ffd83dbSDimitry Andric Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); 24730b57cec5SDimitry Andric 24740b57cec5SDimitry Andric if (Arg->isMasked()) { 24750b57cec5SDimitry Andric // TODO: Should we try to emit this once in the entry block? 24760b57cec5SDimitry Andric const LLT S32 = LLT::scalar(32); 24770b57cec5SDimitry Andric const unsigned Mask = Arg->getMask(); 24780b57cec5SDimitry Andric const unsigned Shift = countTrailingZeros<unsigned>(Mask); 24790b57cec5SDimitry Andric 24808bcb0991SDimitry Andric Register AndMaskSrc = LiveIn; 24818bcb0991SDimitry Andric 24828bcb0991SDimitry Andric if (Shift != 0) { 24830b57cec5SDimitry Andric auto ShiftAmt = B.buildConstant(S32, Shift); 24848bcb0991SDimitry Andric AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0); 24858bcb0991SDimitry Andric } 24868bcb0991SDimitry Andric 24878bcb0991SDimitry Andric B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); 2488*5ffd83dbSDimitry Andric } else { 24890b57cec5SDimitry Andric B.buildCopy(DstReg, LiveIn); 24900b57cec5SDimitry Andric } 24910b57cec5SDimitry Andric 24920b57cec5SDimitry Andric return true; 24930b57cec5SDimitry Andric } 24940b57cec5SDimitry Andric 24950b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( 2496*5ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, 24970b57cec5SDimitry Andric AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { 24980b57cec5SDimitry Andric 2499*5ffd83dbSDimitry Andric const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); 2500*5ffd83dbSDimitry Andric if (!Arg) 25010b57cec5SDimitry Andric return false; 25020b57cec5SDimitry Andric 2503*5ffd83dbSDimitry Andric if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) 2504*5ffd83dbSDimitry Andric return false; 2505*5ffd83dbSDimitry Andric 25060b57cec5SDimitry Andric MI.eraseFromParent(); 25070b57cec5SDimitry Andric return true; 25080b57cec5SDimitry Andric } 25090b57cec5SDimitry Andric 25108bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, 25118bcb0991SDimitry Andric MachineRegisterInfo &MRI, 25128bcb0991SDimitry Andric MachineIRBuilder &B) const { 2513480093f4SDimitry Andric Register Dst = MI.getOperand(0).getReg(); 2514480093f4SDimitry Andric LLT DstTy = MRI.getType(Dst); 2515480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 2516480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 2517480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 25188bcb0991SDimitry Andric 25198bcb0991SDimitry Andric if (legalizeFastUnsafeFDIV(MI, MRI, B)) 25208bcb0991SDimitry Andric return true; 25218bcb0991SDimitry Andric 2522480093f4SDimitry Andric if (DstTy == S16) 2523480093f4SDimitry Andric return legalizeFDIV16(MI, MRI, B); 2524480093f4SDimitry Andric if (DstTy == S32) 2525480093f4SDimitry Andric return legalizeFDIV32(MI, MRI, B); 2526480093f4SDimitry Andric if (DstTy == S64) 2527480093f4SDimitry Andric return legalizeFDIV64(MI, MRI, B); 2528480093f4SDimitry Andric 25298bcb0991SDimitry Andric return false; 25308bcb0991SDimitry Andric } 25318bcb0991SDimitry Andric 2532*5ffd83dbSDimitry Andric void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, 2533*5ffd83dbSDimitry Andric Register DstReg, 2534*5ffd83dbSDimitry Andric Register X, 2535*5ffd83dbSDimitry Andric Register Y, 2536*5ffd83dbSDimitry Andric bool IsDiv) const { 2537*5ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 2538*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2539*5ffd83dbSDimitry Andric 2540*5ffd83dbSDimitry Andric // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the 2541*5ffd83dbSDimitry Andric // algorithm used here. 2542*5ffd83dbSDimitry Andric 2543*5ffd83dbSDimitry Andric // Initial estimate of inv(y). 2544*5ffd83dbSDimitry Andric auto FloatY = B.buildUITOFP(S32, Y); 2545*5ffd83dbSDimitry Andric auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); 2546*5ffd83dbSDimitry Andric auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); 2547*5ffd83dbSDimitry Andric auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); 2548*5ffd83dbSDimitry Andric auto Z = B.buildFPTOUI(S32, ScaledY); 2549*5ffd83dbSDimitry Andric 2550*5ffd83dbSDimitry Andric // One round of UNR. 2551*5ffd83dbSDimitry Andric auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); 2552*5ffd83dbSDimitry Andric auto NegYZ = B.buildMul(S32, NegY, Z); 2553*5ffd83dbSDimitry Andric Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); 2554*5ffd83dbSDimitry Andric 2555*5ffd83dbSDimitry Andric // Quotient/remainder estimate. 2556*5ffd83dbSDimitry Andric auto Q = B.buildUMulH(S32, X, Z); 2557*5ffd83dbSDimitry Andric auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); 2558*5ffd83dbSDimitry Andric 2559*5ffd83dbSDimitry Andric // First quotient/remainder refinement. 2560*5ffd83dbSDimitry Andric auto One = B.buildConstant(S32, 1); 2561*5ffd83dbSDimitry Andric auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2562*5ffd83dbSDimitry Andric if (IsDiv) 2563*5ffd83dbSDimitry Andric Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); 2564*5ffd83dbSDimitry Andric R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); 2565*5ffd83dbSDimitry Andric 2566*5ffd83dbSDimitry Andric // Second quotient/remainder refinement. 2567*5ffd83dbSDimitry Andric Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); 2568*5ffd83dbSDimitry Andric if (IsDiv) 2569*5ffd83dbSDimitry Andric B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); 2570*5ffd83dbSDimitry Andric else 2571*5ffd83dbSDimitry Andric B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); 2572*5ffd83dbSDimitry Andric } 2573*5ffd83dbSDimitry Andric 2574*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, 2575*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 2576*5ffd83dbSDimitry Andric MachineIRBuilder &B) const { 2577*5ffd83dbSDimitry Andric const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2578*5ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2579*5ffd83dbSDimitry Andric Register Num = MI.getOperand(1).getReg(); 2580*5ffd83dbSDimitry Andric Register Den = MI.getOperand(2).getReg(); 2581*5ffd83dbSDimitry Andric legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2582*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2583*5ffd83dbSDimitry Andric return true; 2584*5ffd83dbSDimitry Andric } 2585*5ffd83dbSDimitry Andric 2586*5ffd83dbSDimitry Andric // Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 2587*5ffd83dbSDimitry Andric // 2588*5ffd83dbSDimitry Andric // Return lo, hi of result 2589*5ffd83dbSDimitry Andric // 2590*5ffd83dbSDimitry Andric // %cvt.lo = G_UITOFP Val.lo 2591*5ffd83dbSDimitry Andric // %cvt.hi = G_UITOFP Val.hi 2592*5ffd83dbSDimitry Andric // %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo 2593*5ffd83dbSDimitry Andric // %rcp = G_AMDGPU_RCP_IFLAG %mad 2594*5ffd83dbSDimitry Andric // %mul1 = G_FMUL %rcp, 0x5f7ffffc 2595*5ffd83dbSDimitry Andric // %mul2 = G_FMUL %mul1, 2**(-32) 2596*5ffd83dbSDimitry Andric // %trunc = G_INTRINSIC_TRUNC %mul2 2597*5ffd83dbSDimitry Andric // %mad2 = G_FMAD %trunc, -(2**32), %mul1 2598*5ffd83dbSDimitry Andric // return {G_FPTOUI %mad2, G_FPTOUI %trunc} 2599*5ffd83dbSDimitry Andric static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, 2600*5ffd83dbSDimitry Andric Register Val) { 2601*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2602*5ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, Val); 2603*5ffd83dbSDimitry Andric 2604*5ffd83dbSDimitry Andric auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); 2605*5ffd83dbSDimitry Andric auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); 2606*5ffd83dbSDimitry Andric 2607*5ffd83dbSDimitry Andric auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 2608*5ffd83dbSDimitry Andric B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); 2609*5ffd83dbSDimitry Andric 2610*5ffd83dbSDimitry Andric auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); 2611*5ffd83dbSDimitry Andric auto Mul1 = 2612*5ffd83dbSDimitry Andric B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); 2613*5ffd83dbSDimitry Andric 2614*5ffd83dbSDimitry Andric // 2**(-32) 2615*5ffd83dbSDimitry Andric auto Mul2 = 2616*5ffd83dbSDimitry Andric B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); 2617*5ffd83dbSDimitry Andric auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); 2618*5ffd83dbSDimitry Andric 2619*5ffd83dbSDimitry Andric // -(2**32) 2620*5ffd83dbSDimitry Andric auto Mad2 = B.buildFMAD(S32, Trunc, 2621*5ffd83dbSDimitry Andric B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); 2622*5ffd83dbSDimitry Andric 2623*5ffd83dbSDimitry Andric auto ResultLo = B.buildFPTOUI(S32, Mad2); 2624*5ffd83dbSDimitry Andric auto ResultHi = B.buildFPTOUI(S32, Trunc); 2625*5ffd83dbSDimitry Andric 2626*5ffd83dbSDimitry Andric return {ResultLo.getReg(0), ResultHi.getReg(0)}; 2627*5ffd83dbSDimitry Andric } 2628*5ffd83dbSDimitry Andric 2629*5ffd83dbSDimitry Andric void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, 2630*5ffd83dbSDimitry Andric Register DstReg, 2631*5ffd83dbSDimitry Andric Register Numer, 2632*5ffd83dbSDimitry Andric Register Denom, 2633*5ffd83dbSDimitry Andric bool IsDiv) const { 2634*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2635*5ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 2636*5ffd83dbSDimitry Andric const LLT S1 = LLT::scalar(1); 2637*5ffd83dbSDimitry Andric Register RcpLo, RcpHi; 2638*5ffd83dbSDimitry Andric 2639*5ffd83dbSDimitry Andric std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); 2640*5ffd83dbSDimitry Andric 2641*5ffd83dbSDimitry Andric auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); 2642*5ffd83dbSDimitry Andric 2643*5ffd83dbSDimitry Andric auto Zero64 = B.buildConstant(S64, 0); 2644*5ffd83dbSDimitry Andric auto NegDenom = B.buildSub(S64, Zero64, Denom); 2645*5ffd83dbSDimitry Andric 2646*5ffd83dbSDimitry Andric auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); 2647*5ffd83dbSDimitry Andric auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); 2648*5ffd83dbSDimitry Andric 2649*5ffd83dbSDimitry Andric auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); 2650*5ffd83dbSDimitry Andric Register MulHi1_Lo = UnmergeMulHi1.getReg(0); 2651*5ffd83dbSDimitry Andric Register MulHi1_Hi = UnmergeMulHi1.getReg(1); 2652*5ffd83dbSDimitry Andric 2653*5ffd83dbSDimitry Andric auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); 2654*5ffd83dbSDimitry Andric auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); 2655*5ffd83dbSDimitry Andric auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); 2656*5ffd83dbSDimitry Andric auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); 2657*5ffd83dbSDimitry Andric 2658*5ffd83dbSDimitry Andric auto MulLo2 = B.buildMul(S64, NegDenom, Add1); 2659*5ffd83dbSDimitry Andric auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); 2660*5ffd83dbSDimitry Andric auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); 2661*5ffd83dbSDimitry Andric Register MulHi2_Lo = UnmergeMulHi2.getReg(0); 2662*5ffd83dbSDimitry Andric Register MulHi2_Hi = UnmergeMulHi2.getReg(1); 2663*5ffd83dbSDimitry Andric 2664*5ffd83dbSDimitry Andric auto Zero32 = B.buildConstant(S32, 0); 2665*5ffd83dbSDimitry Andric auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); 2666*5ffd83dbSDimitry Andric auto Add2_HiC = 2667*5ffd83dbSDimitry Andric B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); 2668*5ffd83dbSDimitry Andric auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); 2669*5ffd83dbSDimitry Andric auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); 2670*5ffd83dbSDimitry Andric 2671*5ffd83dbSDimitry Andric auto UnmergeNumer = B.buildUnmerge(S32, Numer); 2672*5ffd83dbSDimitry Andric Register NumerLo = UnmergeNumer.getReg(0); 2673*5ffd83dbSDimitry Andric Register NumerHi = UnmergeNumer.getReg(1); 2674*5ffd83dbSDimitry Andric 2675*5ffd83dbSDimitry Andric auto MulHi3 = B.buildUMulH(S64, Numer, Add2); 2676*5ffd83dbSDimitry Andric auto Mul3 = B.buildMul(S64, Denom, MulHi3); 2677*5ffd83dbSDimitry Andric auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); 2678*5ffd83dbSDimitry Andric Register Mul3_Lo = UnmergeMul3.getReg(0); 2679*5ffd83dbSDimitry Andric Register Mul3_Hi = UnmergeMul3.getReg(1); 2680*5ffd83dbSDimitry Andric auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); 2681*5ffd83dbSDimitry Andric auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); 2682*5ffd83dbSDimitry Andric auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); 2683*5ffd83dbSDimitry Andric auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); 2684*5ffd83dbSDimitry Andric 2685*5ffd83dbSDimitry Andric auto UnmergeDenom = B.buildUnmerge(S32, Denom); 2686*5ffd83dbSDimitry Andric Register DenomLo = UnmergeDenom.getReg(0); 2687*5ffd83dbSDimitry Andric Register DenomHi = UnmergeDenom.getReg(1); 2688*5ffd83dbSDimitry Andric 2689*5ffd83dbSDimitry Andric auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); 2690*5ffd83dbSDimitry Andric auto C1 = B.buildSExt(S32, CmpHi); 2691*5ffd83dbSDimitry Andric 2692*5ffd83dbSDimitry Andric auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); 2693*5ffd83dbSDimitry Andric auto C2 = B.buildSExt(S32, CmpLo); 2694*5ffd83dbSDimitry Andric 2695*5ffd83dbSDimitry Andric auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); 2696*5ffd83dbSDimitry Andric auto C3 = B.buildSelect(S32, CmpEq, C2, C1); 2697*5ffd83dbSDimitry Andric 2698*5ffd83dbSDimitry Andric // TODO: Here and below portions of the code can be enclosed into if/endif. 2699*5ffd83dbSDimitry Andric // Currently control flow is unconditional and we have 4 selects after 2700*5ffd83dbSDimitry Andric // potential endif to substitute PHIs. 2701*5ffd83dbSDimitry Andric 2702*5ffd83dbSDimitry Andric // if C3 != 0 ... 2703*5ffd83dbSDimitry Andric auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); 2704*5ffd83dbSDimitry Andric auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); 2705*5ffd83dbSDimitry Andric auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); 2706*5ffd83dbSDimitry Andric auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); 2707*5ffd83dbSDimitry Andric 2708*5ffd83dbSDimitry Andric auto One64 = B.buildConstant(S64, 1); 2709*5ffd83dbSDimitry Andric auto Add3 = B.buildAdd(S64, MulHi3, One64); 2710*5ffd83dbSDimitry Andric 2711*5ffd83dbSDimitry Andric auto C4 = 2712*5ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); 2713*5ffd83dbSDimitry Andric auto C5 = 2714*5ffd83dbSDimitry Andric B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); 2715*5ffd83dbSDimitry Andric auto C6 = B.buildSelect( 2716*5ffd83dbSDimitry Andric S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); 2717*5ffd83dbSDimitry Andric 2718*5ffd83dbSDimitry Andric // if (C6 != 0) 2719*5ffd83dbSDimitry Andric auto Add4 = B.buildAdd(S64, Add3, One64); 2720*5ffd83dbSDimitry Andric auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); 2721*5ffd83dbSDimitry Andric 2722*5ffd83dbSDimitry Andric auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); 2723*5ffd83dbSDimitry Andric auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); 2724*5ffd83dbSDimitry Andric auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); 2725*5ffd83dbSDimitry Andric 2726*5ffd83dbSDimitry Andric // endif C6 2727*5ffd83dbSDimitry Andric // endif C3 2728*5ffd83dbSDimitry Andric 2729*5ffd83dbSDimitry Andric if (IsDiv) { 2730*5ffd83dbSDimitry Andric auto Sel1 = B.buildSelect( 2731*5ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); 2732*5ffd83dbSDimitry Andric B.buildSelect(DstReg, 2733*5ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); 2734*5ffd83dbSDimitry Andric } else { 2735*5ffd83dbSDimitry Andric auto Sel2 = B.buildSelect( 2736*5ffd83dbSDimitry Andric S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); 2737*5ffd83dbSDimitry Andric B.buildSelect(DstReg, 2738*5ffd83dbSDimitry Andric B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); 2739*5ffd83dbSDimitry Andric } 2740*5ffd83dbSDimitry Andric } 2741*5ffd83dbSDimitry Andric 2742*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, 2743*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 2744*5ffd83dbSDimitry Andric MachineIRBuilder &B) const { 2745*5ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 2746*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2747*5ffd83dbSDimitry Andric const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; 2748*5ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2749*5ffd83dbSDimitry Andric Register Num = MI.getOperand(1).getReg(); 2750*5ffd83dbSDimitry Andric Register Den = MI.getOperand(2).getReg(); 2751*5ffd83dbSDimitry Andric LLT Ty = MRI.getType(DstReg); 2752*5ffd83dbSDimitry Andric 2753*5ffd83dbSDimitry Andric if (Ty == S32) 2754*5ffd83dbSDimitry Andric legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); 2755*5ffd83dbSDimitry Andric else if (Ty == S64) 2756*5ffd83dbSDimitry Andric legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); 2757*5ffd83dbSDimitry Andric else 2758*5ffd83dbSDimitry Andric return false; 2759*5ffd83dbSDimitry Andric 2760*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2761*5ffd83dbSDimitry Andric return true; 2762*5ffd83dbSDimitry Andric 2763*5ffd83dbSDimitry Andric } 2764*5ffd83dbSDimitry Andric 2765*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, 2766*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 2767*5ffd83dbSDimitry Andric MachineIRBuilder &B) const { 2768*5ffd83dbSDimitry Andric const LLT S64 = LLT::scalar(64); 2769*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2770*5ffd83dbSDimitry Andric 2771*5ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2772*5ffd83dbSDimitry Andric const LLT Ty = MRI.getType(DstReg); 2773*5ffd83dbSDimitry Andric if (Ty != S32 && Ty != S64) 2774*5ffd83dbSDimitry Andric return false; 2775*5ffd83dbSDimitry Andric 2776*5ffd83dbSDimitry Andric const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; 2777*5ffd83dbSDimitry Andric 2778*5ffd83dbSDimitry Andric Register LHS = MI.getOperand(1).getReg(); 2779*5ffd83dbSDimitry Andric Register RHS = MI.getOperand(2).getReg(); 2780*5ffd83dbSDimitry Andric 2781*5ffd83dbSDimitry Andric auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); 2782*5ffd83dbSDimitry Andric auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); 2783*5ffd83dbSDimitry Andric auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); 2784*5ffd83dbSDimitry Andric 2785*5ffd83dbSDimitry Andric LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); 2786*5ffd83dbSDimitry Andric RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); 2787*5ffd83dbSDimitry Andric 2788*5ffd83dbSDimitry Andric LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); 2789*5ffd83dbSDimitry Andric RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); 2790*5ffd83dbSDimitry Andric 2791*5ffd83dbSDimitry Andric Register UDivRem = MRI.createGenericVirtualRegister(Ty); 2792*5ffd83dbSDimitry Andric if (Ty == S32) 2793*5ffd83dbSDimitry Andric legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); 2794*5ffd83dbSDimitry Andric else 2795*5ffd83dbSDimitry Andric legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); 2796*5ffd83dbSDimitry Andric 2797*5ffd83dbSDimitry Andric Register Sign; 2798*5ffd83dbSDimitry Andric if (IsDiv) 2799*5ffd83dbSDimitry Andric Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); 2800*5ffd83dbSDimitry Andric else 2801*5ffd83dbSDimitry Andric Sign = LHSign.getReg(0); // Remainder sign is the same as LHS 2802*5ffd83dbSDimitry Andric 2803*5ffd83dbSDimitry Andric UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); 2804*5ffd83dbSDimitry Andric B.buildSub(DstReg, UDivRem, Sign); 2805*5ffd83dbSDimitry Andric 2806*5ffd83dbSDimitry Andric MI.eraseFromParent(); 2807*5ffd83dbSDimitry Andric return true; 2808*5ffd83dbSDimitry Andric } 2809*5ffd83dbSDimitry Andric 28108bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, 28118bcb0991SDimitry Andric MachineRegisterInfo &MRI, 28128bcb0991SDimitry Andric MachineIRBuilder &B) const { 28138bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 28148bcb0991SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 28158bcb0991SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 28168bcb0991SDimitry Andric 28178bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 28188bcb0991SDimitry Andric 28198bcb0991SDimitry Andric LLT ResTy = MRI.getType(Res); 28208bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 28218bcb0991SDimitry Andric LLT S64 = LLT::scalar(64); 28228bcb0991SDimitry Andric 28238bcb0991SDimitry Andric const MachineFunction &MF = B.getMF(); 28248bcb0991SDimitry Andric bool Unsafe = 28258bcb0991SDimitry Andric MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); 28268bcb0991SDimitry Andric 28278bcb0991SDimitry Andric if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) 28288bcb0991SDimitry Andric return false; 28298bcb0991SDimitry Andric 2830480093f4SDimitry Andric if (!Unsafe && ResTy == S32 && 2831*5ffd83dbSDimitry Andric MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) 28328bcb0991SDimitry Andric return false; 28338bcb0991SDimitry Andric 28348bcb0991SDimitry Andric if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { 28358bcb0991SDimitry Andric // 1 / x -> RCP(x) 28368bcb0991SDimitry Andric if (CLHS->isExactlyValue(1.0)) { 28378bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 28388bcb0991SDimitry Andric .addUse(RHS) 28398bcb0991SDimitry Andric .setMIFlags(Flags); 28408bcb0991SDimitry Andric 28418bcb0991SDimitry Andric MI.eraseFromParent(); 28428bcb0991SDimitry Andric return true; 28438bcb0991SDimitry Andric } 28448bcb0991SDimitry Andric 28458bcb0991SDimitry Andric // -1 / x -> RCP( FNEG(x) ) 28468bcb0991SDimitry Andric if (CLHS->isExactlyValue(-1.0)) { 28478bcb0991SDimitry Andric auto FNeg = B.buildFNeg(ResTy, RHS, Flags); 28488bcb0991SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false) 28498bcb0991SDimitry Andric .addUse(FNeg.getReg(0)) 28508bcb0991SDimitry Andric .setMIFlags(Flags); 28518bcb0991SDimitry Andric 28528bcb0991SDimitry Andric MI.eraseFromParent(); 28538bcb0991SDimitry Andric return true; 28548bcb0991SDimitry Andric } 28558bcb0991SDimitry Andric } 28568bcb0991SDimitry Andric 28578bcb0991SDimitry Andric // x / y -> x * (1.0 / y) 28588bcb0991SDimitry Andric if (Unsafe) { 28598bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) 28608bcb0991SDimitry Andric .addUse(RHS) 28618bcb0991SDimitry Andric .setMIFlags(Flags); 28628bcb0991SDimitry Andric B.buildFMul(Res, LHS, RCP, Flags); 28638bcb0991SDimitry Andric 28648bcb0991SDimitry Andric MI.eraseFromParent(); 28658bcb0991SDimitry Andric return true; 28668bcb0991SDimitry Andric } 28678bcb0991SDimitry Andric 28688bcb0991SDimitry Andric return false; 28698bcb0991SDimitry Andric } 28708bcb0991SDimitry Andric 2871480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, 2872480093f4SDimitry Andric MachineRegisterInfo &MRI, 2873480093f4SDimitry Andric MachineIRBuilder &B) const { 2874480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 2875480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 2876480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 2877480093f4SDimitry Andric 2878480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 2879480093f4SDimitry Andric 2880480093f4SDimitry Andric LLT S16 = LLT::scalar(16); 2881480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 2882480093f4SDimitry Andric 2883480093f4SDimitry Andric auto LHSExt = B.buildFPExt(S32, LHS, Flags); 2884480093f4SDimitry Andric auto RHSExt = B.buildFPExt(S32, RHS, Flags); 2885480093f4SDimitry Andric 2886480093f4SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2887480093f4SDimitry Andric .addUse(RHSExt.getReg(0)) 2888480093f4SDimitry Andric .setMIFlags(Flags); 2889480093f4SDimitry Andric 2890480093f4SDimitry Andric auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags); 2891480093f4SDimitry Andric auto RDst = B.buildFPTrunc(S16, QUOT, Flags); 2892480093f4SDimitry Andric 2893480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2894480093f4SDimitry Andric .addUse(RDst.getReg(0)) 2895480093f4SDimitry Andric .addUse(RHS) 2896480093f4SDimitry Andric .addUse(LHS) 2897480093f4SDimitry Andric .setMIFlags(Flags); 2898480093f4SDimitry Andric 2899480093f4SDimitry Andric MI.eraseFromParent(); 2900480093f4SDimitry Andric return true; 2901480093f4SDimitry Andric } 2902480093f4SDimitry Andric 2903480093f4SDimitry Andric // Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions 2904480093f4SDimitry Andric // to enable denorm mode. When 'Enable' is false, disable denorm mode. 2905480093f4SDimitry Andric static void toggleSPDenormMode(bool Enable, 2906480093f4SDimitry Andric MachineIRBuilder &B, 2907480093f4SDimitry Andric const GCNSubtarget &ST, 2908480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode) { 2909480093f4SDimitry Andric // Set SP denorm mode to this value. 2910480093f4SDimitry Andric unsigned SPDenormMode = 2911*5ffd83dbSDimitry Andric Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); 2912480093f4SDimitry Andric 2913480093f4SDimitry Andric if (ST.hasDenormModeInst()) { 2914480093f4SDimitry Andric // Preserve default FP64FP16 denorm mode while updating FP32 mode. 2915*5ffd83dbSDimitry Andric uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); 2916480093f4SDimitry Andric 2917*5ffd83dbSDimitry Andric uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); 2918480093f4SDimitry Andric B.buildInstr(AMDGPU::S_DENORM_MODE) 2919480093f4SDimitry Andric .addImm(NewDenormModeValue); 2920480093f4SDimitry Andric 2921480093f4SDimitry Andric } else { 2922480093f4SDimitry Andric // Select FP32 bit field in mode register. 2923480093f4SDimitry Andric unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE | 2924480093f4SDimitry Andric (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | 2925480093f4SDimitry Andric (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); 2926480093f4SDimitry Andric 2927480093f4SDimitry Andric B.buildInstr(AMDGPU::S_SETREG_IMM32_B32) 2928480093f4SDimitry Andric .addImm(SPDenormMode) 2929480093f4SDimitry Andric .addImm(SPDenormModeBitField); 2930480093f4SDimitry Andric } 2931480093f4SDimitry Andric } 2932480093f4SDimitry Andric 2933480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, 2934480093f4SDimitry Andric MachineRegisterInfo &MRI, 2935480093f4SDimitry Andric MachineIRBuilder &B) const { 2936480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 2937480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 2938480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 2939480093f4SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 2940480093f4SDimitry Andric AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode(); 2941480093f4SDimitry Andric 2942480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 2943480093f4SDimitry Andric 2944480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 2945480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 2946480093f4SDimitry Andric 2947480093f4SDimitry Andric auto One = B.buildFConstant(S32, 1.0f); 2948480093f4SDimitry Andric 2949480093f4SDimitry Andric auto DenominatorScaled = 2950480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2951480093f4SDimitry Andric .addUse(LHS) 2952*5ffd83dbSDimitry Andric .addUse(RHS) 2953*5ffd83dbSDimitry Andric .addImm(0) 2954480093f4SDimitry Andric .setMIFlags(Flags); 2955480093f4SDimitry Andric auto NumeratorScaled = 2956480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) 2957480093f4SDimitry Andric .addUse(LHS) 2958480093f4SDimitry Andric .addUse(RHS) 2959*5ffd83dbSDimitry Andric .addImm(1) 2960480093f4SDimitry Andric .setMIFlags(Flags); 2961480093f4SDimitry Andric 2962480093f4SDimitry Andric auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 2963480093f4SDimitry Andric .addUse(DenominatorScaled.getReg(0)) 2964480093f4SDimitry Andric .setMIFlags(Flags); 2965480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags); 2966480093f4SDimitry Andric 2967480093f4SDimitry Andric // FIXME: Doesn't correctly model the FP mode switch, and the FP operations 2968480093f4SDimitry Andric // aren't modeled as reading it. 2969*5ffd83dbSDimitry Andric if (!Mode.allFP32Denormals()) 2970480093f4SDimitry Andric toggleSPDenormMode(true, B, ST, Mode); 2971480093f4SDimitry Andric 2972480093f4SDimitry Andric auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); 2973480093f4SDimitry Andric auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags); 2974480093f4SDimitry Andric auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags); 2975480093f4SDimitry Andric auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags); 2976480093f4SDimitry Andric auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); 2977480093f4SDimitry Andric auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); 2978480093f4SDimitry Andric 2979*5ffd83dbSDimitry Andric if (!Mode.allFP32Denormals()) 2980480093f4SDimitry Andric toggleSPDenormMode(false, B, ST, Mode); 2981480093f4SDimitry Andric 2982480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) 2983480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 2984480093f4SDimitry Andric .addUse(Fma1.getReg(0)) 2985480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 2986480093f4SDimitry Andric .addUse(NumeratorScaled.getReg(1)) 2987480093f4SDimitry Andric .setMIFlags(Flags); 2988480093f4SDimitry Andric 2989480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false) 2990480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 2991480093f4SDimitry Andric .addUse(RHS) 2992480093f4SDimitry Andric .addUse(LHS) 2993480093f4SDimitry Andric .setMIFlags(Flags); 2994480093f4SDimitry Andric 2995480093f4SDimitry Andric MI.eraseFromParent(); 2996480093f4SDimitry Andric return true; 2997480093f4SDimitry Andric } 2998480093f4SDimitry Andric 2999480093f4SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, 3000480093f4SDimitry Andric MachineRegisterInfo &MRI, 3001480093f4SDimitry Andric MachineIRBuilder &B) const { 3002480093f4SDimitry Andric Register Res = MI.getOperand(0).getReg(); 3003480093f4SDimitry Andric Register LHS = MI.getOperand(1).getReg(); 3004480093f4SDimitry Andric Register RHS = MI.getOperand(2).getReg(); 3005480093f4SDimitry Andric 3006480093f4SDimitry Andric uint16_t Flags = MI.getFlags(); 3007480093f4SDimitry Andric 3008480093f4SDimitry Andric LLT S64 = LLT::scalar(64); 3009480093f4SDimitry Andric LLT S1 = LLT::scalar(1); 3010480093f4SDimitry Andric 3011480093f4SDimitry Andric auto One = B.buildFConstant(S64, 1.0); 3012480093f4SDimitry Andric 3013480093f4SDimitry Andric auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3014480093f4SDimitry Andric .addUse(LHS) 3015480093f4SDimitry Andric .addUse(RHS) 3016*5ffd83dbSDimitry Andric .addImm(0) 3017480093f4SDimitry Andric .setMIFlags(Flags); 3018480093f4SDimitry Andric 3019480093f4SDimitry Andric auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); 3020480093f4SDimitry Andric 3021480093f4SDimitry Andric auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false) 3022480093f4SDimitry Andric .addUse(DivScale0.getReg(0)) 3023480093f4SDimitry Andric .setMIFlags(Flags); 3024480093f4SDimitry Andric 3025480093f4SDimitry Andric auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags); 3026480093f4SDimitry Andric auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags); 3027480093f4SDimitry Andric auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags); 3028480093f4SDimitry Andric 3029480093f4SDimitry Andric auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) 3030480093f4SDimitry Andric .addUse(LHS) 3031480093f4SDimitry Andric .addUse(RHS) 3032*5ffd83dbSDimitry Andric .addImm(1) 3033480093f4SDimitry Andric .setMIFlags(Flags); 3034480093f4SDimitry Andric 3035480093f4SDimitry Andric auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); 3036*5ffd83dbSDimitry Andric auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); 3037480093f4SDimitry Andric auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); 3038480093f4SDimitry Andric 3039480093f4SDimitry Andric Register Scale; 3040480093f4SDimitry Andric if (!ST.hasUsableDivScaleConditionOutput()) { 3041480093f4SDimitry Andric // Workaround a hardware bug on SI where the condition output from div_scale 3042480093f4SDimitry Andric // is not usable. 3043480093f4SDimitry Andric 3044480093f4SDimitry Andric LLT S32 = LLT::scalar(32); 3045480093f4SDimitry Andric 3046480093f4SDimitry Andric auto NumUnmerge = B.buildUnmerge(S32, LHS); 3047480093f4SDimitry Andric auto DenUnmerge = B.buildUnmerge(S32, RHS); 3048480093f4SDimitry Andric auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0); 3049480093f4SDimitry Andric auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1); 3050480093f4SDimitry Andric 3051480093f4SDimitry Andric auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1), 3052480093f4SDimitry Andric Scale1Unmerge.getReg(1)); 3053480093f4SDimitry Andric auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), 3054480093f4SDimitry Andric Scale0Unmerge.getReg(1)); 3055*5ffd83dbSDimitry Andric Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); 3056480093f4SDimitry Andric } else { 3057480093f4SDimitry Andric Scale = DivScale1.getReg(1); 3058480093f4SDimitry Andric } 3059480093f4SDimitry Andric 3060480093f4SDimitry Andric auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false) 3061480093f4SDimitry Andric .addUse(Fma4.getReg(0)) 3062480093f4SDimitry Andric .addUse(Fma3.getReg(0)) 3063480093f4SDimitry Andric .addUse(Mul.getReg(0)) 3064480093f4SDimitry Andric .addUse(Scale) 3065480093f4SDimitry Andric .setMIFlags(Flags); 3066480093f4SDimitry Andric 3067480093f4SDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, makeArrayRef(Res), false) 3068480093f4SDimitry Andric .addUse(Fmas.getReg(0)) 3069480093f4SDimitry Andric .addUse(RHS) 3070480093f4SDimitry Andric .addUse(LHS) 3071480093f4SDimitry Andric .setMIFlags(Flags); 3072480093f4SDimitry Andric 3073480093f4SDimitry Andric MI.eraseFromParent(); 3074480093f4SDimitry Andric return true; 3075480093f4SDimitry Andric } 3076480093f4SDimitry Andric 30778bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, 30788bcb0991SDimitry Andric MachineRegisterInfo &MRI, 30798bcb0991SDimitry Andric MachineIRBuilder &B) const { 30808bcb0991SDimitry Andric Register Res = MI.getOperand(0).getReg(); 30818bcb0991SDimitry Andric Register LHS = MI.getOperand(2).getReg(); 30828bcb0991SDimitry Andric Register RHS = MI.getOperand(3).getReg(); 30838bcb0991SDimitry Andric uint16_t Flags = MI.getFlags(); 30848bcb0991SDimitry Andric 30858bcb0991SDimitry Andric LLT S32 = LLT::scalar(32); 30868bcb0991SDimitry Andric LLT S1 = LLT::scalar(1); 30878bcb0991SDimitry Andric 30888bcb0991SDimitry Andric auto Abs = B.buildFAbs(S32, RHS, Flags); 30898bcb0991SDimitry Andric const APFloat C0Val(1.0f); 30908bcb0991SDimitry Andric 30918bcb0991SDimitry Andric auto C0 = B.buildConstant(S32, 0x6f800000); 30928bcb0991SDimitry Andric auto C1 = B.buildConstant(S32, 0x2f800000); 30938bcb0991SDimitry Andric auto C2 = B.buildConstant(S32, FloatToBits(1.0f)); 30948bcb0991SDimitry Andric 30958bcb0991SDimitry Andric auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags); 30968bcb0991SDimitry Andric auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags); 30978bcb0991SDimitry Andric 30988bcb0991SDimitry Andric auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags); 30998bcb0991SDimitry Andric 31008bcb0991SDimitry Andric auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) 31018bcb0991SDimitry Andric .addUse(Mul0.getReg(0)) 31028bcb0991SDimitry Andric .setMIFlags(Flags); 31038bcb0991SDimitry Andric 31048bcb0991SDimitry Andric auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags); 31058bcb0991SDimitry Andric 31068bcb0991SDimitry Andric B.buildFMul(Res, Sel, Mul1, Flags); 31078bcb0991SDimitry Andric 31088bcb0991SDimitry Andric MI.eraseFromParent(); 31098bcb0991SDimitry Andric return true; 31108bcb0991SDimitry Andric } 31118bcb0991SDimitry Andric 31120b57cec5SDimitry Andric bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, 31130b57cec5SDimitry Andric MachineRegisterInfo &MRI, 31140b57cec5SDimitry Andric MachineIRBuilder &B) const { 31150b57cec5SDimitry Andric const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); 31160b57cec5SDimitry Andric if (!MFI->isEntryFunction()) { 31170b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 31180b57cec5SDimitry Andric AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); 31190b57cec5SDimitry Andric } 31200b57cec5SDimitry Andric 31210b57cec5SDimitry Andric uint64_t Offset = 31220b57cec5SDimitry Andric ST.getTargetLowering()->getImplicitParameterOffset( 31230b57cec5SDimitry Andric B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); 31240b57cec5SDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 31250b57cec5SDimitry Andric LLT DstTy = MRI.getType(DstReg); 31260b57cec5SDimitry Andric LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); 31270b57cec5SDimitry Andric 31280b57cec5SDimitry Andric const ArgDescriptor *Arg; 31290b57cec5SDimitry Andric const TargetRegisterClass *RC; 3130*5ffd83dbSDimitry Andric LLT ArgTy; 3131*5ffd83dbSDimitry Andric std::tie(Arg, RC, ArgTy) = 3132*5ffd83dbSDimitry Andric MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 31330b57cec5SDimitry Andric if (!Arg) 31340b57cec5SDimitry Andric return false; 31350b57cec5SDimitry Andric 31360b57cec5SDimitry Andric Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); 31370b57cec5SDimitry Andric if (!loadInputValue(KernargPtrReg, B, Arg)) 31380b57cec5SDimitry Andric return false; 31390b57cec5SDimitry Andric 3140480093f4SDimitry Andric B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); 31410b57cec5SDimitry Andric MI.eraseFromParent(); 31420b57cec5SDimitry Andric return true; 31430b57cec5SDimitry Andric } 31440b57cec5SDimitry Andric 31458bcb0991SDimitry Andric bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, 31468bcb0991SDimitry Andric MachineRegisterInfo &MRI, 31478bcb0991SDimitry Andric MachineIRBuilder &B, 31488bcb0991SDimitry Andric unsigned AddrSpace) const { 31498bcb0991SDimitry Andric Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); 31508bcb0991SDimitry Andric auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); 31518bcb0991SDimitry Andric B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); 31528bcb0991SDimitry Andric MI.eraseFromParent(); 31538bcb0991SDimitry Andric return true; 31548bcb0991SDimitry Andric } 31558bcb0991SDimitry Andric 3156*5ffd83dbSDimitry Andric // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: 3157*5ffd83dbSDimitry Andric // offset (the offset that is included in bounds checking and swizzling, to be 3158*5ffd83dbSDimitry Andric // split between the instruction's voffset and immoffset fields) and soffset 3159*5ffd83dbSDimitry Andric // (the offset that is excluded from bounds checking and swizzling, to go in 3160*5ffd83dbSDimitry Andric // the instruction's soffset field). This function takes the first kind of 3161*5ffd83dbSDimitry Andric // offset and figures out how to split it between voffset and immoffset. 3162*5ffd83dbSDimitry Andric std::tuple<Register, unsigned, unsigned> 3163*5ffd83dbSDimitry Andric AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, 3164*5ffd83dbSDimitry Andric Register OrigOffset) const { 3165*5ffd83dbSDimitry Andric const unsigned MaxImm = 4095; 3166*5ffd83dbSDimitry Andric Register BaseReg; 3167*5ffd83dbSDimitry Andric unsigned TotalConstOffset; 3168*5ffd83dbSDimitry Andric MachineInstr *OffsetDef; 3169*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3170*5ffd83dbSDimitry Andric 3171*5ffd83dbSDimitry Andric std::tie(BaseReg, TotalConstOffset, OffsetDef) 3172*5ffd83dbSDimitry Andric = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); 3173*5ffd83dbSDimitry Andric 3174*5ffd83dbSDimitry Andric unsigned ImmOffset = TotalConstOffset; 3175*5ffd83dbSDimitry Andric 3176*5ffd83dbSDimitry Andric // If the immediate value is too big for the immoffset field, put the value 3177*5ffd83dbSDimitry Andric // and -4096 into the immoffset field so that the value that is copied/added 3178*5ffd83dbSDimitry Andric // for the voffset field is a multiple of 4096, and it stands more chance 3179*5ffd83dbSDimitry Andric // of being CSEd with the copy/add for another similar load/store. 3180*5ffd83dbSDimitry Andric // However, do not do that rounding down to a multiple of 4096 if that is a 3181*5ffd83dbSDimitry Andric // negative number, as it appears to be illegal to have a negative offset 3182*5ffd83dbSDimitry Andric // in the vgpr, even if adding the immediate offset makes it positive. 3183*5ffd83dbSDimitry Andric unsigned Overflow = ImmOffset & ~MaxImm; 3184*5ffd83dbSDimitry Andric ImmOffset -= Overflow; 3185*5ffd83dbSDimitry Andric if ((int32_t)Overflow < 0) { 3186*5ffd83dbSDimitry Andric Overflow += ImmOffset; 3187*5ffd83dbSDimitry Andric ImmOffset = 0; 3188*5ffd83dbSDimitry Andric } 3189*5ffd83dbSDimitry Andric 3190*5ffd83dbSDimitry Andric if (Overflow != 0) { 3191*5ffd83dbSDimitry Andric if (!BaseReg) { 3192*5ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, Overflow).getReg(0); 3193*5ffd83dbSDimitry Andric } else { 3194*5ffd83dbSDimitry Andric auto OverflowVal = B.buildConstant(S32, Overflow); 3195*5ffd83dbSDimitry Andric BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); 3196*5ffd83dbSDimitry Andric } 3197*5ffd83dbSDimitry Andric } 3198*5ffd83dbSDimitry Andric 3199*5ffd83dbSDimitry Andric if (!BaseReg) 3200*5ffd83dbSDimitry Andric BaseReg = B.buildConstant(S32, 0).getReg(0); 3201*5ffd83dbSDimitry Andric 3202*5ffd83dbSDimitry Andric return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); 3203*5ffd83dbSDimitry Andric } 3204*5ffd83dbSDimitry Andric 32058bcb0991SDimitry Andric /// Handle register layout difference for f16 images for some subtargets. 32068bcb0991SDimitry Andric Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, 32078bcb0991SDimitry Andric MachineRegisterInfo &MRI, 32088bcb0991SDimitry Andric Register Reg) const { 32098bcb0991SDimitry Andric if (!ST.hasUnpackedD16VMem()) 32108bcb0991SDimitry Andric return Reg; 32118bcb0991SDimitry Andric 32128bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 32138bcb0991SDimitry Andric const LLT S32 = LLT::scalar(32); 32148bcb0991SDimitry Andric LLT StoreVT = MRI.getType(Reg); 32158bcb0991SDimitry Andric assert(StoreVT.isVector() && StoreVT.getElementType() == S16); 32168bcb0991SDimitry Andric 32178bcb0991SDimitry Andric auto Unmerge = B.buildUnmerge(S16, Reg); 32188bcb0991SDimitry Andric 32198bcb0991SDimitry Andric SmallVector<Register, 4> WideRegs; 32208bcb0991SDimitry Andric for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) 32218bcb0991SDimitry Andric WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); 32228bcb0991SDimitry Andric 32238bcb0991SDimitry Andric int NumElts = StoreVT.getNumElements(); 32248bcb0991SDimitry Andric 32258bcb0991SDimitry Andric return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); 32268bcb0991SDimitry Andric } 32278bcb0991SDimitry Andric 3228*5ffd83dbSDimitry Andric Register AMDGPULegalizerInfo::fixStoreSourceType( 3229*5ffd83dbSDimitry Andric MachineIRBuilder &B, Register VData, bool IsFormat) const { 3230*5ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 3231*5ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData); 32328bcb0991SDimitry Andric 32338bcb0991SDimitry Andric const LLT S16 = LLT::scalar(16); 32348bcb0991SDimitry Andric 32358bcb0991SDimitry Andric // Fixup illegal register types for i8 stores. 32368bcb0991SDimitry Andric if (Ty == LLT::scalar(8) || Ty == S16) { 32378bcb0991SDimitry Andric Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); 3238*5ffd83dbSDimitry Andric return AnyExt; 32398bcb0991SDimitry Andric } 32408bcb0991SDimitry Andric 32418bcb0991SDimitry Andric if (Ty.isVector()) { 32428bcb0991SDimitry Andric if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { 32438bcb0991SDimitry Andric if (IsFormat) 3244*5ffd83dbSDimitry Andric return handleD16VData(B, *MRI, VData); 3245*5ffd83dbSDimitry Andric } 3246*5ffd83dbSDimitry Andric } 3247*5ffd83dbSDimitry Andric 3248*5ffd83dbSDimitry Andric return VData; 3249*5ffd83dbSDimitry Andric } 3250*5ffd83dbSDimitry Andric 3251*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, 3252*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 3253*5ffd83dbSDimitry Andric MachineIRBuilder &B, 3254*5ffd83dbSDimitry Andric bool IsTyped, 3255*5ffd83dbSDimitry Andric bool IsFormat) const { 3256*5ffd83dbSDimitry Andric Register VData = MI.getOperand(1).getReg(); 3257*5ffd83dbSDimitry Andric LLT Ty = MRI.getType(VData); 3258*5ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 3259*5ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3260*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3261*5ffd83dbSDimitry Andric 3262*5ffd83dbSDimitry Andric VData = fixStoreSourceType(B, VData, IsFormat); 3263*5ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 3264*5ffd83dbSDimitry Andric 3265*5ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 3266*5ffd83dbSDimitry Andric const int MemSize = MMO->getSize(); 3267*5ffd83dbSDimitry Andric 3268*5ffd83dbSDimitry Andric unsigned ImmOffset; 3269*5ffd83dbSDimitry Andric unsigned TotalOffset; 3270*5ffd83dbSDimitry Andric 3271*5ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 3272*5ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3273*5ffd83dbSDimitry Andric 3274*5ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 3275*5ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3276*5ffd83dbSDimitry Andric Register VIndex; 3277*5ffd83dbSDimitry Andric int OpOffset = 0; 3278*5ffd83dbSDimitry Andric if (HasVIndex) { 3279*5ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 3280*5ffd83dbSDimitry Andric OpOffset = 1; 3281*5ffd83dbSDimitry Andric } 3282*5ffd83dbSDimitry Andric 3283*5ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3284*5ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3285*5ffd83dbSDimitry Andric 3286*5ffd83dbSDimitry Andric unsigned Format = 0; 3287*5ffd83dbSDimitry Andric if (IsTyped) { 3288*5ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 3289*5ffd83dbSDimitry Andric ++OpOffset; 3290*5ffd83dbSDimitry Andric } 3291*5ffd83dbSDimitry Andric 3292*5ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3293*5ffd83dbSDimitry Andric 3294*5ffd83dbSDimitry Andric std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3295*5ffd83dbSDimitry Andric if (TotalOffset != 0) 3296*5ffd83dbSDimitry Andric MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3297*5ffd83dbSDimitry Andric 3298*5ffd83dbSDimitry Andric unsigned Opc; 3299*5ffd83dbSDimitry Andric if (IsTyped) { 3300*5ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : 3301*5ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; 3302*5ffd83dbSDimitry Andric } else if (IsFormat) { 3303*5ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : 3304*5ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; 3305*5ffd83dbSDimitry Andric } else { 3306*5ffd83dbSDimitry Andric switch (MemSize) { 3307*5ffd83dbSDimitry Andric case 1: 3308*5ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; 3309*5ffd83dbSDimitry Andric break; 3310*5ffd83dbSDimitry Andric case 2: 3311*5ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; 3312*5ffd83dbSDimitry Andric break; 3313*5ffd83dbSDimitry Andric default: 3314*5ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; 3315*5ffd83dbSDimitry Andric break; 3316*5ffd83dbSDimitry Andric } 3317*5ffd83dbSDimitry Andric } 3318*5ffd83dbSDimitry Andric 3319*5ffd83dbSDimitry Andric if (!VIndex) 3320*5ffd83dbSDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 3321*5ffd83dbSDimitry Andric 3322*5ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 3323*5ffd83dbSDimitry Andric .addUse(VData) // vdata 3324*5ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 3325*5ffd83dbSDimitry Andric .addUse(VIndex) // vindex 3326*5ffd83dbSDimitry Andric .addUse(VOffset) // voffset 3327*5ffd83dbSDimitry Andric .addUse(SOffset) // soffset 3328*5ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 3329*5ffd83dbSDimitry Andric 3330*5ffd83dbSDimitry Andric if (IsTyped) 3331*5ffd83dbSDimitry Andric MIB.addImm(Format); 3332*5ffd83dbSDimitry Andric 3333*5ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3334*5ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3335*5ffd83dbSDimitry Andric .addMemOperand(MMO); 3336*5ffd83dbSDimitry Andric 3337*5ffd83dbSDimitry Andric MI.eraseFromParent(); 33388bcb0991SDimitry Andric return true; 33398bcb0991SDimitry Andric } 33408bcb0991SDimitry Andric 3341*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, 3342*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI, 3343*5ffd83dbSDimitry Andric MachineIRBuilder &B, 3344*5ffd83dbSDimitry Andric bool IsFormat, 3345*5ffd83dbSDimitry Andric bool IsTyped) const { 3346*5ffd83dbSDimitry Andric // FIXME: Verifier should enforce 1 MMO for these intrinsics. 3347*5ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 3348*5ffd83dbSDimitry Andric const int MemSize = MMO->getSize(); 3349*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3350*5ffd83dbSDimitry Andric 3351*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 3352*5ffd83dbSDimitry Andric Register RSrc = MI.getOperand(2).getReg(); 3353*5ffd83dbSDimitry Andric 3354*5ffd83dbSDimitry Andric // The typed intrinsics add an immediate after the registers. 3355*5ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsTyped ? 8 : 7; 3356*5ffd83dbSDimitry Andric 3357*5ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 3358*5ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3359*5ffd83dbSDimitry Andric Register VIndex; 3360*5ffd83dbSDimitry Andric int OpOffset = 0; 3361*5ffd83dbSDimitry Andric if (HasVIndex) { 3362*5ffd83dbSDimitry Andric VIndex = MI.getOperand(3).getReg(); 3363*5ffd83dbSDimitry Andric OpOffset = 1; 33648bcb0991SDimitry Andric } 33658bcb0991SDimitry Andric 3366*5ffd83dbSDimitry Andric Register VOffset = MI.getOperand(3 + OpOffset).getReg(); 3367*5ffd83dbSDimitry Andric Register SOffset = MI.getOperand(4 + OpOffset).getReg(); 3368*5ffd83dbSDimitry Andric 3369*5ffd83dbSDimitry Andric unsigned Format = 0; 3370*5ffd83dbSDimitry Andric if (IsTyped) { 3371*5ffd83dbSDimitry Andric Format = MI.getOperand(5 + OpOffset).getImm(); 3372*5ffd83dbSDimitry Andric ++OpOffset; 33738bcb0991SDimitry Andric } 33748bcb0991SDimitry Andric 3375*5ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); 3376*5ffd83dbSDimitry Andric unsigned ImmOffset; 3377*5ffd83dbSDimitry Andric unsigned TotalOffset; 3378*5ffd83dbSDimitry Andric 3379*5ffd83dbSDimitry Andric LLT Ty = MRI.getType(Dst); 3380*5ffd83dbSDimitry Andric LLT EltTy = Ty.getScalarType(); 3381*5ffd83dbSDimitry Andric const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); 3382*5ffd83dbSDimitry Andric const bool Unpacked = ST.hasUnpackedD16VMem(); 3383*5ffd83dbSDimitry Andric 3384*5ffd83dbSDimitry Andric std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3385*5ffd83dbSDimitry Andric if (TotalOffset != 0) 3386*5ffd83dbSDimitry Andric MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); 3387*5ffd83dbSDimitry Andric 3388*5ffd83dbSDimitry Andric unsigned Opc; 3389*5ffd83dbSDimitry Andric 3390*5ffd83dbSDimitry Andric if (IsTyped) { 3391*5ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : 3392*5ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; 3393*5ffd83dbSDimitry Andric } else if (IsFormat) { 3394*5ffd83dbSDimitry Andric Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : 3395*5ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; 3396*5ffd83dbSDimitry Andric } else { 3397*5ffd83dbSDimitry Andric switch (MemSize) { 3398*5ffd83dbSDimitry Andric case 1: 3399*5ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; 3400*5ffd83dbSDimitry Andric break; 3401*5ffd83dbSDimitry Andric case 2: 3402*5ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 3403*5ffd83dbSDimitry Andric break; 3404*5ffd83dbSDimitry Andric default: 3405*5ffd83dbSDimitry Andric Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; 3406*5ffd83dbSDimitry Andric break; 3407*5ffd83dbSDimitry Andric } 3408*5ffd83dbSDimitry Andric } 3409*5ffd83dbSDimitry Andric 3410*5ffd83dbSDimitry Andric Register LoadDstReg; 3411*5ffd83dbSDimitry Andric 3412*5ffd83dbSDimitry Andric bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); 3413*5ffd83dbSDimitry Andric LLT UnpackedTy = Ty.changeElementSize(32); 3414*5ffd83dbSDimitry Andric 3415*5ffd83dbSDimitry Andric if (IsExtLoad) 3416*5ffd83dbSDimitry Andric LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); 3417*5ffd83dbSDimitry Andric else if (Unpacked && IsD16 && Ty.isVector()) 3418*5ffd83dbSDimitry Andric LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); 3419*5ffd83dbSDimitry Andric else 3420*5ffd83dbSDimitry Andric LoadDstReg = Dst; 3421*5ffd83dbSDimitry Andric 3422*5ffd83dbSDimitry Andric if (!VIndex) 3423*5ffd83dbSDimitry Andric VIndex = B.buildConstant(S32, 0).getReg(0); 3424*5ffd83dbSDimitry Andric 3425*5ffd83dbSDimitry Andric auto MIB = B.buildInstr(Opc) 3426*5ffd83dbSDimitry Andric .addDef(LoadDstReg) // vdata 3427*5ffd83dbSDimitry Andric .addUse(RSrc) // rsrc 3428*5ffd83dbSDimitry Andric .addUse(VIndex) // vindex 3429*5ffd83dbSDimitry Andric .addUse(VOffset) // voffset 3430*5ffd83dbSDimitry Andric .addUse(SOffset) // soffset 3431*5ffd83dbSDimitry Andric .addImm(ImmOffset); // offset(imm) 3432*5ffd83dbSDimitry Andric 3433*5ffd83dbSDimitry Andric if (IsTyped) 3434*5ffd83dbSDimitry Andric MIB.addImm(Format); 3435*5ffd83dbSDimitry Andric 3436*5ffd83dbSDimitry Andric MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3437*5ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3438*5ffd83dbSDimitry Andric .addMemOperand(MMO); 3439*5ffd83dbSDimitry Andric 3440*5ffd83dbSDimitry Andric if (LoadDstReg != Dst) { 3441*5ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), ++B.getInsertPt()); 3442*5ffd83dbSDimitry Andric 3443*5ffd83dbSDimitry Andric // Widen result for extending loads was widened. 3444*5ffd83dbSDimitry Andric if (IsExtLoad) 3445*5ffd83dbSDimitry Andric B.buildTrunc(Dst, LoadDstReg); 3446*5ffd83dbSDimitry Andric else { 3447*5ffd83dbSDimitry Andric // Repack to original 16-bit vector result 3448*5ffd83dbSDimitry Andric // FIXME: G_TRUNC should work, but legalization currently fails 3449*5ffd83dbSDimitry Andric auto Unmerge = B.buildUnmerge(S32, LoadDstReg); 3450*5ffd83dbSDimitry Andric SmallVector<Register, 4> Repack; 3451*5ffd83dbSDimitry Andric for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) 3452*5ffd83dbSDimitry Andric Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); 3453*5ffd83dbSDimitry Andric B.buildMerge(Dst, Repack); 3454*5ffd83dbSDimitry Andric } 3455*5ffd83dbSDimitry Andric } 3456*5ffd83dbSDimitry Andric 3457*5ffd83dbSDimitry Andric MI.eraseFromParent(); 3458*5ffd83dbSDimitry Andric return true; 3459*5ffd83dbSDimitry Andric } 3460*5ffd83dbSDimitry Andric 3461*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, 3462*5ffd83dbSDimitry Andric MachineIRBuilder &B, 3463*5ffd83dbSDimitry Andric bool IsInc) const { 3464*5ffd83dbSDimitry Andric unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : 3465*5ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_ATOMIC_DEC; 3466*5ffd83dbSDimitry Andric B.buildInstr(Opc) 3467*5ffd83dbSDimitry Andric .addDef(MI.getOperand(0).getReg()) 3468*5ffd83dbSDimitry Andric .addUse(MI.getOperand(2).getReg()) 3469*5ffd83dbSDimitry Andric .addUse(MI.getOperand(3).getReg()) 3470*5ffd83dbSDimitry Andric .cloneMemRefs(MI); 3471*5ffd83dbSDimitry Andric MI.eraseFromParent(); 3472*5ffd83dbSDimitry Andric return true; 3473*5ffd83dbSDimitry Andric } 3474*5ffd83dbSDimitry Andric 3475*5ffd83dbSDimitry Andric static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { 3476*5ffd83dbSDimitry Andric switch (IntrID) { 3477*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 3478*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 3479*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; 3480*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 3481*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 3482*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; 3483*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 3484*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 3485*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; 3486*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 3487*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 3488*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; 3489*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 3490*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 3491*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; 3492*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 3493*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 3494*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; 3495*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 3496*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 3497*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; 3498*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 3499*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 3500*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; 3501*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 3502*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 3503*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; 3504*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 3505*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 3506*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; 3507*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 3508*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 3509*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; 3510*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 3511*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 3512*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; 3513*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 3514*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 3515*5ffd83dbSDimitry Andric return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; 3516*5ffd83dbSDimitry Andric default: 3517*5ffd83dbSDimitry Andric llvm_unreachable("unhandled atomic opcode"); 3518*5ffd83dbSDimitry Andric } 3519*5ffd83dbSDimitry Andric } 3520*5ffd83dbSDimitry Andric 3521*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, 3522*5ffd83dbSDimitry Andric MachineIRBuilder &B, 3523*5ffd83dbSDimitry Andric Intrinsic::ID IID) const { 3524*5ffd83dbSDimitry Andric const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || 3525*5ffd83dbSDimitry Andric IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; 3526*5ffd83dbSDimitry Andric 3527*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 3528*5ffd83dbSDimitry Andric Register VData = MI.getOperand(2).getReg(); 3529*5ffd83dbSDimitry Andric 3530*5ffd83dbSDimitry Andric Register CmpVal; 3531*5ffd83dbSDimitry Andric int OpOffset = 0; 3532*5ffd83dbSDimitry Andric 3533*5ffd83dbSDimitry Andric if (IsCmpSwap) { 3534*5ffd83dbSDimitry Andric CmpVal = MI.getOperand(3 + OpOffset).getReg(); 3535*5ffd83dbSDimitry Andric ++OpOffset; 3536*5ffd83dbSDimitry Andric } 3537*5ffd83dbSDimitry Andric 3538*5ffd83dbSDimitry Andric Register RSrc = MI.getOperand(3 + OpOffset).getReg(); 3539*5ffd83dbSDimitry Andric const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; 3540*5ffd83dbSDimitry Andric 3541*5ffd83dbSDimitry Andric // The struct intrinsic variants add one additional operand over raw. 3542*5ffd83dbSDimitry Andric const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; 3543*5ffd83dbSDimitry Andric Register VIndex; 3544*5ffd83dbSDimitry Andric if (HasVIndex) { 3545*5ffd83dbSDimitry Andric VIndex = MI.getOperand(4 + OpOffset).getReg(); 3546*5ffd83dbSDimitry Andric ++OpOffset; 3547*5ffd83dbSDimitry Andric } 3548*5ffd83dbSDimitry Andric 3549*5ffd83dbSDimitry Andric Register VOffset = MI.getOperand(4 + OpOffset).getReg(); 3550*5ffd83dbSDimitry Andric Register SOffset = MI.getOperand(5 + OpOffset).getReg(); 3551*5ffd83dbSDimitry Andric unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); 3552*5ffd83dbSDimitry Andric 3553*5ffd83dbSDimitry Andric MachineMemOperand *MMO = *MI.memoperands_begin(); 3554*5ffd83dbSDimitry Andric 3555*5ffd83dbSDimitry Andric unsigned ImmOffset; 3556*5ffd83dbSDimitry Andric unsigned TotalOffset; 3557*5ffd83dbSDimitry Andric std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); 3558*5ffd83dbSDimitry Andric if (TotalOffset != 0) 3559*5ffd83dbSDimitry Andric MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); 3560*5ffd83dbSDimitry Andric 3561*5ffd83dbSDimitry Andric if (!VIndex) 3562*5ffd83dbSDimitry Andric VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); 3563*5ffd83dbSDimitry Andric 3564*5ffd83dbSDimitry Andric auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) 3565*5ffd83dbSDimitry Andric .addDef(Dst) 3566*5ffd83dbSDimitry Andric .addUse(VData); // vdata 3567*5ffd83dbSDimitry Andric 3568*5ffd83dbSDimitry Andric if (IsCmpSwap) 3569*5ffd83dbSDimitry Andric MIB.addReg(CmpVal); 3570*5ffd83dbSDimitry Andric 3571*5ffd83dbSDimitry Andric MIB.addUse(RSrc) // rsrc 3572*5ffd83dbSDimitry Andric .addUse(VIndex) // vindex 3573*5ffd83dbSDimitry Andric .addUse(VOffset) // voffset 3574*5ffd83dbSDimitry Andric .addUse(SOffset) // soffset 3575*5ffd83dbSDimitry Andric .addImm(ImmOffset) // offset(imm) 3576*5ffd83dbSDimitry Andric .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) 3577*5ffd83dbSDimitry Andric .addImm(HasVIndex ? -1 : 0) // idxen(imm) 3578*5ffd83dbSDimitry Andric .addMemOperand(MMO); 3579*5ffd83dbSDimitry Andric 3580*5ffd83dbSDimitry Andric MI.eraseFromParent(); 3581*5ffd83dbSDimitry Andric return true; 3582*5ffd83dbSDimitry Andric } 3583*5ffd83dbSDimitry Andric 3584*5ffd83dbSDimitry Andric /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized 3585*5ffd83dbSDimitry Andric /// vector with s16 typed elements. 3586*5ffd83dbSDimitry Andric static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, 3587*5ffd83dbSDimitry Andric SmallVectorImpl<Register> &PackedAddrs, 3588*5ffd83dbSDimitry Andric int AddrIdx, int DimIdx, int EndIdx, 3589*5ffd83dbSDimitry Andric int NumGradients) { 3590*5ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 3591*5ffd83dbSDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 3592*5ffd83dbSDimitry Andric 3593*5ffd83dbSDimitry Andric for (int I = AddrIdx; I < EndIdx; ++I) { 3594*5ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(I); 3595*5ffd83dbSDimitry Andric if (!SrcOp.isReg()) 3596*5ffd83dbSDimitry Andric continue; // _L to _LZ may have eliminated this. 3597*5ffd83dbSDimitry Andric 3598*5ffd83dbSDimitry Andric Register AddrReg = SrcOp.getReg(); 3599*5ffd83dbSDimitry Andric 3600*5ffd83dbSDimitry Andric if (I < DimIdx) { 3601*5ffd83dbSDimitry Andric AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); 3602*5ffd83dbSDimitry Andric PackedAddrs.push_back(AddrReg); 3603*5ffd83dbSDimitry Andric } else { 3604*5ffd83dbSDimitry Andric // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, 3605*5ffd83dbSDimitry Andric // derivatives dx/dh and dx/dv are packed with undef. 3606*5ffd83dbSDimitry Andric if (((I + 1) >= EndIdx) || 3607*5ffd83dbSDimitry Andric ((NumGradients / 2) % 2 == 1 && 3608*5ffd83dbSDimitry Andric (I == DimIdx + (NumGradients / 2) - 1 || 3609*5ffd83dbSDimitry Andric I == DimIdx + NumGradients - 1)) || 3610*5ffd83dbSDimitry Andric // Check for _L to _LZ optimization 3611*5ffd83dbSDimitry Andric !MI.getOperand(I + 1).isReg()) { 3612*5ffd83dbSDimitry Andric PackedAddrs.push_back( 3613*5ffd83dbSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) 3614*5ffd83dbSDimitry Andric .getReg(0)); 3615*5ffd83dbSDimitry Andric } else { 3616*5ffd83dbSDimitry Andric PackedAddrs.push_back( 3617*5ffd83dbSDimitry Andric B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) 3618*5ffd83dbSDimitry Andric .getReg(0)); 3619*5ffd83dbSDimitry Andric ++I; 3620*5ffd83dbSDimitry Andric } 3621*5ffd83dbSDimitry Andric } 3622*5ffd83dbSDimitry Andric } 3623*5ffd83dbSDimitry Andric } 3624*5ffd83dbSDimitry Andric 3625*5ffd83dbSDimitry Andric /// Convert from separate vaddr components to a single vector address register, 3626*5ffd83dbSDimitry Andric /// and replace the remaining operands with $noreg. 3627*5ffd83dbSDimitry Andric static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, 3628*5ffd83dbSDimitry Andric int DimIdx, int NumVAddrs) { 3629*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3630*5ffd83dbSDimitry Andric 3631*5ffd83dbSDimitry Andric SmallVector<Register, 8> AddrRegs; 3632*5ffd83dbSDimitry Andric for (int I = 0; I != NumVAddrs; ++I) { 3633*5ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3634*5ffd83dbSDimitry Andric if (SrcOp.isReg()) { 3635*5ffd83dbSDimitry Andric AddrRegs.push_back(SrcOp.getReg()); 3636*5ffd83dbSDimitry Andric assert(B.getMRI()->getType(SrcOp.getReg()) == S32); 3637*5ffd83dbSDimitry Andric } 3638*5ffd83dbSDimitry Andric } 3639*5ffd83dbSDimitry Andric 3640*5ffd83dbSDimitry Andric int NumAddrRegs = AddrRegs.size(); 3641*5ffd83dbSDimitry Andric if (NumAddrRegs != 1) { 3642*5ffd83dbSDimitry Andric // Round up to 8 elements for v5-v7 3643*5ffd83dbSDimitry Andric // FIXME: Missing intermediate sized register classes and instructions. 3644*5ffd83dbSDimitry Andric if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { 3645*5ffd83dbSDimitry Andric const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); 3646*5ffd83dbSDimitry Andric auto Undef = B.buildUndef(S32); 3647*5ffd83dbSDimitry Andric AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); 3648*5ffd83dbSDimitry Andric NumAddrRegs = RoundedNumRegs; 3649*5ffd83dbSDimitry Andric } 3650*5ffd83dbSDimitry Andric 3651*5ffd83dbSDimitry Andric auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); 3652*5ffd83dbSDimitry Andric MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); 3653*5ffd83dbSDimitry Andric } 3654*5ffd83dbSDimitry Andric 3655*5ffd83dbSDimitry Andric for (int I = 1; I != NumVAddrs; ++I) { 3656*5ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(DimIdx + I); 3657*5ffd83dbSDimitry Andric if (SrcOp.isReg()) 3658*5ffd83dbSDimitry Andric MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); 3659*5ffd83dbSDimitry Andric } 3660*5ffd83dbSDimitry Andric } 3661*5ffd83dbSDimitry Andric 3662*5ffd83dbSDimitry Andric /// Rewrite image intrinsics to use register layouts expected by the subtarget. 3663*5ffd83dbSDimitry Andric /// 3664*5ffd83dbSDimitry Andric /// Depending on the subtarget, load/store with 16-bit element data need to be 3665*5ffd83dbSDimitry Andric /// rewritten to use the low half of 32-bit registers, or directly use a packed 3666*5ffd83dbSDimitry Andric /// layout. 16-bit addresses should also sometimes be packed into 32-bit 3667*5ffd83dbSDimitry Andric /// registers. 3668*5ffd83dbSDimitry Andric /// 3669*5ffd83dbSDimitry Andric /// We don't want to directly select image instructions just yet, but also want 3670*5ffd83dbSDimitry Andric /// to exposes all register repacking to the legalizer/combiners. We also don't 3671*5ffd83dbSDimitry Andric /// want a selected instrution entering RegBankSelect. In order to avoid 3672*5ffd83dbSDimitry Andric /// defining a multitude of intermediate image instructions, directly hack on 3673*5ffd83dbSDimitry Andric /// the intrinsic's arguments. In cases like a16 addreses, this requires padding 3674*5ffd83dbSDimitry Andric /// now unnecessary arguments with $noreg. 3675*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeImageIntrinsic( 3676*5ffd83dbSDimitry Andric MachineInstr &MI, MachineIRBuilder &B, 3677*5ffd83dbSDimitry Andric GISelChangeObserver &Observer, 3678*5ffd83dbSDimitry Andric const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { 3679*5ffd83dbSDimitry Andric 3680*5ffd83dbSDimitry Andric const int NumDefs = MI.getNumExplicitDefs(); 3681*5ffd83dbSDimitry Andric bool IsTFE = NumDefs == 2; 3682*5ffd83dbSDimitry Andric // We are only processing the operands of d16 image operations on subtargets 3683*5ffd83dbSDimitry Andric // that use the unpacked register layout, or need to repack the TFE result. 3684*5ffd83dbSDimitry Andric 3685*5ffd83dbSDimitry Andric // TODO: Do we need to guard against already legalized intrinsics? 3686*5ffd83dbSDimitry Andric const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = 3687*5ffd83dbSDimitry Andric AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); 3688*5ffd83dbSDimitry Andric 3689*5ffd83dbSDimitry Andric MachineRegisterInfo *MRI = B.getMRI(); 3690*5ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3691*5ffd83dbSDimitry Andric const LLT S16 = LLT::scalar(16); 3692*5ffd83dbSDimitry Andric const LLT V2S16 = LLT::vector(2, 16); 3693*5ffd83dbSDimitry Andric 3694*5ffd83dbSDimitry Andric // Index of first address argument 3695*5ffd83dbSDimitry Andric const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); 3696*5ffd83dbSDimitry Andric 3697*5ffd83dbSDimitry Andric int NumVAddrs, NumGradients; 3698*5ffd83dbSDimitry Andric std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); 3699*5ffd83dbSDimitry Andric const int DMaskIdx = BaseOpcode->Atomic ? -1 : 3700*5ffd83dbSDimitry Andric getDMaskIdx(BaseOpcode, NumDefs); 3701*5ffd83dbSDimitry Andric unsigned DMask = 0; 3702*5ffd83dbSDimitry Andric 3703*5ffd83dbSDimitry Andric // Check for 16 bit addresses and pack if true. 3704*5ffd83dbSDimitry Andric int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; 3705*5ffd83dbSDimitry Andric LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); 3706*5ffd83dbSDimitry Andric LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); 3707*5ffd83dbSDimitry Andric const bool IsG16 = GradTy == S16; 3708*5ffd83dbSDimitry Andric const bool IsA16 = AddrTy == S16; 3709*5ffd83dbSDimitry Andric 3710*5ffd83dbSDimitry Andric int DMaskLanes = 0; 3711*5ffd83dbSDimitry Andric if (!BaseOpcode->Atomic) { 3712*5ffd83dbSDimitry Andric DMask = MI.getOperand(DMaskIdx).getImm(); 3713*5ffd83dbSDimitry Andric if (BaseOpcode->Gather4) { 3714*5ffd83dbSDimitry Andric DMaskLanes = 4; 3715*5ffd83dbSDimitry Andric } else if (DMask != 0) { 3716*5ffd83dbSDimitry Andric DMaskLanes = countPopulation(DMask); 3717*5ffd83dbSDimitry Andric } else if (!IsTFE && !BaseOpcode->Store) { 3718*5ffd83dbSDimitry Andric // If dmask is 0, this is a no-op load. This can be eliminated. 3719*5ffd83dbSDimitry Andric B.buildUndef(MI.getOperand(0)); 3720*5ffd83dbSDimitry Andric MI.eraseFromParent(); 3721*5ffd83dbSDimitry Andric return true; 3722*5ffd83dbSDimitry Andric } 3723*5ffd83dbSDimitry Andric } 3724*5ffd83dbSDimitry Andric 3725*5ffd83dbSDimitry Andric Observer.changingInstr(MI); 3726*5ffd83dbSDimitry Andric auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); 3727*5ffd83dbSDimitry Andric 3728*5ffd83dbSDimitry Andric unsigned NewOpcode = NumDefs == 0 ? 3729*5ffd83dbSDimitry Andric AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; 3730*5ffd83dbSDimitry Andric 3731*5ffd83dbSDimitry Andric // Track that we legalized this 3732*5ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(NewOpcode)); 3733*5ffd83dbSDimitry Andric 3734*5ffd83dbSDimitry Andric // Expecting to get an error flag since TFC is on - and dmask is 0 Force 3735*5ffd83dbSDimitry Andric // dmask to be at least 1 otherwise the instruction will fail 3736*5ffd83dbSDimitry Andric if (IsTFE && DMask == 0) { 3737*5ffd83dbSDimitry Andric DMask = 0x1; 3738*5ffd83dbSDimitry Andric DMaskLanes = 1; 3739*5ffd83dbSDimitry Andric MI.getOperand(DMaskIdx).setImm(DMask); 3740*5ffd83dbSDimitry Andric } 3741*5ffd83dbSDimitry Andric 3742*5ffd83dbSDimitry Andric if (BaseOpcode->Atomic) { 3743*5ffd83dbSDimitry Andric Register VData0 = MI.getOperand(2).getReg(); 3744*5ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData0); 3745*5ffd83dbSDimitry Andric 3746*5ffd83dbSDimitry Andric // TODO: Allow atomic swap and bit ops for v2s16/v4s16 3747*5ffd83dbSDimitry Andric if (Ty.isVector()) 3748*5ffd83dbSDimitry Andric return false; 3749*5ffd83dbSDimitry Andric 3750*5ffd83dbSDimitry Andric if (BaseOpcode->AtomicX2) { 3751*5ffd83dbSDimitry Andric Register VData1 = MI.getOperand(3).getReg(); 3752*5ffd83dbSDimitry Andric // The two values are packed in one register. 3753*5ffd83dbSDimitry Andric LLT PackedTy = LLT::vector(2, Ty); 3754*5ffd83dbSDimitry Andric auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); 3755*5ffd83dbSDimitry Andric MI.getOperand(2).setReg(Concat.getReg(0)); 3756*5ffd83dbSDimitry Andric MI.getOperand(3).setReg(AMDGPU::NoRegister); 3757*5ffd83dbSDimitry Andric } 3758*5ffd83dbSDimitry Andric } 3759*5ffd83dbSDimitry Andric 3760*5ffd83dbSDimitry Andric int CorrectedNumVAddrs = NumVAddrs; 3761*5ffd83dbSDimitry Andric 3762*5ffd83dbSDimitry Andric // Optimize _L to _LZ when _L is zero 3763*5ffd83dbSDimitry Andric if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = 3764*5ffd83dbSDimitry Andric AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { 3765*5ffd83dbSDimitry Andric const ConstantFP *ConstantLod; 3766*5ffd83dbSDimitry Andric const int LodIdx = AddrIdx + NumVAddrs - 1; 3767*5ffd83dbSDimitry Andric 3768*5ffd83dbSDimitry Andric if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { 3769*5ffd83dbSDimitry Andric if (ConstantLod->isZero() || ConstantLod->isNegative()) { 3770*5ffd83dbSDimitry Andric // Set new opcode to _lz variant of _l, and change the intrinsic ID. 3771*5ffd83dbSDimitry Andric ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( 3772*5ffd83dbSDimitry Andric LZMappingInfo->LZ, ImageDimIntr->Dim); 3773*5ffd83dbSDimitry Andric 3774*5ffd83dbSDimitry Andric // The starting indexes should remain in the same place. 3775*5ffd83dbSDimitry Andric --NumVAddrs; 3776*5ffd83dbSDimitry Andric --CorrectedNumVAddrs; 3777*5ffd83dbSDimitry Andric 3778*5ffd83dbSDimitry Andric MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( 3779*5ffd83dbSDimitry Andric static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); 3780*5ffd83dbSDimitry Andric MI.RemoveOperand(LodIdx); 3781*5ffd83dbSDimitry Andric } 3782*5ffd83dbSDimitry Andric } 3783*5ffd83dbSDimitry Andric } 3784*5ffd83dbSDimitry Andric 3785*5ffd83dbSDimitry Andric // Optimize _mip away, when 'lod' is zero 3786*5ffd83dbSDimitry Andric if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { 3787*5ffd83dbSDimitry Andric int64_t ConstantLod; 3788*5ffd83dbSDimitry Andric const int LodIdx = AddrIdx + NumVAddrs - 1; 3789*5ffd83dbSDimitry Andric 3790*5ffd83dbSDimitry Andric if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { 3791*5ffd83dbSDimitry Andric if (ConstantLod == 0) { 3792*5ffd83dbSDimitry Andric // TODO: Change intrinsic opcode and remove operand instead or replacing 3793*5ffd83dbSDimitry Andric // it with 0, as the _L to _LZ handling is done above. 3794*5ffd83dbSDimitry Andric MI.getOperand(LodIdx).ChangeToImmediate(0); 3795*5ffd83dbSDimitry Andric --CorrectedNumVAddrs; 3796*5ffd83dbSDimitry Andric } 3797*5ffd83dbSDimitry Andric } 3798*5ffd83dbSDimitry Andric } 3799*5ffd83dbSDimitry Andric 3800*5ffd83dbSDimitry Andric // Rewrite the addressing register layout before doing anything else. 3801*5ffd83dbSDimitry Andric if (IsA16 || IsG16) { 3802*5ffd83dbSDimitry Andric if (IsA16) { 3803*5ffd83dbSDimitry Andric // Target must support the feature and gradients need to be 16 bit too 3804*5ffd83dbSDimitry Andric if (!ST.hasA16() || !IsG16) 3805*5ffd83dbSDimitry Andric return false; 3806*5ffd83dbSDimitry Andric } else if (!ST.hasG16()) 3807*5ffd83dbSDimitry Andric return false; 3808*5ffd83dbSDimitry Andric 3809*5ffd83dbSDimitry Andric if (NumVAddrs > 1) { 3810*5ffd83dbSDimitry Andric SmallVector<Register, 4> PackedRegs; 3811*5ffd83dbSDimitry Andric // Don't compress addresses for G16 3812*5ffd83dbSDimitry Andric const int PackEndIdx = 3813*5ffd83dbSDimitry Andric IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); 3814*5ffd83dbSDimitry Andric packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, 3815*5ffd83dbSDimitry Andric PackEndIdx, NumGradients); 3816*5ffd83dbSDimitry Andric 3817*5ffd83dbSDimitry Andric if (!IsA16) { 3818*5ffd83dbSDimitry Andric // Add uncompressed address 3819*5ffd83dbSDimitry Andric for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { 3820*5ffd83dbSDimitry Andric int AddrReg = MI.getOperand(I).getReg(); 3821*5ffd83dbSDimitry Andric assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); 3822*5ffd83dbSDimitry Andric PackedRegs.push_back(AddrReg); 3823*5ffd83dbSDimitry Andric } 3824*5ffd83dbSDimitry Andric } 3825*5ffd83dbSDimitry Andric 3826*5ffd83dbSDimitry Andric // See also below in the non-a16 branch 3827*5ffd83dbSDimitry Andric const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); 3828*5ffd83dbSDimitry Andric 3829*5ffd83dbSDimitry Andric if (!UseNSA && PackedRegs.size() > 1) { 3830*5ffd83dbSDimitry Andric LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); 3831*5ffd83dbSDimitry Andric auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); 3832*5ffd83dbSDimitry Andric PackedRegs[0] = Concat.getReg(0); 3833*5ffd83dbSDimitry Andric PackedRegs.resize(1); 3834*5ffd83dbSDimitry Andric } 3835*5ffd83dbSDimitry Andric 3836*5ffd83dbSDimitry Andric const int NumPacked = PackedRegs.size(); 3837*5ffd83dbSDimitry Andric for (int I = 0; I != NumVAddrs; ++I) { 3838*5ffd83dbSDimitry Andric MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); 3839*5ffd83dbSDimitry Andric if (!SrcOp.isReg()) { 3840*5ffd83dbSDimitry Andric assert(SrcOp.isImm() && SrcOp.getImm() == 0); 3841*5ffd83dbSDimitry Andric continue; 3842*5ffd83dbSDimitry Andric } 3843*5ffd83dbSDimitry Andric 3844*5ffd83dbSDimitry Andric assert(SrcOp.getReg() != AMDGPU::NoRegister); 3845*5ffd83dbSDimitry Andric 3846*5ffd83dbSDimitry Andric if (I < NumPacked) 3847*5ffd83dbSDimitry Andric SrcOp.setReg(PackedRegs[I]); 3848*5ffd83dbSDimitry Andric else 3849*5ffd83dbSDimitry Andric SrcOp.setReg(AMDGPU::NoRegister); 3850*5ffd83dbSDimitry Andric } 3851*5ffd83dbSDimitry Andric } 3852*5ffd83dbSDimitry Andric } else { 3853*5ffd83dbSDimitry Andric // If the register allocator cannot place the address registers contiguously 3854*5ffd83dbSDimitry Andric // without introducing moves, then using the non-sequential address encoding 3855*5ffd83dbSDimitry Andric // is always preferable, since it saves VALU instructions and is usually a 3856*5ffd83dbSDimitry Andric // wash in terms of code size or even better. 3857*5ffd83dbSDimitry Andric // 3858*5ffd83dbSDimitry Andric // However, we currently have no way of hinting to the register allocator 3859*5ffd83dbSDimitry Andric // that MIMG addresses should be placed contiguously when it is possible to 3860*5ffd83dbSDimitry Andric // do so, so force non-NSA for the common 2-address case as a heuristic. 3861*5ffd83dbSDimitry Andric // 3862*5ffd83dbSDimitry Andric // SIShrinkInstructions will convert NSA encodings to non-NSA after register 3863*5ffd83dbSDimitry Andric // allocation when possible. 3864*5ffd83dbSDimitry Andric const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); 3865*5ffd83dbSDimitry Andric 3866*5ffd83dbSDimitry Andric if (!UseNSA && NumVAddrs > 1) 3867*5ffd83dbSDimitry Andric convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); 3868*5ffd83dbSDimitry Andric } 3869*5ffd83dbSDimitry Andric 3870*5ffd83dbSDimitry Andric int Flags = 0; 3871*5ffd83dbSDimitry Andric if (IsA16) 3872*5ffd83dbSDimitry Andric Flags |= 1; 3873*5ffd83dbSDimitry Andric if (IsG16) 3874*5ffd83dbSDimitry Andric Flags |= 2; 3875*5ffd83dbSDimitry Andric MI.addOperand(MachineOperand::CreateImm(Flags)); 3876*5ffd83dbSDimitry Andric 3877*5ffd83dbSDimitry Andric if (BaseOpcode->Store) { // No TFE for stores? 3878*5ffd83dbSDimitry Andric // TODO: Handle dmask trim 3879*5ffd83dbSDimitry Andric Register VData = MI.getOperand(1).getReg(); 3880*5ffd83dbSDimitry Andric LLT Ty = MRI->getType(VData); 3881*5ffd83dbSDimitry Andric if (!Ty.isVector() || Ty.getElementType() != S16) 3882*5ffd83dbSDimitry Andric return true; 3883*5ffd83dbSDimitry Andric 3884*5ffd83dbSDimitry Andric Register RepackedReg = handleD16VData(B, *MRI, VData); 3885*5ffd83dbSDimitry Andric if (RepackedReg != VData) { 3886*5ffd83dbSDimitry Andric MI.getOperand(1).setReg(RepackedReg); 3887*5ffd83dbSDimitry Andric } 3888*5ffd83dbSDimitry Andric 3889*5ffd83dbSDimitry Andric return true; 3890*5ffd83dbSDimitry Andric } 3891*5ffd83dbSDimitry Andric 3892*5ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 3893*5ffd83dbSDimitry Andric LLT Ty = MRI->getType(DstReg); 3894*5ffd83dbSDimitry Andric const LLT EltTy = Ty.getScalarType(); 3895*5ffd83dbSDimitry Andric const bool IsD16 = Ty.getScalarType() == S16; 3896*5ffd83dbSDimitry Andric const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; 3897*5ffd83dbSDimitry Andric 3898*5ffd83dbSDimitry Andric // Confirm that the return type is large enough for the dmask specified 3899*5ffd83dbSDimitry Andric if (NumElts < DMaskLanes) 3900*5ffd83dbSDimitry Andric return false; 3901*5ffd83dbSDimitry Andric 3902*5ffd83dbSDimitry Andric if (NumElts > 4 || DMaskLanes > 4) 3903*5ffd83dbSDimitry Andric return false; 3904*5ffd83dbSDimitry Andric 3905*5ffd83dbSDimitry Andric const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; 3906*5ffd83dbSDimitry Andric const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); 3907*5ffd83dbSDimitry Andric 3908*5ffd83dbSDimitry Andric // The raw dword aligned data component of the load. The only legal cases 3909*5ffd83dbSDimitry Andric // where this matters should be when using the packed D16 format, for 3910*5ffd83dbSDimitry Andric // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, 3911*5ffd83dbSDimitry Andric LLT RoundedTy; 3912*5ffd83dbSDimitry Andric 3913*5ffd83dbSDimitry Andric // S32 vector to to cover all data, plus TFE result element. 3914*5ffd83dbSDimitry Andric LLT TFETy; 3915*5ffd83dbSDimitry Andric 3916*5ffd83dbSDimitry Andric // Register type to use for each loaded component. Will be S32 or V2S16. 3917*5ffd83dbSDimitry Andric LLT RegTy; 3918*5ffd83dbSDimitry Andric 3919*5ffd83dbSDimitry Andric if (IsD16 && ST.hasUnpackedD16VMem()) { 3920*5ffd83dbSDimitry Andric RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); 3921*5ffd83dbSDimitry Andric TFETy = LLT::vector(AdjustedNumElts + 1, 32); 3922*5ffd83dbSDimitry Andric RegTy = S32; 3923*5ffd83dbSDimitry Andric } else { 3924*5ffd83dbSDimitry Andric unsigned EltSize = EltTy.getSizeInBits(); 3925*5ffd83dbSDimitry Andric unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; 3926*5ffd83dbSDimitry Andric unsigned RoundedSize = 32 * RoundedElts; 3927*5ffd83dbSDimitry Andric RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); 3928*5ffd83dbSDimitry Andric TFETy = LLT::vector(RoundedSize / 32 + 1, S32); 3929*5ffd83dbSDimitry Andric RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; 3930*5ffd83dbSDimitry Andric } 3931*5ffd83dbSDimitry Andric 3932*5ffd83dbSDimitry Andric // The return type does not need adjustment. 3933*5ffd83dbSDimitry Andric // TODO: Should we change s16 case to s32 or <2 x s16>? 3934*5ffd83dbSDimitry Andric if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) 3935*5ffd83dbSDimitry Andric return true; 3936*5ffd83dbSDimitry Andric 3937*5ffd83dbSDimitry Andric Register Dst1Reg; 3938*5ffd83dbSDimitry Andric 3939*5ffd83dbSDimitry Andric // Insert after the instruction. 3940*5ffd83dbSDimitry Andric B.setInsertPt(*MI.getParent(), ++MI.getIterator()); 3941*5ffd83dbSDimitry Andric 3942*5ffd83dbSDimitry Andric // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x 3943*5ffd83dbSDimitry Andric // s16> instead of s32, we would only need 1 bitcast instead of multiple. 3944*5ffd83dbSDimitry Andric const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; 3945*5ffd83dbSDimitry Andric const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; 3946*5ffd83dbSDimitry Andric 3947*5ffd83dbSDimitry Andric Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); 3948*5ffd83dbSDimitry Andric 3949*5ffd83dbSDimitry Andric MI.getOperand(0).setReg(NewResultReg); 3950*5ffd83dbSDimitry Andric 3951*5ffd83dbSDimitry Andric // In the IR, TFE is supposed to be used with a 2 element struct return 3952*5ffd83dbSDimitry Andric // type. The intruction really returns these two values in one contiguous 3953*5ffd83dbSDimitry Andric // register, with one additional dword beyond the loaded data. Rewrite the 3954*5ffd83dbSDimitry Andric // return type to use a single register result. 3955*5ffd83dbSDimitry Andric 3956*5ffd83dbSDimitry Andric if (IsTFE) { 3957*5ffd83dbSDimitry Andric Dst1Reg = MI.getOperand(1).getReg(); 3958*5ffd83dbSDimitry Andric if (MRI->getType(Dst1Reg) != S32) 3959*5ffd83dbSDimitry Andric return false; 3960*5ffd83dbSDimitry Andric 3961*5ffd83dbSDimitry Andric // TODO: Make sure the TFE operand bit is set. 3962*5ffd83dbSDimitry Andric MI.RemoveOperand(1); 3963*5ffd83dbSDimitry Andric 3964*5ffd83dbSDimitry Andric // Handle the easy case that requires no repack instructions. 3965*5ffd83dbSDimitry Andric if (Ty == S32) { 3966*5ffd83dbSDimitry Andric B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); 3967*5ffd83dbSDimitry Andric return true; 3968*5ffd83dbSDimitry Andric } 3969*5ffd83dbSDimitry Andric } 3970*5ffd83dbSDimitry Andric 3971*5ffd83dbSDimitry Andric // Now figure out how to copy the new result register back into the old 3972*5ffd83dbSDimitry Andric // result. 3973*5ffd83dbSDimitry Andric SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); 3974*5ffd83dbSDimitry Andric 3975*5ffd83dbSDimitry Andric const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; 3976*5ffd83dbSDimitry Andric 3977*5ffd83dbSDimitry Andric if (ResultNumRegs == 1) { 3978*5ffd83dbSDimitry Andric assert(!IsTFE); 3979*5ffd83dbSDimitry Andric ResultRegs[0] = NewResultReg; 3980*5ffd83dbSDimitry Andric } else { 3981*5ffd83dbSDimitry Andric // We have to repack into a new vector of some kind. 3982*5ffd83dbSDimitry Andric for (int I = 0; I != NumDataRegs; ++I) 3983*5ffd83dbSDimitry Andric ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); 3984*5ffd83dbSDimitry Andric B.buildUnmerge(ResultRegs, NewResultReg); 3985*5ffd83dbSDimitry Andric 3986*5ffd83dbSDimitry Andric // Drop the final TFE element to get the data part. The TFE result is 3987*5ffd83dbSDimitry Andric // directly written to the right place already. 3988*5ffd83dbSDimitry Andric if (IsTFE) 3989*5ffd83dbSDimitry Andric ResultRegs.resize(NumDataRegs); 3990*5ffd83dbSDimitry Andric } 3991*5ffd83dbSDimitry Andric 3992*5ffd83dbSDimitry Andric // For an s16 scalar result, we form an s32 result with a truncate regardless 3993*5ffd83dbSDimitry Andric // of packed vs. unpacked. 3994*5ffd83dbSDimitry Andric if (IsD16 && !Ty.isVector()) { 3995*5ffd83dbSDimitry Andric B.buildTrunc(DstReg, ResultRegs[0]); 3996*5ffd83dbSDimitry Andric return true; 3997*5ffd83dbSDimitry Andric } 3998*5ffd83dbSDimitry Andric 3999*5ffd83dbSDimitry Andric // Avoid a build/concat_vector of 1 entry. 4000*5ffd83dbSDimitry Andric if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { 4001*5ffd83dbSDimitry Andric B.buildBitcast(DstReg, ResultRegs[0]); 4002*5ffd83dbSDimitry Andric return true; 4003*5ffd83dbSDimitry Andric } 4004*5ffd83dbSDimitry Andric 4005*5ffd83dbSDimitry Andric assert(Ty.isVector()); 4006*5ffd83dbSDimitry Andric 4007*5ffd83dbSDimitry Andric if (IsD16) { 4008*5ffd83dbSDimitry Andric // For packed D16 results with TFE enabled, all the data components are 4009*5ffd83dbSDimitry Andric // S32. Cast back to the expected type. 4010*5ffd83dbSDimitry Andric // 4011*5ffd83dbSDimitry Andric // TODO: We don't really need to use load s32 elements. We would only need one 4012*5ffd83dbSDimitry Andric // cast for the TFE result if a multiple of v2s16 was used. 4013*5ffd83dbSDimitry Andric if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { 4014*5ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 4015*5ffd83dbSDimitry Andric Reg = B.buildBitcast(V2S16, Reg).getReg(0); 4016*5ffd83dbSDimitry Andric } else if (ST.hasUnpackedD16VMem()) { 4017*5ffd83dbSDimitry Andric for (Register &Reg : ResultRegs) 4018*5ffd83dbSDimitry Andric Reg = B.buildTrunc(S16, Reg).getReg(0); 4019*5ffd83dbSDimitry Andric } 4020*5ffd83dbSDimitry Andric } 4021*5ffd83dbSDimitry Andric 4022*5ffd83dbSDimitry Andric auto padWithUndef = [&](LLT Ty, int NumElts) { 4023*5ffd83dbSDimitry Andric if (NumElts == 0) 4024*5ffd83dbSDimitry Andric return; 4025*5ffd83dbSDimitry Andric Register Undef = B.buildUndef(Ty).getReg(0); 4026*5ffd83dbSDimitry Andric for (int I = 0; I != NumElts; ++I) 4027*5ffd83dbSDimitry Andric ResultRegs.push_back(Undef); 4028*5ffd83dbSDimitry Andric }; 4029*5ffd83dbSDimitry Andric 4030*5ffd83dbSDimitry Andric // Pad out any elements eliminated due to the dmask. 4031*5ffd83dbSDimitry Andric LLT ResTy = MRI->getType(ResultRegs[0]); 4032*5ffd83dbSDimitry Andric if (!ResTy.isVector()) { 4033*5ffd83dbSDimitry Andric padWithUndef(ResTy, NumElts - ResultRegs.size()); 4034*5ffd83dbSDimitry Andric B.buildBuildVector(DstReg, ResultRegs); 4035*5ffd83dbSDimitry Andric return true; 4036*5ffd83dbSDimitry Andric } 4037*5ffd83dbSDimitry Andric 4038*5ffd83dbSDimitry Andric assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); 4039*5ffd83dbSDimitry Andric const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; 4040*5ffd83dbSDimitry Andric 4041*5ffd83dbSDimitry Andric // Deal with the one annoying legal case. 4042*5ffd83dbSDimitry Andric const LLT V3S16 = LLT::vector(3, 16); 4043*5ffd83dbSDimitry Andric if (Ty == V3S16) { 4044*5ffd83dbSDimitry Andric padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); 4045*5ffd83dbSDimitry Andric auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); 4046*5ffd83dbSDimitry Andric B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); 4047*5ffd83dbSDimitry Andric return true; 4048*5ffd83dbSDimitry Andric } 4049*5ffd83dbSDimitry Andric 4050*5ffd83dbSDimitry Andric padWithUndef(ResTy, RegsToCover - ResultRegs.size()); 4051*5ffd83dbSDimitry Andric B.buildConcatVectors(DstReg, ResultRegs); 4052*5ffd83dbSDimitry Andric return true; 4053*5ffd83dbSDimitry Andric } 4054*5ffd83dbSDimitry Andric 4055*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeSBufferLoad( 4056*5ffd83dbSDimitry Andric MachineInstr &MI, MachineIRBuilder &B, 4057*5ffd83dbSDimitry Andric GISelChangeObserver &Observer) const { 4058*5ffd83dbSDimitry Andric Register Dst = MI.getOperand(0).getReg(); 4059*5ffd83dbSDimitry Andric LLT Ty = B.getMRI()->getType(Dst); 4060*5ffd83dbSDimitry Andric unsigned Size = Ty.getSizeInBits(); 4061*5ffd83dbSDimitry Andric MachineFunction &MF = B.getMF(); 4062*5ffd83dbSDimitry Andric 4063*5ffd83dbSDimitry Andric Observer.changingInstr(MI); 4064*5ffd83dbSDimitry Andric 4065*5ffd83dbSDimitry Andric // FIXME: We don't really need this intermediate instruction. The intrinsic 4066*5ffd83dbSDimitry Andric // should be fixed to have a memory operand. Since it's readnone, we're not 4067*5ffd83dbSDimitry Andric // allowed to add one. 4068*5ffd83dbSDimitry Andric MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); 4069*5ffd83dbSDimitry Andric MI.RemoveOperand(1); // Remove intrinsic ID 4070*5ffd83dbSDimitry Andric 4071*5ffd83dbSDimitry Andric // FIXME: When intrinsic definition is fixed, this should have an MMO already. 4072*5ffd83dbSDimitry Andric // TODO: Should this use datalayout alignment? 4073*5ffd83dbSDimitry Andric const unsigned MemSize = (Size + 7) / 8; 4074*5ffd83dbSDimitry Andric const Align MemAlign(4); 4075*5ffd83dbSDimitry Andric MachineMemOperand *MMO = MF.getMachineMemOperand( 4076*5ffd83dbSDimitry Andric MachinePointerInfo(), 4077*5ffd83dbSDimitry Andric MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | 4078*5ffd83dbSDimitry Andric MachineMemOperand::MOInvariant, 4079*5ffd83dbSDimitry Andric MemSize, MemAlign); 4080*5ffd83dbSDimitry Andric MI.addMemOperand(MF, MMO); 4081*5ffd83dbSDimitry Andric 4082*5ffd83dbSDimitry Andric // There are no 96-bit result scalar loads, but widening to 128-bit should 4083*5ffd83dbSDimitry Andric // always be legal. We may need to restore this to a 96-bit result if it turns 4084*5ffd83dbSDimitry Andric // out this needs to be converted to a vector load during RegBankSelect. 4085*5ffd83dbSDimitry Andric if (!isPowerOf2_32(Size)) { 4086*5ffd83dbSDimitry Andric LegalizerHelper Helper(MF, *this, Observer, B); 4087*5ffd83dbSDimitry Andric 4088*5ffd83dbSDimitry Andric if (Ty.isVector()) 4089*5ffd83dbSDimitry Andric Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); 4090*5ffd83dbSDimitry Andric else 4091*5ffd83dbSDimitry Andric Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); 4092*5ffd83dbSDimitry Andric } 4093*5ffd83dbSDimitry Andric 4094*5ffd83dbSDimitry Andric Observer.changedInstr(MI); 4095*5ffd83dbSDimitry Andric return true; 4096*5ffd83dbSDimitry Andric } 4097*5ffd83dbSDimitry Andric 4098*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, 40990b57cec5SDimitry Andric MachineRegisterInfo &MRI, 41000b57cec5SDimitry Andric MachineIRBuilder &B) const { 4101*5ffd83dbSDimitry Andric // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction 4102*5ffd83dbSDimitry Andric if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4103*5ffd83dbSDimitry Andric !ST.isTrapHandlerEnabled()) { 4104*5ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); 4105*5ffd83dbSDimitry Andric } else { 4106*5ffd83dbSDimitry Andric // Pass queue pointer to trap handler as input, and insert trap instruction 4107*5ffd83dbSDimitry Andric // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 4108*5ffd83dbSDimitry Andric const ArgDescriptor *Arg = 4109*5ffd83dbSDimitry Andric getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); 4110*5ffd83dbSDimitry Andric if (!Arg) 4111*5ffd83dbSDimitry Andric return false; 4112*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 4113*5ffd83dbSDimitry Andric Register SGPR01(AMDGPU::SGPR0_SGPR1); 4114*5ffd83dbSDimitry Andric Register LiveIn = getLiveInRegister( 4115*5ffd83dbSDimitry Andric B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), 4116*5ffd83dbSDimitry Andric /*InsertLiveInCopy=*/false); 4117*5ffd83dbSDimitry Andric if (!loadInputValue(LiveIn, B, Arg)) 4118*5ffd83dbSDimitry Andric return false; 4119*5ffd83dbSDimitry Andric B.buildCopy(SGPR01, LiveIn); 4120*5ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_TRAP) 4121*5ffd83dbSDimitry Andric .addImm(GCNSubtarget::TrapIDLLVMTrap) 4122*5ffd83dbSDimitry Andric .addReg(SGPR01, RegState::Implicit); 4123*5ffd83dbSDimitry Andric } 4124*5ffd83dbSDimitry Andric 4125*5ffd83dbSDimitry Andric MI.eraseFromParent(); 4126*5ffd83dbSDimitry Andric return true; 4127*5ffd83dbSDimitry Andric } 4128*5ffd83dbSDimitry Andric 4129*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( 4130*5ffd83dbSDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { 4131*5ffd83dbSDimitry Andric // Is non-HSA path or trap-handler disabled? then, report a warning 4132*5ffd83dbSDimitry Andric // accordingly 4133*5ffd83dbSDimitry Andric if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || 4134*5ffd83dbSDimitry Andric !ST.isTrapHandlerEnabled()) { 4135*5ffd83dbSDimitry Andric DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), 4136*5ffd83dbSDimitry Andric "debugtrap handler not supported", 4137*5ffd83dbSDimitry Andric MI.getDebugLoc(), DS_Warning); 4138*5ffd83dbSDimitry Andric LLVMContext &Ctx = B.getMF().getFunction().getContext(); 4139*5ffd83dbSDimitry Andric Ctx.diagnose(NoTrap); 4140*5ffd83dbSDimitry Andric } else { 4141*5ffd83dbSDimitry Andric // Insert debug-trap instruction 4142*5ffd83dbSDimitry Andric B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); 4143*5ffd83dbSDimitry Andric } 4144*5ffd83dbSDimitry Andric 4145*5ffd83dbSDimitry Andric MI.eraseFromParent(); 4146*5ffd83dbSDimitry Andric return true; 4147*5ffd83dbSDimitry Andric } 4148*5ffd83dbSDimitry Andric 4149*5ffd83dbSDimitry Andric bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, 4150*5ffd83dbSDimitry Andric MachineInstr &MI) const { 4151*5ffd83dbSDimitry Andric MachineIRBuilder &B = Helper.MIRBuilder; 4152*5ffd83dbSDimitry Andric MachineRegisterInfo &MRI = *B.getMRI(); 4153*5ffd83dbSDimitry Andric 41540b57cec5SDimitry Andric // Replace the use G_BRCOND with the exec manipulate and branch pseudos. 4155480093f4SDimitry Andric auto IntrID = MI.getIntrinsicID(); 4156480093f4SDimitry Andric switch (IntrID) { 4157480093f4SDimitry Andric case Intrinsic::amdgcn_if: 4158480093f4SDimitry Andric case Intrinsic::amdgcn_else: { 4159480093f4SDimitry Andric MachineInstr *Br = nullptr; 4160*5ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 4161*5ffd83dbSDimitry Andric if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 41620b57cec5SDimitry Andric const SIRegisterInfo *TRI 41630b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 41640b57cec5SDimitry Andric 41650b57cec5SDimitry Andric Register Def = MI.getOperand(1).getReg(); 41660b57cec5SDimitry Andric Register Use = MI.getOperand(3).getReg(); 4167480093f4SDimitry Andric 4168*5ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 4169*5ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 4170480093f4SDimitry Andric if (IntrID == Intrinsic::amdgcn_if) { 41710b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_IF) 41720b57cec5SDimitry Andric .addDef(Def) 41730b57cec5SDimitry Andric .addUse(Use) 4174*5ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 4175480093f4SDimitry Andric } else { 4176480093f4SDimitry Andric B.buildInstr(AMDGPU::SI_ELSE) 4177480093f4SDimitry Andric .addDef(Def) 4178480093f4SDimitry Andric .addUse(Use) 4179*5ffd83dbSDimitry Andric .addMBB(UncondBrTarget) 4180480093f4SDimitry Andric .addImm(0); 4181480093f4SDimitry Andric } 4182480093f4SDimitry Andric 4183*5ffd83dbSDimitry Andric if (Br) { 4184*5ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 4185*5ffd83dbSDimitry Andric } else { 4186*5ffd83dbSDimitry Andric // The IRTranslator skips inserting the G_BR for fallthrough cases, but 4187*5ffd83dbSDimitry Andric // since we're swapping branch targets it needs to be reinserted. 4188*5ffd83dbSDimitry Andric // FIXME: IRTranslator should probably not do this 4189*5ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 4190*5ffd83dbSDimitry Andric } 41910b57cec5SDimitry Andric 41920b57cec5SDimitry Andric MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); 41930b57cec5SDimitry Andric MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); 41940b57cec5SDimitry Andric MI.eraseFromParent(); 41950b57cec5SDimitry Andric BrCond->eraseFromParent(); 41960b57cec5SDimitry Andric return true; 41970b57cec5SDimitry Andric } 41980b57cec5SDimitry Andric 41990b57cec5SDimitry Andric return false; 42000b57cec5SDimitry Andric } 42010b57cec5SDimitry Andric case Intrinsic::amdgcn_loop: { 4202480093f4SDimitry Andric MachineInstr *Br = nullptr; 4203*5ffd83dbSDimitry Andric MachineBasicBlock *UncondBrTarget = nullptr; 4204*5ffd83dbSDimitry Andric if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { 42050b57cec5SDimitry Andric const SIRegisterInfo *TRI 42060b57cec5SDimitry Andric = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); 42070b57cec5SDimitry Andric 4208*5ffd83dbSDimitry Andric MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); 42090b57cec5SDimitry Andric Register Reg = MI.getOperand(2).getReg(); 4210*5ffd83dbSDimitry Andric 4211*5ffd83dbSDimitry Andric B.setInsertPt(B.getMBB(), BrCond->getIterator()); 42120b57cec5SDimitry Andric B.buildInstr(AMDGPU::SI_LOOP) 42130b57cec5SDimitry Andric .addUse(Reg) 4214*5ffd83dbSDimitry Andric .addMBB(UncondBrTarget); 4215*5ffd83dbSDimitry Andric 4216*5ffd83dbSDimitry Andric if (Br) 4217*5ffd83dbSDimitry Andric Br->getOperand(0).setMBB(CondBrTarget); 4218*5ffd83dbSDimitry Andric else 4219*5ffd83dbSDimitry Andric B.buildBr(*CondBrTarget); 4220*5ffd83dbSDimitry Andric 42210b57cec5SDimitry Andric MI.eraseFromParent(); 42220b57cec5SDimitry Andric BrCond->eraseFromParent(); 42230b57cec5SDimitry Andric MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); 42240b57cec5SDimitry Andric return true; 42250b57cec5SDimitry Andric } 42260b57cec5SDimitry Andric 42270b57cec5SDimitry Andric return false; 42280b57cec5SDimitry Andric } 42290b57cec5SDimitry Andric case Intrinsic::amdgcn_kernarg_segment_ptr: 4230*5ffd83dbSDimitry Andric if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { 4231*5ffd83dbSDimitry Andric // This only makes sense to call in a kernel, so just lower to null. 4232*5ffd83dbSDimitry Andric B.buildConstant(MI.getOperand(0).getReg(), 0); 4233*5ffd83dbSDimitry Andric MI.eraseFromParent(); 4234*5ffd83dbSDimitry Andric return true; 4235*5ffd83dbSDimitry Andric } 4236*5ffd83dbSDimitry Andric 42370b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 42380b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); 42390b57cec5SDimitry Andric case Intrinsic::amdgcn_implicitarg_ptr: 42400b57cec5SDimitry Andric return legalizeImplicitArgPtr(MI, MRI, B); 42410b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_x: 42420b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42430b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_X); 42440b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_y: 42450b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42460b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Y); 42470b57cec5SDimitry Andric case Intrinsic::amdgcn_workitem_id_z: 42480b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42490b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKITEM_ID_Z); 42500b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_x: 42510b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42520b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_X); 42530b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_y: 42540b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42550b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); 42560b57cec5SDimitry Andric case Intrinsic::amdgcn_workgroup_id_z: 42570b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42580b57cec5SDimitry Andric AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); 42590b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_ptr: 42600b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42610b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_PTR); 42620b57cec5SDimitry Andric case Intrinsic::amdgcn_queue_ptr: 42630b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42640b57cec5SDimitry Andric AMDGPUFunctionArgInfo::QUEUE_PTR); 42650b57cec5SDimitry Andric case Intrinsic::amdgcn_implicit_buffer_ptr: 42660b57cec5SDimitry Andric return legalizePreloadedArgIntrin( 42670b57cec5SDimitry Andric MI, MRI, B, AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR); 42680b57cec5SDimitry Andric case Intrinsic::amdgcn_dispatch_id: 42690b57cec5SDimitry Andric return legalizePreloadedArgIntrin(MI, MRI, B, 42700b57cec5SDimitry Andric AMDGPUFunctionArgInfo::DISPATCH_ID); 42718bcb0991SDimitry Andric case Intrinsic::amdgcn_fdiv_fast: 42728bcb0991SDimitry Andric return legalizeFDIVFastIntrin(MI, MRI, B); 42738bcb0991SDimitry Andric case Intrinsic::amdgcn_is_shared: 42748bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS); 42758bcb0991SDimitry Andric case Intrinsic::amdgcn_is_private: 42768bcb0991SDimitry Andric return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); 42778bcb0991SDimitry Andric case Intrinsic::amdgcn_wavefrontsize: { 42788bcb0991SDimitry Andric B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); 42798bcb0991SDimitry Andric MI.eraseFromParent(); 42808bcb0991SDimitry Andric return true; 42818bcb0991SDimitry Andric } 4282*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_s_buffer_load: 4283*5ffd83dbSDimitry Andric return legalizeSBufferLoad(MI, B, Helper.Observer); 42848bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store: 4285*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store: 4286*5ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, false); 42878bcb0991SDimitry Andric case Intrinsic::amdgcn_raw_buffer_store_format: 4288*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_store_format: 4289*5ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, false, true); 4290*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_store: 4291*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_store: 4292*5ffd83dbSDimitry Andric return legalizeBufferStore(MI, MRI, B, true, true); 4293*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load: 4294*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load: 4295*5ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, false, false); 4296*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_load_format: 4297*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_load_format: 4298*5ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, false); 4299*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_tbuffer_load: 4300*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_tbuffer_load: 4301*5ffd83dbSDimitry Andric return legalizeBufferLoad(MI, MRI, B, true, true); 4302*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_swap: 4303*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_swap: 4304*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_add: 4305*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_add: 4306*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_sub: 4307*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_sub: 4308*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smin: 4309*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smin: 4310*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umin: 4311*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umin: 4312*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_smax: 4313*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_smax: 4314*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_umax: 4315*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_umax: 4316*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_and: 4317*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_and: 4318*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_or: 4319*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_or: 4320*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_xor: 4321*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_xor: 4322*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_inc: 4323*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_inc: 4324*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_dec: 4325*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_dec: 4326*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: 4327*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: 4328*5ffd83dbSDimitry Andric return legalizeBufferAtomic(MI, B, IntrID); 4329*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_atomic_inc: 4330*5ffd83dbSDimitry Andric return legalizeAtomicIncDec(MI, B, true); 4331*5ffd83dbSDimitry Andric case Intrinsic::amdgcn_atomic_dec: 4332*5ffd83dbSDimitry Andric return legalizeAtomicIncDec(MI, B, false); 4333*5ffd83dbSDimitry Andric case Intrinsic::trap: 4334*5ffd83dbSDimitry Andric return legalizeTrapIntrinsic(MI, MRI, B); 4335*5ffd83dbSDimitry Andric case Intrinsic::debugtrap: 4336*5ffd83dbSDimitry Andric return legalizeDebugTrapIntrinsic(MI, MRI, B); 4337*5ffd83dbSDimitry Andric default: { 4338*5ffd83dbSDimitry Andric if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = 4339*5ffd83dbSDimitry Andric AMDGPU::getImageDimIntrinsicInfo(IntrID)) 4340*5ffd83dbSDimitry Andric return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); 43410b57cec5SDimitry Andric return true; 43420b57cec5SDimitry Andric } 4343*5ffd83dbSDimitry Andric } 43440b57cec5SDimitry Andric 43450b57cec5SDimitry Andric return true; 43460b57cec5SDimitry Andric } 4347