1*06c3fb27SDimitry Andric //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 85ffd83dbSDimitry Andric // 95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level, 105ffd83dbSDimitry Andric // after the legalizer. 115ffd83dbSDimitry Andric // 125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 135ffd83dbSDimitry Andric 14e8d8bef9SDimitry Andric #include "AMDGPU.h" 15349cc55cSDimitry Andric #include "AMDGPUCombinerHelper.h" 165ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 17e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 18e8d8bef9SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 195ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h" 205ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 22*06c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" 23*06c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 245ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 265ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 275ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 284824e7fdSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 29e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h" 305ffd83dbSDimitry Andric 31*06c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS 32*06c3fb27SDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc" 33*06c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS 34*06c3fb27SDimitry Andric 355ffd83dbSDimitry Andric #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 365ffd83dbSDimitry Andric 375ffd83dbSDimitry Andric using namespace llvm; 385ffd83dbSDimitry Andric using namespace MIPatternMatch; 395ffd83dbSDimitry Andric 40*06c3fb27SDimitry Andric namespace { 41*06c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES 42*06c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 43*06c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES 44*06c3fb27SDimitry Andric 45*06c3fb27SDimitry Andric class AMDGPUPostLegalizerCombinerImpl : public GIMatchTableExecutor { 46e8d8bef9SDimitry Andric protected: 47*06c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; 48*06c3fb27SDimitry Andric 49e8d8bef9SDimitry Andric MachineIRBuilder &B; 50e8d8bef9SDimitry Andric MachineFunction &MF; 51e8d8bef9SDimitry Andric MachineRegisterInfo &MRI; 52*06c3fb27SDimitry Andric const GCNSubtarget &STI; 53*06c3fb27SDimitry Andric const SIInstrInfo &TII; 54349cc55cSDimitry Andric AMDGPUCombinerHelper &Helper; 55*06c3fb27SDimitry Andric GISelChangeObserver &Observer; 56e8d8bef9SDimitry Andric 57e8d8bef9SDimitry Andric public: 58*06c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl( 59*06c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, 60*06c3fb27SDimitry Andric MachineIRBuilder &B, AMDGPUCombinerHelper &Helper, 61*06c3fb27SDimitry Andric GISelChangeObserver &Observer); 62*06c3fb27SDimitry Andric 63*06c3fb27SDimitry Andric static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; } 64*06c3fb27SDimitry Andric 65*06c3fb27SDimitry Andric bool tryCombineAll(MachineInstr &I) const; 66e8d8bef9SDimitry Andric 675ffd83dbSDimitry Andric struct FMinFMaxLegacyInfo { 685ffd83dbSDimitry Andric Register LHS; 695ffd83dbSDimitry Andric Register RHS; 705ffd83dbSDimitry Andric Register True; 715ffd83dbSDimitry Andric Register False; 725ffd83dbSDimitry Andric CmpInst::Predicate Pred; 735ffd83dbSDimitry Andric }; 745ffd83dbSDimitry Andric 755ffd83dbSDimitry Andric // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 76*06c3fb27SDimitry Andric bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const; 77e8d8bef9SDimitry Andric void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 78*06c3fb27SDimitry Andric const FMinFMaxLegacyInfo &Info) const; 79e8d8bef9SDimitry Andric 80*06c3fb27SDimitry Andric bool matchUCharToFloat(MachineInstr &MI) const; 81*06c3fb27SDimitry Andric void applyUCharToFloat(MachineInstr &MI) const; 82e8d8bef9SDimitry Andric 83*06c3fb27SDimitry Andric bool 84*06c3fb27SDimitry Andric matchRcpSqrtToRsq(MachineInstr &MI, 85*06c3fb27SDimitry Andric std::function<void(MachineIRBuilder &)> &MatchInfo) const; 864824e7fdSDimitry Andric 87e8d8bef9SDimitry Andric // FIXME: Should be able to have 2 separate matchdatas rather than custom 88e8d8bef9SDimitry Andric // struct boilerplate. 89e8d8bef9SDimitry Andric struct CvtF32UByteMatchInfo { 90e8d8bef9SDimitry Andric Register CvtVal; 91e8d8bef9SDimitry Andric unsigned ShiftOffset; 92e8d8bef9SDimitry Andric }; 93e8d8bef9SDimitry Andric 94*06c3fb27SDimitry Andric bool matchCvtF32UByteN(MachineInstr &MI, 95*06c3fb27SDimitry Andric CvtF32UByteMatchInfo &MatchInfo) const; 96e8d8bef9SDimitry Andric void applyCvtF32UByteN(MachineInstr &MI, 97*06c3fb27SDimitry Andric const CvtF32UByteMatchInfo &MatchInfo) const; 98fe6060f1SDimitry Andric 99*06c3fb27SDimitry Andric bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; 100*06c3fb27SDimitry Andric 101*06c3fb27SDimitry Andric // Combine unsigned buffer load and signed extension instructions to generate 102*06c3fb27SDimitry Andric // signed buffer laod instructions. 103*06c3fb27SDimitry Andric bool matchCombineSignExtendInReg(MachineInstr &MI, 104*06c3fb27SDimitry Andric MachineInstr *&MatchInfo) const; 105*06c3fb27SDimitry Andric void applyCombineSignExtendInReg(MachineInstr &MI, 106*06c3fb27SDimitry Andric MachineInstr *&MatchInfo) const; 107*06c3fb27SDimitry Andric 108*06c3fb27SDimitry Andric private: 109*06c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS 110*06c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 111*06c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 112*06c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS 113*06c3fb27SDimitry Andric #undef AMDGPUSubtarget 114e8d8bef9SDimitry Andric }; 115e8d8bef9SDimitry Andric 116*06c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL 117*06c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 118*06c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 119*06c3fb27SDimitry Andric #undef AMDGPUSubtarget 120*06c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL 121*06c3fb27SDimitry Andric 122*06c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( 123*06c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, 124*06c3fb27SDimitry Andric MachineIRBuilder &B, AMDGPUCombinerHelper &Helper, 125*06c3fb27SDimitry Andric GISelChangeObserver &Observer) 126*06c3fb27SDimitry Andric : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()), 127*06c3fb27SDimitry Andric STI(MF.getSubtarget<GCNSubtarget>()), TII(*STI.getInstrInfo()), 128*06c3fb27SDimitry Andric Helper(Helper), Observer(Observer), 129*06c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS 130*06c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 131*06c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS 132*06c3fb27SDimitry Andric { 133*06c3fb27SDimitry Andric } 134*06c3fb27SDimitry Andric 135*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( 136*06c3fb27SDimitry Andric MachineInstr &MI, FMinFMaxLegacyInfo &Info) const { 1375ffd83dbSDimitry Andric // FIXME: Type predicate on pattern 1385ffd83dbSDimitry Andric if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 1395ffd83dbSDimitry Andric return false; 1405ffd83dbSDimitry Andric 1415ffd83dbSDimitry Andric Register Cond = MI.getOperand(1).getReg(); 1425ffd83dbSDimitry Andric if (!MRI.hasOneNonDBGUse(Cond) || 1435ffd83dbSDimitry Andric !mi_match(Cond, MRI, 1445ffd83dbSDimitry Andric m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 1455ffd83dbSDimitry Andric return false; 1465ffd83dbSDimitry Andric 1475ffd83dbSDimitry Andric Info.True = MI.getOperand(2).getReg(); 1485ffd83dbSDimitry Andric Info.False = MI.getOperand(3).getReg(); 1495ffd83dbSDimitry Andric 150*06c3fb27SDimitry Andric // TODO: Handle case where the the selected value is an fneg and the compared 151*06c3fb27SDimitry Andric // constant is the negation of the selected value. 1525ffd83dbSDimitry Andric if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 1535ffd83dbSDimitry Andric !(Info.LHS == Info.False && Info.RHS == Info.True)) 1545ffd83dbSDimitry Andric return false; 1555ffd83dbSDimitry Andric 1565ffd83dbSDimitry Andric switch (Info.Pred) { 1575ffd83dbSDimitry Andric case CmpInst::FCMP_FALSE: 1585ffd83dbSDimitry Andric case CmpInst::FCMP_OEQ: 1595ffd83dbSDimitry Andric case CmpInst::FCMP_ONE: 1605ffd83dbSDimitry Andric case CmpInst::FCMP_ORD: 1615ffd83dbSDimitry Andric case CmpInst::FCMP_UNO: 1625ffd83dbSDimitry Andric case CmpInst::FCMP_UEQ: 1635ffd83dbSDimitry Andric case CmpInst::FCMP_UNE: 1645ffd83dbSDimitry Andric case CmpInst::FCMP_TRUE: 1655ffd83dbSDimitry Andric return false; 1665ffd83dbSDimitry Andric default: 1675ffd83dbSDimitry Andric return true; 1685ffd83dbSDimitry Andric } 1695ffd83dbSDimitry Andric } 1705ffd83dbSDimitry Andric 171*06c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy( 172*06c3fb27SDimitry Andric MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const { 173e8d8bef9SDimitry Andric B.setInstrAndDebugLoc(MI); 174e8d8bef9SDimitry Andric auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 175e8d8bef9SDimitry Andric B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 1765ffd83dbSDimitry Andric }; 1775ffd83dbSDimitry Andric 1785ffd83dbSDimitry Andric switch (Info.Pred) { 1795ffd83dbSDimitry Andric case CmpInst::FCMP_ULT: 1805ffd83dbSDimitry Andric case CmpInst::FCMP_ULE: 1815ffd83dbSDimitry Andric if (Info.LHS == Info.True) 1825ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 1835ffd83dbSDimitry Andric else 1845ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 1855ffd83dbSDimitry Andric break; 1865ffd83dbSDimitry Andric case CmpInst::FCMP_OLE: 1875ffd83dbSDimitry Andric case CmpInst::FCMP_OLT: { 1885ffd83dbSDimitry Andric // We need to permute the operands to get the correct NaN behavior. The 1895ffd83dbSDimitry Andric // selected operand is the second one based on the failing compare with NaN, 1905ffd83dbSDimitry Andric // so permute it based on the compare type the hardware uses. 1915ffd83dbSDimitry Andric if (Info.LHS == Info.True) 1925ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 1935ffd83dbSDimitry Andric else 1945ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 1955ffd83dbSDimitry Andric break; 1965ffd83dbSDimitry Andric } 1975ffd83dbSDimitry Andric case CmpInst::FCMP_UGE: 1985ffd83dbSDimitry Andric case CmpInst::FCMP_UGT: { 1995ffd83dbSDimitry Andric if (Info.LHS == Info.True) 2005ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 2015ffd83dbSDimitry Andric else 2025ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 2035ffd83dbSDimitry Andric break; 2045ffd83dbSDimitry Andric } 2055ffd83dbSDimitry Andric case CmpInst::FCMP_OGT: 2065ffd83dbSDimitry Andric case CmpInst::FCMP_OGE: { 2075ffd83dbSDimitry Andric if (Info.LHS == Info.True) 2085ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 2095ffd83dbSDimitry Andric else 2105ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 2115ffd83dbSDimitry Andric break; 2125ffd83dbSDimitry Andric } 2135ffd83dbSDimitry Andric default: 2145ffd83dbSDimitry Andric llvm_unreachable("predicate should not have matched"); 2155ffd83dbSDimitry Andric } 2165ffd83dbSDimitry Andric 2175ffd83dbSDimitry Andric MI.eraseFromParent(); 2185ffd83dbSDimitry Andric } 2195ffd83dbSDimitry Andric 220*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat( 221*06c3fb27SDimitry Andric MachineInstr &MI) const { 2225ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2235ffd83dbSDimitry Andric 2245ffd83dbSDimitry Andric // TODO: We could try to match extracting the higher bytes, which would be 2255ffd83dbSDimitry Andric // easier if i8 vectors weren't promoted to i32 vectors, particularly after 2265ffd83dbSDimitry Andric // types are legalized. v4i8 -> v4f32 is probably the only case to worry 2275ffd83dbSDimitry Andric // about in practice. 2285ffd83dbSDimitry Andric LLT Ty = MRI.getType(DstReg); 2295ffd83dbSDimitry Andric if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 2305ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 2315ffd83dbSDimitry Andric unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 2325ffd83dbSDimitry Andric assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 2335ffd83dbSDimitry Andric const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 2345ffd83dbSDimitry Andric return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 2355ffd83dbSDimitry Andric } 2365ffd83dbSDimitry Andric 2375ffd83dbSDimitry Andric return false; 2385ffd83dbSDimitry Andric } 2395ffd83dbSDimitry Andric 240*06c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( 241*06c3fb27SDimitry Andric MachineInstr &MI) const { 242e8d8bef9SDimitry Andric B.setInstrAndDebugLoc(MI); 2435ffd83dbSDimitry Andric 2445ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2455ffd83dbSDimitry Andric 2465ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2475ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 248e8d8bef9SDimitry Andric LLT Ty = MRI.getType(DstReg); 249e8d8bef9SDimitry Andric LLT SrcTy = MRI.getType(SrcReg); 2505ffd83dbSDimitry Andric if (SrcTy != S32) 2515ffd83dbSDimitry Andric SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 2525ffd83dbSDimitry Andric 2535ffd83dbSDimitry Andric if (Ty == S32) { 254*06c3fb27SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, 255*06c3fb27SDimitry Andric MI.getFlags()); 2565ffd83dbSDimitry Andric } else { 257*06c3fb27SDimitry Andric auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, 258*06c3fb27SDimitry Andric MI.getFlags()); 2595ffd83dbSDimitry Andric B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 2605ffd83dbSDimitry Andric } 2615ffd83dbSDimitry Andric 2625ffd83dbSDimitry Andric MI.eraseFromParent(); 2635ffd83dbSDimitry Andric } 2645ffd83dbSDimitry Andric 265*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( 266*06c3fb27SDimitry Andric MachineInstr &MI, 267*06c3fb27SDimitry Andric std::function<void(MachineIRBuilder &)> &MatchInfo) const { 2684824e7fdSDimitry Andric 2694824e7fdSDimitry Andric auto getRcpSrc = [=](const MachineInstr &MI) { 2704824e7fdSDimitry Andric MachineInstr *ResMI = nullptr; 2714824e7fdSDimitry Andric if (MI.getOpcode() == TargetOpcode::G_INTRINSIC && 2724824e7fdSDimitry Andric MI.getIntrinsicID() == Intrinsic::amdgcn_rcp) 2734824e7fdSDimitry Andric ResMI = MRI.getVRegDef(MI.getOperand(2).getReg()); 2744824e7fdSDimitry Andric 2754824e7fdSDimitry Andric return ResMI; 2764824e7fdSDimitry Andric }; 2774824e7fdSDimitry Andric 2784824e7fdSDimitry Andric auto getSqrtSrc = [=](const MachineInstr &MI) { 2794824e7fdSDimitry Andric MachineInstr *SqrtSrcMI = nullptr; 280bdd1243dSDimitry Andric auto Match = 2814824e7fdSDimitry Andric mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); 282bdd1243dSDimitry Andric (void)Match; 2834824e7fdSDimitry Andric return SqrtSrcMI; 2844824e7fdSDimitry Andric }; 2854824e7fdSDimitry Andric 2864824e7fdSDimitry Andric MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; 2874824e7fdSDimitry Andric // rcp(sqrt(x)) 2884824e7fdSDimitry Andric if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { 2894824e7fdSDimitry Andric MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { 2904824e7fdSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) 2914824e7fdSDimitry Andric .addUse(SqrtSrcMI->getOperand(0).getReg()) 2924824e7fdSDimitry Andric .setMIFlags(MI.getFlags()); 2934824e7fdSDimitry Andric }; 2944824e7fdSDimitry Andric return true; 2954824e7fdSDimitry Andric } 2964824e7fdSDimitry Andric 2974824e7fdSDimitry Andric // sqrt(rcp(x)) 2984824e7fdSDimitry Andric if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { 2994824e7fdSDimitry Andric MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { 3004824e7fdSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) 3014824e7fdSDimitry Andric .addUse(RcpSrcMI->getOperand(0).getReg()) 3024824e7fdSDimitry Andric .setMIFlags(MI.getFlags()); 3034824e7fdSDimitry Andric }; 3044824e7fdSDimitry Andric return true; 3054824e7fdSDimitry Andric } 3064824e7fdSDimitry Andric 3074824e7fdSDimitry Andric return false; 3084824e7fdSDimitry Andric } 3094824e7fdSDimitry Andric 310*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( 311*06c3fb27SDimitry Andric MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { 3125ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 3135ffd83dbSDimitry Andric 3145ffd83dbSDimitry Andric // Look through G_ZEXT. 315bdd1243dSDimitry Andric bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 3165ffd83dbSDimitry Andric 3175ffd83dbSDimitry Andric Register Src0; 3185ffd83dbSDimitry Andric int64_t ShiftAmt; 319bdd1243dSDimitry Andric IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 3205ffd83dbSDimitry Andric if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 3215ffd83dbSDimitry Andric const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 3225ffd83dbSDimitry Andric 3235ffd83dbSDimitry Andric unsigned ShiftOffset = 8 * Offset; 3245ffd83dbSDimitry Andric if (IsShr) 3255ffd83dbSDimitry Andric ShiftOffset += ShiftAmt; 3265ffd83dbSDimitry Andric else 3275ffd83dbSDimitry Andric ShiftOffset -= ShiftAmt; 3285ffd83dbSDimitry Andric 3295ffd83dbSDimitry Andric MatchInfo.CvtVal = Src0; 3305ffd83dbSDimitry Andric MatchInfo.ShiftOffset = ShiftOffset; 3315ffd83dbSDimitry Andric return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 3325ffd83dbSDimitry Andric } 3335ffd83dbSDimitry Andric 3345ffd83dbSDimitry Andric // TODO: Simplify demanded bits. 3355ffd83dbSDimitry Andric return false; 3365ffd83dbSDimitry Andric } 3375ffd83dbSDimitry Andric 338*06c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN( 339*06c3fb27SDimitry Andric MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const { 340e8d8bef9SDimitry Andric B.setInstrAndDebugLoc(MI); 3415ffd83dbSDimitry Andric unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 3425ffd83dbSDimitry Andric 3435ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3445ffd83dbSDimitry Andric Register CvtSrc = MatchInfo.CvtVal; 345e8d8bef9SDimitry Andric LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 3465ffd83dbSDimitry Andric if (SrcTy != S32) { 3475ffd83dbSDimitry Andric assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 3485ffd83dbSDimitry Andric CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 3495ffd83dbSDimitry Andric } 3505ffd83dbSDimitry Andric 3515ffd83dbSDimitry Andric assert(MI.getOpcode() != NewOpc); 3525ffd83dbSDimitry Andric B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 3535ffd83dbSDimitry Andric MI.eraseFromParent(); 3545ffd83dbSDimitry Andric } 3555ffd83dbSDimitry Andric 356*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( 357*06c3fb27SDimitry Andric MachineInstr &MI, Register &Reg) const { 358fe6060f1SDimitry Andric const SITargetLowering *TLI = static_cast<const SITargetLowering *>( 359fe6060f1SDimitry Andric MF.getSubtarget().getTargetLowering()); 360fe6060f1SDimitry Andric Reg = MI.getOperand(1).getReg(); 361fe6060f1SDimitry Andric return TLI->isCanonicalized(Reg, MF); 362fe6060f1SDimitry Andric } 363fe6060f1SDimitry Andric 364*06c3fb27SDimitry Andric // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, 365*06c3fb27SDimitry Andric // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined 366*06c3fb27SDimitry Andric // with sign extension instrucions in order to generate buffer_load_{i8, i16} 367*06c3fb27SDimitry Andric // instructions. 368e8d8bef9SDimitry Andric 369*06c3fb27SDimitry Andric // Identify buffer_load_{u8, u16}. 370*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( 371*06c3fb27SDimitry Andric MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const { 372*06c3fb27SDimitry Andric Register Op0Reg = MI.getOperand(1).getReg(); 373*06c3fb27SDimitry Andric SubwordBufferLoad = MRI.getVRegDef(Op0Reg); 374bdd1243dSDimitry Andric 375*06c3fb27SDimitry Andric if (!MRI.hasOneNonDBGUse(Op0Reg)) 376*06c3fb27SDimitry Andric return false; 377e8d8bef9SDimitry Andric 378*06c3fb27SDimitry Andric // Check if the first operand of the sign extension is a subword buffer load 379*06c3fb27SDimitry Andric // instruction. 380*06c3fb27SDimitry Andric return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE || 381*06c3fb27SDimitry Andric SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; 382*06c3fb27SDimitry Andric } 3835ffd83dbSDimitry Andric 384*06c3fb27SDimitry Andric // Combine buffer_load_{u8, u16} and the sign extension instruction to generate 385*06c3fb27SDimitry Andric // buffer_load_{i8, i16}. 386*06c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( 387*06c3fb27SDimitry Andric MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const { 388*06c3fb27SDimitry Andric // Modify the opcode and the destination of buffer_load_{u8, u16}: 389*06c3fb27SDimitry Andric // Replace the opcode. 390*06c3fb27SDimitry Andric unsigned Opc = 391*06c3fb27SDimitry Andric SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE 392*06c3fb27SDimitry Andric ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE 393*06c3fb27SDimitry Andric : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT; 394*06c3fb27SDimitry Andric SubwordBufferLoad->setDesc(TII.get(Opc)); 395*06c3fb27SDimitry Andric // Update the destination register of SubwordBufferLoad with the destination 396*06c3fb27SDimitry Andric // register of the sign extension. 397*06c3fb27SDimitry Andric Register SignExtendInsnDst = MI.getOperand(0).getReg(); 398*06c3fb27SDimitry Andric SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst); 399*06c3fb27SDimitry Andric // Remove the sign extension. 400*06c3fb27SDimitry Andric MI.eraseFromParent(); 401*06c3fb27SDimitry Andric } 4025ffd83dbSDimitry Andric 403e8d8bef9SDimitry Andric class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 4045ffd83dbSDimitry Andric GISelKnownBits *KB; 4055ffd83dbSDimitry Andric MachineDominatorTree *MDT; 406*06c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; 4075ffd83dbSDimitry Andric 4085ffd83dbSDimitry Andric public: 409*06c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 4105ffd83dbSDimitry Andric const AMDGPULegalizerInfo *LI, 4115ffd83dbSDimitry Andric GISelKnownBits *KB, MachineDominatorTree *MDT) 4125ffd83dbSDimitry Andric : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 4135ffd83dbSDimitry Andric /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 414*06c3fb27SDimitry Andric KB(KB), MDT(MDT) { 415*06c3fb27SDimitry Andric if (!RuleConfig.parseCommandLineOption()) 4165ffd83dbSDimitry Andric report_fatal_error("Invalid rule identifier"); 4175ffd83dbSDimitry Andric } 4185ffd83dbSDimitry Andric 4195ffd83dbSDimitry Andric bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 4205ffd83dbSDimitry Andric MachineIRBuilder &B) const override; 4215ffd83dbSDimitry Andric }; 4225ffd83dbSDimitry Andric 4235ffd83dbSDimitry Andric bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 4245ffd83dbSDimitry Andric MachineInstr &MI, 4255ffd83dbSDimitry Andric MachineIRBuilder &B) const { 426bdd1243dSDimitry Andric AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT, 427bdd1243dSDimitry Andric LInfo); 428*06c3fb27SDimitry Andric // TODO: Do not re-create the Impl on every inst, it should be per function. 429*06c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl Impl(RuleConfig, B, Helper, Observer); 430*06c3fb27SDimitry Andric Impl.setupMF(*MI.getMF(), KB); 4315ffd83dbSDimitry Andric 432*06c3fb27SDimitry Andric if (Impl.tryCombineAll(MI)) 4335ffd83dbSDimitry Andric return true; 4345ffd83dbSDimitry Andric 4355ffd83dbSDimitry Andric switch (MI.getOpcode()) { 4365ffd83dbSDimitry Andric case TargetOpcode::G_SHL: 4375ffd83dbSDimitry Andric case TargetOpcode::G_LSHR: 4385ffd83dbSDimitry Andric case TargetOpcode::G_ASHR: 4395ffd83dbSDimitry Andric // On some subtargets, 64-bit shift is a quarter rate instruction. In the 4405ffd83dbSDimitry Andric // common case, splitting this into a move and a 32-bit shift is faster and 4415ffd83dbSDimitry Andric // the same code size. 4425ffd83dbSDimitry Andric return Helper.tryCombineShiftToUnmerge(MI, 32); 4435ffd83dbSDimitry Andric } 4445ffd83dbSDimitry Andric 4455ffd83dbSDimitry Andric return false; 4465ffd83dbSDimitry Andric } 4475ffd83dbSDimitry Andric 4485ffd83dbSDimitry Andric // Pass boilerplate 4495ffd83dbSDimitry Andric // ================ 4505ffd83dbSDimitry Andric 4515ffd83dbSDimitry Andric class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 4525ffd83dbSDimitry Andric public: 4535ffd83dbSDimitry Andric static char ID; 4545ffd83dbSDimitry Andric 4555ffd83dbSDimitry Andric AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 4565ffd83dbSDimitry Andric 4575ffd83dbSDimitry Andric StringRef getPassName() const override { 4585ffd83dbSDimitry Andric return "AMDGPUPostLegalizerCombiner"; 4595ffd83dbSDimitry Andric } 4605ffd83dbSDimitry Andric 4615ffd83dbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 4625ffd83dbSDimitry Andric 4635ffd83dbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override; 4645ffd83dbSDimitry Andric private: 4655ffd83dbSDimitry Andric bool IsOptNone; 4665ffd83dbSDimitry Andric }; 4675ffd83dbSDimitry Andric } // end anonymous namespace 4685ffd83dbSDimitry Andric 4695ffd83dbSDimitry Andric void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 4705ffd83dbSDimitry Andric AU.addRequired<TargetPassConfig>(); 4715ffd83dbSDimitry Andric AU.setPreservesCFG(); 4725ffd83dbSDimitry Andric getSelectionDAGFallbackAnalysisUsage(AU); 4735ffd83dbSDimitry Andric AU.addRequired<GISelKnownBitsAnalysis>(); 4745ffd83dbSDimitry Andric AU.addPreserved<GISelKnownBitsAnalysis>(); 4755ffd83dbSDimitry Andric if (!IsOptNone) { 4765ffd83dbSDimitry Andric AU.addRequired<MachineDominatorTree>(); 4775ffd83dbSDimitry Andric AU.addPreserved<MachineDominatorTree>(); 4785ffd83dbSDimitry Andric } 4795ffd83dbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 4805ffd83dbSDimitry Andric } 4815ffd83dbSDimitry Andric 4825ffd83dbSDimitry Andric AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 4835ffd83dbSDimitry Andric : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 4845ffd83dbSDimitry Andric initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 4855ffd83dbSDimitry Andric } 4865ffd83dbSDimitry Andric 4875ffd83dbSDimitry Andric bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 4885ffd83dbSDimitry Andric if (MF.getProperties().hasProperty( 4895ffd83dbSDimitry Andric MachineFunctionProperties::Property::FailedISel)) 4905ffd83dbSDimitry Andric return false; 4915ffd83dbSDimitry Andric auto *TPC = &getAnalysis<TargetPassConfig>(); 4925ffd83dbSDimitry Andric const Function &F = MF.getFunction(); 4935ffd83dbSDimitry Andric bool EnableOpt = 4945ffd83dbSDimitry Andric MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 4955ffd83dbSDimitry Andric 4965ffd83dbSDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 497*06c3fb27SDimitry Andric const AMDGPULegalizerInfo *LI = 498*06c3fb27SDimitry Andric static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 4995ffd83dbSDimitry Andric 5005ffd83dbSDimitry Andric GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 5015ffd83dbSDimitry Andric MachineDominatorTree *MDT = 5025ffd83dbSDimitry Andric IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 503*06c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 5045ffd83dbSDimitry Andric F.hasMinSize(), LI, KB, MDT); 5055ffd83dbSDimitry Andric Combiner C(PCInfo, TPC); 5065ffd83dbSDimitry Andric return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 5075ffd83dbSDimitry Andric } 5085ffd83dbSDimitry Andric 5095ffd83dbSDimitry Andric char AMDGPUPostLegalizerCombiner::ID = 0; 5105ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 511*06c3fb27SDimitry Andric "Combine AMDGPU machine instrs after legalization", false, 512*06c3fb27SDimitry Andric false) 5135ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 5145ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 5155ffd83dbSDimitry Andric INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 5165ffd83dbSDimitry Andric "Combine AMDGPU machine instrs after legalization", false, 5175ffd83dbSDimitry Andric false) 5185ffd83dbSDimitry Andric 5195ffd83dbSDimitry Andric namespace llvm { 5205ffd83dbSDimitry Andric FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 5215ffd83dbSDimitry Andric return new AMDGPUPostLegalizerCombiner(IsOptNone); 5225ffd83dbSDimitry Andric } 5235ffd83dbSDimitry Andric } // end namespace llvm 524