106c3fb27SDimitry Andric //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 85ffd83dbSDimitry Andric // 95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level, 105ffd83dbSDimitry Andric // after the legalizer. 115ffd83dbSDimitry Andric // 125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 135ffd83dbSDimitry Andric 14e8d8bef9SDimitry Andric #include "AMDGPU.h" 15349cc55cSDimitry Andric #include "AMDGPUCombinerHelper.h" 165ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h" 17e8d8bef9SDimitry Andric #include "GCNSubtarget.h" 18e8d8bef9SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 195ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h" 205ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 2206c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 235ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 245f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" 255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 265ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 275ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 284824e7fdSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h" 29e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h" 305ffd83dbSDimitry Andric 3106c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS 3206c3fb27SDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc" 3306c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS 3406c3fb27SDimitry Andric 355ffd83dbSDimitry Andric #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 365ffd83dbSDimitry Andric 375ffd83dbSDimitry Andric using namespace llvm; 385ffd83dbSDimitry Andric using namespace MIPatternMatch; 395ffd83dbSDimitry Andric 4006c3fb27SDimitry Andric namespace { 4106c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES 4206c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 4306c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES 4406c3fb27SDimitry Andric 455f757f3fSDimitry Andric class AMDGPUPostLegalizerCombinerImpl : public Combiner { 46e8d8bef9SDimitry Andric protected: 4706c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig; 4806c3fb27SDimitry Andric const GCNSubtarget &STI; 4906c3fb27SDimitry Andric const SIInstrInfo &TII; 505f757f3fSDimitry Andric // TODO: Make CombinerHelper methods const. 515f757f3fSDimitry Andric mutable AMDGPUCombinerHelper Helper; 52e8d8bef9SDimitry Andric 53e8d8bef9SDimitry Andric public: 5406c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl( 555f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 565f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 5706c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, 585f757f3fSDimitry Andric const GCNSubtarget &STI, MachineDominatorTree *MDT, 595f757f3fSDimitry Andric const LegalizerInfo *LI); 6006c3fb27SDimitry Andric 6106c3fb27SDimitry Andric static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; } 6206c3fb27SDimitry Andric 635f757f3fSDimitry Andric bool tryCombineAllImpl(MachineInstr &I) const; 645f757f3fSDimitry Andric bool tryCombineAll(MachineInstr &I) const override; 65e8d8bef9SDimitry Andric 665ffd83dbSDimitry Andric struct FMinFMaxLegacyInfo { 675ffd83dbSDimitry Andric Register LHS; 685ffd83dbSDimitry Andric Register RHS; 695ffd83dbSDimitry Andric Register True; 705ffd83dbSDimitry Andric Register False; 715ffd83dbSDimitry Andric CmpInst::Predicate Pred; 725ffd83dbSDimitry Andric }; 735ffd83dbSDimitry Andric 745ffd83dbSDimitry Andric // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 7506c3fb27SDimitry Andric bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const; 76e8d8bef9SDimitry Andric void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 7706c3fb27SDimitry Andric const FMinFMaxLegacyInfo &Info) const; 78e8d8bef9SDimitry Andric 7906c3fb27SDimitry Andric bool matchUCharToFloat(MachineInstr &MI) const; 8006c3fb27SDimitry Andric void applyUCharToFloat(MachineInstr &MI) const; 81e8d8bef9SDimitry Andric 8206c3fb27SDimitry Andric bool 8306c3fb27SDimitry Andric matchRcpSqrtToRsq(MachineInstr &MI, 8406c3fb27SDimitry Andric std::function<void(MachineIRBuilder &)> &MatchInfo) const; 854824e7fdSDimitry Andric 86e8d8bef9SDimitry Andric // FIXME: Should be able to have 2 separate matchdatas rather than custom 87e8d8bef9SDimitry Andric // struct boilerplate. 88e8d8bef9SDimitry Andric struct CvtF32UByteMatchInfo { 89e8d8bef9SDimitry Andric Register CvtVal; 90e8d8bef9SDimitry Andric unsigned ShiftOffset; 91e8d8bef9SDimitry Andric }; 92e8d8bef9SDimitry Andric 9306c3fb27SDimitry Andric bool matchCvtF32UByteN(MachineInstr &MI, 9406c3fb27SDimitry Andric CvtF32UByteMatchInfo &MatchInfo) const; 95e8d8bef9SDimitry Andric void applyCvtF32UByteN(MachineInstr &MI, 9606c3fb27SDimitry Andric const CvtF32UByteMatchInfo &MatchInfo) const; 97fe6060f1SDimitry Andric 9806c3fb27SDimitry Andric bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const; 9906c3fb27SDimitry Andric 10006c3fb27SDimitry Andric // Combine unsigned buffer load and signed extension instructions to generate 10106c3fb27SDimitry Andric // signed buffer laod instructions. 102*297eecfbSDimitry Andric bool matchCombineSignExtendInReg( 103*297eecfbSDimitry Andric MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; 104*297eecfbSDimitry Andric void applyCombineSignExtendInReg( 105*297eecfbSDimitry Andric MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const; 10606c3fb27SDimitry Andric 1071db9f3b2SDimitry Andric // Find the s_mul_u64 instructions where the higher bits are either 1081db9f3b2SDimitry Andric // zero-extended or sign-extended. 1091db9f3b2SDimitry Andric bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; 1101db9f3b2SDimitry Andric // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher 1111db9f3b2SDimitry Andric // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32 1121db9f3b2SDimitry Andric // bits are zero extended. 1131db9f3b2SDimitry Andric void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const; 1141db9f3b2SDimitry Andric 11506c3fb27SDimitry Andric private: 11606c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS 11706c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 11806c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 11906c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS 12006c3fb27SDimitry Andric #undef AMDGPUSubtarget 121e8d8bef9SDimitry Andric }; 122e8d8bef9SDimitry Andric 12306c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL 12406c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget 12506c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 12606c3fb27SDimitry Andric #undef AMDGPUSubtarget 12706c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL 12806c3fb27SDimitry Andric 12906c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl( 1305f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 1315f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 13206c3fb27SDimitry Andric const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig, 1335f757f3fSDimitry Andric const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) 1345f757f3fSDimitry Andric : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), 1355f757f3fSDimitry Andric TII(*STI.getInstrInfo()), 1365f757f3fSDimitry Andric Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI), 13706c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS 13806c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc" 13906c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS 14006c3fb27SDimitry Andric { 14106c3fb27SDimitry Andric } 14206c3fb27SDimitry Andric 1435f757f3fSDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { 1445f757f3fSDimitry Andric if (tryCombineAllImpl(MI)) 1455f757f3fSDimitry Andric return true; 1465f757f3fSDimitry Andric 1475f757f3fSDimitry Andric switch (MI.getOpcode()) { 1485f757f3fSDimitry Andric case TargetOpcode::G_SHL: 1495f757f3fSDimitry Andric case TargetOpcode::G_LSHR: 1505f757f3fSDimitry Andric case TargetOpcode::G_ASHR: 1515f757f3fSDimitry Andric // On some subtargets, 64-bit shift is a quarter rate instruction. In the 1525f757f3fSDimitry Andric // common case, splitting this into a move and a 32-bit shift is faster and 1535f757f3fSDimitry Andric // the same code size. 1545f757f3fSDimitry Andric return Helper.tryCombineShiftToUnmerge(MI, 32); 1555f757f3fSDimitry Andric } 1565f757f3fSDimitry Andric 1575f757f3fSDimitry Andric return false; 1585f757f3fSDimitry Andric } 1595f757f3fSDimitry Andric 16006c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy( 16106c3fb27SDimitry Andric MachineInstr &MI, FMinFMaxLegacyInfo &Info) const { 1625ffd83dbSDimitry Andric // FIXME: Type predicate on pattern 1635ffd83dbSDimitry Andric if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 1645ffd83dbSDimitry Andric return false; 1655ffd83dbSDimitry Andric 1665ffd83dbSDimitry Andric Register Cond = MI.getOperand(1).getReg(); 1675ffd83dbSDimitry Andric if (!MRI.hasOneNonDBGUse(Cond) || 1685ffd83dbSDimitry Andric !mi_match(Cond, MRI, 1695ffd83dbSDimitry Andric m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 1705ffd83dbSDimitry Andric return false; 1715ffd83dbSDimitry Andric 1725ffd83dbSDimitry Andric Info.True = MI.getOperand(2).getReg(); 1735ffd83dbSDimitry Andric Info.False = MI.getOperand(3).getReg(); 1745ffd83dbSDimitry Andric 17506c3fb27SDimitry Andric // TODO: Handle case where the the selected value is an fneg and the compared 17606c3fb27SDimitry Andric // constant is the negation of the selected value. 1775ffd83dbSDimitry Andric if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 1785ffd83dbSDimitry Andric !(Info.LHS == Info.False && Info.RHS == Info.True)) 1795ffd83dbSDimitry Andric return false; 1805ffd83dbSDimitry Andric 1815ffd83dbSDimitry Andric switch (Info.Pred) { 1825ffd83dbSDimitry Andric case CmpInst::FCMP_FALSE: 1835ffd83dbSDimitry Andric case CmpInst::FCMP_OEQ: 1845ffd83dbSDimitry Andric case CmpInst::FCMP_ONE: 1855ffd83dbSDimitry Andric case CmpInst::FCMP_ORD: 1865ffd83dbSDimitry Andric case CmpInst::FCMP_UNO: 1875ffd83dbSDimitry Andric case CmpInst::FCMP_UEQ: 1885ffd83dbSDimitry Andric case CmpInst::FCMP_UNE: 1895ffd83dbSDimitry Andric case CmpInst::FCMP_TRUE: 1905ffd83dbSDimitry Andric return false; 1915ffd83dbSDimitry Andric default: 1925ffd83dbSDimitry Andric return true; 1935ffd83dbSDimitry Andric } 1945ffd83dbSDimitry Andric } 1955ffd83dbSDimitry Andric 19606c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy( 19706c3fb27SDimitry Andric MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const { 198e8d8bef9SDimitry Andric B.setInstrAndDebugLoc(MI); 199e8d8bef9SDimitry Andric auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 200e8d8bef9SDimitry Andric B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 2015ffd83dbSDimitry Andric }; 2025ffd83dbSDimitry Andric 2035ffd83dbSDimitry Andric switch (Info.Pred) { 2045ffd83dbSDimitry Andric case CmpInst::FCMP_ULT: 2055ffd83dbSDimitry Andric case CmpInst::FCMP_ULE: 2065ffd83dbSDimitry Andric if (Info.LHS == Info.True) 2075ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 2085ffd83dbSDimitry Andric else 2095ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 2105ffd83dbSDimitry Andric break; 2115ffd83dbSDimitry Andric case CmpInst::FCMP_OLE: 2125ffd83dbSDimitry Andric case CmpInst::FCMP_OLT: { 2135ffd83dbSDimitry Andric // We need to permute the operands to get the correct NaN behavior. The 2145ffd83dbSDimitry Andric // selected operand is the second one based on the failing compare with NaN, 2155ffd83dbSDimitry Andric // so permute it based on the compare type the hardware uses. 2165ffd83dbSDimitry Andric if (Info.LHS == Info.True) 2175ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 2185ffd83dbSDimitry Andric else 2195ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 2205ffd83dbSDimitry Andric break; 2215ffd83dbSDimitry Andric } 2225ffd83dbSDimitry Andric case CmpInst::FCMP_UGE: 2235ffd83dbSDimitry Andric case CmpInst::FCMP_UGT: { 2245ffd83dbSDimitry Andric if (Info.LHS == Info.True) 2255ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 2265ffd83dbSDimitry Andric else 2275ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 2285ffd83dbSDimitry Andric break; 2295ffd83dbSDimitry Andric } 2305ffd83dbSDimitry Andric case CmpInst::FCMP_OGT: 2315ffd83dbSDimitry Andric case CmpInst::FCMP_OGE: { 2325ffd83dbSDimitry Andric if (Info.LHS == Info.True) 2335ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 2345ffd83dbSDimitry Andric else 2355ffd83dbSDimitry Andric buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 2365ffd83dbSDimitry Andric break; 2375ffd83dbSDimitry Andric } 2385ffd83dbSDimitry Andric default: 2395ffd83dbSDimitry Andric llvm_unreachable("predicate should not have matched"); 2405ffd83dbSDimitry Andric } 2415ffd83dbSDimitry Andric 2425ffd83dbSDimitry Andric MI.eraseFromParent(); 2435ffd83dbSDimitry Andric } 2445ffd83dbSDimitry Andric 24506c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat( 24606c3fb27SDimitry Andric MachineInstr &MI) const { 2475ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2485ffd83dbSDimitry Andric 2495ffd83dbSDimitry Andric // TODO: We could try to match extracting the higher bytes, which would be 2505ffd83dbSDimitry Andric // easier if i8 vectors weren't promoted to i32 vectors, particularly after 2515ffd83dbSDimitry Andric // types are legalized. v4i8 -> v4f32 is probably the only case to worry 2525ffd83dbSDimitry Andric // about in practice. 2535ffd83dbSDimitry Andric LLT Ty = MRI.getType(DstReg); 2545ffd83dbSDimitry Andric if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 2555ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 2565ffd83dbSDimitry Andric unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 2575ffd83dbSDimitry Andric assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 2585ffd83dbSDimitry Andric const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 2595ffd83dbSDimitry Andric return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 2605ffd83dbSDimitry Andric } 2615ffd83dbSDimitry Andric 2625ffd83dbSDimitry Andric return false; 2635ffd83dbSDimitry Andric } 2645ffd83dbSDimitry Andric 26506c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat( 26606c3fb27SDimitry Andric MachineInstr &MI) const { 267e8d8bef9SDimitry Andric B.setInstrAndDebugLoc(MI); 2685ffd83dbSDimitry Andric 2695ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 2705ffd83dbSDimitry Andric 2715ffd83dbSDimitry Andric Register DstReg = MI.getOperand(0).getReg(); 2725ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 273e8d8bef9SDimitry Andric LLT Ty = MRI.getType(DstReg); 274e8d8bef9SDimitry Andric LLT SrcTy = MRI.getType(SrcReg); 2755ffd83dbSDimitry Andric if (SrcTy != S32) 2765ffd83dbSDimitry Andric SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 2775ffd83dbSDimitry Andric 2785ffd83dbSDimitry Andric if (Ty == S32) { 27906c3fb27SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg}, 28006c3fb27SDimitry Andric MI.getFlags()); 2815ffd83dbSDimitry Andric } else { 28206c3fb27SDimitry Andric auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg}, 28306c3fb27SDimitry Andric MI.getFlags()); 2845ffd83dbSDimitry Andric B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 2855ffd83dbSDimitry Andric } 2865ffd83dbSDimitry Andric 2875ffd83dbSDimitry Andric MI.eraseFromParent(); 2885ffd83dbSDimitry Andric } 2895ffd83dbSDimitry Andric 29006c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq( 29106c3fb27SDimitry Andric MachineInstr &MI, 29206c3fb27SDimitry Andric std::function<void(MachineIRBuilder &)> &MatchInfo) const { 2935f757f3fSDimitry Andric auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * { 2945f757f3fSDimitry Andric if (!MI.getFlag(MachineInstr::FmContract)) 2955f757f3fSDimitry Andric return nullptr; 2964824e7fdSDimitry Andric 2975f757f3fSDimitry Andric if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { 2985f757f3fSDimitry Andric if (GI->is(Intrinsic::amdgcn_rcp)) 2995f757f3fSDimitry Andric return MRI.getVRegDef(MI.getOperand(2).getReg()); 3005f757f3fSDimitry Andric } 3015f757f3fSDimitry Andric return nullptr; 3024824e7fdSDimitry Andric }; 3034824e7fdSDimitry Andric 3045f757f3fSDimitry Andric auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * { 3055f757f3fSDimitry Andric if (!MI.getFlag(MachineInstr::FmContract)) 3065f757f3fSDimitry Andric return nullptr; 3074824e7fdSDimitry Andric MachineInstr *SqrtSrcMI = nullptr; 308bdd1243dSDimitry Andric auto Match = 3094824e7fdSDimitry Andric mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); 310bdd1243dSDimitry Andric (void)Match; 3114824e7fdSDimitry Andric return SqrtSrcMI; 3124824e7fdSDimitry Andric }; 3134824e7fdSDimitry Andric 3144824e7fdSDimitry Andric MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; 3154824e7fdSDimitry Andric // rcp(sqrt(x)) 3164824e7fdSDimitry Andric if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { 3174824e7fdSDimitry Andric MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { 3185f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) 3194824e7fdSDimitry Andric .addUse(SqrtSrcMI->getOperand(0).getReg()) 3204824e7fdSDimitry Andric .setMIFlags(MI.getFlags()); 3214824e7fdSDimitry Andric }; 3224824e7fdSDimitry Andric return true; 3234824e7fdSDimitry Andric } 3244824e7fdSDimitry Andric 3254824e7fdSDimitry Andric // sqrt(rcp(x)) 3264824e7fdSDimitry Andric if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { 3274824e7fdSDimitry Andric MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { 3285f757f3fSDimitry Andric B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}) 3294824e7fdSDimitry Andric .addUse(RcpSrcMI->getOperand(0).getReg()) 3304824e7fdSDimitry Andric .setMIFlags(MI.getFlags()); 3314824e7fdSDimitry Andric }; 3324824e7fdSDimitry Andric return true; 3334824e7fdSDimitry Andric } 3344824e7fdSDimitry Andric return false; 3354824e7fdSDimitry Andric } 3364824e7fdSDimitry Andric 33706c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN( 33806c3fb27SDimitry Andric MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const { 3395ffd83dbSDimitry Andric Register SrcReg = MI.getOperand(1).getReg(); 3405ffd83dbSDimitry Andric 3415ffd83dbSDimitry Andric // Look through G_ZEXT. 342bdd1243dSDimitry Andric bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 3435ffd83dbSDimitry Andric 3445ffd83dbSDimitry Andric Register Src0; 3455ffd83dbSDimitry Andric int64_t ShiftAmt; 346bdd1243dSDimitry Andric IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 3475ffd83dbSDimitry Andric if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 3485ffd83dbSDimitry Andric const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 3495ffd83dbSDimitry Andric 3505ffd83dbSDimitry Andric unsigned ShiftOffset = 8 * Offset; 3515ffd83dbSDimitry Andric if (IsShr) 3525ffd83dbSDimitry Andric ShiftOffset += ShiftAmt; 3535ffd83dbSDimitry Andric else 3545ffd83dbSDimitry Andric ShiftOffset -= ShiftAmt; 3555ffd83dbSDimitry Andric 3565ffd83dbSDimitry Andric MatchInfo.CvtVal = Src0; 3575ffd83dbSDimitry Andric MatchInfo.ShiftOffset = ShiftOffset; 3585ffd83dbSDimitry Andric return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 3595ffd83dbSDimitry Andric } 3605ffd83dbSDimitry Andric 3615ffd83dbSDimitry Andric // TODO: Simplify demanded bits. 3625ffd83dbSDimitry Andric return false; 3635ffd83dbSDimitry Andric } 3645ffd83dbSDimitry Andric 36506c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN( 36606c3fb27SDimitry Andric MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const { 367e8d8bef9SDimitry Andric B.setInstrAndDebugLoc(MI); 3685ffd83dbSDimitry Andric unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 3695ffd83dbSDimitry Andric 3705ffd83dbSDimitry Andric const LLT S32 = LLT::scalar(32); 3715ffd83dbSDimitry Andric Register CvtSrc = MatchInfo.CvtVal; 372e8d8bef9SDimitry Andric LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 3735ffd83dbSDimitry Andric if (SrcTy != S32) { 3745ffd83dbSDimitry Andric assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 3755ffd83dbSDimitry Andric CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 3765ffd83dbSDimitry Andric } 3775ffd83dbSDimitry Andric 3785ffd83dbSDimitry Andric assert(MI.getOpcode() != NewOpc); 3795ffd83dbSDimitry Andric B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 3805ffd83dbSDimitry Andric MI.eraseFromParent(); 3815ffd83dbSDimitry Andric } 3825ffd83dbSDimitry Andric 38306c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize( 38406c3fb27SDimitry Andric MachineInstr &MI, Register &Reg) const { 385fe6060f1SDimitry Andric const SITargetLowering *TLI = static_cast<const SITargetLowering *>( 386fe6060f1SDimitry Andric MF.getSubtarget().getTargetLowering()); 387fe6060f1SDimitry Andric Reg = MI.getOperand(1).getReg(); 388fe6060f1SDimitry Andric return TLI->isCanonicalized(Reg, MF); 389fe6060f1SDimitry Andric } 390fe6060f1SDimitry Andric 39106c3fb27SDimitry Andric // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8, 39206c3fb27SDimitry Andric // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined 39306c3fb27SDimitry Andric // with sign extension instrucions in order to generate buffer_load_{i8, i16} 39406c3fb27SDimitry Andric // instructions. 395e8d8bef9SDimitry Andric 39606c3fb27SDimitry Andric // Identify buffer_load_{u8, u16}. 39706c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg( 398*297eecfbSDimitry Andric MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { 399*297eecfbSDimitry Andric Register LoadReg = MI.getOperand(1).getReg(); 400*297eecfbSDimitry Andric if (!MRI.hasOneNonDBGUse(LoadReg)) 40106c3fb27SDimitry Andric return false; 402e8d8bef9SDimitry Andric 40306c3fb27SDimitry Andric // Check if the first operand of the sign extension is a subword buffer load 40406c3fb27SDimitry Andric // instruction. 405*297eecfbSDimitry Andric MachineInstr *LoadMI = MRI.getVRegDef(LoadReg); 406*297eecfbSDimitry Andric int64_t Width = MI.getOperand(2).getImm(); 407*297eecfbSDimitry Andric switch (LoadMI->getOpcode()) { 408*297eecfbSDimitry Andric case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: 409*297eecfbSDimitry Andric MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE}; 410*297eecfbSDimitry Andric return Width == 8; 411*297eecfbSDimitry Andric case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: 412*297eecfbSDimitry Andric MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT}; 413*297eecfbSDimitry Andric return Width == 16; 414*297eecfbSDimitry Andric } 415*297eecfbSDimitry Andric return false; 41606c3fb27SDimitry Andric } 4175ffd83dbSDimitry Andric 41806c3fb27SDimitry Andric // Combine buffer_load_{u8, u16} and the sign extension instruction to generate 41906c3fb27SDimitry Andric // buffer_load_{i8, i16}. 42006c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg( 421*297eecfbSDimitry Andric MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const { 422*297eecfbSDimitry Andric auto [LoadMI, NewOpcode] = MatchData; 423*297eecfbSDimitry Andric LoadMI->setDesc(TII.get(NewOpcode)); 424*297eecfbSDimitry Andric // Update the destination register of the load with the destination register 425*297eecfbSDimitry Andric // of the sign extension. 42606c3fb27SDimitry Andric Register SignExtendInsnDst = MI.getOperand(0).getReg(); 427*297eecfbSDimitry Andric LoadMI->getOperand(0).setReg(SignExtendInsnDst); 42806c3fb27SDimitry Andric // Remove the sign extension. 42906c3fb27SDimitry Andric MI.eraseFromParent(); 43006c3fb27SDimitry Andric } 4315ffd83dbSDimitry Andric 4321db9f3b2SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64( 4331db9f3b2SDimitry Andric MachineInstr &MI, unsigned &NewOpcode) const { 4341db9f3b2SDimitry Andric Register Src0 = MI.getOperand(1).getReg(); 4351db9f3b2SDimitry Andric Register Src1 = MI.getOperand(2).getReg(); 4361db9f3b2SDimitry Andric if (MRI.getType(Src0) != LLT::scalar(64)) 4371db9f3b2SDimitry Andric return false; 4381db9f3b2SDimitry Andric 4391db9f3b2SDimitry Andric if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 && 4401db9f3b2SDimitry Andric KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) { 4411db9f3b2SDimitry Andric NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32; 4421db9f3b2SDimitry Andric return true; 4431db9f3b2SDimitry Andric } 4441db9f3b2SDimitry Andric 4451db9f3b2SDimitry Andric if (KB->computeNumSignBits(Src1) >= 33 && 4461db9f3b2SDimitry Andric KB->computeNumSignBits(Src0) >= 33) { 4471db9f3b2SDimitry Andric NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32; 4481db9f3b2SDimitry Andric return true; 4491db9f3b2SDimitry Andric } 4501db9f3b2SDimitry Andric return false; 4511db9f3b2SDimitry Andric } 4521db9f3b2SDimitry Andric 4531db9f3b2SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64( 4541db9f3b2SDimitry Andric MachineInstr &MI, unsigned &NewOpcode) const { 4551db9f3b2SDimitry Andric Helper.replaceOpcodeWith(MI, NewOpcode); 4561db9f3b2SDimitry Andric } 4571db9f3b2SDimitry Andric 4585ffd83dbSDimitry Andric // Pass boilerplate 4595ffd83dbSDimitry Andric // ================ 4605ffd83dbSDimitry Andric 4615ffd83dbSDimitry Andric class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 4625ffd83dbSDimitry Andric public: 4635ffd83dbSDimitry Andric static char ID; 4645ffd83dbSDimitry Andric 4655ffd83dbSDimitry Andric AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 4665ffd83dbSDimitry Andric 4675ffd83dbSDimitry Andric StringRef getPassName() const override { 4685ffd83dbSDimitry Andric return "AMDGPUPostLegalizerCombiner"; 4695ffd83dbSDimitry Andric } 4705ffd83dbSDimitry Andric 4715ffd83dbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 4725ffd83dbSDimitry Andric 4735ffd83dbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override; 4745f757f3fSDimitry Andric 4755ffd83dbSDimitry Andric private: 4765ffd83dbSDimitry Andric bool IsOptNone; 4775f757f3fSDimitry Andric AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig; 4785ffd83dbSDimitry Andric }; 4795ffd83dbSDimitry Andric } // end anonymous namespace 4805ffd83dbSDimitry Andric 4815ffd83dbSDimitry Andric void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 4825ffd83dbSDimitry Andric AU.addRequired<TargetPassConfig>(); 4835ffd83dbSDimitry Andric AU.setPreservesCFG(); 4845ffd83dbSDimitry Andric getSelectionDAGFallbackAnalysisUsage(AU); 4855ffd83dbSDimitry Andric AU.addRequired<GISelKnownBitsAnalysis>(); 4865ffd83dbSDimitry Andric AU.addPreserved<GISelKnownBitsAnalysis>(); 4875ffd83dbSDimitry Andric if (!IsOptNone) { 4885ffd83dbSDimitry Andric AU.addRequired<MachineDominatorTree>(); 4895ffd83dbSDimitry Andric AU.addPreserved<MachineDominatorTree>(); 4905ffd83dbSDimitry Andric } 4915ffd83dbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 4925ffd83dbSDimitry Andric } 4935ffd83dbSDimitry Andric 4945ffd83dbSDimitry Andric AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 4955ffd83dbSDimitry Andric : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 4965ffd83dbSDimitry Andric initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 4975f757f3fSDimitry Andric 4985f757f3fSDimitry Andric if (!RuleConfig.parseCommandLineOption()) 4995f757f3fSDimitry Andric report_fatal_error("Invalid rule identifier"); 5005ffd83dbSDimitry Andric } 5015ffd83dbSDimitry Andric 5025ffd83dbSDimitry Andric bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 5035ffd83dbSDimitry Andric if (MF.getProperties().hasProperty( 5045ffd83dbSDimitry Andric MachineFunctionProperties::Property::FailedISel)) 5055ffd83dbSDimitry Andric return false; 5065ffd83dbSDimitry Andric auto *TPC = &getAnalysis<TargetPassConfig>(); 5075ffd83dbSDimitry Andric const Function &F = MF.getFunction(); 5085ffd83dbSDimitry Andric bool EnableOpt = 5095f757f3fSDimitry Andric MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 5105ffd83dbSDimitry Andric 5115ffd83dbSDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 51206c3fb27SDimitry Andric const AMDGPULegalizerInfo *LI = 51306c3fb27SDimitry Andric static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 5145ffd83dbSDimitry Andric 5155ffd83dbSDimitry Andric GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 5165ffd83dbSDimitry Andric MachineDominatorTree *MDT = 5175ffd83dbSDimitry Andric IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 5185f757f3fSDimitry Andric 5195f757f3fSDimitry Andric CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 5205f757f3fSDimitry Andric LI, EnableOpt, F.hasOptSize(), F.hasMinSize()); 5215f757f3fSDimitry Andric 5225f757f3fSDimitry Andric AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr, 5235f757f3fSDimitry Andric RuleConfig, ST, MDT, LI); 5245f757f3fSDimitry Andric return Impl.combineMachineInstrs(); 5255ffd83dbSDimitry Andric } 5265ffd83dbSDimitry Andric 5275ffd83dbSDimitry Andric char AMDGPUPostLegalizerCombiner::ID = 0; 5285ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 52906c3fb27SDimitry Andric "Combine AMDGPU machine instrs after legalization", false, 53006c3fb27SDimitry Andric false) 5315ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 5325ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 5335ffd83dbSDimitry Andric INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 5345ffd83dbSDimitry Andric "Combine AMDGPU machine instrs after legalization", false, 5355ffd83dbSDimitry Andric false) 5365ffd83dbSDimitry Andric 5375ffd83dbSDimitry Andric namespace llvm { 5385ffd83dbSDimitry Andric FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 5395ffd83dbSDimitry Andric return new AMDGPUPostLegalizerCombiner(IsOptNone); 5405ffd83dbSDimitry Andric } 5415ffd83dbSDimitry Andric } // end namespace llvm 542