xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1*06c3fb27SDimitry Andric //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level,
105ffd83dbSDimitry Andric // after the legalizer.
115ffd83dbSDimitry Andric //
125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
135ffd83dbSDimitry Andric 
14e8d8bef9SDimitry Andric #include "AMDGPU.h"
15349cc55cSDimitry Andric #include "AMDGPUCombinerHelper.h"
165ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
17e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
18e8d8bef9SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
195ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h"
205ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
22*06c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
23*06c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
245ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
265ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h"
275ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
284824e7fdSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
29e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h"
305ffd83dbSDimitry Andric 
31*06c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS
32*06c3fb27SDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc"
33*06c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS
34*06c3fb27SDimitry Andric 
355ffd83dbSDimitry Andric #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
365ffd83dbSDimitry Andric 
375ffd83dbSDimitry Andric using namespace llvm;
385ffd83dbSDimitry Andric using namespace MIPatternMatch;
395ffd83dbSDimitry Andric 
40*06c3fb27SDimitry Andric namespace {
41*06c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES
42*06c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc"
43*06c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES
44*06c3fb27SDimitry Andric 
45*06c3fb27SDimitry Andric class AMDGPUPostLegalizerCombinerImpl : public GIMatchTableExecutor {
46e8d8bef9SDimitry Andric protected:
47*06c3fb27SDimitry Andric   const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
48*06c3fb27SDimitry Andric 
49e8d8bef9SDimitry Andric   MachineIRBuilder &B;
50e8d8bef9SDimitry Andric   MachineFunction &MF;
51e8d8bef9SDimitry Andric   MachineRegisterInfo &MRI;
52*06c3fb27SDimitry Andric   const GCNSubtarget &STI;
53*06c3fb27SDimitry Andric   const SIInstrInfo &TII;
54349cc55cSDimitry Andric   AMDGPUCombinerHelper &Helper;
55*06c3fb27SDimitry Andric   GISelChangeObserver &Observer;
56e8d8bef9SDimitry Andric 
57e8d8bef9SDimitry Andric public:
58*06c3fb27SDimitry Andric   AMDGPUPostLegalizerCombinerImpl(
59*06c3fb27SDimitry Andric       const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
60*06c3fb27SDimitry Andric       MachineIRBuilder &B, AMDGPUCombinerHelper &Helper,
61*06c3fb27SDimitry Andric       GISelChangeObserver &Observer);
62*06c3fb27SDimitry Andric 
63*06c3fb27SDimitry Andric   static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; }
64*06c3fb27SDimitry Andric 
65*06c3fb27SDimitry Andric   bool tryCombineAll(MachineInstr &I) const;
66e8d8bef9SDimitry Andric 
675ffd83dbSDimitry Andric   struct FMinFMaxLegacyInfo {
685ffd83dbSDimitry Andric     Register LHS;
695ffd83dbSDimitry Andric     Register RHS;
705ffd83dbSDimitry Andric     Register True;
715ffd83dbSDimitry Andric     Register False;
725ffd83dbSDimitry Andric     CmpInst::Predicate Pred;
735ffd83dbSDimitry Andric   };
745ffd83dbSDimitry Andric 
755ffd83dbSDimitry Andric   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
76*06c3fb27SDimitry Andric   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const;
77e8d8bef9SDimitry Andric   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
78*06c3fb27SDimitry Andric                                          const FMinFMaxLegacyInfo &Info) const;
79e8d8bef9SDimitry Andric 
80*06c3fb27SDimitry Andric   bool matchUCharToFloat(MachineInstr &MI) const;
81*06c3fb27SDimitry Andric   void applyUCharToFloat(MachineInstr &MI) const;
82e8d8bef9SDimitry Andric 
83*06c3fb27SDimitry Andric   bool
84*06c3fb27SDimitry Andric   matchRcpSqrtToRsq(MachineInstr &MI,
85*06c3fb27SDimitry Andric                     std::function<void(MachineIRBuilder &)> &MatchInfo) const;
864824e7fdSDimitry Andric 
87e8d8bef9SDimitry Andric   // FIXME: Should be able to have 2 separate matchdatas rather than custom
88e8d8bef9SDimitry Andric   // struct boilerplate.
89e8d8bef9SDimitry Andric   struct CvtF32UByteMatchInfo {
90e8d8bef9SDimitry Andric     Register CvtVal;
91e8d8bef9SDimitry Andric     unsigned ShiftOffset;
92e8d8bef9SDimitry Andric   };
93e8d8bef9SDimitry Andric 
94*06c3fb27SDimitry Andric   bool matchCvtF32UByteN(MachineInstr &MI,
95*06c3fb27SDimitry Andric                          CvtF32UByteMatchInfo &MatchInfo) const;
96e8d8bef9SDimitry Andric   void applyCvtF32UByteN(MachineInstr &MI,
97*06c3fb27SDimitry Andric                          const CvtF32UByteMatchInfo &MatchInfo) const;
98fe6060f1SDimitry Andric 
99*06c3fb27SDimitry Andric   bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const;
100*06c3fb27SDimitry Andric 
101*06c3fb27SDimitry Andric   // Combine unsigned buffer load and signed extension instructions to generate
102*06c3fb27SDimitry Andric   // signed buffer laod instructions.
103*06c3fb27SDimitry Andric   bool matchCombineSignExtendInReg(MachineInstr &MI,
104*06c3fb27SDimitry Andric                                    MachineInstr *&MatchInfo) const;
105*06c3fb27SDimitry Andric   void applyCombineSignExtendInReg(MachineInstr &MI,
106*06c3fb27SDimitry Andric                                    MachineInstr *&MatchInfo) const;
107*06c3fb27SDimitry Andric 
108*06c3fb27SDimitry Andric private:
109*06c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS
110*06c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget
111*06c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc"
112*06c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS
113*06c3fb27SDimitry Andric #undef AMDGPUSubtarget
114e8d8bef9SDimitry Andric };
115e8d8bef9SDimitry Andric 
116*06c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL
117*06c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget
118*06c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc"
119*06c3fb27SDimitry Andric #undef AMDGPUSubtarget
120*06c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL
121*06c3fb27SDimitry Andric 
122*06c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
123*06c3fb27SDimitry Andric     const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
124*06c3fb27SDimitry Andric     MachineIRBuilder &B, AMDGPUCombinerHelper &Helper,
125*06c3fb27SDimitry Andric     GISelChangeObserver &Observer)
126*06c3fb27SDimitry Andric     : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()),
127*06c3fb27SDimitry Andric       STI(MF.getSubtarget<GCNSubtarget>()), TII(*STI.getInstrInfo()),
128*06c3fb27SDimitry Andric       Helper(Helper), Observer(Observer),
129*06c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS
130*06c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc"
131*06c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS
132*06c3fb27SDimitry Andric {
133*06c3fb27SDimitry Andric }
134*06c3fb27SDimitry Andric 
135*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
136*06c3fb27SDimitry Andric     MachineInstr &MI, FMinFMaxLegacyInfo &Info) const {
1375ffd83dbSDimitry Andric   // FIXME: Type predicate on pattern
1385ffd83dbSDimitry Andric   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
1395ffd83dbSDimitry Andric     return false;
1405ffd83dbSDimitry Andric 
1415ffd83dbSDimitry Andric   Register Cond = MI.getOperand(1).getReg();
1425ffd83dbSDimitry Andric   if (!MRI.hasOneNonDBGUse(Cond) ||
1435ffd83dbSDimitry Andric       !mi_match(Cond, MRI,
1445ffd83dbSDimitry Andric                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
1455ffd83dbSDimitry Andric     return false;
1465ffd83dbSDimitry Andric 
1475ffd83dbSDimitry Andric   Info.True = MI.getOperand(2).getReg();
1485ffd83dbSDimitry Andric   Info.False = MI.getOperand(3).getReg();
1495ffd83dbSDimitry Andric 
150*06c3fb27SDimitry Andric   // TODO: Handle case where the the selected value is an fneg and the compared
151*06c3fb27SDimitry Andric   // constant is the negation of the selected value.
1525ffd83dbSDimitry Andric   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
1535ffd83dbSDimitry Andric       !(Info.LHS == Info.False && Info.RHS == Info.True))
1545ffd83dbSDimitry Andric     return false;
1555ffd83dbSDimitry Andric 
1565ffd83dbSDimitry Andric   switch (Info.Pred) {
1575ffd83dbSDimitry Andric   case CmpInst::FCMP_FALSE:
1585ffd83dbSDimitry Andric   case CmpInst::FCMP_OEQ:
1595ffd83dbSDimitry Andric   case CmpInst::FCMP_ONE:
1605ffd83dbSDimitry Andric   case CmpInst::FCMP_ORD:
1615ffd83dbSDimitry Andric   case CmpInst::FCMP_UNO:
1625ffd83dbSDimitry Andric   case CmpInst::FCMP_UEQ:
1635ffd83dbSDimitry Andric   case CmpInst::FCMP_UNE:
1645ffd83dbSDimitry Andric   case CmpInst::FCMP_TRUE:
1655ffd83dbSDimitry Andric     return false;
1665ffd83dbSDimitry Andric   default:
1675ffd83dbSDimitry Andric     return true;
1685ffd83dbSDimitry Andric   }
1695ffd83dbSDimitry Andric }
1705ffd83dbSDimitry Andric 
171*06c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy(
172*06c3fb27SDimitry Andric     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const {
173e8d8bef9SDimitry Andric   B.setInstrAndDebugLoc(MI);
174e8d8bef9SDimitry Andric   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
175e8d8bef9SDimitry Andric     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
1765ffd83dbSDimitry Andric   };
1775ffd83dbSDimitry Andric 
1785ffd83dbSDimitry Andric   switch (Info.Pred) {
1795ffd83dbSDimitry Andric   case CmpInst::FCMP_ULT:
1805ffd83dbSDimitry Andric   case CmpInst::FCMP_ULE:
1815ffd83dbSDimitry Andric     if (Info.LHS == Info.True)
1825ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
1835ffd83dbSDimitry Andric     else
1845ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
1855ffd83dbSDimitry Andric     break;
1865ffd83dbSDimitry Andric   case CmpInst::FCMP_OLE:
1875ffd83dbSDimitry Andric   case CmpInst::FCMP_OLT: {
1885ffd83dbSDimitry Andric     // We need to permute the operands to get the correct NaN behavior. The
1895ffd83dbSDimitry Andric     // selected operand is the second one based on the failing compare with NaN,
1905ffd83dbSDimitry Andric     // so permute it based on the compare type the hardware uses.
1915ffd83dbSDimitry Andric     if (Info.LHS == Info.True)
1925ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
1935ffd83dbSDimitry Andric     else
1945ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
1955ffd83dbSDimitry Andric     break;
1965ffd83dbSDimitry Andric   }
1975ffd83dbSDimitry Andric   case CmpInst::FCMP_UGE:
1985ffd83dbSDimitry Andric   case CmpInst::FCMP_UGT: {
1995ffd83dbSDimitry Andric     if (Info.LHS == Info.True)
2005ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
2015ffd83dbSDimitry Andric     else
2025ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
2035ffd83dbSDimitry Andric     break;
2045ffd83dbSDimitry Andric   }
2055ffd83dbSDimitry Andric   case CmpInst::FCMP_OGT:
2065ffd83dbSDimitry Andric   case CmpInst::FCMP_OGE: {
2075ffd83dbSDimitry Andric     if (Info.LHS == Info.True)
2085ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
2095ffd83dbSDimitry Andric     else
2105ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
2115ffd83dbSDimitry Andric     break;
2125ffd83dbSDimitry Andric   }
2135ffd83dbSDimitry Andric   default:
2145ffd83dbSDimitry Andric     llvm_unreachable("predicate should not have matched");
2155ffd83dbSDimitry Andric   }
2165ffd83dbSDimitry Andric 
2175ffd83dbSDimitry Andric   MI.eraseFromParent();
2185ffd83dbSDimitry Andric }
2195ffd83dbSDimitry Andric 
220*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
221*06c3fb27SDimitry Andric     MachineInstr &MI) const {
2225ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2235ffd83dbSDimitry Andric 
2245ffd83dbSDimitry Andric   // TODO: We could try to match extracting the higher bytes, which would be
2255ffd83dbSDimitry Andric   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
2265ffd83dbSDimitry Andric   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
2275ffd83dbSDimitry Andric   // about in practice.
2285ffd83dbSDimitry Andric   LLT Ty = MRI.getType(DstReg);
2295ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
2305ffd83dbSDimitry Andric     Register SrcReg = MI.getOperand(1).getReg();
2315ffd83dbSDimitry Andric     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2325ffd83dbSDimitry Andric     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
2335ffd83dbSDimitry Andric     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
2345ffd83dbSDimitry Andric     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
2355ffd83dbSDimitry Andric   }
2365ffd83dbSDimitry Andric 
2375ffd83dbSDimitry Andric   return false;
2385ffd83dbSDimitry Andric }
2395ffd83dbSDimitry Andric 
240*06c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
241*06c3fb27SDimitry Andric     MachineInstr &MI) const {
242e8d8bef9SDimitry Andric   B.setInstrAndDebugLoc(MI);
2435ffd83dbSDimitry Andric 
2445ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2455ffd83dbSDimitry Andric 
2465ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2475ffd83dbSDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
248e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(DstReg);
249e8d8bef9SDimitry Andric   LLT SrcTy = MRI.getType(SrcReg);
2505ffd83dbSDimitry Andric   if (SrcTy != S32)
2515ffd83dbSDimitry Andric     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
2525ffd83dbSDimitry Andric 
2535ffd83dbSDimitry Andric   if (Ty == S32) {
254*06c3fb27SDimitry Andric     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
255*06c3fb27SDimitry Andric                  MI.getFlags());
2565ffd83dbSDimitry Andric   } else {
257*06c3fb27SDimitry Andric     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
258*06c3fb27SDimitry Andric                              MI.getFlags());
2595ffd83dbSDimitry Andric     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
2605ffd83dbSDimitry Andric   }
2615ffd83dbSDimitry Andric 
2625ffd83dbSDimitry Andric   MI.eraseFromParent();
2635ffd83dbSDimitry Andric }
2645ffd83dbSDimitry Andric 
265*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
266*06c3fb27SDimitry Andric     MachineInstr &MI,
267*06c3fb27SDimitry Andric     std::function<void(MachineIRBuilder &)> &MatchInfo) const {
2684824e7fdSDimitry Andric 
2694824e7fdSDimitry Andric   auto getRcpSrc = [=](const MachineInstr &MI) {
2704824e7fdSDimitry Andric     MachineInstr *ResMI = nullptr;
2714824e7fdSDimitry Andric     if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
2724824e7fdSDimitry Andric         MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
2734824e7fdSDimitry Andric       ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
2744824e7fdSDimitry Andric 
2754824e7fdSDimitry Andric     return ResMI;
2764824e7fdSDimitry Andric   };
2774824e7fdSDimitry Andric 
2784824e7fdSDimitry Andric   auto getSqrtSrc = [=](const MachineInstr &MI) {
2794824e7fdSDimitry Andric     MachineInstr *SqrtSrcMI = nullptr;
280bdd1243dSDimitry Andric     auto Match =
2814824e7fdSDimitry Andric         mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
282bdd1243dSDimitry Andric     (void)Match;
2834824e7fdSDimitry Andric     return SqrtSrcMI;
2844824e7fdSDimitry Andric   };
2854824e7fdSDimitry Andric 
2864824e7fdSDimitry Andric   MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
2874824e7fdSDimitry Andric   // rcp(sqrt(x))
2884824e7fdSDimitry Andric   if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
2894824e7fdSDimitry Andric     MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
2904824e7fdSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
2914824e7fdSDimitry Andric           .addUse(SqrtSrcMI->getOperand(0).getReg())
2924824e7fdSDimitry Andric           .setMIFlags(MI.getFlags());
2934824e7fdSDimitry Andric     };
2944824e7fdSDimitry Andric     return true;
2954824e7fdSDimitry Andric   }
2964824e7fdSDimitry Andric 
2974824e7fdSDimitry Andric   // sqrt(rcp(x))
2984824e7fdSDimitry Andric   if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
2994824e7fdSDimitry Andric     MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
3004824e7fdSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
3014824e7fdSDimitry Andric           .addUse(RcpSrcMI->getOperand(0).getReg())
3024824e7fdSDimitry Andric           .setMIFlags(MI.getFlags());
3034824e7fdSDimitry Andric     };
3044824e7fdSDimitry Andric     return true;
3054824e7fdSDimitry Andric   }
3064824e7fdSDimitry Andric 
3074824e7fdSDimitry Andric   return false;
3084824e7fdSDimitry Andric }
3094824e7fdSDimitry Andric 
310*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
311*06c3fb27SDimitry Andric     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
3125ffd83dbSDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
3135ffd83dbSDimitry Andric 
3145ffd83dbSDimitry Andric   // Look through G_ZEXT.
315bdd1243dSDimitry Andric   bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
3165ffd83dbSDimitry Andric 
3175ffd83dbSDimitry Andric   Register Src0;
3185ffd83dbSDimitry Andric   int64_t ShiftAmt;
319bdd1243dSDimitry Andric   IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
3205ffd83dbSDimitry Andric   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
3215ffd83dbSDimitry Andric     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
3225ffd83dbSDimitry Andric 
3235ffd83dbSDimitry Andric     unsigned ShiftOffset = 8 * Offset;
3245ffd83dbSDimitry Andric     if (IsShr)
3255ffd83dbSDimitry Andric       ShiftOffset += ShiftAmt;
3265ffd83dbSDimitry Andric     else
3275ffd83dbSDimitry Andric       ShiftOffset -= ShiftAmt;
3285ffd83dbSDimitry Andric 
3295ffd83dbSDimitry Andric     MatchInfo.CvtVal = Src0;
3305ffd83dbSDimitry Andric     MatchInfo.ShiftOffset = ShiftOffset;
3315ffd83dbSDimitry Andric     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
3325ffd83dbSDimitry Andric   }
3335ffd83dbSDimitry Andric 
3345ffd83dbSDimitry Andric   // TODO: Simplify demanded bits.
3355ffd83dbSDimitry Andric   return false;
3365ffd83dbSDimitry Andric }
3375ffd83dbSDimitry Andric 
338*06c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
339*06c3fb27SDimitry Andric     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const {
340e8d8bef9SDimitry Andric   B.setInstrAndDebugLoc(MI);
3415ffd83dbSDimitry Andric   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
3425ffd83dbSDimitry Andric 
3435ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3445ffd83dbSDimitry Andric   Register CvtSrc = MatchInfo.CvtVal;
345e8d8bef9SDimitry Andric   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
3465ffd83dbSDimitry Andric   if (SrcTy != S32) {
3475ffd83dbSDimitry Andric     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
3485ffd83dbSDimitry Andric     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
3495ffd83dbSDimitry Andric   }
3505ffd83dbSDimitry Andric 
3515ffd83dbSDimitry Andric   assert(MI.getOpcode() != NewOpc);
3525ffd83dbSDimitry Andric   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
3535ffd83dbSDimitry Andric   MI.eraseFromParent();
3545ffd83dbSDimitry Andric }
3555ffd83dbSDimitry Andric 
356*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
357*06c3fb27SDimitry Andric     MachineInstr &MI, Register &Reg) const {
358fe6060f1SDimitry Andric   const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
359fe6060f1SDimitry Andric       MF.getSubtarget().getTargetLowering());
360fe6060f1SDimitry Andric   Reg = MI.getOperand(1).getReg();
361fe6060f1SDimitry Andric   return TLI->isCanonicalized(Reg, MF);
362fe6060f1SDimitry Andric }
363fe6060f1SDimitry Andric 
364*06c3fb27SDimitry Andric // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8,
365*06c3fb27SDimitry Andric // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined
366*06c3fb27SDimitry Andric // with sign extension instrucions in order to generate buffer_load_{i8, i16}
367*06c3fb27SDimitry Andric // instructions.
368e8d8bef9SDimitry Andric 
369*06c3fb27SDimitry Andric // Identify buffer_load_{u8, u16}.
370*06c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
371*06c3fb27SDimitry Andric     MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
372*06c3fb27SDimitry Andric   Register Op0Reg = MI.getOperand(1).getReg();
373*06c3fb27SDimitry Andric   SubwordBufferLoad = MRI.getVRegDef(Op0Reg);
374bdd1243dSDimitry Andric 
375*06c3fb27SDimitry Andric   if (!MRI.hasOneNonDBGUse(Op0Reg))
376*06c3fb27SDimitry Andric     return false;
377e8d8bef9SDimitry Andric 
378*06c3fb27SDimitry Andric   // Check if the first operand of the sign extension is a subword buffer load
379*06c3fb27SDimitry Andric   // instruction.
380*06c3fb27SDimitry Andric   return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE ||
381*06c3fb27SDimitry Andric          SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
382*06c3fb27SDimitry Andric }
3835ffd83dbSDimitry Andric 
384*06c3fb27SDimitry Andric // Combine buffer_load_{u8, u16} and the sign extension instruction to generate
385*06c3fb27SDimitry Andric // buffer_load_{i8, i16}.
386*06c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
387*06c3fb27SDimitry Andric     MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
388*06c3fb27SDimitry Andric   // Modify the opcode and the destination of buffer_load_{u8, u16}:
389*06c3fb27SDimitry Andric   // Replace the opcode.
390*06c3fb27SDimitry Andric   unsigned Opc =
391*06c3fb27SDimitry Andric       SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
392*06c3fb27SDimitry Andric           ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
393*06c3fb27SDimitry Andric           : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
394*06c3fb27SDimitry Andric   SubwordBufferLoad->setDesc(TII.get(Opc));
395*06c3fb27SDimitry Andric   // Update the destination register of SubwordBufferLoad with the destination
396*06c3fb27SDimitry Andric   // register of the sign extension.
397*06c3fb27SDimitry Andric   Register SignExtendInsnDst = MI.getOperand(0).getReg();
398*06c3fb27SDimitry Andric   SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst);
399*06c3fb27SDimitry Andric   // Remove the sign extension.
400*06c3fb27SDimitry Andric   MI.eraseFromParent();
401*06c3fb27SDimitry Andric }
4025ffd83dbSDimitry Andric 
403e8d8bef9SDimitry Andric class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
4045ffd83dbSDimitry Andric   GISelKnownBits *KB;
4055ffd83dbSDimitry Andric   MachineDominatorTree *MDT;
406*06c3fb27SDimitry Andric   AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
4075ffd83dbSDimitry Andric 
4085ffd83dbSDimitry Andric public:
409*06c3fb27SDimitry Andric   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
4105ffd83dbSDimitry Andric                                   const AMDGPULegalizerInfo *LI,
4115ffd83dbSDimitry Andric                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
4125ffd83dbSDimitry Andric       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
4135ffd83dbSDimitry Andric                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
414*06c3fb27SDimitry Andric         KB(KB), MDT(MDT) {
415*06c3fb27SDimitry Andric     if (!RuleConfig.parseCommandLineOption())
4165ffd83dbSDimitry Andric       report_fatal_error("Invalid rule identifier");
4175ffd83dbSDimitry Andric   }
4185ffd83dbSDimitry Andric 
4195ffd83dbSDimitry Andric   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
4205ffd83dbSDimitry Andric                MachineIRBuilder &B) const override;
4215ffd83dbSDimitry Andric };
4225ffd83dbSDimitry Andric 
4235ffd83dbSDimitry Andric bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
4245ffd83dbSDimitry Andric                                               MachineInstr &MI,
4255ffd83dbSDimitry Andric                                               MachineIRBuilder &B) const {
426bdd1243dSDimitry Andric   AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT,
427bdd1243dSDimitry Andric                               LInfo);
428*06c3fb27SDimitry Andric   // TODO: Do not re-create the Impl on every inst, it should be per function.
429*06c3fb27SDimitry Andric   AMDGPUPostLegalizerCombinerImpl Impl(RuleConfig, B, Helper, Observer);
430*06c3fb27SDimitry Andric   Impl.setupMF(*MI.getMF(), KB);
4315ffd83dbSDimitry Andric 
432*06c3fb27SDimitry Andric   if (Impl.tryCombineAll(MI))
4335ffd83dbSDimitry Andric     return true;
4345ffd83dbSDimitry Andric 
4355ffd83dbSDimitry Andric   switch (MI.getOpcode()) {
4365ffd83dbSDimitry Andric   case TargetOpcode::G_SHL:
4375ffd83dbSDimitry Andric   case TargetOpcode::G_LSHR:
4385ffd83dbSDimitry Andric   case TargetOpcode::G_ASHR:
4395ffd83dbSDimitry Andric     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4405ffd83dbSDimitry Andric     // common case, splitting this into a move and a 32-bit shift is faster and
4415ffd83dbSDimitry Andric     // the same code size.
4425ffd83dbSDimitry Andric     return Helper.tryCombineShiftToUnmerge(MI, 32);
4435ffd83dbSDimitry Andric   }
4445ffd83dbSDimitry Andric 
4455ffd83dbSDimitry Andric   return false;
4465ffd83dbSDimitry Andric }
4475ffd83dbSDimitry Andric 
4485ffd83dbSDimitry Andric // Pass boilerplate
4495ffd83dbSDimitry Andric // ================
4505ffd83dbSDimitry Andric 
4515ffd83dbSDimitry Andric class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
4525ffd83dbSDimitry Andric public:
4535ffd83dbSDimitry Andric   static char ID;
4545ffd83dbSDimitry Andric 
4555ffd83dbSDimitry Andric   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
4565ffd83dbSDimitry Andric 
4575ffd83dbSDimitry Andric   StringRef getPassName() const override {
4585ffd83dbSDimitry Andric     return "AMDGPUPostLegalizerCombiner";
4595ffd83dbSDimitry Andric   }
4605ffd83dbSDimitry Andric 
4615ffd83dbSDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
4625ffd83dbSDimitry Andric 
4635ffd83dbSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override;
4645ffd83dbSDimitry Andric private:
4655ffd83dbSDimitry Andric   bool IsOptNone;
4665ffd83dbSDimitry Andric };
4675ffd83dbSDimitry Andric } // end anonymous namespace
4685ffd83dbSDimitry Andric 
4695ffd83dbSDimitry Andric void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
4705ffd83dbSDimitry Andric   AU.addRequired<TargetPassConfig>();
4715ffd83dbSDimitry Andric   AU.setPreservesCFG();
4725ffd83dbSDimitry Andric   getSelectionDAGFallbackAnalysisUsage(AU);
4735ffd83dbSDimitry Andric   AU.addRequired<GISelKnownBitsAnalysis>();
4745ffd83dbSDimitry Andric   AU.addPreserved<GISelKnownBitsAnalysis>();
4755ffd83dbSDimitry Andric   if (!IsOptNone) {
4765ffd83dbSDimitry Andric     AU.addRequired<MachineDominatorTree>();
4775ffd83dbSDimitry Andric     AU.addPreserved<MachineDominatorTree>();
4785ffd83dbSDimitry Andric   }
4795ffd83dbSDimitry Andric   MachineFunctionPass::getAnalysisUsage(AU);
4805ffd83dbSDimitry Andric }
4815ffd83dbSDimitry Andric 
4825ffd83dbSDimitry Andric AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
4835ffd83dbSDimitry Andric     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
4845ffd83dbSDimitry Andric   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
4855ffd83dbSDimitry Andric }
4865ffd83dbSDimitry Andric 
4875ffd83dbSDimitry Andric bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
4885ffd83dbSDimitry Andric   if (MF.getProperties().hasProperty(
4895ffd83dbSDimitry Andric           MachineFunctionProperties::Property::FailedISel))
4905ffd83dbSDimitry Andric     return false;
4915ffd83dbSDimitry Andric   auto *TPC = &getAnalysis<TargetPassConfig>();
4925ffd83dbSDimitry Andric   const Function &F = MF.getFunction();
4935ffd83dbSDimitry Andric   bool EnableOpt =
4945ffd83dbSDimitry Andric       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
4955ffd83dbSDimitry Andric 
4965ffd83dbSDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
497*06c3fb27SDimitry Andric   const AMDGPULegalizerInfo *LI =
498*06c3fb27SDimitry Andric       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
4995ffd83dbSDimitry Andric 
5005ffd83dbSDimitry Andric   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
5015ffd83dbSDimitry Andric   MachineDominatorTree *MDT =
5025ffd83dbSDimitry Andric       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
503*06c3fb27SDimitry Andric   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
5045ffd83dbSDimitry Andric                                          F.hasMinSize(), LI, KB, MDT);
5055ffd83dbSDimitry Andric   Combiner C(PCInfo, TPC);
5065ffd83dbSDimitry Andric   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
5075ffd83dbSDimitry Andric }
5085ffd83dbSDimitry Andric 
5095ffd83dbSDimitry Andric char AMDGPUPostLegalizerCombiner::ID = 0;
5105ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
511*06c3fb27SDimitry Andric                       "Combine AMDGPU machine instrs after legalization", false,
512*06c3fb27SDimitry Andric                       false)
5135ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
5145ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
5155ffd83dbSDimitry Andric INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
5165ffd83dbSDimitry Andric                     "Combine AMDGPU machine instrs after legalization", false,
5175ffd83dbSDimitry Andric                     false)
5185ffd83dbSDimitry Andric 
5195ffd83dbSDimitry Andric namespace llvm {
5205ffd83dbSDimitry Andric FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
5215ffd83dbSDimitry Andric   return new AMDGPUPostLegalizerCombiner(IsOptNone);
5225ffd83dbSDimitry Andric }
5235ffd83dbSDimitry Andric } // end namespace llvm
524