xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp (revision 297eecfb02bb25902531dbb5c3b9a88caf8adf29)
106c3fb27SDimitry Andric //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level,
105ffd83dbSDimitry Andric // after the legalizer.
115ffd83dbSDimitry Andric //
125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
135ffd83dbSDimitry Andric 
14e8d8bef9SDimitry Andric #include "AMDGPU.h"
15349cc55cSDimitry Andric #include "AMDGPUCombinerHelper.h"
165ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
17e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
18e8d8bef9SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
195ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h"
205ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
2206c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
235ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
245f757f3fSDimitry Andric #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
265ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h"
275ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
284824e7fdSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
29e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h"
305ffd83dbSDimitry Andric 
3106c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS
3206c3fb27SDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc"
3306c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS
3406c3fb27SDimitry Andric 
355ffd83dbSDimitry Andric #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
365ffd83dbSDimitry Andric 
375ffd83dbSDimitry Andric using namespace llvm;
385ffd83dbSDimitry Andric using namespace MIPatternMatch;
395ffd83dbSDimitry Andric 
4006c3fb27SDimitry Andric namespace {
4106c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES
4206c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc"
4306c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES
4406c3fb27SDimitry Andric 
455f757f3fSDimitry Andric class AMDGPUPostLegalizerCombinerImpl : public Combiner {
46e8d8bef9SDimitry Andric protected:
4706c3fb27SDimitry Andric   const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
4806c3fb27SDimitry Andric   const GCNSubtarget &STI;
4906c3fb27SDimitry Andric   const SIInstrInfo &TII;
505f757f3fSDimitry Andric   // TODO: Make CombinerHelper methods const.
515f757f3fSDimitry Andric   mutable AMDGPUCombinerHelper Helper;
52e8d8bef9SDimitry Andric 
53e8d8bef9SDimitry Andric public:
5406c3fb27SDimitry Andric   AMDGPUPostLegalizerCombinerImpl(
555f757f3fSDimitry Andric       MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
565f757f3fSDimitry Andric       GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
5706c3fb27SDimitry Andric       const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
585f757f3fSDimitry Andric       const GCNSubtarget &STI, MachineDominatorTree *MDT,
595f757f3fSDimitry Andric       const LegalizerInfo *LI);
6006c3fb27SDimitry Andric 
6106c3fb27SDimitry Andric   static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; }
6206c3fb27SDimitry Andric 
635f757f3fSDimitry Andric   bool tryCombineAllImpl(MachineInstr &I) const;
645f757f3fSDimitry Andric   bool tryCombineAll(MachineInstr &I) const override;
65e8d8bef9SDimitry Andric 
665ffd83dbSDimitry Andric   struct FMinFMaxLegacyInfo {
675ffd83dbSDimitry Andric     Register LHS;
685ffd83dbSDimitry Andric     Register RHS;
695ffd83dbSDimitry Andric     Register True;
705ffd83dbSDimitry Andric     Register False;
715ffd83dbSDimitry Andric     CmpInst::Predicate Pred;
725ffd83dbSDimitry Andric   };
735ffd83dbSDimitry Andric 
745ffd83dbSDimitry Andric   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
7506c3fb27SDimitry Andric   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const;
76e8d8bef9SDimitry Andric   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
7706c3fb27SDimitry Andric                                          const FMinFMaxLegacyInfo &Info) const;
78e8d8bef9SDimitry Andric 
7906c3fb27SDimitry Andric   bool matchUCharToFloat(MachineInstr &MI) const;
8006c3fb27SDimitry Andric   void applyUCharToFloat(MachineInstr &MI) const;
81e8d8bef9SDimitry Andric 
8206c3fb27SDimitry Andric   bool
8306c3fb27SDimitry Andric   matchRcpSqrtToRsq(MachineInstr &MI,
8406c3fb27SDimitry Andric                     std::function<void(MachineIRBuilder &)> &MatchInfo) const;
854824e7fdSDimitry Andric 
86e8d8bef9SDimitry Andric   // FIXME: Should be able to have 2 separate matchdatas rather than custom
87e8d8bef9SDimitry Andric   // struct boilerplate.
88e8d8bef9SDimitry Andric   struct CvtF32UByteMatchInfo {
89e8d8bef9SDimitry Andric     Register CvtVal;
90e8d8bef9SDimitry Andric     unsigned ShiftOffset;
91e8d8bef9SDimitry Andric   };
92e8d8bef9SDimitry Andric 
9306c3fb27SDimitry Andric   bool matchCvtF32UByteN(MachineInstr &MI,
9406c3fb27SDimitry Andric                          CvtF32UByteMatchInfo &MatchInfo) const;
95e8d8bef9SDimitry Andric   void applyCvtF32UByteN(MachineInstr &MI,
9606c3fb27SDimitry Andric                          const CvtF32UByteMatchInfo &MatchInfo) const;
97fe6060f1SDimitry Andric 
9806c3fb27SDimitry Andric   bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const;
9906c3fb27SDimitry Andric 
10006c3fb27SDimitry Andric   // Combine unsigned buffer load and signed extension instructions to generate
10106c3fb27SDimitry Andric   // signed buffer laod instructions.
102*297eecfbSDimitry Andric   bool matchCombineSignExtendInReg(
103*297eecfbSDimitry Andric       MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
104*297eecfbSDimitry Andric   void applyCombineSignExtendInReg(
105*297eecfbSDimitry Andric       MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchInfo) const;
10606c3fb27SDimitry Andric 
1071db9f3b2SDimitry Andric   // Find the s_mul_u64 instructions where the higher bits are either
1081db9f3b2SDimitry Andric   // zero-extended or sign-extended.
1091db9f3b2SDimitry Andric   bool matchCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
1101db9f3b2SDimitry Andric   // Replace the s_mul_u64 instructions with S_MUL_I64_I32_PSEUDO if the higher
1111db9f3b2SDimitry Andric   // 33 bits are sign extended and with S_MUL_U64_U32_PSEUDO if the higher 32
1121db9f3b2SDimitry Andric   // bits are zero extended.
1131db9f3b2SDimitry Andric   void applyCombine_s_mul_u64(MachineInstr &MI, unsigned &NewOpcode) const;
1141db9f3b2SDimitry Andric 
11506c3fb27SDimitry Andric private:
11606c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS
11706c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget
11806c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc"
11906c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS
12006c3fb27SDimitry Andric #undef AMDGPUSubtarget
121e8d8bef9SDimitry Andric };
122e8d8bef9SDimitry Andric 
12306c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL
12406c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget
12506c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc"
12606c3fb27SDimitry Andric #undef AMDGPUSubtarget
12706c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL
12806c3fb27SDimitry Andric 
12906c3fb27SDimitry Andric AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
1305f757f3fSDimitry Andric     MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
1315f757f3fSDimitry Andric     GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
13206c3fb27SDimitry Andric     const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
1335f757f3fSDimitry Andric     const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
1345f757f3fSDimitry Andric     : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
1355f757f3fSDimitry Andric       TII(*STI.getInstrInfo()),
1365f757f3fSDimitry Andric       Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
13706c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS
13806c3fb27SDimitry Andric #include "AMDGPUGenPostLegalizeGICombiner.inc"
13906c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS
14006c3fb27SDimitry Andric {
14106c3fb27SDimitry Andric }
14206c3fb27SDimitry Andric 
1435f757f3fSDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
1445f757f3fSDimitry Andric   if (tryCombineAllImpl(MI))
1455f757f3fSDimitry Andric     return true;
1465f757f3fSDimitry Andric 
1475f757f3fSDimitry Andric   switch (MI.getOpcode()) {
1485f757f3fSDimitry Andric   case TargetOpcode::G_SHL:
1495f757f3fSDimitry Andric   case TargetOpcode::G_LSHR:
1505f757f3fSDimitry Andric   case TargetOpcode::G_ASHR:
1515f757f3fSDimitry Andric     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
1525f757f3fSDimitry Andric     // common case, splitting this into a move and a 32-bit shift is faster and
1535f757f3fSDimitry Andric     // the same code size.
1545f757f3fSDimitry Andric     return Helper.tryCombineShiftToUnmerge(MI, 32);
1555f757f3fSDimitry Andric   }
1565f757f3fSDimitry Andric 
1575f757f3fSDimitry Andric   return false;
1585f757f3fSDimitry Andric }
1595f757f3fSDimitry Andric 
16006c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
16106c3fb27SDimitry Andric     MachineInstr &MI, FMinFMaxLegacyInfo &Info) const {
1625ffd83dbSDimitry Andric   // FIXME: Type predicate on pattern
1635ffd83dbSDimitry Andric   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
1645ffd83dbSDimitry Andric     return false;
1655ffd83dbSDimitry Andric 
1665ffd83dbSDimitry Andric   Register Cond = MI.getOperand(1).getReg();
1675ffd83dbSDimitry Andric   if (!MRI.hasOneNonDBGUse(Cond) ||
1685ffd83dbSDimitry Andric       !mi_match(Cond, MRI,
1695ffd83dbSDimitry Andric                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
1705ffd83dbSDimitry Andric     return false;
1715ffd83dbSDimitry Andric 
1725ffd83dbSDimitry Andric   Info.True = MI.getOperand(2).getReg();
1735ffd83dbSDimitry Andric   Info.False = MI.getOperand(3).getReg();
1745ffd83dbSDimitry Andric 
17506c3fb27SDimitry Andric   // TODO: Handle case where the the selected value is an fneg and the compared
17606c3fb27SDimitry Andric   // constant is the negation of the selected value.
1775ffd83dbSDimitry Andric   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
1785ffd83dbSDimitry Andric       !(Info.LHS == Info.False && Info.RHS == Info.True))
1795ffd83dbSDimitry Andric     return false;
1805ffd83dbSDimitry Andric 
1815ffd83dbSDimitry Andric   switch (Info.Pred) {
1825ffd83dbSDimitry Andric   case CmpInst::FCMP_FALSE:
1835ffd83dbSDimitry Andric   case CmpInst::FCMP_OEQ:
1845ffd83dbSDimitry Andric   case CmpInst::FCMP_ONE:
1855ffd83dbSDimitry Andric   case CmpInst::FCMP_ORD:
1865ffd83dbSDimitry Andric   case CmpInst::FCMP_UNO:
1875ffd83dbSDimitry Andric   case CmpInst::FCMP_UEQ:
1885ffd83dbSDimitry Andric   case CmpInst::FCMP_UNE:
1895ffd83dbSDimitry Andric   case CmpInst::FCMP_TRUE:
1905ffd83dbSDimitry Andric     return false;
1915ffd83dbSDimitry Andric   default:
1925ffd83dbSDimitry Andric     return true;
1935ffd83dbSDimitry Andric   }
1945ffd83dbSDimitry Andric }
1955ffd83dbSDimitry Andric 
19606c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy(
19706c3fb27SDimitry Andric     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const {
198e8d8bef9SDimitry Andric   B.setInstrAndDebugLoc(MI);
199e8d8bef9SDimitry Andric   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
200e8d8bef9SDimitry Andric     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
2015ffd83dbSDimitry Andric   };
2025ffd83dbSDimitry Andric 
2035ffd83dbSDimitry Andric   switch (Info.Pred) {
2045ffd83dbSDimitry Andric   case CmpInst::FCMP_ULT:
2055ffd83dbSDimitry Andric   case CmpInst::FCMP_ULE:
2065ffd83dbSDimitry Andric     if (Info.LHS == Info.True)
2075ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
2085ffd83dbSDimitry Andric     else
2095ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
2105ffd83dbSDimitry Andric     break;
2115ffd83dbSDimitry Andric   case CmpInst::FCMP_OLE:
2125ffd83dbSDimitry Andric   case CmpInst::FCMP_OLT: {
2135ffd83dbSDimitry Andric     // We need to permute the operands to get the correct NaN behavior. The
2145ffd83dbSDimitry Andric     // selected operand is the second one based on the failing compare with NaN,
2155ffd83dbSDimitry Andric     // so permute it based on the compare type the hardware uses.
2165ffd83dbSDimitry Andric     if (Info.LHS == Info.True)
2175ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
2185ffd83dbSDimitry Andric     else
2195ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
2205ffd83dbSDimitry Andric     break;
2215ffd83dbSDimitry Andric   }
2225ffd83dbSDimitry Andric   case CmpInst::FCMP_UGE:
2235ffd83dbSDimitry Andric   case CmpInst::FCMP_UGT: {
2245ffd83dbSDimitry Andric     if (Info.LHS == Info.True)
2255ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
2265ffd83dbSDimitry Andric     else
2275ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
2285ffd83dbSDimitry Andric     break;
2295ffd83dbSDimitry Andric   }
2305ffd83dbSDimitry Andric   case CmpInst::FCMP_OGT:
2315ffd83dbSDimitry Andric   case CmpInst::FCMP_OGE: {
2325ffd83dbSDimitry Andric     if (Info.LHS == Info.True)
2335ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
2345ffd83dbSDimitry Andric     else
2355ffd83dbSDimitry Andric       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
2365ffd83dbSDimitry Andric     break;
2375ffd83dbSDimitry Andric   }
2385ffd83dbSDimitry Andric   default:
2395ffd83dbSDimitry Andric     llvm_unreachable("predicate should not have matched");
2405ffd83dbSDimitry Andric   }
2415ffd83dbSDimitry Andric 
2425ffd83dbSDimitry Andric   MI.eraseFromParent();
2435ffd83dbSDimitry Andric }
2445ffd83dbSDimitry Andric 
24506c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
24606c3fb27SDimitry Andric     MachineInstr &MI) const {
2475ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2485ffd83dbSDimitry Andric 
2495ffd83dbSDimitry Andric   // TODO: We could try to match extracting the higher bytes, which would be
2505ffd83dbSDimitry Andric   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
2515ffd83dbSDimitry Andric   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
2525ffd83dbSDimitry Andric   // about in practice.
2535ffd83dbSDimitry Andric   LLT Ty = MRI.getType(DstReg);
2545ffd83dbSDimitry Andric   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
2555ffd83dbSDimitry Andric     Register SrcReg = MI.getOperand(1).getReg();
2565ffd83dbSDimitry Andric     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
2575ffd83dbSDimitry Andric     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
2585ffd83dbSDimitry Andric     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
2595ffd83dbSDimitry Andric     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
2605ffd83dbSDimitry Andric   }
2615ffd83dbSDimitry Andric 
2625ffd83dbSDimitry Andric   return false;
2635ffd83dbSDimitry Andric }
2645ffd83dbSDimitry Andric 
26506c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
26606c3fb27SDimitry Andric     MachineInstr &MI) const {
267e8d8bef9SDimitry Andric   B.setInstrAndDebugLoc(MI);
2685ffd83dbSDimitry Andric 
2695ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
2705ffd83dbSDimitry Andric 
2715ffd83dbSDimitry Andric   Register DstReg = MI.getOperand(0).getReg();
2725ffd83dbSDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
273e8d8bef9SDimitry Andric   LLT Ty = MRI.getType(DstReg);
274e8d8bef9SDimitry Andric   LLT SrcTy = MRI.getType(SrcReg);
2755ffd83dbSDimitry Andric   if (SrcTy != S32)
2765ffd83dbSDimitry Andric     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
2775ffd83dbSDimitry Andric 
2785ffd83dbSDimitry Andric   if (Ty == S32) {
27906c3fb27SDimitry Andric     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
28006c3fb27SDimitry Andric                  MI.getFlags());
2815ffd83dbSDimitry Andric   } else {
28206c3fb27SDimitry Andric     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
28306c3fb27SDimitry Andric                              MI.getFlags());
2845ffd83dbSDimitry Andric     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
2855ffd83dbSDimitry Andric   }
2865ffd83dbSDimitry Andric 
2875ffd83dbSDimitry Andric   MI.eraseFromParent();
2885ffd83dbSDimitry Andric }
2895ffd83dbSDimitry Andric 
29006c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
29106c3fb27SDimitry Andric     MachineInstr &MI,
29206c3fb27SDimitry Andric     std::function<void(MachineIRBuilder &)> &MatchInfo) const {
2935f757f3fSDimitry Andric   auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * {
2945f757f3fSDimitry Andric     if (!MI.getFlag(MachineInstr::FmContract))
2955f757f3fSDimitry Andric       return nullptr;
2964824e7fdSDimitry Andric 
2975f757f3fSDimitry Andric     if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
2985f757f3fSDimitry Andric       if (GI->is(Intrinsic::amdgcn_rcp))
2995f757f3fSDimitry Andric         return MRI.getVRegDef(MI.getOperand(2).getReg());
3005f757f3fSDimitry Andric     }
3015f757f3fSDimitry Andric     return nullptr;
3024824e7fdSDimitry Andric   };
3034824e7fdSDimitry Andric 
3045f757f3fSDimitry Andric   auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * {
3055f757f3fSDimitry Andric     if (!MI.getFlag(MachineInstr::FmContract))
3065f757f3fSDimitry Andric       return nullptr;
3074824e7fdSDimitry Andric     MachineInstr *SqrtSrcMI = nullptr;
308bdd1243dSDimitry Andric     auto Match =
3094824e7fdSDimitry Andric         mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
310bdd1243dSDimitry Andric     (void)Match;
3114824e7fdSDimitry Andric     return SqrtSrcMI;
3124824e7fdSDimitry Andric   };
3134824e7fdSDimitry Andric 
3144824e7fdSDimitry Andric   MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
3154824e7fdSDimitry Andric   // rcp(sqrt(x))
3164824e7fdSDimitry Andric   if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
3174824e7fdSDimitry Andric     MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
3185f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})
3194824e7fdSDimitry Andric           .addUse(SqrtSrcMI->getOperand(0).getReg())
3204824e7fdSDimitry Andric           .setMIFlags(MI.getFlags());
3214824e7fdSDimitry Andric     };
3224824e7fdSDimitry Andric     return true;
3234824e7fdSDimitry Andric   }
3244824e7fdSDimitry Andric 
3254824e7fdSDimitry Andric   // sqrt(rcp(x))
3264824e7fdSDimitry Andric   if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
3274824e7fdSDimitry Andric     MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
3285f757f3fSDimitry Andric       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})
3294824e7fdSDimitry Andric           .addUse(RcpSrcMI->getOperand(0).getReg())
3304824e7fdSDimitry Andric           .setMIFlags(MI.getFlags());
3314824e7fdSDimitry Andric     };
3324824e7fdSDimitry Andric     return true;
3334824e7fdSDimitry Andric   }
3344824e7fdSDimitry Andric   return false;
3354824e7fdSDimitry Andric }
3364824e7fdSDimitry Andric 
33706c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
33806c3fb27SDimitry Andric     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
3395ffd83dbSDimitry Andric   Register SrcReg = MI.getOperand(1).getReg();
3405ffd83dbSDimitry Andric 
3415ffd83dbSDimitry Andric   // Look through G_ZEXT.
342bdd1243dSDimitry Andric   bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
3435ffd83dbSDimitry Andric 
3445ffd83dbSDimitry Andric   Register Src0;
3455ffd83dbSDimitry Andric   int64_t ShiftAmt;
346bdd1243dSDimitry Andric   IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
3475ffd83dbSDimitry Andric   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
3485ffd83dbSDimitry Andric     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
3495ffd83dbSDimitry Andric 
3505ffd83dbSDimitry Andric     unsigned ShiftOffset = 8 * Offset;
3515ffd83dbSDimitry Andric     if (IsShr)
3525ffd83dbSDimitry Andric       ShiftOffset += ShiftAmt;
3535ffd83dbSDimitry Andric     else
3545ffd83dbSDimitry Andric       ShiftOffset -= ShiftAmt;
3555ffd83dbSDimitry Andric 
3565ffd83dbSDimitry Andric     MatchInfo.CvtVal = Src0;
3575ffd83dbSDimitry Andric     MatchInfo.ShiftOffset = ShiftOffset;
3585ffd83dbSDimitry Andric     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
3595ffd83dbSDimitry Andric   }
3605ffd83dbSDimitry Andric 
3615ffd83dbSDimitry Andric   // TODO: Simplify demanded bits.
3625ffd83dbSDimitry Andric   return false;
3635ffd83dbSDimitry Andric }
3645ffd83dbSDimitry Andric 
36506c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
36606c3fb27SDimitry Andric     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const {
367e8d8bef9SDimitry Andric   B.setInstrAndDebugLoc(MI);
3685ffd83dbSDimitry Andric   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
3695ffd83dbSDimitry Andric 
3705ffd83dbSDimitry Andric   const LLT S32 = LLT::scalar(32);
3715ffd83dbSDimitry Andric   Register CvtSrc = MatchInfo.CvtVal;
372e8d8bef9SDimitry Andric   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
3735ffd83dbSDimitry Andric   if (SrcTy != S32) {
3745ffd83dbSDimitry Andric     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
3755ffd83dbSDimitry Andric     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
3765ffd83dbSDimitry Andric   }
3775ffd83dbSDimitry Andric 
3785ffd83dbSDimitry Andric   assert(MI.getOpcode() != NewOpc);
3795ffd83dbSDimitry Andric   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
3805ffd83dbSDimitry Andric   MI.eraseFromParent();
3815ffd83dbSDimitry Andric }
3825ffd83dbSDimitry Andric 
38306c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
38406c3fb27SDimitry Andric     MachineInstr &MI, Register &Reg) const {
385fe6060f1SDimitry Andric   const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
386fe6060f1SDimitry Andric       MF.getSubtarget().getTargetLowering());
387fe6060f1SDimitry Andric   Reg = MI.getOperand(1).getReg();
388fe6060f1SDimitry Andric   return TLI->isCanonicalized(Reg, MF);
389fe6060f1SDimitry Andric }
390fe6060f1SDimitry Andric 
39106c3fb27SDimitry Andric // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8,
39206c3fb27SDimitry Andric // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined
39306c3fb27SDimitry Andric // with sign extension instrucions in order to generate buffer_load_{i8, i16}
39406c3fb27SDimitry Andric // instructions.
395e8d8bef9SDimitry Andric 
39606c3fb27SDimitry Andric // Identify buffer_load_{u8, u16}.
39706c3fb27SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
398*297eecfbSDimitry Andric     MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
399*297eecfbSDimitry Andric   Register LoadReg = MI.getOperand(1).getReg();
400*297eecfbSDimitry Andric   if (!MRI.hasOneNonDBGUse(LoadReg))
40106c3fb27SDimitry Andric     return false;
402e8d8bef9SDimitry Andric 
40306c3fb27SDimitry Andric   // Check if the first operand of the sign extension is a subword buffer load
40406c3fb27SDimitry Andric   // instruction.
405*297eecfbSDimitry Andric   MachineInstr *LoadMI = MRI.getVRegDef(LoadReg);
406*297eecfbSDimitry Andric   int64_t Width = MI.getOperand(2).getImm();
407*297eecfbSDimitry Andric   switch (LoadMI->getOpcode()) {
408*297eecfbSDimitry Andric   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
409*297eecfbSDimitry Andric     MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE};
410*297eecfbSDimitry Andric     return Width == 8;
411*297eecfbSDimitry Andric   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
412*297eecfbSDimitry Andric     MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
413*297eecfbSDimitry Andric     return Width == 16;
414*297eecfbSDimitry Andric   }
415*297eecfbSDimitry Andric   return false;
41606c3fb27SDimitry Andric }
4175ffd83dbSDimitry Andric 
41806c3fb27SDimitry Andric // Combine buffer_load_{u8, u16} and the sign extension instruction to generate
41906c3fb27SDimitry Andric // buffer_load_{i8, i16}.
42006c3fb27SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
421*297eecfbSDimitry Andric     MachineInstr &MI, std::pair<MachineInstr *, unsigned> &MatchData) const {
422*297eecfbSDimitry Andric   auto [LoadMI, NewOpcode] = MatchData;
423*297eecfbSDimitry Andric   LoadMI->setDesc(TII.get(NewOpcode));
424*297eecfbSDimitry Andric   // Update the destination register of the load with the destination register
425*297eecfbSDimitry Andric   // of the sign extension.
42606c3fb27SDimitry Andric   Register SignExtendInsnDst = MI.getOperand(0).getReg();
427*297eecfbSDimitry Andric   LoadMI->getOperand(0).setReg(SignExtendInsnDst);
42806c3fb27SDimitry Andric   // Remove the sign extension.
42906c3fb27SDimitry Andric   MI.eraseFromParent();
43006c3fb27SDimitry Andric }
4315ffd83dbSDimitry Andric 
4321db9f3b2SDimitry Andric bool AMDGPUPostLegalizerCombinerImpl::matchCombine_s_mul_u64(
4331db9f3b2SDimitry Andric     MachineInstr &MI, unsigned &NewOpcode) const {
4341db9f3b2SDimitry Andric   Register Src0 = MI.getOperand(1).getReg();
4351db9f3b2SDimitry Andric   Register Src1 = MI.getOperand(2).getReg();
4361db9f3b2SDimitry Andric   if (MRI.getType(Src0) != LLT::scalar(64))
4371db9f3b2SDimitry Andric     return false;
4381db9f3b2SDimitry Andric 
4391db9f3b2SDimitry Andric   if (KB->getKnownBits(Src1).countMinLeadingZeros() >= 32 &&
4401db9f3b2SDimitry Andric       KB->getKnownBits(Src0).countMinLeadingZeros() >= 32) {
4411db9f3b2SDimitry Andric     NewOpcode = AMDGPU::G_AMDGPU_S_MUL_U64_U32;
4421db9f3b2SDimitry Andric     return true;
4431db9f3b2SDimitry Andric   }
4441db9f3b2SDimitry Andric 
4451db9f3b2SDimitry Andric   if (KB->computeNumSignBits(Src1) >= 33 &&
4461db9f3b2SDimitry Andric       KB->computeNumSignBits(Src0) >= 33) {
4471db9f3b2SDimitry Andric     NewOpcode = AMDGPU::G_AMDGPU_S_MUL_I64_I32;
4481db9f3b2SDimitry Andric     return true;
4491db9f3b2SDimitry Andric   }
4501db9f3b2SDimitry Andric   return false;
4511db9f3b2SDimitry Andric }
4521db9f3b2SDimitry Andric 
4531db9f3b2SDimitry Andric void AMDGPUPostLegalizerCombinerImpl::applyCombine_s_mul_u64(
4541db9f3b2SDimitry Andric     MachineInstr &MI, unsigned &NewOpcode) const {
4551db9f3b2SDimitry Andric   Helper.replaceOpcodeWith(MI, NewOpcode);
4561db9f3b2SDimitry Andric }
4571db9f3b2SDimitry Andric 
4585ffd83dbSDimitry Andric // Pass boilerplate
4595ffd83dbSDimitry Andric // ================
4605ffd83dbSDimitry Andric 
4615ffd83dbSDimitry Andric class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
4625ffd83dbSDimitry Andric public:
4635ffd83dbSDimitry Andric   static char ID;
4645ffd83dbSDimitry Andric 
4655ffd83dbSDimitry Andric   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
4665ffd83dbSDimitry Andric 
4675ffd83dbSDimitry Andric   StringRef getPassName() const override {
4685ffd83dbSDimitry Andric     return "AMDGPUPostLegalizerCombiner";
4695ffd83dbSDimitry Andric   }
4705ffd83dbSDimitry Andric 
4715ffd83dbSDimitry Andric   bool runOnMachineFunction(MachineFunction &MF) override;
4725ffd83dbSDimitry Andric 
4735ffd83dbSDimitry Andric   void getAnalysisUsage(AnalysisUsage &AU) const override;
4745f757f3fSDimitry Andric 
4755ffd83dbSDimitry Andric private:
4765ffd83dbSDimitry Andric   bool IsOptNone;
4775f757f3fSDimitry Andric   AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
4785ffd83dbSDimitry Andric };
4795ffd83dbSDimitry Andric } // end anonymous namespace
4805ffd83dbSDimitry Andric 
4815ffd83dbSDimitry Andric void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
4825ffd83dbSDimitry Andric   AU.addRequired<TargetPassConfig>();
4835ffd83dbSDimitry Andric   AU.setPreservesCFG();
4845ffd83dbSDimitry Andric   getSelectionDAGFallbackAnalysisUsage(AU);
4855ffd83dbSDimitry Andric   AU.addRequired<GISelKnownBitsAnalysis>();
4865ffd83dbSDimitry Andric   AU.addPreserved<GISelKnownBitsAnalysis>();
4875ffd83dbSDimitry Andric   if (!IsOptNone) {
4885ffd83dbSDimitry Andric     AU.addRequired<MachineDominatorTree>();
4895ffd83dbSDimitry Andric     AU.addPreserved<MachineDominatorTree>();
4905ffd83dbSDimitry Andric   }
4915ffd83dbSDimitry Andric   MachineFunctionPass::getAnalysisUsage(AU);
4925ffd83dbSDimitry Andric }
4935ffd83dbSDimitry Andric 
4945ffd83dbSDimitry Andric AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
4955ffd83dbSDimitry Andric     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
4965ffd83dbSDimitry Andric   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
4975f757f3fSDimitry Andric 
4985f757f3fSDimitry Andric   if (!RuleConfig.parseCommandLineOption())
4995f757f3fSDimitry Andric     report_fatal_error("Invalid rule identifier");
5005ffd83dbSDimitry Andric }
5015ffd83dbSDimitry Andric 
5025ffd83dbSDimitry Andric bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
5035ffd83dbSDimitry Andric   if (MF.getProperties().hasProperty(
5045ffd83dbSDimitry Andric           MachineFunctionProperties::Property::FailedISel))
5055ffd83dbSDimitry Andric     return false;
5065ffd83dbSDimitry Andric   auto *TPC = &getAnalysis<TargetPassConfig>();
5075ffd83dbSDimitry Andric   const Function &F = MF.getFunction();
5085ffd83dbSDimitry Andric   bool EnableOpt =
5095f757f3fSDimitry Andric       MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
5105ffd83dbSDimitry Andric 
5115ffd83dbSDimitry Andric   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
51206c3fb27SDimitry Andric   const AMDGPULegalizerInfo *LI =
51306c3fb27SDimitry Andric       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
5145ffd83dbSDimitry Andric 
5155ffd83dbSDimitry Andric   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
5165ffd83dbSDimitry Andric   MachineDominatorTree *MDT =
5175ffd83dbSDimitry Andric       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
5185f757f3fSDimitry Andric 
5195f757f3fSDimitry Andric   CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
5205f757f3fSDimitry Andric                      LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
5215f757f3fSDimitry Andric 
5225f757f3fSDimitry Andric   AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr,
5235f757f3fSDimitry Andric                                        RuleConfig, ST, MDT, LI);
5245f757f3fSDimitry Andric   return Impl.combineMachineInstrs();
5255ffd83dbSDimitry Andric }
5265ffd83dbSDimitry Andric 
5275ffd83dbSDimitry Andric char AMDGPUPostLegalizerCombiner::ID = 0;
5285ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
52906c3fb27SDimitry Andric                       "Combine AMDGPU machine instrs after legalization", false,
53006c3fb27SDimitry Andric                       false)
5315ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
5325ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
5335ffd83dbSDimitry Andric INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
5345ffd83dbSDimitry Andric                     "Combine AMDGPU machine instrs after legalization", false,
5355ffd83dbSDimitry Andric                     false)
5365ffd83dbSDimitry Andric 
5375ffd83dbSDimitry Andric namespace llvm {
5385ffd83dbSDimitry Andric FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
5395ffd83dbSDimitry Andric   return new AMDGPUPostLegalizerCombiner(IsOptNone);
5405ffd83dbSDimitry Andric }
5415ffd83dbSDimitry Andric } // end namespace llvm
542