15ffd83dbSDimitry Andric //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// 25ffd83dbSDimitry Andric // 35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 65ffd83dbSDimitry Andric // 75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 85ffd83dbSDimitry Andric // 95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level, 105ffd83dbSDimitry Andric // before the legalizer. 115ffd83dbSDimitry Andric // 125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===// 135ffd83dbSDimitry Andric 14e8d8bef9SDimitry Andric #include "AMDGPU.h" 15*fe6060f1SDimitry Andric #include "AMDGPULegalizerInfo.h" 16*fe6060f1SDimitry Andric #include "GCNSubtarget.h" 17*fe6060f1SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 185ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h" 195ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 205ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 225ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 235ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h" 245ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h" 25e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h" 265ffd83dbSDimitry Andric 275ffd83dbSDimitry Andric #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" 285ffd83dbSDimitry Andric 295ffd83dbSDimitry Andric using namespace llvm; 305ffd83dbSDimitry Andric using namespace MIPatternMatch; 315ffd83dbSDimitry Andric 32*fe6060f1SDimitry Andric class AMDGPUPreLegalizerCombinerHelper { 33*fe6060f1SDimitry Andric protected: 34*fe6060f1SDimitry Andric MachineIRBuilder &B; 35*fe6060f1SDimitry Andric MachineFunction &MF; 36*fe6060f1SDimitry Andric MachineRegisterInfo &MRI; 37*fe6060f1SDimitry Andric CombinerHelper &Helper; 38*fe6060f1SDimitry Andric 39*fe6060f1SDimitry Andric public: 40*fe6060f1SDimitry Andric AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 41*fe6060f1SDimitry Andric : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 42*fe6060f1SDimitry Andric 43*fe6060f1SDimitry Andric struct ClampI64ToI16MatchInfo { 44*fe6060f1SDimitry Andric int64_t Cmp1 = 0; 45*fe6060f1SDimitry Andric int64_t Cmp2 = 0; 46*fe6060f1SDimitry Andric Register Origin; 47*fe6060f1SDimitry Andric }; 48*fe6060f1SDimitry Andric 49*fe6060f1SDimitry Andric bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, 50*fe6060f1SDimitry Andric MachineFunction &MF, 51*fe6060f1SDimitry Andric ClampI64ToI16MatchInfo &MatchInfo); 52*fe6060f1SDimitry Andric 53*fe6060f1SDimitry Andric void applyClampI64ToI16(MachineInstr &MI, 54*fe6060f1SDimitry Andric const ClampI64ToI16MatchInfo &MatchInfo); 55*fe6060f1SDimitry Andric }; 56*fe6060f1SDimitry Andric 57*fe6060f1SDimitry Andric bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( 58*fe6060f1SDimitry Andric MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, 59*fe6060f1SDimitry Andric ClampI64ToI16MatchInfo &MatchInfo) { 60*fe6060f1SDimitry Andric assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 61*fe6060f1SDimitry Andric 62*fe6060f1SDimitry Andric // Try to find a pattern where an i64 value should get clamped to short. 63*fe6060f1SDimitry Andric const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 64*fe6060f1SDimitry Andric if (SrcType != LLT::scalar(64)) 65*fe6060f1SDimitry Andric return false; 66*fe6060f1SDimitry Andric 67*fe6060f1SDimitry Andric const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); 68*fe6060f1SDimitry Andric if (DstType != LLT::scalar(16)) 69*fe6060f1SDimitry Andric return false; 70*fe6060f1SDimitry Andric 71*fe6060f1SDimitry Andric Register Base; 72*fe6060f1SDimitry Andric 73*fe6060f1SDimitry Andric auto IsApplicableForCombine = [&MatchInfo]() -> bool { 74*fe6060f1SDimitry Andric const auto Cmp1 = MatchInfo.Cmp1; 75*fe6060f1SDimitry Andric const auto Cmp2 = MatchInfo.Cmp2; 76*fe6060f1SDimitry Andric const auto Diff = std::abs(Cmp2 - Cmp1); 77*fe6060f1SDimitry Andric 78*fe6060f1SDimitry Andric // If the difference between both comparison values is 0 or 1, there is no 79*fe6060f1SDimitry Andric // need to clamp. 80*fe6060f1SDimitry Andric if (Diff == 0 || Diff == 1) 81*fe6060f1SDimitry Andric return false; 82*fe6060f1SDimitry Andric 83*fe6060f1SDimitry Andric const int64_t Min = std::numeric_limits<int16_t>::min(); 84*fe6060f1SDimitry Andric const int64_t Max = std::numeric_limits<int16_t>::max(); 85*fe6060f1SDimitry Andric 86*fe6060f1SDimitry Andric // Check if the comparison values are between SHORT_MIN and SHORT_MAX. 87*fe6060f1SDimitry Andric return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 88*fe6060f1SDimitry Andric (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 89*fe6060f1SDimitry Andric }; 90*fe6060f1SDimitry Andric 91*fe6060f1SDimitry Andric // Try to match a combination of min / max MIR opcodes. 92*fe6060f1SDimitry Andric if (mi_match(MI.getOperand(1).getReg(), MRI, 93*fe6060f1SDimitry Andric m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 94*fe6060f1SDimitry Andric if (mi_match(Base, MRI, 95*fe6060f1SDimitry Andric m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 96*fe6060f1SDimitry Andric return IsApplicableForCombine(); 97*fe6060f1SDimitry Andric } 98*fe6060f1SDimitry Andric } 99*fe6060f1SDimitry Andric 100*fe6060f1SDimitry Andric if (mi_match(MI.getOperand(1).getReg(), MRI, 101*fe6060f1SDimitry Andric m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 102*fe6060f1SDimitry Andric if (mi_match(Base, MRI, 103*fe6060f1SDimitry Andric m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 104*fe6060f1SDimitry Andric return IsApplicableForCombine(); 105*fe6060f1SDimitry Andric } 106*fe6060f1SDimitry Andric } 107*fe6060f1SDimitry Andric 108*fe6060f1SDimitry Andric return false; 109*fe6060f1SDimitry Andric } 110*fe6060f1SDimitry Andric 111*fe6060f1SDimitry Andric // We want to find a combination of instructions that 112*fe6060f1SDimitry Andric // gets generated when an i64 gets clamped to i16. 113*fe6060f1SDimitry Andric // The corresponding pattern is: 114*fe6060f1SDimitry Andric // G_MAX / G_MAX for i16 <= G_TRUNC i64. 115*fe6060f1SDimitry Andric // This can be efficiently written as following: 116*fe6060f1SDimitry Andric // v_cvt_pk_i16_i32 v0, v0, v1 117*fe6060f1SDimitry Andric // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max 118*fe6060f1SDimitry Andric void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( 119*fe6060f1SDimitry Andric MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { 120*fe6060f1SDimitry Andric 121*fe6060f1SDimitry Andric Register Src = MatchInfo.Origin; 122*fe6060f1SDimitry Andric assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == 123*fe6060f1SDimitry Andric LLT::scalar(64)); 124*fe6060f1SDimitry Andric const LLT S32 = LLT::scalar(32); 125*fe6060f1SDimitry Andric 126*fe6060f1SDimitry Andric B.setMBB(*MI.getParent()); 127*fe6060f1SDimitry Andric B.setInstrAndDebugLoc(MI); 128*fe6060f1SDimitry Andric 129*fe6060f1SDimitry Andric auto Unmerge = B.buildUnmerge(S32, Src); 130*fe6060f1SDimitry Andric 131*fe6060f1SDimitry Andric assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); 132*fe6060f1SDimitry Andric 133*fe6060f1SDimitry Andric const LLT V2S16 = LLT::fixed_vector(2, 16); 134*fe6060f1SDimitry Andric auto CvtPk = 135*fe6060f1SDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16}, 136*fe6060f1SDimitry Andric {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags()); 137*fe6060f1SDimitry Andric 138*fe6060f1SDimitry Andric auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 139*fe6060f1SDimitry Andric auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 140*fe6060f1SDimitry Andric auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); 141*fe6060f1SDimitry Andric auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); 142*fe6060f1SDimitry Andric 143*fe6060f1SDimitry Andric auto Bitcast = B.buildBitcast({S32}, CvtPk); 144*fe6060f1SDimitry Andric 145*fe6060f1SDimitry Andric auto Med3 = B.buildInstr( 146*fe6060f1SDimitry Andric AMDGPU::G_AMDGPU_SMED3, {S32}, 147*fe6060f1SDimitry Andric {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, 148*fe6060f1SDimitry Andric MI.getFlags()); 149*fe6060f1SDimitry Andric 150*fe6060f1SDimitry Andric B.buildTrunc(MI.getOperand(0).getReg(), Med3); 151*fe6060f1SDimitry Andric 152*fe6060f1SDimitry Andric MI.eraseFromParent(); 153*fe6060f1SDimitry Andric } 154*fe6060f1SDimitry Andric 155*fe6060f1SDimitry Andric class AMDGPUPreLegalizerCombinerHelperState { 156*fe6060f1SDimitry Andric protected: 157*fe6060f1SDimitry Andric CombinerHelper &Helper; 158*fe6060f1SDimitry Andric AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; 159*fe6060f1SDimitry Andric 160*fe6060f1SDimitry Andric public: 161*fe6060f1SDimitry Andric AMDGPUPreLegalizerCombinerHelperState( 162*fe6060f1SDimitry Andric CombinerHelper &Helper, 163*fe6060f1SDimitry Andric AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) 164*fe6060f1SDimitry Andric : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} 165*fe6060f1SDimitry Andric }; 166*fe6060f1SDimitry Andric 1675ffd83dbSDimitry Andric #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 1685ffd83dbSDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc" 1695ffd83dbSDimitry Andric #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 1705ffd83dbSDimitry Andric 1715ffd83dbSDimitry Andric namespace { 1725ffd83dbSDimitry Andric #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 1735ffd83dbSDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc" 1745ffd83dbSDimitry Andric #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 1755ffd83dbSDimitry Andric 176e8d8bef9SDimitry Andric class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { 1775ffd83dbSDimitry Andric GISelKnownBits *KB; 1785ffd83dbSDimitry Andric MachineDominatorTree *MDT; 1795ffd83dbSDimitry Andric 1805ffd83dbSDimitry Andric public: 1815ffd83dbSDimitry Andric AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 1825ffd83dbSDimitry Andric 1835ffd83dbSDimitry Andric AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 1845ffd83dbSDimitry Andric GISelKnownBits *KB, MachineDominatorTree *MDT) 1855ffd83dbSDimitry Andric : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 1865ffd83dbSDimitry Andric /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), 1875ffd83dbSDimitry Andric KB(KB), MDT(MDT) { 1885ffd83dbSDimitry Andric if (!GeneratedRuleCfg.parseCommandLineOption()) 1895ffd83dbSDimitry Andric report_fatal_error("Invalid rule identifier"); 1905ffd83dbSDimitry Andric } 1915ffd83dbSDimitry Andric 1925ffd83dbSDimitry Andric virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 1935ffd83dbSDimitry Andric MachineIRBuilder &B) const override; 1945ffd83dbSDimitry Andric }; 1955ffd83dbSDimitry Andric 1965ffd83dbSDimitry Andric bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 1975ffd83dbSDimitry Andric MachineInstr &MI, 1985ffd83dbSDimitry Andric MachineIRBuilder &B) const { 1995ffd83dbSDimitry Andric CombinerHelper Helper(Observer, B, KB, MDT); 200*fe6060f1SDimitry Andric AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); 201*fe6060f1SDimitry Andric AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 202*fe6060f1SDimitry Andric PreLegalizerHelper); 2035ffd83dbSDimitry Andric 2045ffd83dbSDimitry Andric if (Generated.tryCombineAll(Observer, MI, B, Helper)) 2055ffd83dbSDimitry Andric return true; 2065ffd83dbSDimitry Andric 2075ffd83dbSDimitry Andric switch (MI.getOpcode()) { 208*fe6060f1SDimitry Andric case TargetOpcode::G_MEMCPY_INLINE: 209*fe6060f1SDimitry Andric return Helper.tryEmitMemcpyInline(MI); 2105ffd83dbSDimitry Andric case TargetOpcode::G_CONCAT_VECTORS: 2115ffd83dbSDimitry Andric return Helper.tryCombineConcatVectors(MI); 2125ffd83dbSDimitry Andric case TargetOpcode::G_SHUFFLE_VECTOR: 2135ffd83dbSDimitry Andric return Helper.tryCombineShuffleVector(MI); 2145ffd83dbSDimitry Andric } 2155ffd83dbSDimitry Andric 2165ffd83dbSDimitry Andric return false; 2175ffd83dbSDimitry Andric } 2185ffd83dbSDimitry Andric 2195ffd83dbSDimitry Andric #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 2205ffd83dbSDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc" 2215ffd83dbSDimitry Andric #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 2225ffd83dbSDimitry Andric 2235ffd83dbSDimitry Andric // Pass boilerplate 2245ffd83dbSDimitry Andric // ================ 2255ffd83dbSDimitry Andric 2265ffd83dbSDimitry Andric class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { 2275ffd83dbSDimitry Andric public: 2285ffd83dbSDimitry Andric static char ID; 2295ffd83dbSDimitry Andric 2305ffd83dbSDimitry Andric AMDGPUPreLegalizerCombiner(bool IsOptNone = false); 2315ffd83dbSDimitry Andric 2325ffd83dbSDimitry Andric StringRef getPassName() const override { 2335ffd83dbSDimitry Andric return "AMDGPUPreLegalizerCombiner"; 2345ffd83dbSDimitry Andric } 2355ffd83dbSDimitry Andric 2365ffd83dbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 2375ffd83dbSDimitry Andric 2385ffd83dbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override; 2395ffd83dbSDimitry Andric private: 2405ffd83dbSDimitry Andric bool IsOptNone; 2415ffd83dbSDimitry Andric }; 2425ffd83dbSDimitry Andric } // end anonymous namespace 2435ffd83dbSDimitry Andric 2445ffd83dbSDimitry Andric void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 2455ffd83dbSDimitry Andric AU.addRequired<TargetPassConfig>(); 2465ffd83dbSDimitry Andric AU.setPreservesCFG(); 2475ffd83dbSDimitry Andric getSelectionDAGFallbackAnalysisUsage(AU); 2485ffd83dbSDimitry Andric AU.addRequired<GISelKnownBitsAnalysis>(); 2495ffd83dbSDimitry Andric AU.addPreserved<GISelKnownBitsAnalysis>(); 2505ffd83dbSDimitry Andric if (!IsOptNone) { 2515ffd83dbSDimitry Andric AU.addRequired<MachineDominatorTree>(); 2525ffd83dbSDimitry Andric AU.addPreserved<MachineDominatorTree>(); 2535ffd83dbSDimitry Andric } 254*fe6060f1SDimitry Andric 255*fe6060f1SDimitry Andric AU.addRequired<GISelCSEAnalysisWrapperPass>(); 256*fe6060f1SDimitry Andric AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 2575ffd83dbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 2585ffd83dbSDimitry Andric } 2595ffd83dbSDimitry Andric 2605ffd83dbSDimitry Andric AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) 2615ffd83dbSDimitry Andric : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 2625ffd83dbSDimitry Andric initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 2635ffd83dbSDimitry Andric } 2645ffd83dbSDimitry Andric 2655ffd83dbSDimitry Andric bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 2665ffd83dbSDimitry Andric if (MF.getProperties().hasProperty( 2675ffd83dbSDimitry Andric MachineFunctionProperties::Property::FailedISel)) 2685ffd83dbSDimitry Andric return false; 2695ffd83dbSDimitry Andric auto *TPC = &getAnalysis<TargetPassConfig>(); 2705ffd83dbSDimitry Andric const Function &F = MF.getFunction(); 2715ffd83dbSDimitry Andric bool EnableOpt = 2725ffd83dbSDimitry Andric MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 2735ffd83dbSDimitry Andric GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 2745ffd83dbSDimitry Andric MachineDominatorTree *MDT = 2755ffd83dbSDimitry Andric IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 2765ffd83dbSDimitry Andric AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 2775ffd83dbSDimitry Andric F.hasMinSize(), KB, MDT); 278*fe6060f1SDimitry Andric // Enable CSE. 279*fe6060f1SDimitry Andric GISelCSEAnalysisWrapper &Wrapper = 280*fe6060f1SDimitry Andric getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 281*fe6060f1SDimitry Andric auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); 282*fe6060f1SDimitry Andric 2835ffd83dbSDimitry Andric Combiner C(PCInfo, TPC); 284*fe6060f1SDimitry Andric return C.combineMachineInstrs(MF, CSEInfo); 2855ffd83dbSDimitry Andric } 2865ffd83dbSDimitry Andric 2875ffd83dbSDimitry Andric char AMDGPUPreLegalizerCombiner::ID = 0; 2885ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 2895ffd83dbSDimitry Andric "Combine AMDGPU machine instrs before legalization", 2905ffd83dbSDimitry Andric false, false) 2915ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 2925ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 2935ffd83dbSDimitry Andric INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 2945ffd83dbSDimitry Andric "Combine AMDGPU machine instrs before legalization", false, 2955ffd83dbSDimitry Andric false) 2965ffd83dbSDimitry Andric 2975ffd83dbSDimitry Andric namespace llvm { 2985ffd83dbSDimitry Andric FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { 2995ffd83dbSDimitry Andric return new AMDGPUPreLegalizerCombiner(IsOptNone); 3005ffd83dbSDimitry Andric } 3015ffd83dbSDimitry Andric } // end namespace llvm 302