1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // before the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18 #include "llvm/CodeGen/GlobalISel/Combiner.h" 19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/Target/TargetMachine.h" 26 27 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" 28 29 using namespace llvm; 30 using namespace MIPatternMatch; 31 32 class AMDGPUPreLegalizerCombinerHelper { 33 protected: 34 MachineIRBuilder &B; 35 MachineFunction &MF; 36 MachineRegisterInfo &MRI; 37 CombinerHelper &Helper; 38 39 public: 40 AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 41 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 42 43 struct ClampI64ToI16MatchInfo { 44 int64_t Cmp1 = 0; 45 int64_t Cmp2 = 0; 46 Register Origin; 47 }; 48 49 bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI, 50 MachineFunction &MF, 51 ClampI64ToI16MatchInfo &MatchInfo); 52 53 void applyClampI64ToI16(MachineInstr &MI, 54 const ClampI64ToI16MatchInfo &MatchInfo); 55 }; 56 57 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( 58 MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF, 59 ClampI64ToI16MatchInfo &MatchInfo) { 60 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 61 62 // Try to find a pattern where an i64 value should get clamped to short. 63 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 64 if (SrcType != LLT::scalar(64)) 65 return false; 66 67 const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); 68 if (DstType != LLT::scalar(16)) 69 return false; 70 71 Register Base; 72 73 auto IsApplicableForCombine = [&MatchInfo]() -> bool { 74 const auto Cmp1 = MatchInfo.Cmp1; 75 const auto Cmp2 = MatchInfo.Cmp2; 76 const auto Diff = std::abs(Cmp2 - Cmp1); 77 78 // If the difference between both comparison values is 0 or 1, there is no 79 // need to clamp. 80 if (Diff == 0 || Diff == 1) 81 return false; 82 83 const int64_t Min = std::numeric_limits<int16_t>::min(); 84 const int64_t Max = std::numeric_limits<int16_t>::max(); 85 86 // Check if the comparison values are between SHORT_MIN and SHORT_MAX. 87 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 88 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 89 }; 90 91 // Try to match a combination of min / max MIR opcodes. 92 if (mi_match(MI.getOperand(1).getReg(), MRI, 93 m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 94 if (mi_match(Base, MRI, 95 m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 96 return IsApplicableForCombine(); 97 } 98 } 99 100 if (mi_match(MI.getOperand(1).getReg(), MRI, 101 m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 102 if (mi_match(Base, MRI, 103 m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 104 return IsApplicableForCombine(); 105 } 106 } 107 108 return false; 109 } 110 111 // We want to find a combination of instructions that 112 // gets generated when an i64 gets clamped to i16. 113 // The corresponding pattern is: 114 // G_MAX / G_MAX for i16 <= G_TRUNC i64. 115 // This can be efficiently written as following: 116 // v_cvt_pk_i16_i32 v0, v0, v1 117 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max 118 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16( 119 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) { 120 121 Register Src = MatchInfo.Origin; 122 assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == 123 LLT::scalar(64)); 124 const LLT S32 = LLT::scalar(32); 125 126 B.setMBB(*MI.getParent()); 127 B.setInstrAndDebugLoc(MI); 128 129 auto Unmerge = B.buildUnmerge(S32, Src); 130 131 assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); 132 133 const LLT V2S16 = LLT::fixed_vector(2, 16); 134 auto CvtPk = 135 B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16}, 136 {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags()); 137 138 auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 139 auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 140 auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); 141 auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); 142 143 auto Bitcast = B.buildBitcast({S32}, CvtPk); 144 145 auto Med3 = B.buildInstr( 146 AMDGPU::G_AMDGPU_SMED3, {S32}, 147 {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, 148 MI.getFlags()); 149 150 B.buildTrunc(MI.getOperand(0).getReg(), Med3); 151 152 MI.eraseFromParent(); 153 } 154 155 class AMDGPUPreLegalizerCombinerHelperState { 156 protected: 157 CombinerHelper &Helper; 158 AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper; 159 160 public: 161 AMDGPUPreLegalizerCombinerHelperState( 162 CombinerHelper &Helper, 163 AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper) 164 : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {} 165 }; 166 167 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 168 #include "AMDGPUGenPreLegalizeGICombiner.inc" 169 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 170 171 namespace { 172 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 173 #include "AMDGPUGenPreLegalizeGICombiner.inc" 174 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 175 176 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { 177 GISelKnownBits *KB; 178 MachineDominatorTree *MDT; 179 180 public: 181 AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 182 183 AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 184 GISelKnownBits *KB, MachineDominatorTree *MDT) 185 : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 186 /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), 187 KB(KB), MDT(MDT) { 188 if (!GeneratedRuleCfg.parseCommandLineOption()) 189 report_fatal_error("Invalid rule identifier"); 190 } 191 192 virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 193 MachineIRBuilder &B) const override; 194 }; 195 196 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 197 MachineInstr &MI, 198 MachineIRBuilder &B) const { 199 CombinerHelper Helper(Observer, B, KB, MDT); 200 AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper); 201 AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 202 PreLegalizerHelper); 203 204 if (Generated.tryCombineAll(Observer, MI, B, Helper)) 205 return true; 206 207 switch (MI.getOpcode()) { 208 case TargetOpcode::G_MEMCPY_INLINE: 209 return Helper.tryEmitMemcpyInline(MI); 210 case TargetOpcode::G_CONCAT_VECTORS: 211 return Helper.tryCombineConcatVectors(MI); 212 case TargetOpcode::G_SHUFFLE_VECTOR: 213 return Helper.tryCombineShuffleVector(MI); 214 } 215 216 return false; 217 } 218 219 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 220 #include "AMDGPUGenPreLegalizeGICombiner.inc" 221 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 222 223 // Pass boilerplate 224 // ================ 225 226 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { 227 public: 228 static char ID; 229 230 AMDGPUPreLegalizerCombiner(bool IsOptNone = false); 231 232 StringRef getPassName() const override { 233 return "AMDGPUPreLegalizerCombiner"; 234 } 235 236 bool runOnMachineFunction(MachineFunction &MF) override; 237 238 void getAnalysisUsage(AnalysisUsage &AU) const override; 239 private: 240 bool IsOptNone; 241 }; 242 } // end anonymous namespace 243 244 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 245 AU.addRequired<TargetPassConfig>(); 246 AU.setPreservesCFG(); 247 getSelectionDAGFallbackAnalysisUsage(AU); 248 AU.addRequired<GISelKnownBitsAnalysis>(); 249 AU.addPreserved<GISelKnownBitsAnalysis>(); 250 if (!IsOptNone) { 251 AU.addRequired<MachineDominatorTree>(); 252 AU.addPreserved<MachineDominatorTree>(); 253 } 254 255 AU.addRequired<GISelCSEAnalysisWrapperPass>(); 256 AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 257 MachineFunctionPass::getAnalysisUsage(AU); 258 } 259 260 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) 261 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 262 initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 263 } 264 265 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 266 if (MF.getProperties().hasProperty( 267 MachineFunctionProperties::Property::FailedISel)) 268 return false; 269 auto *TPC = &getAnalysis<TargetPassConfig>(); 270 const Function &F = MF.getFunction(); 271 bool EnableOpt = 272 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 273 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 274 MachineDominatorTree *MDT = 275 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 276 AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 277 F.hasMinSize(), KB, MDT); 278 // Enable CSE. 279 GISelCSEAnalysisWrapper &Wrapper = 280 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 281 auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); 282 283 Combiner C(PCInfo, TPC); 284 return C.combineMachineInstrs(MF, CSEInfo); 285 } 286 287 char AMDGPUPreLegalizerCombiner::ID = 0; 288 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 289 "Combine AMDGPU machine instrs before legalization", 290 false, false) 291 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 292 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 293 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 294 "Combine AMDGPU machine instrs before legalization", false, 295 false) 296 297 namespace llvm { 298 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { 299 return new AMDGPUPreLegalizerCombiner(IsOptNone); 300 } 301 } // end namespace llvm 302