1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // before the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUCombinerHelper.h" 16 #include "AMDGPULegalizerInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 20 #include "llvm/CodeGen/GlobalISel/Combiner.h" 21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h" 24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 25 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 27 #include "llvm/CodeGen/MachineDominators.h" 28 #include "llvm/CodeGen/TargetPassConfig.h" 29 #include "llvm/Target/TargetMachine.h" 30 31 #define GET_GICOMBINER_DEPS 32 #include "AMDGPUGenPreLegalizeGICombiner.inc" 33 #undef GET_GICOMBINER_DEPS 34 35 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" 36 37 using namespace llvm; 38 using namespace MIPatternMatch; 39 namespace { 40 41 #define GET_GICOMBINER_TYPES 42 #include "AMDGPUGenPreLegalizeGICombiner.inc" 43 #undef GET_GICOMBINER_TYPES 44 45 class AMDGPUPreLegalizerCombinerImpl : public GIMatchTableExecutor { 46 protected: 47 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig; 48 const GCNSubtarget &STI; 49 50 GISelChangeObserver &Observer; 51 MachineIRBuilder &B; 52 MachineFunction &MF; 53 MachineRegisterInfo &MRI; 54 AMDGPUCombinerHelper &Helper; 55 56 public: 57 AMDGPUPreLegalizerCombinerImpl( 58 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, 59 const GCNSubtarget &STI, GISelChangeObserver &Observer, 60 MachineIRBuilder &B, AMDGPUCombinerHelper &Helper); 61 62 static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; } 63 64 bool tryCombineAll(MachineInstr &I) const; 65 66 struct ClampI64ToI16MatchInfo { 67 int64_t Cmp1 = 0; 68 int64_t Cmp2 = 0; 69 Register Origin; 70 }; 71 72 bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI, 73 const MachineFunction &MF, 74 ClampI64ToI16MatchInfo &MatchInfo) const; 75 76 void applyClampI64ToI16(MachineInstr &MI, 77 const ClampI64ToI16MatchInfo &MatchInfo) const; 78 79 private: 80 #define GET_GICOMBINER_CLASS_MEMBERS 81 #define AMDGPUSubtarget GCNSubtarget 82 #include "AMDGPUGenPreLegalizeGICombiner.inc" 83 #undef GET_GICOMBINER_CLASS_MEMBERS 84 #undef AMDGPUSubtarget 85 }; 86 87 #define GET_GICOMBINER_IMPL 88 #define AMDGPUSubtarget GCNSubtarget 89 #include "AMDGPUGenPreLegalizeGICombiner.inc" 90 #undef AMDGPUSubtarget 91 #undef GET_GICOMBINER_IMPL 92 93 AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl( 94 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, 95 const GCNSubtarget &STI, GISelChangeObserver &Observer, MachineIRBuilder &B, 96 AMDGPUCombinerHelper &Helper) 97 : RuleConfig(RuleConfig), STI(STI), Observer(Observer), B(B), MF(B.getMF()), 98 MRI(*B.getMRI()), Helper(Helper), 99 #define GET_GICOMBINER_CONSTRUCTOR_INITS 100 #include "AMDGPUGenPreLegalizeGICombiner.inc" 101 #undef GET_GICOMBINER_CONSTRUCTOR_INITS 102 { 103 } 104 105 bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16( 106 MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF, 107 ClampI64ToI16MatchInfo &MatchInfo) const { 108 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 109 110 // Try to find a pattern where an i64 value should get clamped to short. 111 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 112 if (SrcType != LLT::scalar(64)) 113 return false; 114 115 const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); 116 if (DstType != LLT::scalar(16)) 117 return false; 118 119 Register Base; 120 121 auto IsApplicableForCombine = [&MatchInfo]() -> bool { 122 const auto Cmp1 = MatchInfo.Cmp1; 123 const auto Cmp2 = MatchInfo.Cmp2; 124 const auto Diff = std::abs(Cmp2 - Cmp1); 125 126 // If the difference between both comparison values is 0 or 1, there is no 127 // need to clamp. 128 if (Diff == 0 || Diff == 1) 129 return false; 130 131 const int64_t Min = std::numeric_limits<int16_t>::min(); 132 const int64_t Max = std::numeric_limits<int16_t>::max(); 133 134 // Check if the comparison values are between SHORT_MIN and SHORT_MAX. 135 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 136 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 137 }; 138 139 // Try to match a combination of min / max MIR opcodes. 140 if (mi_match(MI.getOperand(1).getReg(), MRI, 141 m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 142 if (mi_match(Base, MRI, 143 m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 144 return IsApplicableForCombine(); 145 } 146 } 147 148 if (mi_match(MI.getOperand(1).getReg(), MRI, 149 m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 150 if (mi_match(Base, MRI, 151 m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 152 return IsApplicableForCombine(); 153 } 154 } 155 156 return false; 157 } 158 159 // We want to find a combination of instructions that 160 // gets generated when an i64 gets clamped to i16. 161 // The corresponding pattern is: 162 // G_MAX / G_MAX for i16 <= G_TRUNC i64. 163 // This can be efficiently written as following: 164 // v_cvt_pk_i16_i32 v0, v0, v1 165 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max 166 void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( 167 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const { 168 169 Register Src = MatchInfo.Origin; 170 assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == 171 LLT::scalar(64)); 172 const LLT S32 = LLT::scalar(32); 173 174 B.setInstrAndDebugLoc(MI); 175 176 auto Unmerge = B.buildUnmerge(S32, Src); 177 178 assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); 179 180 const LLT V2S16 = LLT::fixed_vector(2, 16); 181 auto CvtPk = 182 B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16}, 183 {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags()); 184 185 auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 186 auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 187 auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); 188 auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); 189 190 auto Bitcast = B.buildBitcast({S32}, CvtPk); 191 192 auto Med3 = B.buildInstr( 193 AMDGPU::G_AMDGPU_SMED3, {S32}, 194 {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, 195 MI.getFlags()); 196 197 B.buildTrunc(MI.getOperand(0).getReg(), Med3); 198 199 MI.eraseFromParent(); 200 } 201 202 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo { 203 GISelKnownBits *KB; 204 MachineDominatorTree *MDT; 205 AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig; 206 207 public: 208 AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 209 GISelKnownBits *KB, MachineDominatorTree *MDT) 210 : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 211 /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize), 212 KB(KB), MDT(MDT) { 213 if (!RuleConfig.parseCommandLineOption()) 214 report_fatal_error("Invalid rule identifier"); 215 } 216 217 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 218 MachineIRBuilder &B) const override; 219 }; 220 221 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 222 MachineInstr &MI, 223 MachineIRBuilder &B) const { 224 const auto *LI = MI.getMF()->getSubtarget().getLegalizerInfo(); 225 AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ true, KB, MDT, LI); 226 227 const GCNSubtarget &STI = MI.getMF()->getSubtarget<GCNSubtarget>(); 228 // TODO: Do not re-create the Impl on every inst, it should be per function. 229 AMDGPUPreLegalizerCombinerImpl Impl(RuleConfig, STI, Observer, B, Helper); 230 Impl.setupMF(*MI.getMF(), KB); 231 232 if (Impl.tryCombineAll(MI)) 233 return true; 234 235 switch (MI.getOpcode()) { 236 case TargetOpcode::G_CONCAT_VECTORS: 237 return Helper.tryCombineConcatVectors(MI); 238 case TargetOpcode::G_SHUFFLE_VECTOR: 239 return Helper.tryCombineShuffleVector(MI); 240 } 241 242 return false; 243 } 244 245 // Pass boilerplate 246 // ================ 247 248 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { 249 public: 250 static char ID; 251 252 AMDGPUPreLegalizerCombiner(bool IsOptNone = false); 253 254 StringRef getPassName() const override { 255 return "AMDGPUPreLegalizerCombiner"; 256 } 257 258 bool runOnMachineFunction(MachineFunction &MF) override; 259 260 void getAnalysisUsage(AnalysisUsage &AU) const override; 261 262 private: 263 bool IsOptNone; 264 }; 265 } // end anonymous namespace 266 267 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 268 AU.addRequired<TargetPassConfig>(); 269 AU.setPreservesCFG(); 270 getSelectionDAGFallbackAnalysisUsage(AU); 271 AU.addRequired<GISelKnownBitsAnalysis>(); 272 AU.addPreserved<GISelKnownBitsAnalysis>(); 273 if (!IsOptNone) { 274 AU.addRequired<MachineDominatorTree>(); 275 AU.addPreserved<MachineDominatorTree>(); 276 } 277 278 AU.addRequired<GISelCSEAnalysisWrapperPass>(); 279 AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 280 MachineFunctionPass::getAnalysisUsage(AU); 281 } 282 283 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) 284 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 285 initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 286 } 287 288 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 289 if (MF.getProperties().hasProperty( 290 MachineFunctionProperties::Property::FailedISel)) 291 return false; 292 auto *TPC = &getAnalysis<TargetPassConfig>(); 293 const Function &F = MF.getFunction(); 294 bool EnableOpt = 295 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 296 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 297 MachineDominatorTree *MDT = 298 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 299 AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 300 F.hasMinSize(), KB, MDT); 301 // Enable CSE. 302 GISelCSEAnalysisWrapper &Wrapper = 303 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 304 auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); 305 306 Combiner C(PCInfo, TPC); 307 return C.combineMachineInstrs(MF, CSEInfo); 308 } 309 310 char AMDGPUPreLegalizerCombiner::ID = 0; 311 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 312 "Combine AMDGPU machine instrs before legalization", 313 false, false) 314 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 315 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 316 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 317 "Combine AMDGPU machine instrs before legalization", false, 318 false) 319 320 namespace llvm { 321 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { 322 return new AMDGPUPreLegalizerCombiner(IsOptNone); 323 } 324 } // end namespace llvm 325