1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // before the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUCombinerHelper.h" 16 #include "AMDGPULegalizerInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "llvm/CodeGen/GlobalISel/CSEInfo.h" 20 #include "llvm/CodeGen/GlobalISel/Combiner.h" 21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 23 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" 24 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 26 #include "llvm/CodeGen/MachineDominators.h" 27 #include "llvm/CodeGen/TargetPassConfig.h" 28 #include "llvm/Target/TargetMachine.h" 29 30 #define GET_GICOMBINER_DEPS 31 #include "AMDGPUGenPreLegalizeGICombiner.inc" 32 #undef GET_GICOMBINER_DEPS 33 34 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner" 35 36 using namespace llvm; 37 using namespace MIPatternMatch; 38 namespace { 39 40 #define GET_GICOMBINER_TYPES 41 #include "AMDGPUGenPreLegalizeGICombiner.inc" 42 #undef GET_GICOMBINER_TYPES 43 44 class AMDGPUPreLegalizerCombinerImpl : public Combiner { 45 protected: 46 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig; 47 const GCNSubtarget &STI; 48 // TODO: Make CombinerHelper methods const. 49 mutable AMDGPUCombinerHelper Helper; 50 51 public: 52 AMDGPUPreLegalizerCombinerImpl( 53 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 54 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 55 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, 56 const GCNSubtarget &STI, MachineDominatorTree *MDT, 57 const LegalizerInfo *LI); 58 59 static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; } 60 61 bool tryCombineAllImpl(MachineInstr &MI) const; 62 bool tryCombineAll(MachineInstr &I) const override; 63 64 struct ClampI64ToI16MatchInfo { 65 int64_t Cmp1 = 0; 66 int64_t Cmp2 = 0; 67 Register Origin; 68 }; 69 70 bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI, 71 const MachineFunction &MF, 72 ClampI64ToI16MatchInfo &MatchInfo) const; 73 74 void applyClampI64ToI16(MachineInstr &MI, 75 const ClampI64ToI16MatchInfo &MatchInfo) const; 76 77 private: 78 #define GET_GICOMBINER_CLASS_MEMBERS 79 #define AMDGPUSubtarget GCNSubtarget 80 #include "AMDGPUGenPreLegalizeGICombiner.inc" 81 #undef GET_GICOMBINER_CLASS_MEMBERS 82 #undef AMDGPUSubtarget 83 }; 84 85 #define GET_GICOMBINER_IMPL 86 #define AMDGPUSubtarget GCNSubtarget 87 #include "AMDGPUGenPreLegalizeGICombiner.inc" 88 #undef AMDGPUSubtarget 89 #undef GET_GICOMBINER_IMPL 90 91 AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl( 92 MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC, 93 GISelKnownBits &KB, GISelCSEInfo *CSEInfo, 94 const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig, 95 const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI) 96 : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI), 97 Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI), 98 #define GET_GICOMBINER_CONSTRUCTOR_INITS 99 #include "AMDGPUGenPreLegalizeGICombiner.inc" 100 #undef GET_GICOMBINER_CONSTRUCTOR_INITS 101 { 102 } 103 104 bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { 105 if (tryCombineAllImpl(MI)) 106 return true; 107 108 switch (MI.getOpcode()) { 109 case TargetOpcode::G_SHUFFLE_VECTOR: 110 return Helper.tryCombineShuffleVector(MI); 111 } 112 113 return false; 114 } 115 116 bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16( 117 MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF, 118 ClampI64ToI16MatchInfo &MatchInfo) const { 119 assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!"); 120 121 // Try to find a pattern where an i64 value should get clamped to short. 122 const LLT SrcType = MRI.getType(MI.getOperand(1).getReg()); 123 if (SrcType != LLT::scalar(64)) 124 return false; 125 126 const LLT DstType = MRI.getType(MI.getOperand(0).getReg()); 127 if (DstType != LLT::scalar(16)) 128 return false; 129 130 Register Base; 131 132 auto IsApplicableForCombine = [&MatchInfo]() -> bool { 133 const auto Cmp1 = MatchInfo.Cmp1; 134 const auto Cmp2 = MatchInfo.Cmp2; 135 const auto Diff = std::abs(Cmp2 - Cmp1); 136 137 // If the difference between both comparison values is 0 or 1, there is no 138 // need to clamp. 139 if (Diff == 0 || Diff == 1) 140 return false; 141 142 const int64_t Min = std::numeric_limits<int16_t>::min(); 143 const int64_t Max = std::numeric_limits<int16_t>::max(); 144 145 // Check if the comparison values are between SHORT_MIN and SHORT_MAX. 146 return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) || 147 (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min)); 148 }; 149 150 // Try to match a combination of min / max MIR opcodes. 151 if (mi_match(MI.getOperand(1).getReg(), MRI, 152 m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 153 if (mi_match(Base, MRI, 154 m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 155 return IsApplicableForCombine(); 156 } 157 } 158 159 if (mi_match(MI.getOperand(1).getReg(), MRI, 160 m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) { 161 if (mi_match(Base, MRI, 162 m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) { 163 return IsApplicableForCombine(); 164 } 165 } 166 167 return false; 168 } 169 170 // We want to find a combination of instructions that 171 // gets generated when an i64 gets clamped to i16. 172 // The corresponding pattern is: 173 // G_MAX / G_MAX for i16 <= G_TRUNC i64. 174 // This can be efficiently written as following: 175 // v_cvt_pk_i16_i32 v0, v0, v1 176 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max 177 void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16( 178 MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const { 179 180 Register Src = MatchInfo.Origin; 181 assert(MI.getParent()->getParent()->getRegInfo().getType(Src) == 182 LLT::scalar(64)); 183 const LLT S32 = LLT::scalar(32); 184 185 auto Unmerge = B.buildUnmerge(S32, Src); 186 187 assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32); 188 189 const LLT V2S16 = LLT::fixed_vector(2, 16); 190 auto CvtPk = 191 B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16}, 192 {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags()); 193 194 auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2); 195 auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2); 196 auto MinBoundaryDst = B.buildConstant(S32, MinBoundary); 197 auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary); 198 199 auto Bitcast = B.buildBitcast({S32}, CvtPk); 200 201 auto Med3 = B.buildInstr( 202 AMDGPU::G_AMDGPU_SMED3, {S32}, 203 {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)}, 204 MI.getFlags()); 205 206 B.buildTrunc(MI.getOperand(0).getReg(), Med3); 207 208 MI.eraseFromParent(); 209 } 210 211 // Pass boilerplate 212 // ================ 213 214 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass { 215 public: 216 static char ID; 217 218 AMDGPUPreLegalizerCombiner(bool IsOptNone = false); 219 220 StringRef getPassName() const override { 221 return "AMDGPUPreLegalizerCombiner"; 222 } 223 224 bool runOnMachineFunction(MachineFunction &MF) override; 225 226 void getAnalysisUsage(AnalysisUsage &AU) const override; 227 228 private: 229 bool IsOptNone; 230 AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig; 231 }; 232 } // end anonymous namespace 233 234 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 235 AU.addRequired<TargetPassConfig>(); 236 AU.setPreservesCFG(); 237 getSelectionDAGFallbackAnalysisUsage(AU); 238 AU.addRequired<GISelKnownBitsAnalysis>(); 239 AU.addPreserved<GISelKnownBitsAnalysis>(); 240 if (!IsOptNone) { 241 AU.addRequired<MachineDominatorTreeWrapperPass>(); 242 AU.addPreserved<MachineDominatorTreeWrapperPass>(); 243 } 244 245 AU.addRequired<GISelCSEAnalysisWrapperPass>(); 246 AU.addPreserved<GISelCSEAnalysisWrapperPass>(); 247 MachineFunctionPass::getAnalysisUsage(AU); 248 } 249 250 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone) 251 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 252 initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 253 254 if (!RuleConfig.parseCommandLineOption()) 255 report_fatal_error("Invalid rule identifier"); 256 } 257 258 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 259 if (MF.getProperties().hasProperty( 260 MachineFunctionProperties::Property::FailedISel)) 261 return false; 262 auto *TPC = &getAnalysis<TargetPassConfig>(); 263 const Function &F = MF.getFunction(); 264 bool EnableOpt = 265 MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F); 266 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 267 268 // Enable CSE. 269 GISelCSEAnalysisWrapper &Wrapper = 270 getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper(); 271 auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig()); 272 273 const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>(); 274 MachineDominatorTree *MDT = 275 IsOptNone ? nullptr 276 : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 277 CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, 278 nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize()); 279 AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig, 280 STI, MDT, STI.getLegalizerInfo()); 281 return Impl.combineMachineInstrs(); 282 } 283 284 char AMDGPUPreLegalizerCombiner::ID = 0; 285 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 286 "Combine AMDGPU machine instrs before legalization", 287 false, false) 288 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 289 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 290 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE, 291 "Combine AMDGPU machine instrs before legalization", false, 292 false) 293 294 namespace llvm { 295 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) { 296 return new AMDGPUPreLegalizerCombiner(IsOptNone); 297 } 298 } // end namespace llvm 299