1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18 #include "llvm/CodeGen/GlobalISel/Combiner.h" 19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/Target/TargetMachine.h" 26 27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 28 29 using namespace llvm; 30 using namespace MIPatternMatch; 31 32 class AMDGPUPostLegalizerCombinerHelper { 33 protected: 34 MachineIRBuilder &B; 35 MachineFunction &MF; 36 MachineRegisterInfo &MRI; 37 CombinerHelper &Helper; 38 39 public: 40 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 41 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 42 43 struct FMinFMaxLegacyInfo { 44 Register LHS; 45 Register RHS; 46 Register True; 47 Register False; 48 CmpInst::Predicate Pred; 49 }; 50 51 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 52 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 53 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 54 const FMinFMaxLegacyInfo &Info); 55 56 bool matchUCharToFloat(MachineInstr &MI); 57 void applyUCharToFloat(MachineInstr &MI); 58 59 // FIXME: Should be able to have 2 separate matchdatas rather than custom 60 // struct boilerplate. 61 struct CvtF32UByteMatchInfo { 62 Register CvtVal; 63 unsigned ShiftOffset; 64 }; 65 66 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 67 void applyCvtF32UByteN(MachineInstr &MI, 68 const CvtF32UByteMatchInfo &MatchInfo); 69 }; 70 71 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 72 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 73 // FIXME: Combines should have subtarget predicates, and we shouldn't need 74 // this here. 75 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 76 return false; 77 78 // FIXME: Type predicate on pattern 79 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 80 return false; 81 82 Register Cond = MI.getOperand(1).getReg(); 83 if (!MRI.hasOneNonDBGUse(Cond) || 84 !mi_match(Cond, MRI, 85 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 86 return false; 87 88 Info.True = MI.getOperand(2).getReg(); 89 Info.False = MI.getOperand(3).getReg(); 90 91 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 92 !(Info.LHS == Info.False && Info.RHS == Info.True)) 93 return false; 94 95 switch (Info.Pred) { 96 case CmpInst::FCMP_FALSE: 97 case CmpInst::FCMP_OEQ: 98 case CmpInst::FCMP_ONE: 99 case CmpInst::FCMP_ORD: 100 case CmpInst::FCMP_UNO: 101 case CmpInst::FCMP_UEQ: 102 case CmpInst::FCMP_UNE: 103 case CmpInst::FCMP_TRUE: 104 return false; 105 default: 106 return true; 107 } 108 } 109 110 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 111 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 112 B.setInstrAndDebugLoc(MI); 113 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 114 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 115 }; 116 117 switch (Info.Pred) { 118 case CmpInst::FCMP_ULT: 119 case CmpInst::FCMP_ULE: 120 if (Info.LHS == Info.True) 121 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 122 else 123 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 124 break; 125 case CmpInst::FCMP_OLE: 126 case CmpInst::FCMP_OLT: { 127 // We need to permute the operands to get the correct NaN behavior. The 128 // selected operand is the second one based on the failing compare with NaN, 129 // so permute it based on the compare type the hardware uses. 130 if (Info.LHS == Info.True) 131 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 132 else 133 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 134 break; 135 } 136 case CmpInst::FCMP_UGE: 137 case CmpInst::FCMP_UGT: { 138 if (Info.LHS == Info.True) 139 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 140 else 141 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 142 break; 143 } 144 case CmpInst::FCMP_OGT: 145 case CmpInst::FCMP_OGE: { 146 if (Info.LHS == Info.True) 147 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 148 else 149 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 150 break; 151 } 152 default: 153 llvm_unreachable("predicate should not have matched"); 154 } 155 156 MI.eraseFromParent(); 157 } 158 159 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 160 Register DstReg = MI.getOperand(0).getReg(); 161 162 // TODO: We could try to match extracting the higher bytes, which would be 163 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 164 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 165 // about in practice. 166 LLT Ty = MRI.getType(DstReg); 167 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 168 Register SrcReg = MI.getOperand(1).getReg(); 169 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 170 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 171 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 172 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 173 } 174 175 return false; 176 } 177 178 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 179 B.setInstrAndDebugLoc(MI); 180 181 const LLT S32 = LLT::scalar(32); 182 183 Register DstReg = MI.getOperand(0).getReg(); 184 Register SrcReg = MI.getOperand(1).getReg(); 185 LLT Ty = MRI.getType(DstReg); 186 LLT SrcTy = MRI.getType(SrcReg); 187 if (SrcTy != S32) 188 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 189 190 if (Ty == S32) { 191 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 192 {SrcReg}, MI.getFlags()); 193 } else { 194 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 195 {SrcReg}, MI.getFlags()); 196 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 197 } 198 199 MI.eraseFromParent(); 200 } 201 202 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 203 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 204 Register SrcReg = MI.getOperand(1).getReg(); 205 206 // Look through G_ZEXT. 207 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 208 209 Register Src0; 210 int64_t ShiftAmt; 211 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 212 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 213 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 214 215 unsigned ShiftOffset = 8 * Offset; 216 if (IsShr) 217 ShiftOffset += ShiftAmt; 218 else 219 ShiftOffset -= ShiftAmt; 220 221 MatchInfo.CvtVal = Src0; 222 MatchInfo.ShiftOffset = ShiftOffset; 223 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 224 } 225 226 // TODO: Simplify demanded bits. 227 return false; 228 } 229 230 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 231 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 232 B.setInstrAndDebugLoc(MI); 233 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 234 235 const LLT S32 = LLT::scalar(32); 236 Register CvtSrc = MatchInfo.CvtVal; 237 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 238 if (SrcTy != S32) { 239 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 240 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 241 } 242 243 assert(MI.getOpcode() != NewOpc); 244 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 245 MI.eraseFromParent(); 246 } 247 248 class AMDGPUPostLegalizerCombinerHelperState { 249 protected: 250 CombinerHelper &Helper; 251 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 252 253 public: 254 AMDGPUPostLegalizerCombinerHelperState( 255 CombinerHelper &Helper, 256 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 257 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 258 }; 259 260 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 261 #include "AMDGPUGenPostLegalizeGICombiner.inc" 262 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 263 264 namespace { 265 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 266 #include "AMDGPUGenPostLegalizeGICombiner.inc" 267 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 268 269 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 270 GISelKnownBits *KB; 271 MachineDominatorTree *MDT; 272 273 public: 274 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 275 276 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 277 const AMDGPULegalizerInfo *LI, 278 GISelKnownBits *KB, MachineDominatorTree *MDT) 279 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 280 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 281 KB(KB), MDT(MDT) { 282 if (!GeneratedRuleCfg.parseCommandLineOption()) 283 report_fatal_error("Invalid rule identifier"); 284 } 285 286 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 287 MachineIRBuilder &B) const override; 288 }; 289 290 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 291 MachineInstr &MI, 292 MachineIRBuilder &B) const { 293 CombinerHelper Helper(Observer, B, KB, MDT, LInfo); 294 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 295 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 296 PostLegalizerHelper); 297 298 if (Generated.tryCombineAll(Observer, MI, B)) 299 return true; 300 301 switch (MI.getOpcode()) { 302 case TargetOpcode::G_SHL: 303 case TargetOpcode::G_LSHR: 304 case TargetOpcode::G_ASHR: 305 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 306 // common case, splitting this into a move and a 32-bit shift is faster and 307 // the same code size. 308 return Helper.tryCombineShiftToUnmerge(MI, 32); 309 } 310 311 return false; 312 } 313 314 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 315 #include "AMDGPUGenPostLegalizeGICombiner.inc" 316 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 317 318 // Pass boilerplate 319 // ================ 320 321 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 322 public: 323 static char ID; 324 325 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 326 327 StringRef getPassName() const override { 328 return "AMDGPUPostLegalizerCombiner"; 329 } 330 331 bool runOnMachineFunction(MachineFunction &MF) override; 332 333 void getAnalysisUsage(AnalysisUsage &AU) const override; 334 private: 335 bool IsOptNone; 336 }; 337 } // end anonymous namespace 338 339 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 340 AU.addRequired<TargetPassConfig>(); 341 AU.setPreservesCFG(); 342 getSelectionDAGFallbackAnalysisUsage(AU); 343 AU.addRequired<GISelKnownBitsAnalysis>(); 344 AU.addPreserved<GISelKnownBitsAnalysis>(); 345 if (!IsOptNone) { 346 AU.addRequired<MachineDominatorTree>(); 347 AU.addPreserved<MachineDominatorTree>(); 348 } 349 MachineFunctionPass::getAnalysisUsage(AU); 350 } 351 352 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 353 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 354 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 355 } 356 357 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 358 if (MF.getProperties().hasProperty( 359 MachineFunctionProperties::Property::FailedISel)) 360 return false; 361 auto *TPC = &getAnalysis<TargetPassConfig>(); 362 const Function &F = MF.getFunction(); 363 bool EnableOpt = 364 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 365 366 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 367 const AMDGPULegalizerInfo *LI 368 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 369 370 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 371 MachineDominatorTree *MDT = 372 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 373 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 374 F.hasMinSize(), LI, KB, MDT); 375 Combiner C(PCInfo, TPC); 376 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 377 } 378 379 char AMDGPUPostLegalizerCombiner::ID = 0; 380 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 381 "Combine AMDGPU machine instrs after legalization", 382 false, false) 383 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 384 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 385 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 386 "Combine AMDGPU machine instrs after legalization", false, 387 false) 388 389 namespace llvm { 390 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 391 return new AMDGPUPostLegalizerCombiner(IsOptNone); 392 } 393 } // end namespace llvm 394