1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "GCNSubtarget.h" 17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 18 #include "llvm/CodeGen/GlobalISel/Combiner.h" 19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 23 #include "llvm/CodeGen/MachineDominators.h" 24 #include "llvm/CodeGen/TargetPassConfig.h" 25 #include "llvm/Target/TargetMachine.h" 26 27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 28 29 using namespace llvm; 30 using namespace MIPatternMatch; 31 32 class AMDGPUPostLegalizerCombinerHelper { 33 protected: 34 MachineIRBuilder &B; 35 MachineFunction &MF; 36 MachineRegisterInfo &MRI; 37 CombinerHelper &Helper; 38 39 public: 40 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) 41 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 42 43 struct FMinFMaxLegacyInfo { 44 Register LHS; 45 Register RHS; 46 Register True; 47 Register False; 48 CmpInst::Predicate Pred; 49 }; 50 51 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 52 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 53 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 54 const FMinFMaxLegacyInfo &Info); 55 56 bool matchUCharToFloat(MachineInstr &MI); 57 void applyUCharToFloat(MachineInstr &MI); 58 59 // FIXME: Should be able to have 2 separate matchdatas rather than custom 60 // struct boilerplate. 61 struct CvtF32UByteMatchInfo { 62 Register CvtVal; 63 unsigned ShiftOffset; 64 }; 65 66 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 67 void applyCvtF32UByteN(MachineInstr &MI, 68 const CvtF32UByteMatchInfo &MatchInfo); 69 70 bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); 71 }; 72 73 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 74 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 75 // FIXME: Combines should have subtarget predicates, and we shouldn't need 76 // this here. 77 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 78 return false; 79 80 // FIXME: Type predicate on pattern 81 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 82 return false; 83 84 Register Cond = MI.getOperand(1).getReg(); 85 if (!MRI.hasOneNonDBGUse(Cond) || 86 !mi_match(Cond, MRI, 87 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 88 return false; 89 90 Info.True = MI.getOperand(2).getReg(); 91 Info.False = MI.getOperand(3).getReg(); 92 93 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 94 !(Info.LHS == Info.False && Info.RHS == Info.True)) 95 return false; 96 97 switch (Info.Pred) { 98 case CmpInst::FCMP_FALSE: 99 case CmpInst::FCMP_OEQ: 100 case CmpInst::FCMP_ONE: 101 case CmpInst::FCMP_ORD: 102 case CmpInst::FCMP_UNO: 103 case CmpInst::FCMP_UEQ: 104 case CmpInst::FCMP_UNE: 105 case CmpInst::FCMP_TRUE: 106 return false; 107 default: 108 return true; 109 } 110 } 111 112 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 113 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 114 B.setInstrAndDebugLoc(MI); 115 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 116 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 117 }; 118 119 switch (Info.Pred) { 120 case CmpInst::FCMP_ULT: 121 case CmpInst::FCMP_ULE: 122 if (Info.LHS == Info.True) 123 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 124 else 125 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 126 break; 127 case CmpInst::FCMP_OLE: 128 case CmpInst::FCMP_OLT: { 129 // We need to permute the operands to get the correct NaN behavior. The 130 // selected operand is the second one based on the failing compare with NaN, 131 // so permute it based on the compare type the hardware uses. 132 if (Info.LHS == Info.True) 133 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 134 else 135 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 136 break; 137 } 138 case CmpInst::FCMP_UGE: 139 case CmpInst::FCMP_UGT: { 140 if (Info.LHS == Info.True) 141 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 142 else 143 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 144 break; 145 } 146 case CmpInst::FCMP_OGT: 147 case CmpInst::FCMP_OGE: { 148 if (Info.LHS == Info.True) 149 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 150 else 151 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 152 break; 153 } 154 default: 155 llvm_unreachable("predicate should not have matched"); 156 } 157 158 MI.eraseFromParent(); 159 } 160 161 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 162 Register DstReg = MI.getOperand(0).getReg(); 163 164 // TODO: We could try to match extracting the higher bytes, which would be 165 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 166 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 167 // about in practice. 168 LLT Ty = MRI.getType(DstReg); 169 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 170 Register SrcReg = MI.getOperand(1).getReg(); 171 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 172 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 173 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 174 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 175 } 176 177 return false; 178 } 179 180 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 181 B.setInstrAndDebugLoc(MI); 182 183 const LLT S32 = LLT::scalar(32); 184 185 Register DstReg = MI.getOperand(0).getReg(); 186 Register SrcReg = MI.getOperand(1).getReg(); 187 LLT Ty = MRI.getType(DstReg); 188 LLT SrcTy = MRI.getType(SrcReg); 189 if (SrcTy != S32) 190 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 191 192 if (Ty == S32) { 193 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 194 {SrcReg}, MI.getFlags()); 195 } else { 196 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 197 {SrcReg}, MI.getFlags()); 198 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 199 } 200 201 MI.eraseFromParent(); 202 } 203 204 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 205 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 206 Register SrcReg = MI.getOperand(1).getReg(); 207 208 // Look through G_ZEXT. 209 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 210 211 Register Src0; 212 int64_t ShiftAmt; 213 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 214 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 215 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 216 217 unsigned ShiftOffset = 8 * Offset; 218 if (IsShr) 219 ShiftOffset += ShiftAmt; 220 else 221 ShiftOffset -= ShiftAmt; 222 223 MatchInfo.CvtVal = Src0; 224 MatchInfo.ShiftOffset = ShiftOffset; 225 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 226 } 227 228 // TODO: Simplify demanded bits. 229 return false; 230 } 231 232 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 233 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 234 B.setInstrAndDebugLoc(MI); 235 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 236 237 const LLT S32 = LLT::scalar(32); 238 Register CvtSrc = MatchInfo.CvtVal; 239 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 240 if (SrcTy != S32) { 241 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 242 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 243 } 244 245 assert(MI.getOpcode() != NewOpc); 246 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 247 MI.eraseFromParent(); 248 } 249 250 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize( 251 MachineInstr &MI, Register &Reg) { 252 const SITargetLowering *TLI = static_cast<const SITargetLowering *>( 253 MF.getSubtarget().getTargetLowering()); 254 Reg = MI.getOperand(1).getReg(); 255 return TLI->isCanonicalized(Reg, MF); 256 } 257 258 class AMDGPUPostLegalizerCombinerHelperState { 259 protected: 260 CombinerHelper &Helper; 261 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 262 263 public: 264 AMDGPUPostLegalizerCombinerHelperState( 265 CombinerHelper &Helper, 266 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 267 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 268 }; 269 270 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 271 #include "AMDGPUGenPostLegalizeGICombiner.inc" 272 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 273 274 namespace { 275 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 276 #include "AMDGPUGenPostLegalizeGICombiner.inc" 277 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 278 279 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 280 GISelKnownBits *KB; 281 MachineDominatorTree *MDT; 282 283 public: 284 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 285 286 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 287 const AMDGPULegalizerInfo *LI, 288 GISelKnownBits *KB, MachineDominatorTree *MDT) 289 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 290 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 291 KB(KB), MDT(MDT) { 292 if (!GeneratedRuleCfg.parseCommandLineOption()) 293 report_fatal_error("Invalid rule identifier"); 294 } 295 296 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 297 MachineIRBuilder &B) const override; 298 }; 299 300 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 301 MachineInstr &MI, 302 MachineIRBuilder &B) const { 303 CombinerHelper Helper(Observer, B, KB, MDT, LInfo); 304 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 305 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 306 PostLegalizerHelper); 307 308 if (Generated.tryCombineAll(Observer, MI, B)) 309 return true; 310 311 switch (MI.getOpcode()) { 312 case TargetOpcode::G_SHL: 313 case TargetOpcode::G_LSHR: 314 case TargetOpcode::G_ASHR: 315 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 316 // common case, splitting this into a move and a 32-bit shift is faster and 317 // the same code size. 318 return Helper.tryCombineShiftToUnmerge(MI, 32); 319 } 320 321 return false; 322 } 323 324 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 325 #include "AMDGPUGenPostLegalizeGICombiner.inc" 326 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 327 328 // Pass boilerplate 329 // ================ 330 331 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 332 public: 333 static char ID; 334 335 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 336 337 StringRef getPassName() const override { 338 return "AMDGPUPostLegalizerCombiner"; 339 } 340 341 bool runOnMachineFunction(MachineFunction &MF) override; 342 343 void getAnalysisUsage(AnalysisUsage &AU) const override; 344 private: 345 bool IsOptNone; 346 }; 347 } // end anonymous namespace 348 349 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 350 AU.addRequired<TargetPassConfig>(); 351 AU.setPreservesCFG(); 352 getSelectionDAGFallbackAnalysisUsage(AU); 353 AU.addRequired<GISelKnownBitsAnalysis>(); 354 AU.addPreserved<GISelKnownBitsAnalysis>(); 355 if (!IsOptNone) { 356 AU.addRequired<MachineDominatorTree>(); 357 AU.addPreserved<MachineDominatorTree>(); 358 } 359 MachineFunctionPass::getAnalysisUsage(AU); 360 } 361 362 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 363 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 364 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 365 } 366 367 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 368 if (MF.getProperties().hasProperty( 369 MachineFunctionProperties::Property::FailedISel)) 370 return false; 371 auto *TPC = &getAnalysis<TargetPassConfig>(); 372 const Function &F = MF.getFunction(); 373 bool EnableOpt = 374 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 375 376 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 377 const AMDGPULegalizerInfo *LI 378 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 379 380 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 381 MachineDominatorTree *MDT = 382 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 383 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 384 F.hasMinSize(), LI, KB, MDT); 385 Combiner C(PCInfo, TPC); 386 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 387 } 388 389 char AMDGPUPostLegalizerCombiner::ID = 0; 390 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 391 "Combine AMDGPU machine instrs after legalization", 392 false, false) 393 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 394 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 395 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 396 "Combine AMDGPU machine instrs after legalization", false, 397 false) 398 399 namespace llvm { 400 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 401 return new AMDGPUPostLegalizerCombiner(IsOptNone); 402 } 403 } // end namespace llvm 404