1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUCombinerHelper.h" 16 #include "AMDGPULegalizerInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "llvm/CodeGen/GlobalISel/Combiner.h" 20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/MachineDominators.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/Target/TargetMachine.h" 27 28 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 29 30 using namespace llvm; 31 using namespace MIPatternMatch; 32 33 class AMDGPUPostLegalizerCombinerHelper { 34 protected: 35 MachineIRBuilder &B; 36 MachineFunction &MF; 37 MachineRegisterInfo &MRI; 38 AMDGPUCombinerHelper &Helper; 39 40 public: 41 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, 42 AMDGPUCombinerHelper &Helper) 43 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 44 45 struct FMinFMaxLegacyInfo { 46 Register LHS; 47 Register RHS; 48 Register True; 49 Register False; 50 CmpInst::Predicate Pred; 51 }; 52 53 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 54 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 55 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 56 const FMinFMaxLegacyInfo &Info); 57 58 bool matchUCharToFloat(MachineInstr &MI); 59 void applyUCharToFloat(MachineInstr &MI); 60 61 // FIXME: Should be able to have 2 separate matchdatas rather than custom 62 // struct boilerplate. 63 struct CvtF32UByteMatchInfo { 64 Register CvtVal; 65 unsigned ShiftOffset; 66 }; 67 68 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 69 void applyCvtF32UByteN(MachineInstr &MI, 70 const CvtF32UByteMatchInfo &MatchInfo); 71 72 bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); 73 }; 74 75 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 76 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 77 // FIXME: Combines should have subtarget predicates, and we shouldn't need 78 // this here. 79 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 80 return false; 81 82 // FIXME: Type predicate on pattern 83 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 84 return false; 85 86 Register Cond = MI.getOperand(1).getReg(); 87 if (!MRI.hasOneNonDBGUse(Cond) || 88 !mi_match(Cond, MRI, 89 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 90 return false; 91 92 Info.True = MI.getOperand(2).getReg(); 93 Info.False = MI.getOperand(3).getReg(); 94 95 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 96 !(Info.LHS == Info.False && Info.RHS == Info.True)) 97 return false; 98 99 switch (Info.Pred) { 100 case CmpInst::FCMP_FALSE: 101 case CmpInst::FCMP_OEQ: 102 case CmpInst::FCMP_ONE: 103 case CmpInst::FCMP_ORD: 104 case CmpInst::FCMP_UNO: 105 case CmpInst::FCMP_UEQ: 106 case CmpInst::FCMP_UNE: 107 case CmpInst::FCMP_TRUE: 108 return false; 109 default: 110 return true; 111 } 112 } 113 114 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 115 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 116 B.setInstrAndDebugLoc(MI); 117 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 118 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 119 }; 120 121 switch (Info.Pred) { 122 case CmpInst::FCMP_ULT: 123 case CmpInst::FCMP_ULE: 124 if (Info.LHS == Info.True) 125 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 126 else 127 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 128 break; 129 case CmpInst::FCMP_OLE: 130 case CmpInst::FCMP_OLT: { 131 // We need to permute the operands to get the correct NaN behavior. The 132 // selected operand is the second one based on the failing compare with NaN, 133 // so permute it based on the compare type the hardware uses. 134 if (Info.LHS == Info.True) 135 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 136 else 137 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 138 break; 139 } 140 case CmpInst::FCMP_UGE: 141 case CmpInst::FCMP_UGT: { 142 if (Info.LHS == Info.True) 143 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 144 else 145 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 146 break; 147 } 148 case CmpInst::FCMP_OGT: 149 case CmpInst::FCMP_OGE: { 150 if (Info.LHS == Info.True) 151 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 152 else 153 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 154 break; 155 } 156 default: 157 llvm_unreachable("predicate should not have matched"); 158 } 159 160 MI.eraseFromParent(); 161 } 162 163 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 164 Register DstReg = MI.getOperand(0).getReg(); 165 166 // TODO: We could try to match extracting the higher bytes, which would be 167 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 168 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 169 // about in practice. 170 LLT Ty = MRI.getType(DstReg); 171 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 172 Register SrcReg = MI.getOperand(1).getReg(); 173 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 174 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 175 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 176 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 177 } 178 179 return false; 180 } 181 182 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 183 B.setInstrAndDebugLoc(MI); 184 185 const LLT S32 = LLT::scalar(32); 186 187 Register DstReg = MI.getOperand(0).getReg(); 188 Register SrcReg = MI.getOperand(1).getReg(); 189 LLT Ty = MRI.getType(DstReg); 190 LLT SrcTy = MRI.getType(SrcReg); 191 if (SrcTy != S32) 192 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 193 194 if (Ty == S32) { 195 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 196 {SrcReg}, MI.getFlags()); 197 } else { 198 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 199 {SrcReg}, MI.getFlags()); 200 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 201 } 202 203 MI.eraseFromParent(); 204 } 205 206 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 207 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 208 Register SrcReg = MI.getOperand(1).getReg(); 209 210 // Look through G_ZEXT. 211 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 212 213 Register Src0; 214 int64_t ShiftAmt; 215 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 216 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 217 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 218 219 unsigned ShiftOffset = 8 * Offset; 220 if (IsShr) 221 ShiftOffset += ShiftAmt; 222 else 223 ShiftOffset -= ShiftAmt; 224 225 MatchInfo.CvtVal = Src0; 226 MatchInfo.ShiftOffset = ShiftOffset; 227 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 228 } 229 230 // TODO: Simplify demanded bits. 231 return false; 232 } 233 234 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 235 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 236 B.setInstrAndDebugLoc(MI); 237 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 238 239 const LLT S32 = LLT::scalar(32); 240 Register CvtSrc = MatchInfo.CvtVal; 241 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 242 if (SrcTy != S32) { 243 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 244 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 245 } 246 247 assert(MI.getOpcode() != NewOpc); 248 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 249 MI.eraseFromParent(); 250 } 251 252 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize( 253 MachineInstr &MI, Register &Reg) { 254 const SITargetLowering *TLI = static_cast<const SITargetLowering *>( 255 MF.getSubtarget().getTargetLowering()); 256 Reg = MI.getOperand(1).getReg(); 257 return TLI->isCanonicalized(Reg, MF); 258 } 259 260 class AMDGPUPostLegalizerCombinerHelperState { 261 protected: 262 AMDGPUCombinerHelper &Helper; 263 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 264 265 public: 266 AMDGPUPostLegalizerCombinerHelperState( 267 AMDGPUCombinerHelper &Helper, 268 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 269 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 270 }; 271 272 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 273 #include "AMDGPUGenPostLegalizeGICombiner.inc" 274 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 275 276 namespace { 277 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 278 #include "AMDGPUGenPostLegalizeGICombiner.inc" 279 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 280 281 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 282 GISelKnownBits *KB; 283 MachineDominatorTree *MDT; 284 285 public: 286 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 287 288 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 289 const AMDGPULegalizerInfo *LI, 290 GISelKnownBits *KB, MachineDominatorTree *MDT) 291 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 292 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 293 KB(KB), MDT(MDT) { 294 if (!GeneratedRuleCfg.parseCommandLineOption()) 295 report_fatal_error("Invalid rule identifier"); 296 } 297 298 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 299 MachineIRBuilder &B) const override; 300 }; 301 302 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 303 MachineInstr &MI, 304 MachineIRBuilder &B) const { 305 AMDGPUCombinerHelper Helper(Observer, B, KB, MDT, LInfo); 306 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 307 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 308 PostLegalizerHelper); 309 310 if (Generated.tryCombineAll(Observer, MI, B)) 311 return true; 312 313 switch (MI.getOpcode()) { 314 case TargetOpcode::G_SHL: 315 case TargetOpcode::G_LSHR: 316 case TargetOpcode::G_ASHR: 317 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 318 // common case, splitting this into a move and a 32-bit shift is faster and 319 // the same code size. 320 return Helper.tryCombineShiftToUnmerge(MI, 32); 321 } 322 323 return false; 324 } 325 326 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 327 #include "AMDGPUGenPostLegalizeGICombiner.inc" 328 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 329 330 // Pass boilerplate 331 // ================ 332 333 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 334 public: 335 static char ID; 336 337 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 338 339 StringRef getPassName() const override { 340 return "AMDGPUPostLegalizerCombiner"; 341 } 342 343 bool runOnMachineFunction(MachineFunction &MF) override; 344 345 void getAnalysisUsage(AnalysisUsage &AU) const override; 346 private: 347 bool IsOptNone; 348 }; 349 } // end anonymous namespace 350 351 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 352 AU.addRequired<TargetPassConfig>(); 353 AU.setPreservesCFG(); 354 getSelectionDAGFallbackAnalysisUsage(AU); 355 AU.addRequired<GISelKnownBitsAnalysis>(); 356 AU.addPreserved<GISelKnownBitsAnalysis>(); 357 if (!IsOptNone) { 358 AU.addRequired<MachineDominatorTree>(); 359 AU.addPreserved<MachineDominatorTree>(); 360 } 361 MachineFunctionPass::getAnalysisUsage(AU); 362 } 363 364 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 365 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 366 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 367 } 368 369 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 370 if (MF.getProperties().hasProperty( 371 MachineFunctionProperties::Property::FailedISel)) 372 return false; 373 auto *TPC = &getAnalysis<TargetPassConfig>(); 374 const Function &F = MF.getFunction(); 375 bool EnableOpt = 376 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 377 378 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 379 const AMDGPULegalizerInfo *LI 380 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 381 382 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 383 MachineDominatorTree *MDT = 384 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 385 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 386 F.hasMinSize(), LI, KB, MDT); 387 Combiner C(PCInfo, TPC); 388 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 389 } 390 391 char AMDGPUPostLegalizerCombiner::ID = 0; 392 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 393 "Combine AMDGPU machine instrs after legalization", 394 false, false) 395 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 396 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 397 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 398 "Combine AMDGPU machine instrs after legalization", false, 399 false) 400 401 namespace llvm { 402 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 403 return new AMDGPUPostLegalizerCombiner(IsOptNone); 404 } 405 } // end namespace llvm 406