1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUCombinerHelper.h" 16 #include "AMDGPULegalizerInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "llvm/CodeGen/GlobalISel/Combiner.h" 20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/MachineDominators.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 #include "llvm/Target/TargetMachine.h" 28 29 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 30 31 using namespace llvm; 32 using namespace MIPatternMatch; 33 34 class AMDGPUPostLegalizerCombinerHelper { 35 protected: 36 MachineIRBuilder &B; 37 MachineFunction &MF; 38 MachineRegisterInfo &MRI; 39 AMDGPUCombinerHelper &Helper; 40 41 public: 42 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, 43 AMDGPUCombinerHelper &Helper) 44 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 45 46 struct FMinFMaxLegacyInfo { 47 Register LHS; 48 Register RHS; 49 Register True; 50 Register False; 51 CmpInst::Predicate Pred; 52 }; 53 54 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 55 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 56 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 57 const FMinFMaxLegacyInfo &Info); 58 59 bool matchUCharToFloat(MachineInstr &MI); 60 void applyUCharToFloat(MachineInstr &MI); 61 62 bool matchRcpSqrtToRsq(MachineInstr &MI, 63 std::function<void(MachineIRBuilder &)> &MatchInfo); 64 65 // FIXME: Should be able to have 2 separate matchdatas rather than custom 66 // struct boilerplate. 67 struct CvtF32UByteMatchInfo { 68 Register CvtVal; 69 unsigned ShiftOffset; 70 }; 71 72 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 73 void applyCvtF32UByteN(MachineInstr &MI, 74 const CvtF32UByteMatchInfo &MatchInfo); 75 76 bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); 77 }; 78 79 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 80 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 81 // FIXME: Combines should have subtarget predicates, and we shouldn't need 82 // this here. 83 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 84 return false; 85 86 // FIXME: Type predicate on pattern 87 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 88 return false; 89 90 Register Cond = MI.getOperand(1).getReg(); 91 if (!MRI.hasOneNonDBGUse(Cond) || 92 !mi_match(Cond, MRI, 93 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 94 return false; 95 96 Info.True = MI.getOperand(2).getReg(); 97 Info.False = MI.getOperand(3).getReg(); 98 99 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 100 !(Info.LHS == Info.False && Info.RHS == Info.True)) 101 return false; 102 103 switch (Info.Pred) { 104 case CmpInst::FCMP_FALSE: 105 case CmpInst::FCMP_OEQ: 106 case CmpInst::FCMP_ONE: 107 case CmpInst::FCMP_ORD: 108 case CmpInst::FCMP_UNO: 109 case CmpInst::FCMP_UEQ: 110 case CmpInst::FCMP_UNE: 111 case CmpInst::FCMP_TRUE: 112 return false; 113 default: 114 return true; 115 } 116 } 117 118 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 119 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 120 B.setInstrAndDebugLoc(MI); 121 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 122 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 123 }; 124 125 switch (Info.Pred) { 126 case CmpInst::FCMP_ULT: 127 case CmpInst::FCMP_ULE: 128 if (Info.LHS == Info.True) 129 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 130 else 131 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 132 break; 133 case CmpInst::FCMP_OLE: 134 case CmpInst::FCMP_OLT: { 135 // We need to permute the operands to get the correct NaN behavior. The 136 // selected operand is the second one based on the failing compare with NaN, 137 // so permute it based on the compare type the hardware uses. 138 if (Info.LHS == Info.True) 139 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 140 else 141 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 142 break; 143 } 144 case CmpInst::FCMP_UGE: 145 case CmpInst::FCMP_UGT: { 146 if (Info.LHS == Info.True) 147 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 148 else 149 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 150 break; 151 } 152 case CmpInst::FCMP_OGT: 153 case CmpInst::FCMP_OGE: { 154 if (Info.LHS == Info.True) 155 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 156 else 157 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 158 break; 159 } 160 default: 161 llvm_unreachable("predicate should not have matched"); 162 } 163 164 MI.eraseFromParent(); 165 } 166 167 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 168 Register DstReg = MI.getOperand(0).getReg(); 169 170 // TODO: We could try to match extracting the higher bytes, which would be 171 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 172 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 173 // about in practice. 174 LLT Ty = MRI.getType(DstReg); 175 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 176 Register SrcReg = MI.getOperand(1).getReg(); 177 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 178 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 179 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 180 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 181 } 182 183 return false; 184 } 185 186 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 187 B.setInstrAndDebugLoc(MI); 188 189 const LLT S32 = LLT::scalar(32); 190 191 Register DstReg = MI.getOperand(0).getReg(); 192 Register SrcReg = MI.getOperand(1).getReg(); 193 LLT Ty = MRI.getType(DstReg); 194 LLT SrcTy = MRI.getType(SrcReg); 195 if (SrcTy != S32) 196 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 197 198 if (Ty == S32) { 199 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 200 {SrcReg}, MI.getFlags()); 201 } else { 202 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 203 {SrcReg}, MI.getFlags()); 204 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 205 } 206 207 MI.eraseFromParent(); 208 } 209 210 bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq( 211 MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { 212 213 auto getRcpSrc = [=](const MachineInstr &MI) { 214 MachineInstr *ResMI = nullptr; 215 if (MI.getOpcode() == TargetOpcode::G_INTRINSIC && 216 MI.getIntrinsicID() == Intrinsic::amdgcn_rcp) 217 ResMI = MRI.getVRegDef(MI.getOperand(2).getReg()); 218 219 return ResMI; 220 }; 221 222 auto getSqrtSrc = [=](const MachineInstr &MI) { 223 MachineInstr *SqrtSrcMI = nullptr; 224 mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); 225 return SqrtSrcMI; 226 }; 227 228 MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; 229 // rcp(sqrt(x)) 230 if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { 231 MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { 232 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) 233 .addUse(SqrtSrcMI->getOperand(0).getReg()) 234 .setMIFlags(MI.getFlags()); 235 }; 236 return true; 237 } 238 239 // sqrt(rcp(x)) 240 if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { 241 MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { 242 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) 243 .addUse(RcpSrcMI->getOperand(0).getReg()) 244 .setMIFlags(MI.getFlags()); 245 }; 246 return true; 247 } 248 249 return false; 250 } 251 252 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 253 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 254 Register SrcReg = MI.getOperand(1).getReg(); 255 256 // Look through G_ZEXT. 257 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 258 259 Register Src0; 260 int64_t ShiftAmt; 261 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 262 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 263 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 264 265 unsigned ShiftOffset = 8 * Offset; 266 if (IsShr) 267 ShiftOffset += ShiftAmt; 268 else 269 ShiftOffset -= ShiftAmt; 270 271 MatchInfo.CvtVal = Src0; 272 MatchInfo.ShiftOffset = ShiftOffset; 273 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 274 } 275 276 // TODO: Simplify demanded bits. 277 return false; 278 } 279 280 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 281 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 282 B.setInstrAndDebugLoc(MI); 283 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 284 285 const LLT S32 = LLT::scalar(32); 286 Register CvtSrc = MatchInfo.CvtVal; 287 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 288 if (SrcTy != S32) { 289 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 290 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 291 } 292 293 assert(MI.getOpcode() != NewOpc); 294 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 295 MI.eraseFromParent(); 296 } 297 298 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize( 299 MachineInstr &MI, Register &Reg) { 300 const SITargetLowering *TLI = static_cast<const SITargetLowering *>( 301 MF.getSubtarget().getTargetLowering()); 302 Reg = MI.getOperand(1).getReg(); 303 return TLI->isCanonicalized(Reg, MF); 304 } 305 306 class AMDGPUPostLegalizerCombinerHelperState { 307 protected: 308 AMDGPUCombinerHelper &Helper; 309 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 310 311 public: 312 AMDGPUPostLegalizerCombinerHelperState( 313 AMDGPUCombinerHelper &Helper, 314 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper) 315 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {} 316 }; 317 318 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 319 #include "AMDGPUGenPostLegalizeGICombiner.inc" 320 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 321 322 namespace { 323 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 324 #include "AMDGPUGenPostLegalizeGICombiner.inc" 325 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 326 327 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 328 GISelKnownBits *KB; 329 MachineDominatorTree *MDT; 330 331 public: 332 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 333 334 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 335 const AMDGPULegalizerInfo *LI, 336 GISelKnownBits *KB, MachineDominatorTree *MDT) 337 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 338 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 339 KB(KB), MDT(MDT) { 340 if (!GeneratedRuleCfg.parseCommandLineOption()) 341 report_fatal_error("Invalid rule identifier"); 342 } 343 344 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 345 MachineIRBuilder &B) const override; 346 }; 347 348 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 349 MachineInstr &MI, 350 MachineIRBuilder &B) const { 351 AMDGPUCombinerHelper Helper(Observer, B, KB, MDT, LInfo); 352 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 353 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper, 354 PostLegalizerHelper); 355 356 if (Generated.tryCombineAll(Observer, MI, B)) 357 return true; 358 359 switch (MI.getOpcode()) { 360 case TargetOpcode::G_SHL: 361 case TargetOpcode::G_LSHR: 362 case TargetOpcode::G_ASHR: 363 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 364 // common case, splitting this into a move and a 32-bit shift is faster and 365 // the same code size. 366 return Helper.tryCombineShiftToUnmerge(MI, 32); 367 } 368 369 return false; 370 } 371 372 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 373 #include "AMDGPUGenPostLegalizeGICombiner.inc" 374 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 375 376 // Pass boilerplate 377 // ================ 378 379 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 380 public: 381 static char ID; 382 383 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 384 385 StringRef getPassName() const override { 386 return "AMDGPUPostLegalizerCombiner"; 387 } 388 389 bool runOnMachineFunction(MachineFunction &MF) override; 390 391 void getAnalysisUsage(AnalysisUsage &AU) const override; 392 private: 393 bool IsOptNone; 394 }; 395 } // end anonymous namespace 396 397 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 398 AU.addRequired<TargetPassConfig>(); 399 AU.setPreservesCFG(); 400 getSelectionDAGFallbackAnalysisUsage(AU); 401 AU.addRequired<GISelKnownBitsAnalysis>(); 402 AU.addPreserved<GISelKnownBitsAnalysis>(); 403 if (!IsOptNone) { 404 AU.addRequired<MachineDominatorTree>(); 405 AU.addPreserved<MachineDominatorTree>(); 406 } 407 MachineFunctionPass::getAnalysisUsage(AU); 408 } 409 410 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 411 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 412 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 413 } 414 415 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 416 if (MF.getProperties().hasProperty( 417 MachineFunctionProperties::Property::FailedISel)) 418 return false; 419 auto *TPC = &getAnalysis<TargetPassConfig>(); 420 const Function &F = MF.getFunction(); 421 bool EnableOpt = 422 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 423 424 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 425 const AMDGPULegalizerInfo *LI 426 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 427 428 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 429 MachineDominatorTree *MDT = 430 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 431 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 432 F.hasMinSize(), LI, KB, MDT); 433 Combiner C(PCInfo, TPC); 434 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 435 } 436 437 char AMDGPUPostLegalizerCombiner::ID = 0; 438 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 439 "Combine AMDGPU machine instrs after legalization", 440 false, false) 441 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 442 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 443 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 444 "Combine AMDGPU machine instrs after legalization", false, 445 false) 446 447 namespace llvm { 448 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 449 return new AMDGPUPostLegalizerCombiner(IsOptNone); 450 } 451 } // end namespace llvm 452