1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUCombinerHelper.h" 16 #include "AMDGPULegalizerInfo.h" 17 #include "GCNSubtarget.h" 18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 19 #include "llvm/CodeGen/GlobalISel/Combiner.h" 20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 24 #include "llvm/CodeGen/MachineDominators.h" 25 #include "llvm/CodeGen/TargetPassConfig.h" 26 #include "llvm/IR/IntrinsicsAMDGPU.h" 27 #include "llvm/Target/TargetMachine.h" 28 29 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 30 31 using namespace llvm; 32 using namespace MIPatternMatch; 33 34 class AMDGPUPostLegalizerCombinerHelper { 35 protected: 36 MachineIRBuilder &B; 37 MachineFunction &MF; 38 MachineRegisterInfo &MRI; 39 AMDGPUCombinerHelper &Helper; 40 41 public: 42 AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, 43 AMDGPUCombinerHelper &Helper) 44 : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){}; 45 46 struct FMinFMaxLegacyInfo { 47 Register LHS; 48 Register RHS; 49 Register True; 50 Register False; 51 CmpInst::Predicate Pred; 52 }; 53 54 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 55 bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info); 56 void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 57 const FMinFMaxLegacyInfo &Info); 58 59 bool matchUCharToFloat(MachineInstr &MI); 60 void applyUCharToFloat(MachineInstr &MI); 61 62 bool matchRcpSqrtToRsq(MachineInstr &MI, 63 std::function<void(MachineIRBuilder &)> &MatchInfo); 64 65 // FIXME: Should be able to have 2 separate matchdatas rather than custom 66 // struct boilerplate. 67 struct CvtF32UByteMatchInfo { 68 Register CvtVal; 69 unsigned ShiftOffset; 70 }; 71 72 bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo); 73 void applyCvtF32UByteN(MachineInstr &MI, 74 const CvtF32UByteMatchInfo &MatchInfo); 75 76 bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg); 77 }; 78 79 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy( 80 MachineInstr &MI, FMinFMaxLegacyInfo &Info) { 81 // FIXME: Type predicate on pattern 82 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 83 return false; 84 85 Register Cond = MI.getOperand(1).getReg(); 86 if (!MRI.hasOneNonDBGUse(Cond) || 87 !mi_match(Cond, MRI, 88 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 89 return false; 90 91 Info.True = MI.getOperand(2).getReg(); 92 Info.False = MI.getOperand(3).getReg(); 93 94 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 95 !(Info.LHS == Info.False && Info.RHS == Info.True)) 96 return false; 97 98 switch (Info.Pred) { 99 case CmpInst::FCMP_FALSE: 100 case CmpInst::FCMP_OEQ: 101 case CmpInst::FCMP_ONE: 102 case CmpInst::FCMP_ORD: 103 case CmpInst::FCMP_UNO: 104 case CmpInst::FCMP_UEQ: 105 case CmpInst::FCMP_UNE: 106 case CmpInst::FCMP_TRUE: 107 return false; 108 default: 109 return true; 110 } 111 } 112 113 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy( 114 MachineInstr &MI, const FMinFMaxLegacyInfo &Info) { 115 B.setInstrAndDebugLoc(MI); 116 auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) { 117 B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 118 }; 119 120 switch (Info.Pred) { 121 case CmpInst::FCMP_ULT: 122 case CmpInst::FCMP_ULE: 123 if (Info.LHS == Info.True) 124 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 125 else 126 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 127 break; 128 case CmpInst::FCMP_OLE: 129 case CmpInst::FCMP_OLT: { 130 // We need to permute the operands to get the correct NaN behavior. The 131 // selected operand is the second one based on the failing compare with NaN, 132 // so permute it based on the compare type the hardware uses. 133 if (Info.LHS == Info.True) 134 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 135 else 136 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 137 break; 138 } 139 case CmpInst::FCMP_UGE: 140 case CmpInst::FCMP_UGT: { 141 if (Info.LHS == Info.True) 142 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 143 else 144 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 145 break; 146 } 147 case CmpInst::FCMP_OGT: 148 case CmpInst::FCMP_OGE: { 149 if (Info.LHS == Info.True) 150 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 151 else 152 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 153 break; 154 } 155 default: 156 llvm_unreachable("predicate should not have matched"); 157 } 158 159 MI.eraseFromParent(); 160 } 161 162 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) { 163 Register DstReg = MI.getOperand(0).getReg(); 164 165 // TODO: We could try to match extracting the higher bytes, which would be 166 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 167 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 168 // about in practice. 169 LLT Ty = MRI.getType(DstReg); 170 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 171 Register SrcReg = MI.getOperand(1).getReg(); 172 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 173 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 174 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 175 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 176 } 177 178 return false; 179 } 180 181 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) { 182 B.setInstrAndDebugLoc(MI); 183 184 const LLT S32 = LLT::scalar(32); 185 186 Register DstReg = MI.getOperand(0).getReg(); 187 Register SrcReg = MI.getOperand(1).getReg(); 188 LLT Ty = MRI.getType(DstReg); 189 LLT SrcTy = MRI.getType(SrcReg); 190 if (SrcTy != S32) 191 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 192 193 if (Ty == S32) { 194 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 195 {SrcReg}, MI.getFlags()); 196 } else { 197 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 198 {SrcReg}, MI.getFlags()); 199 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 200 } 201 202 MI.eraseFromParent(); 203 } 204 205 bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq( 206 MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) { 207 208 auto getRcpSrc = [=](const MachineInstr &MI) { 209 MachineInstr *ResMI = nullptr; 210 if (MI.getOpcode() == TargetOpcode::G_INTRINSIC && 211 MI.getIntrinsicID() == Intrinsic::amdgcn_rcp) 212 ResMI = MRI.getVRegDef(MI.getOperand(2).getReg()); 213 214 return ResMI; 215 }; 216 217 auto getSqrtSrc = [=](const MachineInstr &MI) { 218 MachineInstr *SqrtSrcMI = nullptr; 219 auto Match = 220 mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI))); 221 (void)Match; 222 return SqrtSrcMI; 223 }; 224 225 MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr; 226 // rcp(sqrt(x)) 227 if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) { 228 MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) { 229 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) 230 .addUse(SqrtSrcMI->getOperand(0).getReg()) 231 .setMIFlags(MI.getFlags()); 232 }; 233 return true; 234 } 235 236 // sqrt(rcp(x)) 237 if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) { 238 MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) { 239 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false) 240 .addUse(RcpSrcMI->getOperand(0).getReg()) 241 .setMIFlags(MI.getFlags()); 242 }; 243 return true; 244 } 245 246 return false; 247 } 248 249 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN( 250 MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) { 251 Register SrcReg = MI.getOperand(1).getReg(); 252 253 // Look through G_ZEXT. 254 bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 255 256 Register Src0; 257 int64_t ShiftAmt; 258 IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 259 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 260 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 261 262 unsigned ShiftOffset = 8 * Offset; 263 if (IsShr) 264 ShiftOffset += ShiftAmt; 265 else 266 ShiftOffset -= ShiftAmt; 267 268 MatchInfo.CvtVal = Src0; 269 MatchInfo.ShiftOffset = ShiftOffset; 270 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 271 } 272 273 // TODO: Simplify demanded bits. 274 return false; 275 } 276 277 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN( 278 MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) { 279 B.setInstrAndDebugLoc(MI); 280 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 281 282 const LLT S32 = LLT::scalar(32); 283 Register CvtSrc = MatchInfo.CvtVal; 284 LLT SrcTy = MRI.getType(MatchInfo.CvtVal); 285 if (SrcTy != S32) { 286 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 287 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 288 } 289 290 assert(MI.getOpcode() != NewOpc); 291 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 292 MI.eraseFromParent(); 293 } 294 295 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize( 296 MachineInstr &MI, Register &Reg) { 297 const SITargetLowering *TLI = static_cast<const SITargetLowering *>( 298 MF.getSubtarget().getTargetLowering()); 299 Reg = MI.getOperand(1).getReg(); 300 return TLI->isCanonicalized(Reg, MF); 301 } 302 303 class AMDGPUPostLegalizerCombinerHelperState { 304 protected: 305 AMDGPUCombinerHelper &Helper; 306 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper; 307 308 // Note: pointer is necessary because Target Predicates use 309 // "Subtarget->" 310 const GCNSubtarget *Subtarget; 311 312 public: 313 AMDGPUPostLegalizerCombinerHelperState( 314 AMDGPUCombinerHelper &Helper, 315 AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper, 316 const GCNSubtarget &Subtarget) 317 : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper), 318 Subtarget(&Subtarget) {} 319 }; 320 321 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 322 #include "AMDGPUGenPostLegalizeGICombiner.inc" 323 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 324 325 namespace { 326 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 327 #include "AMDGPUGenPostLegalizeGICombiner.inc" 328 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 329 330 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo { 331 GISelKnownBits *KB; 332 MachineDominatorTree *MDT; 333 const GCNSubtarget &Subtarget; 334 335 public: 336 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 337 338 AMDGPUPostLegalizerCombinerInfo(const GCNSubtarget &Subtarget, bool EnableOpt, 339 bool OptSize, bool MinSize, 340 const AMDGPULegalizerInfo *LI, 341 GISelKnownBits *KB, MachineDominatorTree *MDT) 342 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 343 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 344 KB(KB), MDT(MDT), Subtarget(Subtarget) { 345 if (!GeneratedRuleCfg.parseCommandLineOption()) 346 report_fatal_error("Invalid rule identifier"); 347 } 348 349 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 350 MachineIRBuilder &B) const override; 351 }; 352 353 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 354 MachineInstr &MI, 355 MachineIRBuilder &B) const { 356 AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT, 357 LInfo); 358 AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper); 359 AMDGPUGenPostLegalizerCombinerHelper Generated( 360 GeneratedRuleCfg, Helper, PostLegalizerHelper, Subtarget); 361 362 if (Generated.tryCombineAll(Observer, MI, B)) 363 return true; 364 365 switch (MI.getOpcode()) { 366 case TargetOpcode::G_SHL: 367 case TargetOpcode::G_LSHR: 368 case TargetOpcode::G_ASHR: 369 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 370 // common case, splitting this into a move and a 32-bit shift is faster and 371 // the same code size. 372 return Helper.tryCombineShiftToUnmerge(MI, 32); 373 } 374 375 return false; 376 } 377 378 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 379 #include "AMDGPUGenPostLegalizeGICombiner.inc" 380 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 381 382 // Pass boilerplate 383 // ================ 384 385 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 386 public: 387 static char ID; 388 389 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 390 391 StringRef getPassName() const override { 392 return "AMDGPUPostLegalizerCombiner"; 393 } 394 395 bool runOnMachineFunction(MachineFunction &MF) override; 396 397 void getAnalysisUsage(AnalysisUsage &AU) const override; 398 private: 399 bool IsOptNone; 400 }; 401 } // end anonymous namespace 402 403 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 404 AU.addRequired<TargetPassConfig>(); 405 AU.setPreservesCFG(); 406 getSelectionDAGFallbackAnalysisUsage(AU); 407 AU.addRequired<GISelKnownBitsAnalysis>(); 408 AU.addPreserved<GISelKnownBitsAnalysis>(); 409 if (!IsOptNone) { 410 AU.addRequired<MachineDominatorTree>(); 411 AU.addPreserved<MachineDominatorTree>(); 412 } 413 MachineFunctionPass::getAnalysisUsage(AU); 414 } 415 416 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 417 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 418 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 419 } 420 421 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 422 if (MF.getProperties().hasProperty( 423 MachineFunctionProperties::Property::FailedISel)) 424 return false; 425 auto *TPC = &getAnalysis<TargetPassConfig>(); 426 const Function &F = MF.getFunction(); 427 bool EnableOpt = 428 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 429 430 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 431 const AMDGPULegalizerInfo *LI 432 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 433 434 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 435 MachineDominatorTree *MDT = 436 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 437 AMDGPUPostLegalizerCombinerInfo PCInfo(ST, EnableOpt, F.hasOptSize(), 438 F.hasMinSize(), LI, KB, MDT); 439 Combiner C(PCInfo, TPC); 440 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 441 } 442 443 char AMDGPUPostLegalizerCombiner::ID = 0; 444 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 445 "Combine AMDGPU machine instrs after legalization", 446 false, false) 447 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 448 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 449 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 450 "Combine AMDGPU machine instrs after legalization", false, 451 false) 452 453 namespace llvm { 454 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 455 return new AMDGPUPostLegalizerCombiner(IsOptNone); 456 } 457 } // end namespace llvm 458