1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass does combining of machine instructions at the generic MI level, 10 // after the legalizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPUTargetMachine.h" 15 #include "AMDGPULegalizerInfo.h" 16 #include "llvm/CodeGen/GlobalISel/Combiner.h" 17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" 18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" 19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" 20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" 21 #include "llvm/CodeGen/MachineDominators.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/TargetPassConfig.h" 24 #include "llvm/Support/Debug.h" 25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 26 27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner" 28 29 using namespace llvm; 30 using namespace MIPatternMatch; 31 32 struct FMinFMaxLegacyInfo { 33 Register LHS; 34 Register RHS; 35 Register True; 36 Register False; 37 CmpInst::Predicate Pred; 38 }; 39 40 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize 41 static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, 42 MachineFunction &MF, FMinFMaxLegacyInfo &Info) { 43 // FIXME: Combines should have subtarget predicates, and we shouldn't need 44 // this here. 45 if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) 46 return false; 47 48 // FIXME: Type predicate on pattern 49 if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) 50 return false; 51 52 Register Cond = MI.getOperand(1).getReg(); 53 if (!MRI.hasOneNonDBGUse(Cond) || 54 !mi_match(Cond, MRI, 55 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) 56 return false; 57 58 Info.True = MI.getOperand(2).getReg(); 59 Info.False = MI.getOperand(3).getReg(); 60 61 if (!(Info.LHS == Info.True && Info.RHS == Info.False) && 62 !(Info.LHS == Info.False && Info.RHS == Info.True)) 63 return false; 64 65 switch (Info.Pred) { 66 case CmpInst::FCMP_FALSE: 67 case CmpInst::FCMP_OEQ: 68 case CmpInst::FCMP_ONE: 69 case CmpInst::FCMP_ORD: 70 case CmpInst::FCMP_UNO: 71 case CmpInst::FCMP_UEQ: 72 case CmpInst::FCMP_UNE: 73 case CmpInst::FCMP_TRUE: 74 return false; 75 default: 76 return true; 77 } 78 } 79 80 static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, 81 const FMinFMaxLegacyInfo &Info) { 82 83 auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) { 84 MachineIRBuilder MIB(MI); 85 MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); 86 }; 87 88 switch (Info.Pred) { 89 case CmpInst::FCMP_ULT: 90 case CmpInst::FCMP_ULE: 91 if (Info.LHS == Info.True) 92 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 93 else 94 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 95 break; 96 case CmpInst::FCMP_OLE: 97 case CmpInst::FCMP_OLT: { 98 // We need to permute the operands to get the correct NaN behavior. The 99 // selected operand is the second one based on the failing compare with NaN, 100 // so permute it based on the compare type the hardware uses. 101 if (Info.LHS == Info.True) 102 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 103 else 104 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 105 break; 106 } 107 case CmpInst::FCMP_UGE: 108 case CmpInst::FCMP_UGT: { 109 if (Info.LHS == Info.True) 110 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); 111 else 112 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); 113 break; 114 } 115 case CmpInst::FCMP_OGT: 116 case CmpInst::FCMP_OGE: { 117 if (Info.LHS == Info.True) 118 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); 119 else 120 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); 121 break; 122 } 123 default: 124 llvm_unreachable("predicate should not have matched"); 125 } 126 127 MI.eraseFromParent(); 128 } 129 130 static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, 131 MachineFunction &MF, CombinerHelper &Helper) { 132 Register DstReg = MI.getOperand(0).getReg(); 133 134 // TODO: We could try to match extracting the higher bytes, which would be 135 // easier if i8 vectors weren't promoted to i32 vectors, particularly after 136 // types are legalized. v4i8 -> v4f32 is probably the only case to worry 137 // about in practice. 138 LLT Ty = MRI.getType(DstReg); 139 if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { 140 Register SrcReg = MI.getOperand(1).getReg(); 141 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); 142 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); 143 const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); 144 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); 145 } 146 147 return false; 148 } 149 150 static void applyUCharToFloat(MachineInstr &MI) { 151 MachineIRBuilder B(MI); 152 153 const LLT S32 = LLT::scalar(32); 154 155 Register DstReg = MI.getOperand(0).getReg(); 156 Register SrcReg = MI.getOperand(1).getReg(); 157 LLT Ty = B.getMRI()->getType(DstReg); 158 LLT SrcTy = B.getMRI()->getType(SrcReg); 159 if (SrcTy != S32) 160 SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); 161 162 if (Ty == S32) { 163 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, 164 {SrcReg}, MI.getFlags()); 165 } else { 166 auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, 167 {SrcReg}, MI.getFlags()); 168 B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); 169 } 170 171 MI.eraseFromParent(); 172 } 173 174 // FIXME: Should be able to have 2 separate matchdatas rather than custom struct 175 // boilerplate. 176 struct CvtF32UByteMatchInfo { 177 Register CvtVal; 178 unsigned ShiftOffset; 179 }; 180 181 static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI, 182 MachineFunction &MF, 183 CvtF32UByteMatchInfo &MatchInfo) { 184 Register SrcReg = MI.getOperand(1).getReg(); 185 186 // Look through G_ZEXT. 187 mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); 188 189 Register Src0; 190 int64_t ShiftAmt; 191 bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); 192 if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { 193 const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; 194 195 unsigned ShiftOffset = 8 * Offset; 196 if (IsShr) 197 ShiftOffset += ShiftAmt; 198 else 199 ShiftOffset -= ShiftAmt; 200 201 MatchInfo.CvtVal = Src0; 202 MatchInfo.ShiftOffset = ShiftOffset; 203 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; 204 } 205 206 // TODO: Simplify demanded bits. 207 return false; 208 } 209 210 static void applyCvtF32UByteN(MachineInstr &MI, 211 const CvtF32UByteMatchInfo &MatchInfo) { 212 MachineIRBuilder B(MI); 213 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; 214 215 const LLT S32 = LLT::scalar(32); 216 Register CvtSrc = MatchInfo.CvtVal; 217 LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal); 218 if (SrcTy != S32) { 219 assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); 220 CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); 221 } 222 223 assert(MI.getOpcode() != NewOpc); 224 B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); 225 MI.eraseFromParent(); 226 } 227 228 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 229 #include "AMDGPUGenPostLegalizeGICombiner.inc" 230 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS 231 232 namespace { 233 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 234 #include "AMDGPUGenPostLegalizeGICombiner.inc" 235 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H 236 237 class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { 238 GISelKnownBits *KB; 239 MachineDominatorTree *MDT; 240 241 public: 242 AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; 243 244 AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, 245 const AMDGPULegalizerInfo *LI, 246 GISelKnownBits *KB, MachineDominatorTree *MDT) 247 : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, 248 /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), 249 KB(KB), MDT(MDT) { 250 if (!GeneratedRuleCfg.parseCommandLineOption()) 251 report_fatal_error("Invalid rule identifier"); 252 } 253 254 bool combine(GISelChangeObserver &Observer, MachineInstr &MI, 255 MachineIRBuilder &B) const override; 256 }; 257 258 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, 259 MachineInstr &MI, 260 MachineIRBuilder &B) const { 261 CombinerHelper Helper(Observer, B, KB, MDT); 262 AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg); 263 264 if (Generated.tryCombineAll(Observer, MI, B, Helper)) 265 return true; 266 267 switch (MI.getOpcode()) { 268 case TargetOpcode::G_SHL: 269 case TargetOpcode::G_LSHR: 270 case TargetOpcode::G_ASHR: 271 // On some subtargets, 64-bit shift is a quarter rate instruction. In the 272 // common case, splitting this into a move and a 32-bit shift is faster and 273 // the same code size. 274 return Helper.tryCombineShiftToUnmerge(MI, 32); 275 } 276 277 return false; 278 } 279 280 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 281 #include "AMDGPUGenPostLegalizeGICombiner.inc" 282 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP 283 284 // Pass boilerplate 285 // ================ 286 287 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { 288 public: 289 static char ID; 290 291 AMDGPUPostLegalizerCombiner(bool IsOptNone = false); 292 293 StringRef getPassName() const override { 294 return "AMDGPUPostLegalizerCombiner"; 295 } 296 297 bool runOnMachineFunction(MachineFunction &MF) override; 298 299 void getAnalysisUsage(AnalysisUsage &AU) const override; 300 private: 301 bool IsOptNone; 302 }; 303 } // end anonymous namespace 304 305 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { 306 AU.addRequired<TargetPassConfig>(); 307 AU.setPreservesCFG(); 308 getSelectionDAGFallbackAnalysisUsage(AU); 309 AU.addRequired<GISelKnownBitsAnalysis>(); 310 AU.addPreserved<GISelKnownBitsAnalysis>(); 311 if (!IsOptNone) { 312 AU.addRequired<MachineDominatorTree>(); 313 AU.addPreserved<MachineDominatorTree>(); 314 } 315 MachineFunctionPass::getAnalysisUsage(AU); 316 } 317 318 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) 319 : MachineFunctionPass(ID), IsOptNone(IsOptNone) { 320 initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); 321 } 322 323 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { 324 if (MF.getProperties().hasProperty( 325 MachineFunctionProperties::Property::FailedISel)) 326 return false; 327 auto *TPC = &getAnalysis<TargetPassConfig>(); 328 const Function &F = MF.getFunction(); 329 bool EnableOpt = 330 MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); 331 332 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 333 const AMDGPULegalizerInfo *LI 334 = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); 335 336 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); 337 MachineDominatorTree *MDT = 338 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); 339 AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), 340 F.hasMinSize(), LI, KB, MDT); 341 Combiner C(PCInfo, TPC); 342 return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); 343 } 344 345 char AMDGPUPostLegalizerCombiner::ID = 0; 346 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 347 "Combine AMDGPU machine instrs after legalization", 348 false, false) 349 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) 350 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) 351 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, 352 "Combine AMDGPU machine instrs after legalization", false, 353 false) 354 355 namespace llvm { 356 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { 357 return new AMDGPUPostLegalizerCombiner(IsOptNone); 358 } 359 } // end namespace llvm 360