//=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This pass does combining of machine instructions at the generic MI level, // after register banks are known. // //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-regbank-combiner" using namespace llvm; using namespace MIPatternMatch; class AMDGPURegBankCombinerHelper { protected: MachineIRBuilder &B; MachineFunction &MF; MachineRegisterInfo &MRI; const GCNSubtarget &Subtarget; const RegisterBankInfo &RBI; const TargetRegisterInfo &TRI; const SIInstrInfo &TII; CombinerHelper &Helper; public: AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) : B(B), MF(B.getMF()), MRI(*B.getMRI()), Subtarget(MF.getSubtarget()), RBI(*Subtarget.getRegBankInfo()), TRI(*Subtarget.getRegisterInfo()), TII(*Subtarget.getInstrInfo()), Helper(Helper){}; bool isVgprRegBank(Register Reg); Register getAsVgpr(Register Reg); struct MinMaxMedOpc { unsigned Min, Max, Med; }; struct Med3MatchInfo { unsigned Opc; Register Val0, Val1, Val2; }; MinMaxMedOpc getMinMaxPair(unsigned Opc); template bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc, Register &Val, CstTy &K0, CstTy &K1); bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg); bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg); void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); void applyClamp(MachineInstr &MI, Register &Reg); private: AMDGPU::SIModeRegisterDefaults getMode(); bool getIEEE(); bool getDX10Clamp(); bool isFminnumIeee(const MachineInstr &MI); bool isFCst(MachineInstr *MI); bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1); }; bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID; } Register AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg) { if (isVgprRegBank(Reg)) return Reg; // Search for existing copy of Reg to vgpr. for (MachineInstr &Use : MRI.use_instructions(Reg)) { Register Def = Use.getOperand(0).getReg(); if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def)) return Def; } // Copy Reg to vgpr. Register VgprReg = B.buildCopy(MRI.getType(Reg), Reg).getReg(0); MRI.setRegBank(VgprReg, RBI.getRegBank(AMDGPU::VGPRRegBankID)); return VgprReg; } AMDGPURegBankCombinerHelper::MinMaxMedOpc AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { switch (Opc) { default: llvm_unreachable("Unsupported opcode"); case AMDGPU::G_SMAX: case AMDGPU::G_SMIN: return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3}; case AMDGPU::G_UMAX: case AMDGPU::G_UMIN: return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; case AMDGPU::G_FMAXNUM: case AMDGPU::G_FMINNUM: return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3}; case AMDGPU::G_FMAXNUM_IEEE: case AMDGPU::G_FMINNUM_IEEE: return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE, AMDGPU::G_AMDGPU_FMED3}; } } template bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc, Register &Val, CstTy &K0, CstTy &K1) { // 4 operand commutes of: min(max(Val, K0), K1). // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)). // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0). // 4 operand commutes of: max(min(Val, K1), K0). // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)). // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1). return mi_match( MI, MRI, m_any_of( m_CommutativeBinOp( MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)), m_Cst(K1)), m_CommutativeBinOp( MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)), m_Cst(K0)))); } bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( MachineInstr &MI, Med3MatchInfo &MatchInfo) { Register Dst = MI.getOperand(0).getReg(); if (!isVgprRegBank(Dst)) return false; // med3 for i16 is only available on gfx9+, and not available for v2i16. LLT Ty = MRI.getType(Dst); if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) && Ty != LLT::scalar(32)) return false; MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode()); Register Val; std::optional K0, K1; // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. if (!matchMed(MI, MRI, OpcodeTriple, Val, K0, K1)) return false; if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0->Value.sgt(K1->Value)) return false; if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0->Value.ugt(K1->Value)) return false; MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg}; return true; } // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) // ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K // ieee = false : min/max(NaN, K) = K // clamp(NaN) = dx10_clamp ? 0.0 : NaN // Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input. // Other operand commutes (see matchMed) give same result since min and max are // commutative. // Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1 // with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0. // Val = SNaN only for ieee = true // fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1 // min(max(SNaN, K0), K1) = min(QNaN, K1) = K1 // max(min(SNaN, K1), K0) = max(K1, K0) = K1 // Val = NaN,ieee = false or Val = QNaN,ieee = true // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0 // min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true) // max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0 bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( MachineInstr &MI, Med3MatchInfo &MatchInfo) { Register Dst = MI.getOperand(0).getReg(); LLT Ty = MRI.getType(Dst); // med3 for f16 is only available on gfx9+, and not available for v2f16. if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) && Ty != LLT::scalar(32)) return false; auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); Register Val; std::optional K0, K1; // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. if (!matchMed(MI, MRI, OpcodeTriple, Val, K0, K1)) return false; if (K0->Value > K1->Value) return false; // For IEEE=false perform combine only when it's safe to assume that there are // no NaN inputs. Most often MI is marked with nnan fast math flag. // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner // nodes(max/min) have same behavior when one input is NaN and other isn't. // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN, // also post-legalizer inputs to min/max are fcanonicalized (never SNaN). if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) { // Don't fold single use constant that can't be inlined. if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) && (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) { MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg}; return true; } } return false; } bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) { // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16). auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); Register Val; std::optional K0, K1; // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). if (!matchMed(MI, MRI, OpcodeTriple, Val, K0, K1)) return false; if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0)) return false; // For IEEE=false perform combine only when it's safe to assume that there are // no NaN inputs. Most often MI is marked with nnan fast math flag. // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates // to 0.0 requires dx10_clamp = true. if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) && isKnownNeverSNaN(Val, MRI)) || isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) { Reg = Val; return true; } return false; } // Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true. // Val = SNaN only for ieee = true. It is important which operand is NaN. // min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0 // min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0 // min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN // Val = NaN,ieee = false or Val = QNaN,ieee = true // min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0 // min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0 // min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0 bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) { if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3) return false; // In llvm-ir, clamp is often represented as an intrinsic call to // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders. MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); if (isFCst(Src0) && !isFCst(Src1)) std::swap(Src0, Src1); if (isFCst(Src1) && !isFCst(Src2)) std::swap(Src1, Src2); if (isFCst(Src0) && !isFCst(Src1)) std::swap(Src0, Src1); if (!isClampZeroToOne(Src1, Src2)) return false; Register Val = Src0->getOperand(0).getReg(); auto isOp3Zero = [&]() { MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT) return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0); return false; }; // For IEEE=false perform combine only when it's safe to assume that there are // no NaN inputs. Most often MI is marked with nnan fast math flag. // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold // when Val could be QNaN. If Val can also be SNaN third input should be 0.0. if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) || (getIEEE() && getDX10Clamp() && (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) { Reg = Val; return true; } return false; } void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) { B.setInstrAndDebugLoc(MI); B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg}, MI.getFlags()); MI.eraseFromParent(); } void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) { B.setInstrAndDebugLoc(MI); B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)}, {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1), getAsVgpr(MatchInfo.Val2)}, MI.getFlags()); MI.eraseFromParent(); } AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { return MF.getInfo()->getMode(); } bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; } bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; } bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) { return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE; } bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) { return MI->getOpcode() == AMDGPU::G_FCONSTANT; } bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0, MachineInstr *K1) { if (isFCst(K0) && isFCst(K1)) { const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm(); const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm(); return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) || (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0)); } return false; } class AMDGPURegBankCombinerHelperState { protected: CombinerHelper &Helper; AMDGPURegBankCombinerHelper &RegBankHelper; public: AMDGPURegBankCombinerHelperState(CombinerHelper &Helper, AMDGPURegBankCombinerHelper &RegBankHelper) : Helper(Helper), RegBankHelper(RegBankHelper) {} }; #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS #include "AMDGPUGenRegBankGICombiner.inc" #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS namespace { #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H #include "AMDGPUGenRegBankGICombiner.inc" #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H class AMDGPURegBankCombinerInfo final : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; public: AMDGPUGenRegBankCombinerHelperRuleConfig GeneratedRuleCfg; AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, const AMDGPULegalizerInfo *LI, GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), KB(KB), MDT(MDT) { if (!GeneratedRuleCfg.parseCommandLineOption()) report_fatal_error("Invalid rule identifier"); } bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const { CombinerHelper Helper(Observer, B, /* IsPreLegalize*/ false, KB, MDT); AMDGPURegBankCombinerHelper RegBankHelper(B, Helper); AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper, RegBankHelper); if (Generated.tryCombineAll(Observer, MI, B)) return true; return false; } #define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP #include "AMDGPUGenRegBankGICombiner.inc" #undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP // Pass boilerplate // ================ class AMDGPURegBankCombiner : public MachineFunctionPass { public: static char ID; AMDGPURegBankCombiner(bool IsOptNone = false); StringRef getPassName() const override { return "AMDGPURegBankCombiner"; } bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; private: bool IsOptNone; }; } // end anonymous namespace void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); AU.setPreservesCFG(); getSelectionDAGFallbackAnalysisUsage(AU); AU.addRequired(); AU.addPreserved(); if (!IsOptNone) { AU.addRequired(); AU.addPreserved(); } MachineFunctionPass::getAnalysisUsage(AU); } AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone) : MachineFunctionPass(ID), IsOptNone(IsOptNone) { initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry()); } bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) { if (MF.getProperties().hasProperty( MachineFunctionProperties::Property::FailedISel)) return false; auto *TPC = &getAnalysis(); const Function &F = MF.getFunction(); bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); const GCNSubtarget &ST = MF.getSubtarget(); const AMDGPULegalizerInfo *LI = static_cast(ST.getLegalizerInfo()); GISelKnownBits *KB = &getAnalysis().get(MF); MachineDominatorTree *MDT = IsOptNone ? nullptr : &getAnalysis(); AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), F.hasMinSize(), LI, KB, MDT); Combiner C(PCInfo, TPC); return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); } char AMDGPURegBankCombiner::ID = 0; INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after regbankselect", false, false) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE, "Combine AMDGPU machine instrs after regbankselect", false, false) namespace llvm { FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) { return new AMDGPURegBankCombiner(IsOptNone); } } // end namespace llvm