15ffd83dbSDimitry Andric //=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
25ffd83dbSDimitry Andric //
35ffd83dbSDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
45ffd83dbSDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
55ffd83dbSDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
65ffd83dbSDimitry Andric //
75ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
85ffd83dbSDimitry Andric //
95ffd83dbSDimitry Andric // This pass does combining of machine instructions at the generic MI level,
105ffd83dbSDimitry Andric // after register banks are known.
115ffd83dbSDimitry Andric //
125ffd83dbSDimitry Andric //===----------------------------------------------------------------------===//
135ffd83dbSDimitry Andric
14e8d8bef9SDimitry Andric #include "AMDGPU.h"
155ffd83dbSDimitry Andric #include "AMDGPULegalizerInfo.h"
16fe6060f1SDimitry Andric #include "AMDGPURegisterBankInfo.h"
17e8d8bef9SDimitry Andric #include "GCNSubtarget.h"
18fe6060f1SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
190eae32dcSDimitry Andric #include "SIMachineFunctionInfo.h"
205ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/Combiner.h"
215ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
225ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
2306c3fb27SDimitry Andric #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
245ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
255ffd83dbSDimitry Andric #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
265ffd83dbSDimitry Andric #include "llvm/CodeGen/MachineDominators.h"
275ffd83dbSDimitry Andric #include "llvm/CodeGen/TargetPassConfig.h"
280eae32dcSDimitry Andric #include "llvm/IR/IntrinsicsAMDGPU.h"
29e8d8bef9SDimitry Andric #include "llvm/Target/TargetMachine.h"
3006c3fb27SDimitry Andric
3106c3fb27SDimitry Andric #define GET_GICOMBINER_DEPS
3206c3fb27SDimitry Andric #include "AMDGPUGenPreLegalizeGICombiner.inc"
3306c3fb27SDimitry Andric #undef GET_GICOMBINER_DEPS
3406c3fb27SDimitry Andric
355ffd83dbSDimitry Andric #define DEBUG_TYPE "amdgpu-regbank-combiner"
365ffd83dbSDimitry Andric
375ffd83dbSDimitry Andric using namespace llvm;
385ffd83dbSDimitry Andric using namespace MIPatternMatch;
395ffd83dbSDimitry Andric
4006c3fb27SDimitry Andric namespace {
4106c3fb27SDimitry Andric #define GET_GICOMBINER_TYPES
4206c3fb27SDimitry Andric #include "AMDGPUGenRegBankGICombiner.inc"
4306c3fb27SDimitry Andric #undef GET_GICOMBINER_TYPES
4406c3fb27SDimitry Andric
455f757f3fSDimitry Andric class AMDGPURegBankCombinerImpl : public Combiner {
46fe6060f1SDimitry Andric protected:
4706c3fb27SDimitry Andric const AMDGPURegBankCombinerImplRuleConfig &RuleConfig;
4806c3fb27SDimitry Andric const GCNSubtarget &STI;
49fe6060f1SDimitry Andric const RegisterBankInfo &RBI;
50fe6060f1SDimitry Andric const TargetRegisterInfo &TRI;
510eae32dcSDimitry Andric const SIInstrInfo &TII;
525f757f3fSDimitry Andric // TODO: Make CombinerHelper methods const.
535f757f3fSDimitry Andric mutable CombinerHelper Helper;
54fe6060f1SDimitry Andric
55fe6060f1SDimitry Andric public:
5606c3fb27SDimitry Andric AMDGPURegBankCombinerImpl(
575f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
585f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
5906c3fb27SDimitry Andric const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
605f757f3fSDimitry Andric const GCNSubtarget &STI, MachineDominatorTree *MDT,
615f757f3fSDimitry Andric const LegalizerInfo *LI);
62fe6060f1SDimitry Andric
getName()6306c3fb27SDimitry Andric static const char *getName() { return "AMDGPURegBankCombinerImpl"; }
6406c3fb27SDimitry Andric
655f757f3fSDimitry Andric bool tryCombineAll(MachineInstr &I) const override;
6606c3fb27SDimitry Andric
6706c3fb27SDimitry Andric bool isVgprRegBank(Register Reg) const;
6806c3fb27SDimitry Andric Register getAsVgpr(Register Reg) const;
69fe6060f1SDimitry Andric
70fe6060f1SDimitry Andric struct MinMaxMedOpc {
71fe6060f1SDimitry Andric unsigned Min, Max, Med;
72fe6060f1SDimitry Andric };
73fe6060f1SDimitry Andric
74fe6060f1SDimitry Andric struct Med3MatchInfo {
75fe6060f1SDimitry Andric unsigned Opc;
76fe6060f1SDimitry Andric Register Val0, Val1, Val2;
77fe6060f1SDimitry Andric };
78fe6060f1SDimitry Andric
7906c3fb27SDimitry Andric MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
80fe6060f1SDimitry Andric
81349cc55cSDimitry Andric template <class m_Cst, typename CstTy>
82fe6060f1SDimitry Andric bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
8306c3fb27SDimitry Andric Register &Val, CstTy &K0, CstTy &K1) const;
84fe6060f1SDimitry Andric
8506c3fb27SDimitry Andric bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
8606c3fb27SDimitry Andric bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
8706c3fb27SDimitry Andric bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) const;
8806c3fb27SDimitry Andric bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) const;
8906c3fb27SDimitry Andric void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
9006c3fb27SDimitry Andric void applyClamp(MachineInstr &MI, Register &Reg) const;
910eae32dcSDimitry Andric
920eae32dcSDimitry Andric private:
9306c3fb27SDimitry Andric SIModeRegisterDefaults getMode() const;
9406c3fb27SDimitry Andric bool getIEEE() const;
9506c3fb27SDimitry Andric bool getDX10Clamp() const;
9606c3fb27SDimitry Andric bool isFminnumIeee(const MachineInstr &MI) const;
9706c3fb27SDimitry Andric bool isFCst(MachineInstr *MI) const;
9806c3fb27SDimitry Andric bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1) const;
9906c3fb27SDimitry Andric
10006c3fb27SDimitry Andric #define GET_GICOMBINER_CLASS_MEMBERS
10106c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget
10206c3fb27SDimitry Andric #include "AMDGPUGenRegBankGICombiner.inc"
10306c3fb27SDimitry Andric #undef GET_GICOMBINER_CLASS_MEMBERS
10406c3fb27SDimitry Andric #undef AMDGPUSubtarget
105fe6060f1SDimitry Andric };
106fe6060f1SDimitry Andric
10706c3fb27SDimitry Andric #define GET_GICOMBINER_IMPL
10806c3fb27SDimitry Andric #define AMDGPUSubtarget GCNSubtarget
10906c3fb27SDimitry Andric #include "AMDGPUGenRegBankGICombiner.inc"
11006c3fb27SDimitry Andric #undef AMDGPUSubtarget
11106c3fb27SDimitry Andric #undef GET_GICOMBINER_IMPL
11206c3fb27SDimitry Andric
AMDGPURegBankCombinerImpl(MachineFunction & MF,CombinerInfo & CInfo,const TargetPassConfig * TPC,GISelKnownBits & KB,GISelCSEInfo * CSEInfo,const AMDGPURegBankCombinerImplRuleConfig & RuleConfig,const GCNSubtarget & STI,MachineDominatorTree * MDT,const LegalizerInfo * LI)11306c3fb27SDimitry Andric AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl(
1145f757f3fSDimitry Andric MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
1155f757f3fSDimitry Andric GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
1165f757f3fSDimitry Andric const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
1175f757f3fSDimitry Andric const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
1185f757f3fSDimitry Andric : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
1195f757f3fSDimitry Andric RBI(*STI.getRegBankInfo()), TRI(*STI.getRegisterInfo()),
1205f757f3fSDimitry Andric TII(*STI.getInstrInfo()),
1215f757f3fSDimitry Andric Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
12206c3fb27SDimitry Andric #define GET_GICOMBINER_CONSTRUCTOR_INITS
12306c3fb27SDimitry Andric #include "AMDGPUGenRegBankGICombiner.inc"
12406c3fb27SDimitry Andric #undef GET_GICOMBINER_CONSTRUCTOR_INITS
12506c3fb27SDimitry Andric {
12606c3fb27SDimitry Andric }
12706c3fb27SDimitry Andric
isVgprRegBank(Register Reg) const12806c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg) const {
129fe6060f1SDimitry Andric return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
130fe6060f1SDimitry Andric }
131fe6060f1SDimitry Andric
getAsVgpr(Register Reg) const13206c3fb27SDimitry Andric Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
1334824e7fdSDimitry Andric if (isVgprRegBank(Reg))
1344824e7fdSDimitry Andric return Reg;
1354824e7fdSDimitry Andric
1364824e7fdSDimitry Andric // Search for existing copy of Reg to vgpr.
1374824e7fdSDimitry Andric for (MachineInstr &Use : MRI.use_instructions(Reg)) {
1384824e7fdSDimitry Andric Register Def = Use.getOperand(0).getReg();
1394824e7fdSDimitry Andric if (Use.getOpcode() == AMDGPU::COPY && isVgprRegBank(Def))
1404824e7fdSDimitry Andric return Def;
1414824e7fdSDimitry Andric }
1424824e7fdSDimitry Andric
1434824e7fdSDimitry Andric // Copy Reg to vgpr.
1444824e7fdSDimitry Andric Register VgprReg = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
1454824e7fdSDimitry Andric MRI.setRegBank(VgprReg, RBI.getRegBank(AMDGPU::VGPRRegBankID));
1464824e7fdSDimitry Andric return VgprReg;
1474824e7fdSDimitry Andric }
1484824e7fdSDimitry Andric
14906c3fb27SDimitry Andric AMDGPURegBankCombinerImpl::MinMaxMedOpc
getMinMaxPair(unsigned Opc) const15006c3fb27SDimitry Andric AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const {
151fe6060f1SDimitry Andric switch (Opc) {
152fe6060f1SDimitry Andric default:
153fe6060f1SDimitry Andric llvm_unreachable("Unsupported opcode");
154fe6060f1SDimitry Andric case AMDGPU::G_SMAX:
155fe6060f1SDimitry Andric case AMDGPU::G_SMIN:
156fe6060f1SDimitry Andric return {AMDGPU::G_SMIN, AMDGPU::G_SMAX, AMDGPU::G_AMDGPU_SMED3};
157fe6060f1SDimitry Andric case AMDGPU::G_UMAX:
158fe6060f1SDimitry Andric case AMDGPU::G_UMIN:
159fe6060f1SDimitry Andric return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3};
1600eae32dcSDimitry Andric case AMDGPU::G_FMAXNUM:
1610eae32dcSDimitry Andric case AMDGPU::G_FMINNUM:
1620eae32dcSDimitry Andric return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3};
1630eae32dcSDimitry Andric case AMDGPU::G_FMAXNUM_IEEE:
1640eae32dcSDimitry Andric case AMDGPU::G_FMINNUM_IEEE:
1650eae32dcSDimitry Andric return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE,
1660eae32dcSDimitry Andric AMDGPU::G_AMDGPU_FMED3};
167fe6060f1SDimitry Andric }
168fe6060f1SDimitry Andric }
169fe6060f1SDimitry Andric
170349cc55cSDimitry Andric template <class m_Cst, typename CstTy>
matchMed(MachineInstr & MI,MachineRegisterInfo & MRI,MinMaxMedOpc MMMOpc,Register & Val,CstTy & K0,CstTy & K1) const17106c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI,
172fe6060f1SDimitry Andric MachineRegisterInfo &MRI,
173fe6060f1SDimitry Andric MinMaxMedOpc MMMOpc, Register &Val,
17406c3fb27SDimitry Andric CstTy &K0, CstTy &K1) const {
175fe6060f1SDimitry Andric // 4 operand commutes of: min(max(Val, K0), K1).
176fe6060f1SDimitry Andric // Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
177fe6060f1SDimitry Andric // Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
178fe6060f1SDimitry Andric // 4 operand commutes of: max(min(Val, K1), K0).
179fe6060f1SDimitry Andric // Find K0 from outer instr: max(min(...), K0) or max(K0, min(...)).
180fe6060f1SDimitry Andric // Find K1 and Val from inner instr: min(K1, Val) or min(Val, K1).
181fe6060f1SDimitry Andric return mi_match(
182fe6060f1SDimitry Andric MI, MRI,
183fe6060f1SDimitry Andric m_any_of(
184fe6060f1SDimitry Andric m_CommutativeBinOp(
185fe6060f1SDimitry Andric MMMOpc.Min, m_CommutativeBinOp(MMMOpc.Max, m_Reg(Val), m_Cst(K0)),
186fe6060f1SDimitry Andric m_Cst(K1)),
187fe6060f1SDimitry Andric m_CommutativeBinOp(
188fe6060f1SDimitry Andric MMMOpc.Max, m_CommutativeBinOp(MMMOpc.Min, m_Reg(Val), m_Cst(K1)),
189fe6060f1SDimitry Andric m_Cst(K0))));
190fe6060f1SDimitry Andric }
191fe6060f1SDimitry Andric
matchIntMinMaxToMed3(MachineInstr & MI,Med3MatchInfo & MatchInfo) const19206c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
19306c3fb27SDimitry Andric MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
194fe6060f1SDimitry Andric Register Dst = MI.getOperand(0).getReg();
195fe6060f1SDimitry Andric if (!isVgprRegBank(Dst))
196fe6060f1SDimitry Andric return false;
197fe6060f1SDimitry Andric
198fcaf7f86SDimitry Andric // med3 for i16 is only available on gfx9+, and not available for v2i16.
199fcaf7f86SDimitry Andric LLT Ty = MRI.getType(Dst);
20006c3fb27SDimitry Andric if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
201fe6060f1SDimitry Andric return false;
202fe6060f1SDimitry Andric
203fe6060f1SDimitry Andric MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
204349cc55cSDimitry Andric Register Val;
205bdd1243dSDimitry Andric std::optional<ValueAndVReg> K0, K1;
206fe6060f1SDimitry Andric // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
207349cc55cSDimitry Andric if (!matchMed<GCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
208fe6060f1SDimitry Andric return false;
209fe6060f1SDimitry Andric
210349cc55cSDimitry Andric if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_SMED3 && K0->Value.sgt(K1->Value))
211fe6060f1SDimitry Andric return false;
212349cc55cSDimitry Andric if (OpcodeTriple.Med == AMDGPU::G_AMDGPU_UMED3 && K0->Value.ugt(K1->Value))
213fe6060f1SDimitry Andric return false;
214fe6060f1SDimitry Andric
215349cc55cSDimitry Andric MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
216fe6060f1SDimitry Andric return true;
217fe6060f1SDimitry Andric }
218fe6060f1SDimitry Andric
2190eae32dcSDimitry Andric // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1)
2200eae32dcSDimitry Andric // ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K
2210eae32dcSDimitry Andric // ieee = false : min/max(NaN, K) = K
2220eae32dcSDimitry Andric // clamp(NaN) = dx10_clamp ? 0.0 : NaN
2230eae32dcSDimitry Andric // Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input.
2240eae32dcSDimitry Andric // Other operand commutes (see matchMed) give same result since min and max are
2250eae32dcSDimitry Andric // commutative.
2260eae32dcSDimitry Andric
2270eae32dcSDimitry Andric // Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1
2280eae32dcSDimitry Andric // with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0.
2290eae32dcSDimitry Andric // Val = SNaN only for ieee = true
2300eae32dcSDimitry Andric // fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1
2310eae32dcSDimitry Andric // min(max(SNaN, K0), K1) = min(QNaN, K1) = K1
2320eae32dcSDimitry Andric // max(min(SNaN, K1), K0) = max(K1, K0) = K1
2330eae32dcSDimitry Andric // Val = NaN,ieee = false or Val = QNaN,ieee = true
2340eae32dcSDimitry Andric // fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
2350eae32dcSDimitry Andric // min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
2360eae32dcSDimitry Andric // max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
matchFPMinMaxToMed3(MachineInstr & MI,Med3MatchInfo & MatchInfo) const23706c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
23806c3fb27SDimitry Andric MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
2390eae32dcSDimitry Andric Register Dst = MI.getOperand(0).getReg();
2400eae32dcSDimitry Andric LLT Ty = MRI.getType(Dst);
24104eeddc0SDimitry Andric
24204eeddc0SDimitry Andric // med3 for f16 is only available on gfx9+, and not available for v2f16.
24306c3fb27SDimitry Andric if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
2440eae32dcSDimitry Andric return false;
2450eae32dcSDimitry Andric
2460eae32dcSDimitry Andric auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
2470eae32dcSDimitry Andric
2480eae32dcSDimitry Andric Register Val;
249bdd1243dSDimitry Andric std::optional<FPValueAndVReg> K0, K1;
2500eae32dcSDimitry Andric // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1.
2510eae32dcSDimitry Andric if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
2520eae32dcSDimitry Andric return false;
2530eae32dcSDimitry Andric
2540eae32dcSDimitry Andric if (K0->Value > K1->Value)
2550eae32dcSDimitry Andric return false;
2560eae32dcSDimitry Andric
2570eae32dcSDimitry Andric // For IEEE=false perform combine only when it's safe to assume that there are
2580eae32dcSDimitry Andric // no NaN inputs. Most often MI is marked with nnan fast math flag.
2590eae32dcSDimitry Andric // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to
2600eae32dcSDimitry Andric // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner
2610eae32dcSDimitry Andric // nodes(max/min) have same behavior when one input is NaN and other isn't.
2620eae32dcSDimitry Andric // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN,
2630eae32dcSDimitry Andric // also post-legalizer inputs to min/max are fcanonicalized (never SNaN).
2640eae32dcSDimitry Andric if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) {
2650eae32dcSDimitry Andric // Don't fold single use constant that can't be inlined.
2660eae32dcSDimitry Andric if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) &&
2670eae32dcSDimitry Andric (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) {
2680eae32dcSDimitry Andric MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg};
2690eae32dcSDimitry Andric return true;
2700eae32dcSDimitry Andric }
2710eae32dcSDimitry Andric }
2720eae32dcSDimitry Andric
2730eae32dcSDimitry Andric return false;
2740eae32dcSDimitry Andric }
2750eae32dcSDimitry Andric
matchFPMinMaxToClamp(MachineInstr & MI,Register & Reg) const27606c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI,
27706c3fb27SDimitry Andric Register &Reg) const {
2780eae32dcSDimitry Andric // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
2790eae32dcSDimitry Andric auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
2800eae32dcSDimitry Andric Register Val;
281bdd1243dSDimitry Andric std::optional<FPValueAndVReg> K0, K1;
2820eae32dcSDimitry Andric // Match min(max(Val, K0), K1) or max(min(Val, K1), K0).
2830eae32dcSDimitry Andric if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1))
2840eae32dcSDimitry Andric return false;
2850eae32dcSDimitry Andric
2860eae32dcSDimitry Andric if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0))
2870eae32dcSDimitry Andric return false;
2880eae32dcSDimitry Andric
2890eae32dcSDimitry Andric // For IEEE=false perform combine only when it's safe to assume that there are
2900eae32dcSDimitry Andric // no NaN inputs. Most often MI is marked with nnan fast math flag.
2910eae32dcSDimitry Andric // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates
2920eae32dcSDimitry Andric // to 0.0 requires dx10_clamp = true.
2930eae32dcSDimitry Andric if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) &&
2940eae32dcSDimitry Andric isKnownNeverSNaN(Val, MRI)) ||
2950eae32dcSDimitry Andric isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) {
2960eae32dcSDimitry Andric Reg = Val;
2970eae32dcSDimitry Andric return true;
2980eae32dcSDimitry Andric }
2990eae32dcSDimitry Andric
3000eae32dcSDimitry Andric return false;
3010eae32dcSDimitry Andric }
3020eae32dcSDimitry Andric
3030eae32dcSDimitry Andric // Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true.
3040eae32dcSDimitry Andric // Val = SNaN only for ieee = true. It is important which operand is NaN.
3050eae32dcSDimitry Andric // min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0
3060eae32dcSDimitry Andric // min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0
3070eae32dcSDimitry Andric // min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN
3080eae32dcSDimitry Andric // Val = NaN,ieee = false or Val = QNaN,ieee = true
3090eae32dcSDimitry Andric // min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
3100eae32dcSDimitry Andric // min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
3110eae32dcSDimitry Andric // min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
matchFPMed3ToClamp(MachineInstr & MI,Register & Reg) const31206c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
31306c3fb27SDimitry Andric Register &Reg) const {
3140eae32dcSDimitry Andric // In llvm-ir, clamp is often represented as an intrinsic call to
3150eae32dcSDimitry Andric // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
31606c3fb27SDimitry Andric MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
31706c3fb27SDimitry Andric MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
31806c3fb27SDimitry Andric MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
3190eae32dcSDimitry Andric
3200eae32dcSDimitry Andric if (isFCst(Src0) && !isFCst(Src1))
3210eae32dcSDimitry Andric std::swap(Src0, Src1);
3220eae32dcSDimitry Andric if (isFCst(Src1) && !isFCst(Src2))
3230eae32dcSDimitry Andric std::swap(Src1, Src2);
3240eae32dcSDimitry Andric if (isFCst(Src0) && !isFCst(Src1))
3250eae32dcSDimitry Andric std::swap(Src0, Src1);
3260eae32dcSDimitry Andric if (!isClampZeroToOne(Src1, Src2))
3270eae32dcSDimitry Andric return false;
3280eae32dcSDimitry Andric
3290eae32dcSDimitry Andric Register Val = Src0->getOperand(0).getReg();
3300eae32dcSDimitry Andric
3310eae32dcSDimitry Andric auto isOp3Zero = [&]() {
3320eae32dcSDimitry Andric MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
3330eae32dcSDimitry Andric if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT)
3340eae32dcSDimitry Andric return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0);
3350eae32dcSDimitry Andric return false;
3360eae32dcSDimitry Andric };
3370eae32dcSDimitry Andric // For IEEE=false perform combine only when it's safe to assume that there are
3380eae32dcSDimitry Andric // no NaN inputs. Most often MI is marked with nnan fast math flag.
3390eae32dcSDimitry Andric // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold
3400eae32dcSDimitry Andric // when Val could be QNaN. If Val can also be SNaN third input should be 0.0.
3410eae32dcSDimitry Andric if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) ||
3420eae32dcSDimitry Andric (getIEEE() && getDX10Clamp() &&
3430eae32dcSDimitry Andric (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) {
3440eae32dcSDimitry Andric Reg = Val;
3450eae32dcSDimitry Andric return true;
3460eae32dcSDimitry Andric }
3470eae32dcSDimitry Andric
3480eae32dcSDimitry Andric return false;
3490eae32dcSDimitry Andric }
3500eae32dcSDimitry Andric
applyClamp(MachineInstr & MI,Register & Reg) const35106c3fb27SDimitry Andric void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
35206c3fb27SDimitry Andric Register &Reg) const {
3530eae32dcSDimitry Andric B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
3540eae32dcSDimitry Andric MI.getFlags());
3550eae32dcSDimitry Andric MI.eraseFromParent();
3560eae32dcSDimitry Andric }
3570eae32dcSDimitry Andric
applyMed3(MachineInstr & MI,Med3MatchInfo & MatchInfo) const35806c3fb27SDimitry Andric void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
35906c3fb27SDimitry Andric Med3MatchInfo &MatchInfo) const {
360fe6060f1SDimitry Andric B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
3614824e7fdSDimitry Andric {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1),
3624824e7fdSDimitry Andric getAsVgpr(MatchInfo.Val2)},
3634824e7fdSDimitry Andric MI.getFlags());
364fe6060f1SDimitry Andric MI.eraseFromParent();
365fe6060f1SDimitry Andric }
366fe6060f1SDimitry Andric
getMode() const36706c3fb27SDimitry Andric SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
3680eae32dcSDimitry Andric return MF.getInfo<SIMachineFunctionInfo>()->getMode();
3690eae32dcSDimitry Andric }
3700eae32dcSDimitry Andric
getIEEE() const37106c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::getIEEE() const { return getMode().IEEE; }
3720eae32dcSDimitry Andric
getDX10Clamp() const37306c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::getDX10Clamp() const {
37406c3fb27SDimitry Andric return getMode().DX10Clamp;
37506c3fb27SDimitry Andric }
3760eae32dcSDimitry Andric
isFminnumIeee(const MachineInstr & MI) const37706c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::isFminnumIeee(const MachineInstr &MI) const {
3780eae32dcSDimitry Andric return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
3790eae32dcSDimitry Andric }
3800eae32dcSDimitry Andric
isFCst(MachineInstr * MI) const38106c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::isFCst(MachineInstr *MI) const {
3820eae32dcSDimitry Andric return MI->getOpcode() == AMDGPU::G_FCONSTANT;
3830eae32dcSDimitry Andric }
3840eae32dcSDimitry Andric
isClampZeroToOne(MachineInstr * K0,MachineInstr * K1) const38506c3fb27SDimitry Andric bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0,
38606c3fb27SDimitry Andric MachineInstr *K1) const {
3870eae32dcSDimitry Andric if (isFCst(K0) && isFCst(K1)) {
3880eae32dcSDimitry Andric const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm();
3890eae32dcSDimitry Andric const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm();
3900eae32dcSDimitry Andric return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) ||
3910eae32dcSDimitry Andric (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0));
3920eae32dcSDimitry Andric }
3930eae32dcSDimitry Andric return false;
3940eae32dcSDimitry Andric }
3950eae32dcSDimitry Andric
3965ffd83dbSDimitry Andric // Pass boilerplate
3975ffd83dbSDimitry Andric // ================
3985ffd83dbSDimitry Andric
3995ffd83dbSDimitry Andric class AMDGPURegBankCombiner : public MachineFunctionPass {
4005ffd83dbSDimitry Andric public:
4015ffd83dbSDimitry Andric static char ID;
4025ffd83dbSDimitry Andric
4035ffd83dbSDimitry Andric AMDGPURegBankCombiner(bool IsOptNone = false);
4045ffd83dbSDimitry Andric
getPassName() const40506c3fb27SDimitry Andric StringRef getPassName() const override { return "AMDGPURegBankCombiner"; }
4065ffd83dbSDimitry Andric
4075ffd83dbSDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override;
4085ffd83dbSDimitry Andric
4095ffd83dbSDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override;
4105f757f3fSDimitry Andric
4115ffd83dbSDimitry Andric private:
4125ffd83dbSDimitry Andric bool IsOptNone;
4135f757f3fSDimitry Andric AMDGPURegBankCombinerImplRuleConfig RuleConfig;
4145ffd83dbSDimitry Andric };
4155ffd83dbSDimitry Andric } // end anonymous namespace
4165ffd83dbSDimitry Andric
getAnalysisUsage(AnalysisUsage & AU) const4175ffd83dbSDimitry Andric void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
4185ffd83dbSDimitry Andric AU.addRequired<TargetPassConfig>();
4195ffd83dbSDimitry Andric AU.setPreservesCFG();
4205ffd83dbSDimitry Andric getSelectionDAGFallbackAnalysisUsage(AU);
4215ffd83dbSDimitry Andric AU.addRequired<GISelKnownBitsAnalysis>();
4225ffd83dbSDimitry Andric AU.addPreserved<GISelKnownBitsAnalysis>();
4235ffd83dbSDimitry Andric if (!IsOptNone) {
424*0fca6ea1SDimitry Andric AU.addRequired<MachineDominatorTreeWrapperPass>();
425*0fca6ea1SDimitry Andric AU.addPreserved<MachineDominatorTreeWrapperPass>();
4265ffd83dbSDimitry Andric }
4275ffd83dbSDimitry Andric MachineFunctionPass::getAnalysisUsage(AU);
4285ffd83dbSDimitry Andric }
4295ffd83dbSDimitry Andric
AMDGPURegBankCombiner(bool IsOptNone)4305ffd83dbSDimitry Andric AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
4315ffd83dbSDimitry Andric : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
4325ffd83dbSDimitry Andric initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
4335f757f3fSDimitry Andric
4345f757f3fSDimitry Andric if (!RuleConfig.parseCommandLineOption())
4355f757f3fSDimitry Andric report_fatal_error("Invalid rule identifier");
4365ffd83dbSDimitry Andric }
4375ffd83dbSDimitry Andric
runOnMachineFunction(MachineFunction & MF)4385ffd83dbSDimitry Andric bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
4395ffd83dbSDimitry Andric if (MF.getProperties().hasProperty(
4405ffd83dbSDimitry Andric MachineFunctionProperties::Property::FailedISel))
4415ffd83dbSDimitry Andric return false;
4425ffd83dbSDimitry Andric auto *TPC = &getAnalysis<TargetPassConfig>();
4435ffd83dbSDimitry Andric const Function &F = MF.getFunction();
4445ffd83dbSDimitry Andric bool EnableOpt =
4455f757f3fSDimitry Andric MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
4465ffd83dbSDimitry Andric
4475ffd83dbSDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
4485ffd83dbSDimitry Andric GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
4495f757f3fSDimitry Andric
4505f757f3fSDimitry Andric const auto *LI = ST.getLegalizerInfo();
4515ffd83dbSDimitry Andric MachineDominatorTree *MDT =
452*0fca6ea1SDimitry Andric IsOptNone ? nullptr
453*0fca6ea1SDimitry Andric : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
4545f757f3fSDimitry Andric
4555f757f3fSDimitry Andric CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
4565f757f3fSDimitry Andric LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
4575f757f3fSDimitry Andric AMDGPURegBankCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr,
4585f757f3fSDimitry Andric RuleConfig, ST, MDT, LI);
4595f757f3fSDimitry Andric return Impl.combineMachineInstrs();
4605ffd83dbSDimitry Andric }
4615ffd83dbSDimitry Andric
4625ffd83dbSDimitry Andric char AMDGPURegBankCombiner::ID = 0;
4635ffd83dbSDimitry Andric INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE,
4645ffd83dbSDimitry Andric "Combine AMDGPU machine instrs after regbankselect",
4655ffd83dbSDimitry Andric false, false)
4665ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
4675ffd83dbSDimitry Andric INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
4685ffd83dbSDimitry Andric INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE,
4695ffd83dbSDimitry Andric "Combine AMDGPU machine instrs after regbankselect", false,
4705ffd83dbSDimitry Andric false)
4715ffd83dbSDimitry Andric
4725ffd83dbSDimitry Andric namespace llvm {
createAMDGPURegBankCombiner(bool IsOptNone)4735ffd83dbSDimitry Andric FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) {
4745ffd83dbSDimitry Andric return new AMDGPURegBankCombiner(IsOptNone);
4755ffd83dbSDimitry Andric }
4765ffd83dbSDimitry Andric } // end namespace llvm
477