xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/Combiner.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/MachineDominators.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/Target/TargetMachine.h"
27 
28 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
29 
30 using namespace llvm;
31 using namespace MIPatternMatch;
32 
33 class AMDGPUPreLegalizerCombinerHelper {
34 protected:
35   MachineIRBuilder &B;
36   MachineFunction &MF;
37   MachineRegisterInfo &MRI;
38   AMDGPUCombinerHelper &Helper;
39 
40 public:
41   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B,
42                                    AMDGPUCombinerHelper &Helper)
43       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
44 
45   struct ClampI64ToI16MatchInfo {
46     int64_t Cmp1 = 0;
47     int64_t Cmp2 = 0;
48     Register Origin;
49   };
50 
51   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
52                           MachineFunction &MF,
53                           ClampI64ToI16MatchInfo &MatchInfo);
54 
55   void applyClampI64ToI16(MachineInstr &MI,
56                           const ClampI64ToI16MatchInfo &MatchInfo);
57 };
58 
59 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
60     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
61     ClampI64ToI16MatchInfo &MatchInfo) {
62   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
63 
64   // Try to find a pattern where an i64 value should get clamped to short.
65   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
66   if (SrcType != LLT::scalar(64))
67     return false;
68 
69   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
70   if (DstType != LLT::scalar(16))
71     return false;
72 
73   Register Base;
74 
75   auto IsApplicableForCombine = [&MatchInfo]() -> bool {
76     const auto Cmp1 = MatchInfo.Cmp1;
77     const auto Cmp2 = MatchInfo.Cmp2;
78     const auto Diff = std::abs(Cmp2 - Cmp1);
79 
80     // If the difference between both comparison values is 0 or 1, there is no
81     // need to clamp.
82     if (Diff == 0 || Diff == 1)
83       return false;
84 
85     const int64_t Min = std::numeric_limits<int16_t>::min();
86     const int64_t Max = std::numeric_limits<int16_t>::max();
87 
88     // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
89     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
90             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
91   };
92 
93   // Try to match a combination of min / max MIR opcodes.
94   if (mi_match(MI.getOperand(1).getReg(), MRI,
95                m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
96     if (mi_match(Base, MRI,
97                  m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
98       return IsApplicableForCombine();
99     }
100   }
101 
102   if (mi_match(MI.getOperand(1).getReg(), MRI,
103                m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
104     if (mi_match(Base, MRI,
105                  m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
106       return IsApplicableForCombine();
107     }
108   }
109 
110   return false;
111 }
112 
113 // We want to find a combination of instructions that
114 // gets generated when an i64 gets clamped to i16.
115 // The corresponding pattern is:
116 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
117 // This can be efficiently written as following:
118 // v_cvt_pk_i16_i32 v0, v0, v1
119 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
120 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
121     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
122 
123   Register Src = MatchInfo.Origin;
124   assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
125          LLT::scalar(64));
126   const LLT S32 = LLT::scalar(32);
127 
128   B.setMBB(*MI.getParent());
129   B.setInstrAndDebugLoc(MI);
130 
131   auto Unmerge = B.buildUnmerge(S32, Src);
132 
133   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
134 
135   const LLT V2S16 = LLT::fixed_vector(2, 16);
136   auto CvtPk =
137       B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
138                    {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
139 
140   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
141   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
142   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
143   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
144 
145   auto Bitcast = B.buildBitcast({S32}, CvtPk);
146 
147   auto Med3 = B.buildInstr(
148       AMDGPU::G_AMDGPU_SMED3, {S32},
149       {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
150       MI.getFlags());
151 
152   B.buildTrunc(MI.getOperand(0).getReg(), Med3);
153 
154   MI.eraseFromParent();
155 }
156 
157 class AMDGPUPreLegalizerCombinerHelperState {
158 protected:
159   AMDGPUCombinerHelper &Helper;
160   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
161 
162 public:
163   AMDGPUPreLegalizerCombinerHelperState(
164       AMDGPUCombinerHelper &Helper,
165       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
166       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
167 };
168 
169 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
170 #include "AMDGPUGenPreLegalizeGICombiner.inc"
171 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
172 
173 namespace {
174 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
175 #include "AMDGPUGenPreLegalizeGICombiner.inc"
176 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
177 
178 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
179   GISelKnownBits *KB;
180   MachineDominatorTree *MDT;
181 
182 public:
183   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
184 
185   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
186                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
187       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
188                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
189         KB(KB), MDT(MDT) {
190     if (!GeneratedRuleCfg.parseCommandLineOption())
191       report_fatal_error("Invalid rule identifier");
192   }
193 
194   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
195                        MachineIRBuilder &B) const override;
196 };
197 
198 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
199                                               MachineInstr &MI,
200                                               MachineIRBuilder &B) const {
201   AMDGPUCombinerHelper Helper(Observer, B, KB, MDT);
202   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
203   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
204                                                 PreLegalizerHelper);
205 
206   if (Generated.tryCombineAll(Observer, MI, B))
207     return true;
208 
209   switch (MI.getOpcode()) {
210   case TargetOpcode::G_CONCAT_VECTORS:
211     return Helper.tryCombineConcatVectors(MI);
212   case TargetOpcode::G_SHUFFLE_VECTOR:
213     return Helper.tryCombineShuffleVector(MI);
214   }
215 
216   return false;
217 }
218 
219 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
220 #include "AMDGPUGenPreLegalizeGICombiner.inc"
221 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
222 
223 // Pass boilerplate
224 // ================
225 
226 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
227 public:
228   static char ID;
229 
230   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
231 
232   StringRef getPassName() const override {
233     return "AMDGPUPreLegalizerCombiner";
234   }
235 
236   bool runOnMachineFunction(MachineFunction &MF) override;
237 
238   void getAnalysisUsage(AnalysisUsage &AU) const override;
239 private:
240   bool IsOptNone;
241 };
242 } // end anonymous namespace
243 
244 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
245   AU.addRequired<TargetPassConfig>();
246   AU.setPreservesCFG();
247   getSelectionDAGFallbackAnalysisUsage(AU);
248   AU.addRequired<GISelKnownBitsAnalysis>();
249   AU.addPreserved<GISelKnownBitsAnalysis>();
250   if (!IsOptNone) {
251     AU.addRequired<MachineDominatorTree>();
252     AU.addPreserved<MachineDominatorTree>();
253   }
254 
255   AU.addRequired<GISelCSEAnalysisWrapperPass>();
256   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
257   MachineFunctionPass::getAnalysisUsage(AU);
258 }
259 
260 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
261   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
262   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
263 }
264 
265 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
266   if (MF.getProperties().hasProperty(
267           MachineFunctionProperties::Property::FailedISel))
268     return false;
269   auto *TPC = &getAnalysis<TargetPassConfig>();
270   const Function &F = MF.getFunction();
271   bool EnableOpt =
272       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
273   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
274   MachineDominatorTree *MDT =
275       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
276   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
277                                         F.hasMinSize(), KB, MDT);
278   // Enable CSE.
279   GISelCSEAnalysisWrapper &Wrapper =
280       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
281   auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
282 
283   Combiner C(PCInfo, TPC);
284   return C.combineMachineInstrs(MF, CSEInfo);
285 }
286 
287 char AMDGPUPreLegalizerCombiner::ID = 0;
288 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
289                       "Combine AMDGPU machine instrs before legalization",
290                       false, false)
291 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
292 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
293 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
294                     "Combine AMDGPU machine instrs before legalization", false,
295                     false)
296 
297 namespace llvm {
298 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
299   return new AMDGPUPreLegalizerCombiner(IsOptNone);
300 }
301 } // end namespace llvm
302