xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp (revision cfd6422a5217410fbd66f7a7a8a64d9d85e61229)
1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUTargetMachine.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "llvm/CodeGen/GlobalISel/Combiner.h"
17 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
19 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
20 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
21 #include "llvm/CodeGen/MachineDominators.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/TargetPassConfig.h"
24 #include "llvm/Support/Debug.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 struct FMinFMaxLegacyInfo {
33   Register LHS;
34   Register RHS;
35   Register True;
36   Register False;
37   CmpInst::Predicate Pred;
38 };
39 
40 // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
41 static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
42                                 MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
43   // FIXME: Combines should have subtarget predicates, and we shouldn't need
44   // this here.
45   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
46     return false;
47 
48   // FIXME: Type predicate on pattern
49   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
50     return false;
51 
52   Register Cond = MI.getOperand(1).getReg();
53   if (!MRI.hasOneNonDBGUse(Cond) ||
54       !mi_match(Cond, MRI,
55                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
56     return false;
57 
58   Info.True = MI.getOperand(2).getReg();
59   Info.False = MI.getOperand(3).getReg();
60 
61   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
62       !(Info.LHS == Info.False && Info.RHS == Info.True))
63     return false;
64 
65   switch (Info.Pred) {
66   case CmpInst::FCMP_FALSE:
67   case CmpInst::FCMP_OEQ:
68   case CmpInst::FCMP_ONE:
69   case CmpInst::FCMP_ORD:
70   case CmpInst::FCMP_UNO:
71   case CmpInst::FCMP_UEQ:
72   case CmpInst::FCMP_UNE:
73   case CmpInst::FCMP_TRUE:
74     return false;
75   default:
76     return true;
77   }
78 }
79 
80 static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
81                                               const FMinFMaxLegacyInfo &Info) {
82 
83   auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
84     MachineIRBuilder MIB(MI);
85     MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
86   };
87 
88   switch (Info.Pred) {
89   case CmpInst::FCMP_ULT:
90   case CmpInst::FCMP_ULE:
91     if (Info.LHS == Info.True)
92       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
93     else
94       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
95     break;
96   case CmpInst::FCMP_OLE:
97   case CmpInst::FCMP_OLT: {
98     // We need to permute the operands to get the correct NaN behavior. The
99     // selected operand is the second one based on the failing compare with NaN,
100     // so permute it based on the compare type the hardware uses.
101     if (Info.LHS == Info.True)
102       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
103     else
104       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
105     break;
106   }
107   case CmpInst::FCMP_UGE:
108   case CmpInst::FCMP_UGT: {
109     if (Info.LHS == Info.True)
110       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
111     else
112       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
113     break;
114   }
115   case CmpInst::FCMP_OGT:
116   case CmpInst::FCMP_OGE: {
117     if (Info.LHS == Info.True)
118       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
119     else
120       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
121     break;
122   }
123   default:
124     llvm_unreachable("predicate should not have matched");
125   }
126 
127   MI.eraseFromParent();
128 }
129 
130 static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
131                               MachineFunction &MF, CombinerHelper &Helper) {
132   Register DstReg = MI.getOperand(0).getReg();
133 
134   // TODO: We could try to match extracting the higher bytes, which would be
135   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
136   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
137   // about in practice.
138   LLT Ty = MRI.getType(DstReg);
139   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
140     Register SrcReg = MI.getOperand(1).getReg();
141     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
142     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
143     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
144     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
145   }
146 
147   return false;
148 }
149 
150 static void applyUCharToFloat(MachineInstr &MI) {
151   MachineIRBuilder B(MI);
152 
153   const LLT S32 = LLT::scalar(32);
154 
155   Register DstReg = MI.getOperand(0).getReg();
156   Register SrcReg = MI.getOperand(1).getReg();
157   LLT Ty = B.getMRI()->getType(DstReg);
158   LLT SrcTy = B.getMRI()->getType(SrcReg);
159   if (SrcTy != S32)
160     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
161 
162   if (Ty == S32) {
163     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
164                    {SrcReg}, MI.getFlags());
165   } else {
166     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
167                              {SrcReg}, MI.getFlags());
168     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
169   }
170 
171   MI.eraseFromParent();
172 }
173 
174 // FIXME: Should be able to have 2 separate matchdatas rather than custom struct
175 // boilerplate.
176 struct CvtF32UByteMatchInfo {
177   Register CvtVal;
178   unsigned ShiftOffset;
179 };
180 
181 static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
182                               MachineFunction &MF,
183                               CvtF32UByteMatchInfo &MatchInfo) {
184   Register SrcReg = MI.getOperand(1).getReg();
185 
186   // Look through G_ZEXT.
187   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
188 
189   Register Src0;
190   int64_t ShiftAmt;
191   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
192   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
193     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
194 
195     unsigned ShiftOffset = 8 * Offset;
196     if (IsShr)
197       ShiftOffset += ShiftAmt;
198     else
199       ShiftOffset -= ShiftAmt;
200 
201     MatchInfo.CvtVal = Src0;
202     MatchInfo.ShiftOffset = ShiftOffset;
203     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
204   }
205 
206   // TODO: Simplify demanded bits.
207   return false;
208 }
209 
210 static void applyCvtF32UByteN(MachineInstr &MI,
211                               const CvtF32UByteMatchInfo &MatchInfo) {
212   MachineIRBuilder B(MI);
213   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
214 
215   const LLT S32 = LLT::scalar(32);
216   Register CvtSrc = MatchInfo.CvtVal;
217   LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
218   if (SrcTy != S32) {
219     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
220     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
221   }
222 
223   assert(MI.getOpcode() != NewOpc);
224   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
225   MI.eraseFromParent();
226 }
227 
228 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
229 #include "AMDGPUGenPostLegalizeGICombiner.inc"
230 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
231 
232 namespace {
233 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
234 #include "AMDGPUGenPostLegalizeGICombiner.inc"
235 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
236 
237 class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
238   GISelKnownBits *KB;
239   MachineDominatorTree *MDT;
240 
241 public:
242   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
243 
244   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
245                                   const AMDGPULegalizerInfo *LI,
246                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
247       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
248                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
249         KB(KB), MDT(MDT) {
250     if (!GeneratedRuleCfg.parseCommandLineOption())
251       report_fatal_error("Invalid rule identifier");
252   }
253 
254   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
255                MachineIRBuilder &B) const override;
256 };
257 
258 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
259                                               MachineInstr &MI,
260                                               MachineIRBuilder &B) const {
261   CombinerHelper Helper(Observer, B, KB, MDT);
262   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
263 
264   if (Generated.tryCombineAll(Observer, MI, B, Helper))
265     return true;
266 
267   switch (MI.getOpcode()) {
268   case TargetOpcode::G_SHL:
269   case TargetOpcode::G_LSHR:
270   case TargetOpcode::G_ASHR:
271     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
272     // common case, splitting this into a move and a 32-bit shift is faster and
273     // the same code size.
274     return Helper.tryCombineShiftToUnmerge(MI, 32);
275   }
276 
277   return false;
278 }
279 
280 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
281 #include "AMDGPUGenPostLegalizeGICombiner.inc"
282 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
283 
284 // Pass boilerplate
285 // ================
286 
287 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
288 public:
289   static char ID;
290 
291   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
292 
293   StringRef getPassName() const override {
294     return "AMDGPUPostLegalizerCombiner";
295   }
296 
297   bool runOnMachineFunction(MachineFunction &MF) override;
298 
299   void getAnalysisUsage(AnalysisUsage &AU) const override;
300 private:
301   bool IsOptNone;
302 };
303 } // end anonymous namespace
304 
305 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
306   AU.addRequired<TargetPassConfig>();
307   AU.setPreservesCFG();
308   getSelectionDAGFallbackAnalysisUsage(AU);
309   AU.addRequired<GISelKnownBitsAnalysis>();
310   AU.addPreserved<GISelKnownBitsAnalysis>();
311   if (!IsOptNone) {
312     AU.addRequired<MachineDominatorTree>();
313     AU.addPreserved<MachineDominatorTree>();
314   }
315   MachineFunctionPass::getAnalysisUsage(AU);
316 }
317 
318 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
319   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
320   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
321 }
322 
323 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
324   if (MF.getProperties().hasProperty(
325           MachineFunctionProperties::Property::FailedISel))
326     return false;
327   auto *TPC = &getAnalysis<TargetPassConfig>();
328   const Function &F = MF.getFunction();
329   bool EnableOpt =
330       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
331 
332   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
333   const AMDGPULegalizerInfo *LI
334     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
335 
336   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
337   MachineDominatorTree *MDT =
338       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
339   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
340                                          F.hasMinSize(), LI, KB, MDT);
341   Combiner C(PCInfo, TPC);
342   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
343 }
344 
345 char AMDGPUPostLegalizerCombiner::ID = 0;
346 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
347                       "Combine AMDGPU machine instrs after legalization",
348                       false, false)
349 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
350 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
351 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
352                     "Combine AMDGPU machine instrs after legalization", false,
353                     false)
354 
355 namespace llvm {
356 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
357   return new AMDGPUPostLegalizerCombiner(IsOptNone);
358 }
359 } // end namespace llvm
360