xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp (revision 5e801ac66d24704442eba426ed13c3effb8a34e7)
1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/Combiner.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/MachineDominators.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/Target/TargetMachine.h"
27 
28 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
29 
30 using namespace llvm;
31 using namespace MIPatternMatch;
32 
33 class AMDGPUPostLegalizerCombinerHelper {
34 protected:
35   MachineIRBuilder &B;
36   MachineFunction &MF;
37   MachineRegisterInfo &MRI;
38   AMDGPUCombinerHelper &Helper;
39 
40 public:
41   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B,
42                                     AMDGPUCombinerHelper &Helper)
43       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
44 
45   struct FMinFMaxLegacyInfo {
46     Register LHS;
47     Register RHS;
48     Register True;
49     Register False;
50     CmpInst::Predicate Pred;
51   };
52 
53   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
54   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
55   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
56                                          const FMinFMaxLegacyInfo &Info);
57 
58   bool matchUCharToFloat(MachineInstr &MI);
59   void applyUCharToFloat(MachineInstr &MI);
60 
61   // FIXME: Should be able to have 2 separate matchdatas rather than custom
62   // struct boilerplate.
63   struct CvtF32UByteMatchInfo {
64     Register CvtVal;
65     unsigned ShiftOffset;
66   };
67 
68   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
69   void applyCvtF32UByteN(MachineInstr &MI,
70                          const CvtF32UByteMatchInfo &MatchInfo);
71 
72   bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
73 };
74 
75 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
76     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
77   // FIXME: Combines should have subtarget predicates, and we shouldn't need
78   // this here.
79   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
80     return false;
81 
82   // FIXME: Type predicate on pattern
83   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
84     return false;
85 
86   Register Cond = MI.getOperand(1).getReg();
87   if (!MRI.hasOneNonDBGUse(Cond) ||
88       !mi_match(Cond, MRI,
89                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
90     return false;
91 
92   Info.True = MI.getOperand(2).getReg();
93   Info.False = MI.getOperand(3).getReg();
94 
95   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
96       !(Info.LHS == Info.False && Info.RHS == Info.True))
97     return false;
98 
99   switch (Info.Pred) {
100   case CmpInst::FCMP_FALSE:
101   case CmpInst::FCMP_OEQ:
102   case CmpInst::FCMP_ONE:
103   case CmpInst::FCMP_ORD:
104   case CmpInst::FCMP_UNO:
105   case CmpInst::FCMP_UEQ:
106   case CmpInst::FCMP_UNE:
107   case CmpInst::FCMP_TRUE:
108     return false;
109   default:
110     return true;
111   }
112 }
113 
114 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
115     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
116   B.setInstrAndDebugLoc(MI);
117   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
118     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
119   };
120 
121   switch (Info.Pred) {
122   case CmpInst::FCMP_ULT:
123   case CmpInst::FCMP_ULE:
124     if (Info.LHS == Info.True)
125       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
126     else
127       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
128     break;
129   case CmpInst::FCMP_OLE:
130   case CmpInst::FCMP_OLT: {
131     // We need to permute the operands to get the correct NaN behavior. The
132     // selected operand is the second one based on the failing compare with NaN,
133     // so permute it based on the compare type the hardware uses.
134     if (Info.LHS == Info.True)
135       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
136     else
137       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
138     break;
139   }
140   case CmpInst::FCMP_UGE:
141   case CmpInst::FCMP_UGT: {
142     if (Info.LHS == Info.True)
143       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
144     else
145       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
146     break;
147   }
148   case CmpInst::FCMP_OGT:
149   case CmpInst::FCMP_OGE: {
150     if (Info.LHS == Info.True)
151       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
152     else
153       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
154     break;
155   }
156   default:
157     llvm_unreachable("predicate should not have matched");
158   }
159 
160   MI.eraseFromParent();
161 }
162 
163 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
164   Register DstReg = MI.getOperand(0).getReg();
165 
166   // TODO: We could try to match extracting the higher bytes, which would be
167   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
168   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
169   // about in practice.
170   LLT Ty = MRI.getType(DstReg);
171   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
172     Register SrcReg = MI.getOperand(1).getReg();
173     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
174     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
175     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
176     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
177   }
178 
179   return false;
180 }
181 
182 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
183   B.setInstrAndDebugLoc(MI);
184 
185   const LLT S32 = LLT::scalar(32);
186 
187   Register DstReg = MI.getOperand(0).getReg();
188   Register SrcReg = MI.getOperand(1).getReg();
189   LLT Ty = MRI.getType(DstReg);
190   LLT SrcTy = MRI.getType(SrcReg);
191   if (SrcTy != S32)
192     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
193 
194   if (Ty == S32) {
195     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
196                    {SrcReg}, MI.getFlags());
197   } else {
198     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
199                              {SrcReg}, MI.getFlags());
200     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
201   }
202 
203   MI.eraseFromParent();
204 }
205 
206 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
207     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
208   Register SrcReg = MI.getOperand(1).getReg();
209 
210   // Look through G_ZEXT.
211   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
212 
213   Register Src0;
214   int64_t ShiftAmt;
215   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
216   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
217     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
218 
219     unsigned ShiftOffset = 8 * Offset;
220     if (IsShr)
221       ShiftOffset += ShiftAmt;
222     else
223       ShiftOffset -= ShiftAmt;
224 
225     MatchInfo.CvtVal = Src0;
226     MatchInfo.ShiftOffset = ShiftOffset;
227     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
228   }
229 
230   // TODO: Simplify demanded bits.
231   return false;
232 }
233 
234 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
235     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
236   B.setInstrAndDebugLoc(MI);
237   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
238 
239   const LLT S32 = LLT::scalar(32);
240   Register CvtSrc = MatchInfo.CvtVal;
241   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
242   if (SrcTy != S32) {
243     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
244     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
245   }
246 
247   assert(MI.getOpcode() != NewOpc);
248   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
249   MI.eraseFromParent();
250 }
251 
252 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
253     MachineInstr &MI, Register &Reg) {
254   const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
255       MF.getSubtarget().getTargetLowering());
256   Reg = MI.getOperand(1).getReg();
257   return TLI->isCanonicalized(Reg, MF);
258 }
259 
260 class AMDGPUPostLegalizerCombinerHelperState {
261 protected:
262   AMDGPUCombinerHelper &Helper;
263   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
264 
265 public:
266   AMDGPUPostLegalizerCombinerHelperState(
267       AMDGPUCombinerHelper &Helper,
268       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
269       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
270 };
271 
272 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
273 #include "AMDGPUGenPostLegalizeGICombiner.inc"
274 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
275 
276 namespace {
277 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
278 #include "AMDGPUGenPostLegalizeGICombiner.inc"
279 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
280 
281 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
282   GISelKnownBits *KB;
283   MachineDominatorTree *MDT;
284 
285 public:
286   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
287 
288   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
289                                   const AMDGPULegalizerInfo *LI,
290                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
291       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
292                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
293         KB(KB), MDT(MDT) {
294     if (!GeneratedRuleCfg.parseCommandLineOption())
295       report_fatal_error("Invalid rule identifier");
296   }
297 
298   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
299                MachineIRBuilder &B) const override;
300 };
301 
302 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
303                                               MachineInstr &MI,
304                                               MachineIRBuilder &B) const {
305   AMDGPUCombinerHelper Helper(Observer, B, KB, MDT, LInfo);
306   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
307   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
308                                                  PostLegalizerHelper);
309 
310   if (Generated.tryCombineAll(Observer, MI, B))
311     return true;
312 
313   switch (MI.getOpcode()) {
314   case TargetOpcode::G_SHL:
315   case TargetOpcode::G_LSHR:
316   case TargetOpcode::G_ASHR:
317     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
318     // common case, splitting this into a move and a 32-bit shift is faster and
319     // the same code size.
320     return Helper.tryCombineShiftToUnmerge(MI, 32);
321   }
322 
323   return false;
324 }
325 
326 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
327 #include "AMDGPUGenPostLegalizeGICombiner.inc"
328 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
329 
330 // Pass boilerplate
331 // ================
332 
333 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
334 public:
335   static char ID;
336 
337   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
338 
339   StringRef getPassName() const override {
340     return "AMDGPUPostLegalizerCombiner";
341   }
342 
343   bool runOnMachineFunction(MachineFunction &MF) override;
344 
345   void getAnalysisUsage(AnalysisUsage &AU) const override;
346 private:
347   bool IsOptNone;
348 };
349 } // end anonymous namespace
350 
351 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
352   AU.addRequired<TargetPassConfig>();
353   AU.setPreservesCFG();
354   getSelectionDAGFallbackAnalysisUsage(AU);
355   AU.addRequired<GISelKnownBitsAnalysis>();
356   AU.addPreserved<GISelKnownBitsAnalysis>();
357   if (!IsOptNone) {
358     AU.addRequired<MachineDominatorTree>();
359     AU.addPreserved<MachineDominatorTree>();
360   }
361   MachineFunctionPass::getAnalysisUsage(AU);
362 }
363 
364 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
365   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
366   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
367 }
368 
369 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
370   if (MF.getProperties().hasProperty(
371           MachineFunctionProperties::Property::FailedISel))
372     return false;
373   auto *TPC = &getAnalysis<TargetPassConfig>();
374   const Function &F = MF.getFunction();
375   bool EnableOpt =
376       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
377 
378   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
379   const AMDGPULegalizerInfo *LI
380     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
381 
382   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
383   MachineDominatorTree *MDT =
384       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
385   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
386                                          F.hasMinSize(), LI, KB, MDT);
387   Combiner C(PCInfo, TPC);
388   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
389 }
390 
391 char AMDGPUPostLegalizerCombiner::ID = 0;
392 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
393                       "Combine AMDGPU machine instrs after legalization",
394                       false, false)
395 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
396 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
397 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
398                     "Combine AMDGPU machine instrs after legalization", false,
399                     false)
400 
401 namespace llvm {
402 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
403   return new AMDGPUPostLegalizerCombiner(IsOptNone);
404 }
405 } // end namespace llvm
406