xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp (revision eea7c61590ae8968b3f1f609cf0bc8633222a94f)
1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/GlobalISel/Combiner.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 class AMDGPUPostLegalizerCombinerHelper {
33 protected:
34   MachineIRBuilder &B;
35   MachineFunction &MF;
36   MachineRegisterInfo &MRI;
37   CombinerHelper &Helper;
38 
39 public:
40   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42 
43   struct FMinFMaxLegacyInfo {
44     Register LHS;
45     Register RHS;
46     Register True;
47     Register False;
48     CmpInst::Predicate Pred;
49   };
50 
51   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
52   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
53   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
54                                          const FMinFMaxLegacyInfo &Info);
55 
56   bool matchUCharToFloat(MachineInstr &MI);
57   void applyUCharToFloat(MachineInstr &MI);
58 
59   // FIXME: Should be able to have 2 separate matchdatas rather than custom
60   // struct boilerplate.
61   struct CvtF32UByteMatchInfo {
62     Register CvtVal;
63     unsigned ShiftOffset;
64   };
65 
66   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
67   void applyCvtF32UByteN(MachineInstr &MI,
68                          const CvtF32UByteMatchInfo &MatchInfo);
69 
70   bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
71 };
72 
73 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
74     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
75   // FIXME: Combines should have subtarget predicates, and we shouldn't need
76   // this here.
77   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
78     return false;
79 
80   // FIXME: Type predicate on pattern
81   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
82     return false;
83 
84   Register Cond = MI.getOperand(1).getReg();
85   if (!MRI.hasOneNonDBGUse(Cond) ||
86       !mi_match(Cond, MRI,
87                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
88     return false;
89 
90   Info.True = MI.getOperand(2).getReg();
91   Info.False = MI.getOperand(3).getReg();
92 
93   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
94       !(Info.LHS == Info.False && Info.RHS == Info.True))
95     return false;
96 
97   switch (Info.Pred) {
98   case CmpInst::FCMP_FALSE:
99   case CmpInst::FCMP_OEQ:
100   case CmpInst::FCMP_ONE:
101   case CmpInst::FCMP_ORD:
102   case CmpInst::FCMP_UNO:
103   case CmpInst::FCMP_UEQ:
104   case CmpInst::FCMP_UNE:
105   case CmpInst::FCMP_TRUE:
106     return false;
107   default:
108     return true;
109   }
110 }
111 
112 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
113     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
114   B.setInstrAndDebugLoc(MI);
115   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
116     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
117   };
118 
119   switch (Info.Pred) {
120   case CmpInst::FCMP_ULT:
121   case CmpInst::FCMP_ULE:
122     if (Info.LHS == Info.True)
123       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
124     else
125       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
126     break;
127   case CmpInst::FCMP_OLE:
128   case CmpInst::FCMP_OLT: {
129     // We need to permute the operands to get the correct NaN behavior. The
130     // selected operand is the second one based on the failing compare with NaN,
131     // so permute it based on the compare type the hardware uses.
132     if (Info.LHS == Info.True)
133       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
134     else
135       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
136     break;
137   }
138   case CmpInst::FCMP_UGE:
139   case CmpInst::FCMP_UGT: {
140     if (Info.LHS == Info.True)
141       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
142     else
143       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
144     break;
145   }
146   case CmpInst::FCMP_OGT:
147   case CmpInst::FCMP_OGE: {
148     if (Info.LHS == Info.True)
149       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
150     else
151       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
152     break;
153   }
154   default:
155     llvm_unreachable("predicate should not have matched");
156   }
157 
158   MI.eraseFromParent();
159 }
160 
161 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
162   Register DstReg = MI.getOperand(0).getReg();
163 
164   // TODO: We could try to match extracting the higher bytes, which would be
165   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
166   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
167   // about in practice.
168   LLT Ty = MRI.getType(DstReg);
169   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
170     Register SrcReg = MI.getOperand(1).getReg();
171     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
172     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
173     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
174     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
175   }
176 
177   return false;
178 }
179 
180 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
181   B.setInstrAndDebugLoc(MI);
182 
183   const LLT S32 = LLT::scalar(32);
184 
185   Register DstReg = MI.getOperand(0).getReg();
186   Register SrcReg = MI.getOperand(1).getReg();
187   LLT Ty = MRI.getType(DstReg);
188   LLT SrcTy = MRI.getType(SrcReg);
189   if (SrcTy != S32)
190     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
191 
192   if (Ty == S32) {
193     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
194                    {SrcReg}, MI.getFlags());
195   } else {
196     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
197                              {SrcReg}, MI.getFlags());
198     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
199   }
200 
201   MI.eraseFromParent();
202 }
203 
204 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
205     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
206   Register SrcReg = MI.getOperand(1).getReg();
207 
208   // Look through G_ZEXT.
209   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
210 
211   Register Src0;
212   int64_t ShiftAmt;
213   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
214   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
215     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
216 
217     unsigned ShiftOffset = 8 * Offset;
218     if (IsShr)
219       ShiftOffset += ShiftAmt;
220     else
221       ShiftOffset -= ShiftAmt;
222 
223     MatchInfo.CvtVal = Src0;
224     MatchInfo.ShiftOffset = ShiftOffset;
225     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
226   }
227 
228   // TODO: Simplify demanded bits.
229   return false;
230 }
231 
232 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
233     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
234   B.setInstrAndDebugLoc(MI);
235   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
236 
237   const LLT S32 = LLT::scalar(32);
238   Register CvtSrc = MatchInfo.CvtVal;
239   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
240   if (SrcTy != S32) {
241     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
242     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
243   }
244 
245   assert(MI.getOpcode() != NewOpc);
246   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
247   MI.eraseFromParent();
248 }
249 
250 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
251     MachineInstr &MI, Register &Reg) {
252   const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
253       MF.getSubtarget().getTargetLowering());
254   Reg = MI.getOperand(1).getReg();
255   return TLI->isCanonicalized(Reg, MF);
256 }
257 
258 class AMDGPUPostLegalizerCombinerHelperState {
259 protected:
260   CombinerHelper &Helper;
261   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
262 
263 public:
264   AMDGPUPostLegalizerCombinerHelperState(
265       CombinerHelper &Helper,
266       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
267       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
268 };
269 
270 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
271 #include "AMDGPUGenPostLegalizeGICombiner.inc"
272 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
273 
274 namespace {
275 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
276 #include "AMDGPUGenPostLegalizeGICombiner.inc"
277 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
278 
279 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
280   GISelKnownBits *KB;
281   MachineDominatorTree *MDT;
282 
283 public:
284   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
285 
286   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
287                                   const AMDGPULegalizerInfo *LI,
288                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
289       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
290                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
291         KB(KB), MDT(MDT) {
292     if (!GeneratedRuleCfg.parseCommandLineOption())
293       report_fatal_error("Invalid rule identifier");
294   }
295 
296   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
297                MachineIRBuilder &B) const override;
298 };
299 
300 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
301                                               MachineInstr &MI,
302                                               MachineIRBuilder &B) const {
303   CombinerHelper Helper(Observer, B, KB, MDT, LInfo);
304   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
305   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
306                                                  PostLegalizerHelper);
307 
308   if (Generated.tryCombineAll(Observer, MI, B))
309     return true;
310 
311   switch (MI.getOpcode()) {
312   case TargetOpcode::G_SHL:
313   case TargetOpcode::G_LSHR:
314   case TargetOpcode::G_ASHR:
315     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
316     // common case, splitting this into a move and a 32-bit shift is faster and
317     // the same code size.
318     return Helper.tryCombineShiftToUnmerge(MI, 32);
319   }
320 
321   return false;
322 }
323 
324 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
325 #include "AMDGPUGenPostLegalizeGICombiner.inc"
326 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
327 
328 // Pass boilerplate
329 // ================
330 
331 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
332 public:
333   static char ID;
334 
335   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
336 
337   StringRef getPassName() const override {
338     return "AMDGPUPostLegalizerCombiner";
339   }
340 
341   bool runOnMachineFunction(MachineFunction &MF) override;
342 
343   void getAnalysisUsage(AnalysisUsage &AU) const override;
344 private:
345   bool IsOptNone;
346 };
347 } // end anonymous namespace
348 
349 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
350   AU.addRequired<TargetPassConfig>();
351   AU.setPreservesCFG();
352   getSelectionDAGFallbackAnalysisUsage(AU);
353   AU.addRequired<GISelKnownBitsAnalysis>();
354   AU.addPreserved<GISelKnownBitsAnalysis>();
355   if (!IsOptNone) {
356     AU.addRequired<MachineDominatorTree>();
357     AU.addPreserved<MachineDominatorTree>();
358   }
359   MachineFunctionPass::getAnalysisUsage(AU);
360 }
361 
362 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
363   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
364   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
365 }
366 
367 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
368   if (MF.getProperties().hasProperty(
369           MachineFunctionProperties::Property::FailedISel))
370     return false;
371   auto *TPC = &getAnalysis<TargetPassConfig>();
372   const Function &F = MF.getFunction();
373   bool EnableOpt =
374       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
375 
376   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
377   const AMDGPULegalizerInfo *LI
378     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
379 
380   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
381   MachineDominatorTree *MDT =
382       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
383   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
384                                          F.hasMinSize(), LI, KB, MDT);
385   Combiner C(PCInfo, TPC);
386   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
387 }
388 
389 char AMDGPUPostLegalizerCombiner::ID = 0;
390 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
391                       "Combine AMDGPU machine instrs after legalization",
392                       false, false)
393 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
394 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
395 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
396                     "Combine AMDGPU machine instrs after legalization", false,
397                     false)
398 
399 namespace llvm {
400 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
401   return new AMDGPUPostLegalizerCombiner(IsOptNone);
402 }
403 } // end namespace llvm
404