xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp (revision f126d349810fdb512c0b01e101342d430b947488)
1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/Combiner.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/MachineDominators.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
30 
31 using namespace llvm;
32 using namespace MIPatternMatch;
33 
34 class AMDGPUPostLegalizerCombinerHelper {
35 protected:
36   MachineIRBuilder &B;
37   MachineFunction &MF;
38   MachineRegisterInfo &MRI;
39   AMDGPUCombinerHelper &Helper;
40 
41 public:
42   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B,
43                                     AMDGPUCombinerHelper &Helper)
44       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
45 
46   struct FMinFMaxLegacyInfo {
47     Register LHS;
48     Register RHS;
49     Register True;
50     Register False;
51     CmpInst::Predicate Pred;
52   };
53 
54   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
55   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
56   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
57                                          const FMinFMaxLegacyInfo &Info);
58 
59   bool matchUCharToFloat(MachineInstr &MI);
60   void applyUCharToFloat(MachineInstr &MI);
61 
62   bool matchRcpSqrtToRsq(MachineInstr &MI,
63                          std::function<void(MachineIRBuilder &)> &MatchInfo);
64 
65   // FIXME: Should be able to have 2 separate matchdatas rather than custom
66   // struct boilerplate.
67   struct CvtF32UByteMatchInfo {
68     Register CvtVal;
69     unsigned ShiftOffset;
70   };
71 
72   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
73   void applyCvtF32UByteN(MachineInstr &MI,
74                          const CvtF32UByteMatchInfo &MatchInfo);
75 
76   bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
77 };
78 
79 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
80     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
81   // FIXME: Combines should have subtarget predicates, and we shouldn't need
82   // this here.
83   if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
84     return false;
85 
86   // FIXME: Type predicate on pattern
87   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
88     return false;
89 
90   Register Cond = MI.getOperand(1).getReg();
91   if (!MRI.hasOneNonDBGUse(Cond) ||
92       !mi_match(Cond, MRI,
93                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
94     return false;
95 
96   Info.True = MI.getOperand(2).getReg();
97   Info.False = MI.getOperand(3).getReg();
98 
99   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
100       !(Info.LHS == Info.False && Info.RHS == Info.True))
101     return false;
102 
103   switch (Info.Pred) {
104   case CmpInst::FCMP_FALSE:
105   case CmpInst::FCMP_OEQ:
106   case CmpInst::FCMP_ONE:
107   case CmpInst::FCMP_ORD:
108   case CmpInst::FCMP_UNO:
109   case CmpInst::FCMP_UEQ:
110   case CmpInst::FCMP_UNE:
111   case CmpInst::FCMP_TRUE:
112     return false;
113   default:
114     return true;
115   }
116 }
117 
118 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
119     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
120   B.setInstrAndDebugLoc(MI);
121   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
122     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
123   };
124 
125   switch (Info.Pred) {
126   case CmpInst::FCMP_ULT:
127   case CmpInst::FCMP_ULE:
128     if (Info.LHS == Info.True)
129       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
130     else
131       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
132     break;
133   case CmpInst::FCMP_OLE:
134   case CmpInst::FCMP_OLT: {
135     // We need to permute the operands to get the correct NaN behavior. The
136     // selected operand is the second one based on the failing compare with NaN,
137     // so permute it based on the compare type the hardware uses.
138     if (Info.LHS == Info.True)
139       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
140     else
141       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
142     break;
143   }
144   case CmpInst::FCMP_UGE:
145   case CmpInst::FCMP_UGT: {
146     if (Info.LHS == Info.True)
147       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
148     else
149       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
150     break;
151   }
152   case CmpInst::FCMP_OGT:
153   case CmpInst::FCMP_OGE: {
154     if (Info.LHS == Info.True)
155       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
156     else
157       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
158     break;
159   }
160   default:
161     llvm_unreachable("predicate should not have matched");
162   }
163 
164   MI.eraseFromParent();
165 }
166 
167 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
168   Register DstReg = MI.getOperand(0).getReg();
169 
170   // TODO: We could try to match extracting the higher bytes, which would be
171   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
172   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
173   // about in practice.
174   LLT Ty = MRI.getType(DstReg);
175   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
176     Register SrcReg = MI.getOperand(1).getReg();
177     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
178     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
179     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
180     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
181   }
182 
183   return false;
184 }
185 
186 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
187   B.setInstrAndDebugLoc(MI);
188 
189   const LLT S32 = LLT::scalar(32);
190 
191   Register DstReg = MI.getOperand(0).getReg();
192   Register SrcReg = MI.getOperand(1).getReg();
193   LLT Ty = MRI.getType(DstReg);
194   LLT SrcTy = MRI.getType(SrcReg);
195   if (SrcTy != S32)
196     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
197 
198   if (Ty == S32) {
199     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
200                    {SrcReg}, MI.getFlags());
201   } else {
202     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
203                              {SrcReg}, MI.getFlags());
204     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
205   }
206 
207   MI.eraseFromParent();
208 }
209 
210 bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
211     MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
212 
213   auto getRcpSrc = [=](const MachineInstr &MI) {
214     MachineInstr *ResMI = nullptr;
215     if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
216         MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
217       ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
218 
219     return ResMI;
220   };
221 
222   auto getSqrtSrc = [=](const MachineInstr &MI) {
223     MachineInstr *SqrtSrcMI = nullptr;
224     mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
225     return SqrtSrcMI;
226   };
227 
228   MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
229   // rcp(sqrt(x))
230   if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
231     MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
232       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
233           .addUse(SqrtSrcMI->getOperand(0).getReg())
234           .setMIFlags(MI.getFlags());
235     };
236     return true;
237   }
238 
239   // sqrt(rcp(x))
240   if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
241     MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
242       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
243           .addUse(RcpSrcMI->getOperand(0).getReg())
244           .setMIFlags(MI.getFlags());
245     };
246     return true;
247   }
248 
249   return false;
250 }
251 
252 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
253     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
254   Register SrcReg = MI.getOperand(1).getReg();
255 
256   // Look through G_ZEXT.
257   mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
258 
259   Register Src0;
260   int64_t ShiftAmt;
261   bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
262   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
263     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
264 
265     unsigned ShiftOffset = 8 * Offset;
266     if (IsShr)
267       ShiftOffset += ShiftAmt;
268     else
269       ShiftOffset -= ShiftAmt;
270 
271     MatchInfo.CvtVal = Src0;
272     MatchInfo.ShiftOffset = ShiftOffset;
273     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
274   }
275 
276   // TODO: Simplify demanded bits.
277   return false;
278 }
279 
280 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
281     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
282   B.setInstrAndDebugLoc(MI);
283   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
284 
285   const LLT S32 = LLT::scalar(32);
286   Register CvtSrc = MatchInfo.CvtVal;
287   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
288   if (SrcTy != S32) {
289     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
290     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
291   }
292 
293   assert(MI.getOpcode() != NewOpc);
294   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
295   MI.eraseFromParent();
296 }
297 
298 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
299     MachineInstr &MI, Register &Reg) {
300   const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
301       MF.getSubtarget().getTargetLowering());
302   Reg = MI.getOperand(1).getReg();
303   return TLI->isCanonicalized(Reg, MF);
304 }
305 
306 class AMDGPUPostLegalizerCombinerHelperState {
307 protected:
308   AMDGPUCombinerHelper &Helper;
309   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
310 
311 public:
312   AMDGPUPostLegalizerCombinerHelperState(
313       AMDGPUCombinerHelper &Helper,
314       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper)
315       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper) {}
316 };
317 
318 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
319 #include "AMDGPUGenPostLegalizeGICombiner.inc"
320 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
321 
322 namespace {
323 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
324 #include "AMDGPUGenPostLegalizeGICombiner.inc"
325 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
326 
327 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
328   GISelKnownBits *KB;
329   MachineDominatorTree *MDT;
330 
331 public:
332   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
333 
334   AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
335                                   const AMDGPULegalizerInfo *LI,
336                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
337       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
338                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
339         KB(KB), MDT(MDT) {
340     if (!GeneratedRuleCfg.parseCommandLineOption())
341       report_fatal_error("Invalid rule identifier");
342   }
343 
344   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
345                MachineIRBuilder &B) const override;
346 };
347 
348 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
349                                               MachineInstr &MI,
350                                               MachineIRBuilder &B) const {
351   AMDGPUCombinerHelper Helper(Observer, B, KB, MDT, LInfo);
352   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
353   AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
354                                                  PostLegalizerHelper);
355 
356   if (Generated.tryCombineAll(Observer, MI, B))
357     return true;
358 
359   switch (MI.getOpcode()) {
360   case TargetOpcode::G_SHL:
361   case TargetOpcode::G_LSHR:
362   case TargetOpcode::G_ASHR:
363     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
364     // common case, splitting this into a move and a 32-bit shift is faster and
365     // the same code size.
366     return Helper.tryCombineShiftToUnmerge(MI, 32);
367   }
368 
369   return false;
370 }
371 
372 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
373 #include "AMDGPUGenPostLegalizeGICombiner.inc"
374 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
375 
376 // Pass boilerplate
377 // ================
378 
379 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
380 public:
381   static char ID;
382 
383   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
384 
385   StringRef getPassName() const override {
386     return "AMDGPUPostLegalizerCombiner";
387   }
388 
389   bool runOnMachineFunction(MachineFunction &MF) override;
390 
391   void getAnalysisUsage(AnalysisUsage &AU) const override;
392 private:
393   bool IsOptNone;
394 };
395 } // end anonymous namespace
396 
397 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
398   AU.addRequired<TargetPassConfig>();
399   AU.setPreservesCFG();
400   getSelectionDAGFallbackAnalysisUsage(AU);
401   AU.addRequired<GISelKnownBitsAnalysis>();
402   AU.addPreserved<GISelKnownBitsAnalysis>();
403   if (!IsOptNone) {
404     AU.addRequired<MachineDominatorTree>();
405     AU.addPreserved<MachineDominatorTree>();
406   }
407   MachineFunctionPass::getAnalysisUsage(AU);
408 }
409 
410 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
411   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
412   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
413 }
414 
415 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
416   if (MF.getProperties().hasProperty(
417           MachineFunctionProperties::Property::FailedISel))
418     return false;
419   auto *TPC = &getAnalysis<TargetPassConfig>();
420   const Function &F = MF.getFunction();
421   bool EnableOpt =
422       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
423 
424   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
425   const AMDGPULegalizerInfo *LI
426     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
427 
428   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
429   MachineDominatorTree *MDT =
430       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
431   AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
432                                          F.hasMinSize(), LI, KB, MDT);
433   Combiner C(PCInfo, TPC);
434   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
435 }
436 
437 char AMDGPUPostLegalizerCombiner::ID = 0;
438 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
439                       "Combine AMDGPU machine instrs after legalization",
440                       false, false)
441 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
442 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
443 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
444                     "Combine AMDGPU machine instrs after legalization", false,
445                     false)
446 
447 namespace llvm {
448 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
449   return new AMDGPUPostLegalizerCombiner(IsOptNone);
450 }
451 } // end namespace llvm
452