xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp (revision b2d2a78ad80ec68d4a17f5aef97d21686cb1e29b)
1 //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPUCombinerHelper.h"
10 #include "GCNSubtarget.h"
11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
13 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
14 #include "llvm/IR/IntrinsicsAMDGPU.h"
15 #include "llvm/Target/TargetMachine.h"
16 
17 using namespace llvm;
18 using namespace MIPatternMatch;
19 
20 LLVM_READNONE
21 static bool fnegFoldsIntoMI(const MachineInstr &MI) {
22   switch (MI.getOpcode()) {
23   case AMDGPU::G_FADD:
24   case AMDGPU::G_FSUB:
25   case AMDGPU::G_FMUL:
26   case AMDGPU::G_FMA:
27   case AMDGPU::G_FMAD:
28   case AMDGPU::G_FMINNUM:
29   case AMDGPU::G_FMAXNUM:
30   case AMDGPU::G_FMINNUM_IEEE:
31   case AMDGPU::G_FMAXNUM_IEEE:
32   case AMDGPU::G_FMINIMUM:
33   case AMDGPU::G_FMAXIMUM:
34   case AMDGPU::G_FSIN:
35   case AMDGPU::G_FPEXT:
36   case AMDGPU::G_INTRINSIC_TRUNC:
37   case AMDGPU::G_FPTRUNC:
38   case AMDGPU::G_FRINT:
39   case AMDGPU::G_FNEARBYINT:
40   case AMDGPU::G_INTRINSIC_ROUND:
41   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
42   case AMDGPU::G_FCANONICALIZE:
43   case AMDGPU::G_AMDGPU_RCP_IFLAG:
44   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
45   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
46     return true;
47   case AMDGPU::G_INTRINSIC: {
48     Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
49     switch (IntrinsicID) {
50     case Intrinsic::amdgcn_rcp:
51     case Intrinsic::amdgcn_rcp_legacy:
52     case Intrinsic::amdgcn_sin:
53     case Intrinsic::amdgcn_fmul_legacy:
54     case Intrinsic::amdgcn_fmed3:
55     case Intrinsic::amdgcn_fma_legacy:
56       return true;
57     default:
58       return false;
59     }
60   }
61   default:
62     return false;
63   }
64 }
65 
66 /// \p returns true if the operation will definitely need to use a 64-bit
67 /// encoding, and thus will use a VOP3 encoding regardless of the source
68 /// modifiers.
69 LLVM_READONLY
70 static bool opMustUseVOP3Encoding(const MachineInstr &MI,
71                                   const MachineRegisterInfo &MRI) {
72   return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) ||
73          MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
74 }
75 
76 // Most FP instructions support source modifiers.
77 LLVM_READONLY
78 static bool hasSourceMods(const MachineInstr &MI) {
79   if (!MI.memoperands().empty())
80     return false;
81 
82   switch (MI.getOpcode()) {
83   case AMDGPU::COPY:
84   case AMDGPU::G_SELECT:
85   case AMDGPU::G_FDIV:
86   case AMDGPU::G_FREM:
87   case TargetOpcode::INLINEASM:
88   case TargetOpcode::INLINEASM_BR:
89   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
90   case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
91   case AMDGPU::G_BITCAST:
92   case AMDGPU::G_ANYEXT:
93   case AMDGPU::G_BUILD_VECTOR:
94   case AMDGPU::G_BUILD_VECTOR_TRUNC:
95   case AMDGPU::G_PHI:
96     return false;
97   case AMDGPU::G_INTRINSIC:
98   case AMDGPU::G_INTRINSIC_CONVERGENT: {
99     Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
100     switch (IntrinsicID) {
101     case Intrinsic::amdgcn_interp_p1:
102     case Intrinsic::amdgcn_interp_p2:
103     case Intrinsic::amdgcn_interp_mov:
104     case Intrinsic::amdgcn_interp_p1_f16:
105     case Intrinsic::amdgcn_interp_p2_f16:
106     case Intrinsic::amdgcn_div_scale:
107       return false;
108     default:
109       return true;
110     }
111   }
112   default:
113     return true;
114   }
115 }
116 
117 static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
118                                   unsigned CostThreshold = 4) {
119   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
120   // it is truly free to use a source modifier in all cases. If there are
121   // multiple users but for each one will necessitate using VOP3, there will be
122   // a code size increase. Try to avoid increasing code size unless we know it
123   // will save on the instruction count.
124   unsigned NumMayIncreaseSize = 0;
125   Register Dst = MI.getOperand(0).getReg();
126   for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
127     if (!hasSourceMods(Use))
128       return false;
129 
130     if (!opMustUseVOP3Encoding(Use, MRI)) {
131       if (++NumMayIncreaseSize > CostThreshold)
132         return false;
133     }
134   }
135   return true;
136 }
137 
138 static bool mayIgnoreSignedZero(MachineInstr &MI) {
139   const TargetOptions &Options = MI.getMF()->getTarget().Options;
140   return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
141 }
142 
143 static bool isInv2Pi(const APFloat &APF) {
144   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
145   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
146   static const APFloat KF64(APFloat::IEEEdouble(),
147                             APInt(64, 0x3fc45f306dc9c882));
148 
149   return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
150          APF.bitwiseIsEqual(KF64);
151 }
152 
153 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
154 // additional cost to negate them.
155 static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
156                                        MachineRegisterInfo &MRI) {
157   std::optional<FPValueAndVReg> FPValReg;
158   if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
159     if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
160       return true;
161 
162     const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
163     if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
164       return true;
165   }
166   return false;
167 }
168 
169 static unsigned inverseMinMax(unsigned Opc) {
170   switch (Opc) {
171   case AMDGPU::G_FMAXNUM:
172     return AMDGPU::G_FMINNUM;
173   case AMDGPU::G_FMINNUM:
174     return AMDGPU::G_FMAXNUM;
175   case AMDGPU::G_FMAXNUM_IEEE:
176     return AMDGPU::G_FMINNUM_IEEE;
177   case AMDGPU::G_FMINNUM_IEEE:
178     return AMDGPU::G_FMAXNUM_IEEE;
179   case AMDGPU::G_FMAXIMUM:
180     return AMDGPU::G_FMINIMUM;
181   case AMDGPU::G_FMINIMUM:
182     return AMDGPU::G_FMAXIMUM;
183   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
184     return AMDGPU::G_AMDGPU_FMIN_LEGACY;
185   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
186     return AMDGPU::G_AMDGPU_FMAX_LEGACY;
187   default:
188     llvm_unreachable("invalid min/max opcode");
189   }
190 }
191 
192 bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
193                                              MachineInstr *&MatchInfo) {
194   Register Src = MI.getOperand(1).getReg();
195   MatchInfo = MRI.getVRegDef(Src);
196 
197   // If the input has multiple uses and we can either fold the negate down, or
198   // the other uses cannot, give up. This both prevents unprofitable
199   // transformations and infinite loops: we won't repeatedly try to fold around
200   // a negate that has no 'good' form.
201   if (MRI.hasOneNonDBGUse(Src)) {
202     if (allUsesHaveSourceMods(MI, MRI, 0))
203       return false;
204   } else {
205     if (fnegFoldsIntoMI(*MatchInfo) &&
206         (allUsesHaveSourceMods(MI, MRI) ||
207          !allUsesHaveSourceMods(*MatchInfo, MRI)))
208       return false;
209   }
210 
211   switch (MatchInfo->getOpcode()) {
212   case AMDGPU::G_FMINNUM:
213   case AMDGPU::G_FMAXNUM:
214   case AMDGPU::G_FMINNUM_IEEE:
215   case AMDGPU::G_FMAXNUM_IEEE:
216   case AMDGPU::G_FMINIMUM:
217   case AMDGPU::G_FMAXIMUM:
218   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
219   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
220     // 0 doesn't have a negated inline immediate.
221     return !isConstantCostlierToNegate(*MatchInfo,
222                                        MatchInfo->getOperand(2).getReg(), MRI);
223   case AMDGPU::G_FADD:
224   case AMDGPU::G_FSUB:
225   case AMDGPU::G_FMA:
226   case AMDGPU::G_FMAD:
227     return mayIgnoreSignedZero(*MatchInfo);
228   case AMDGPU::G_FMUL:
229   case AMDGPU::G_FPEXT:
230   case AMDGPU::G_INTRINSIC_TRUNC:
231   case AMDGPU::G_FPTRUNC:
232   case AMDGPU::G_FRINT:
233   case AMDGPU::G_FNEARBYINT:
234   case AMDGPU::G_INTRINSIC_ROUND:
235   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
236   case AMDGPU::G_FSIN:
237   case AMDGPU::G_FCANONICALIZE:
238   case AMDGPU::G_AMDGPU_RCP_IFLAG:
239     return true;
240   case AMDGPU::G_INTRINSIC:
241   case AMDGPU::G_INTRINSIC_CONVERGENT: {
242     Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
243     switch (IntrinsicID) {
244     case Intrinsic::amdgcn_rcp:
245     case Intrinsic::amdgcn_rcp_legacy:
246     case Intrinsic::amdgcn_sin:
247     case Intrinsic::amdgcn_fmul_legacy:
248     case Intrinsic::amdgcn_fmed3:
249       return true;
250     case Intrinsic::amdgcn_fma_legacy:
251       return mayIgnoreSignedZero(*MatchInfo);
252     default:
253       return false;
254     }
255   }
256   default:
257     return false;
258   }
259 }
260 
261 void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
262                                              MachineInstr *&MatchInfo) {
263   // Transform:
264   // %A = inst %Op1, ...
265   // %B = fneg %A
266   //
267   // into:
268   //
269   // (if %A has one use, specifically fneg above)
270   // %B = inst (maybe fneg %Op1), ...
271   //
272   // (if %A has multiple uses)
273   // %B = inst (maybe fneg %Op1), ...
274   // %A = fneg %B
275 
276   // Replace register in operand with a register holding negated value.
277   auto NegateOperand = [&](MachineOperand &Op) {
278     Register Reg = Op.getReg();
279     if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
280       Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
281     replaceRegOpWith(MRI, Op, Reg);
282   };
283 
284   // Replace either register in operands with a register holding negated value.
285   auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
286     Register XReg = X.getReg();
287     Register YReg = Y.getReg();
288     if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
289       replaceRegOpWith(MRI, X, XReg);
290     else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
291       replaceRegOpWith(MRI, Y, YReg);
292     else {
293       YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
294       replaceRegOpWith(MRI, Y, YReg);
295     }
296   };
297 
298   Builder.setInstrAndDebugLoc(*MatchInfo);
299 
300   // Negate appropriate operands so that resulting value of MatchInfo is
301   // negated.
302   switch (MatchInfo->getOpcode()) {
303   case AMDGPU::G_FADD:
304   case AMDGPU::G_FSUB:
305     NegateOperand(MatchInfo->getOperand(1));
306     NegateOperand(MatchInfo->getOperand(2));
307     break;
308   case AMDGPU::G_FMUL:
309     NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
310     break;
311   case AMDGPU::G_FMINNUM:
312   case AMDGPU::G_FMAXNUM:
313   case AMDGPU::G_FMINNUM_IEEE:
314   case AMDGPU::G_FMAXNUM_IEEE:
315   case AMDGPU::G_FMINIMUM:
316   case AMDGPU::G_FMAXIMUM:
317   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
318   case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
319     NegateOperand(MatchInfo->getOperand(1));
320     NegateOperand(MatchInfo->getOperand(2));
321     unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
322     replaceOpcodeWith(*MatchInfo, Opposite);
323     break;
324   }
325   case AMDGPU::G_FMA:
326   case AMDGPU::G_FMAD:
327     NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
328     NegateOperand(MatchInfo->getOperand(3));
329     break;
330   case AMDGPU::G_FPEXT:
331   case AMDGPU::G_INTRINSIC_TRUNC:
332   case AMDGPU::G_FRINT:
333   case AMDGPU::G_FNEARBYINT:
334   case AMDGPU::G_INTRINSIC_ROUND:
335   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
336   case AMDGPU::G_FSIN:
337   case AMDGPU::G_FCANONICALIZE:
338   case AMDGPU::G_AMDGPU_RCP_IFLAG:
339   case AMDGPU::G_FPTRUNC:
340     NegateOperand(MatchInfo->getOperand(1));
341     break;
342   case AMDGPU::G_INTRINSIC:
343   case AMDGPU::G_INTRINSIC_CONVERGENT: {
344     Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
345     switch (IntrinsicID) {
346     case Intrinsic::amdgcn_rcp:
347     case Intrinsic::amdgcn_rcp_legacy:
348     case Intrinsic::amdgcn_sin:
349       NegateOperand(MatchInfo->getOperand(2));
350       break;
351     case Intrinsic::amdgcn_fmul_legacy:
352       NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
353       break;
354     case Intrinsic::amdgcn_fmed3:
355       NegateOperand(MatchInfo->getOperand(2));
356       NegateOperand(MatchInfo->getOperand(3));
357       NegateOperand(MatchInfo->getOperand(4));
358       break;
359     case Intrinsic::amdgcn_fma_legacy:
360       NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
361       NegateOperand(MatchInfo->getOperand(4));
362       break;
363     default:
364       llvm_unreachable("folding fneg not supported for this intrinsic");
365     }
366     break;
367   }
368   default:
369     llvm_unreachable("folding fneg not supported for this instruction");
370   }
371 
372   Register Dst = MI.getOperand(0).getReg();
373   Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
374 
375   if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
376     // MatchInfo now has negated value so use that instead of old Dst.
377     replaceRegWith(MRI, Dst, MatchInfoDst);
378   } else {
379     // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
380     // but replaceRegWith will replace defs as well. It is easier to replace one
381     // def with a new register.
382     LLT Type = MRI.getType(Dst);
383     Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
384     replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
385 
386     // MatchInfo now has negated value so use that instead of old Dst.
387     replaceRegWith(MRI, Dst, NegatedMatchInfo);
388 
389     // Recreate non negated value for other uses of old MatchInfoDst
390     auto NextInst = ++MatchInfo->getIterator();
391     Builder.setInstrAndDebugLoc(*NextInst);
392     Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
393   }
394 
395   MI.eraseFromParent();
396 }
397 
398 // TODO: Should return converted value / extension source and avoid introducing
399 // intermediate fptruncs in the apply function.
400 static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
401                                   Register Reg) {
402   const MachineInstr *Def = MRI.getVRegDef(Reg);
403   if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
404     Register SrcReg = Def->getOperand(1).getReg();
405     return MRI.getType(SrcReg) == LLT::scalar(16);
406   }
407 
408   if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
409     APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
410     bool LosesInfo = true;
411     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
412     return !LosesInfo;
413   }
414 
415   return false;
416 }
417 
418 bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
419                                                        Register Src0,
420                                                        Register Src1,
421                                                        Register Src2) {
422   assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
423   Register SrcReg = MI.getOperand(1).getReg();
424   if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
425     return false;
426 
427   return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
428          isFPExtFromF16OrConst(MRI, Src2);
429 }
430 
431 void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
432                                                        Register Src0,
433                                                        Register Src1,
434                                                        Register Src2) {
435   // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
436   // sources.
437   Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
438   Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
439   Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);
440 
441   LLT Ty = MRI.getType(Src0);
442   auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
443   auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
444   auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
445   Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
446   MI.eraseFromParent();
447 }
448