xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp (revision 5ca8e32633c4ffbbcd6762e5888b6a4ba0708c6c)
1 //=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "AMDGPUCombinerHelper.h"
10 #include "GCNSubtarget.h"
11 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
12 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
13 #include "llvm/IR/IntrinsicsAMDGPU.h"
14 #include "llvm/Target/TargetMachine.h"
15 
16 using namespace llvm;
17 using namespace MIPatternMatch;
18 
19 LLVM_READNONE
20 static bool fnegFoldsIntoMI(const MachineInstr &MI) {
21   switch (MI.getOpcode()) {
22   case AMDGPU::G_FADD:
23   case AMDGPU::G_FSUB:
24   case AMDGPU::G_FMUL:
25   case AMDGPU::G_FMA:
26   case AMDGPU::G_FMAD:
27   case AMDGPU::G_FMINNUM:
28   case AMDGPU::G_FMAXNUM:
29   case AMDGPU::G_FMINNUM_IEEE:
30   case AMDGPU::G_FMAXNUM_IEEE:
31   case AMDGPU::G_FSIN:
32   case AMDGPU::G_FPEXT:
33   case AMDGPU::G_INTRINSIC_TRUNC:
34   case AMDGPU::G_FPTRUNC:
35   case AMDGPU::G_FRINT:
36   case AMDGPU::G_FNEARBYINT:
37   case AMDGPU::G_INTRINSIC_ROUND:
38   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
39   case AMDGPU::G_FCANONICALIZE:
40   case AMDGPU::G_AMDGPU_RCP_IFLAG:
41   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
42   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
43     return true;
44   case AMDGPU::G_INTRINSIC: {
45     unsigned IntrinsicID = MI.getIntrinsicID();
46     switch (IntrinsicID) {
47     case Intrinsic::amdgcn_rcp:
48     case Intrinsic::amdgcn_rcp_legacy:
49     case Intrinsic::amdgcn_sin:
50     case Intrinsic::amdgcn_fmul_legacy:
51     case Intrinsic::amdgcn_fmed3:
52     case Intrinsic::amdgcn_fma_legacy:
53       return true;
54     default:
55       return false;
56     }
57   }
58   default:
59     return false;
60   }
61 }
62 
63 /// \p returns true if the operation will definitely need to use a 64-bit
64 /// encoding, and thus will use a VOP3 encoding regardless of the source
65 /// modifiers.
66 LLVM_READONLY
67 static bool opMustUseVOP3Encoding(const MachineInstr &MI,
68                                   const MachineRegisterInfo &MRI) {
69   return MI.getNumOperands() >
70              (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) ||
71          MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
72 }
73 
74 // Most FP instructions support source modifiers.
75 LLVM_READONLY
76 static bool hasSourceMods(const MachineInstr &MI) {
77   if (!MI.memoperands().empty())
78     return false;
79 
80   switch (MI.getOpcode()) {
81   case AMDGPU::COPY:
82   case AMDGPU::G_SELECT:
83   case AMDGPU::G_FDIV:
84   case AMDGPU::G_FREM:
85   case TargetOpcode::INLINEASM:
86   case TargetOpcode::INLINEASM_BR:
87   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
88   case AMDGPU::G_BITCAST:
89   case AMDGPU::G_ANYEXT:
90   case AMDGPU::G_BUILD_VECTOR:
91   case AMDGPU::G_BUILD_VECTOR_TRUNC:
92   case AMDGPU::G_PHI:
93     return false;
94   case AMDGPU::G_INTRINSIC: {
95     unsigned IntrinsicID = MI.getIntrinsicID();
96     switch (IntrinsicID) {
97     case Intrinsic::amdgcn_interp_p1:
98     case Intrinsic::amdgcn_interp_p2:
99     case Intrinsic::amdgcn_interp_mov:
100     case Intrinsic::amdgcn_interp_p1_f16:
101     case Intrinsic::amdgcn_interp_p2_f16:
102     case Intrinsic::amdgcn_div_scale:
103       return false;
104     default:
105       return true;
106     }
107   }
108   default:
109     return true;
110   }
111 }
112 
113 static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
114                                   unsigned CostThreshold = 4) {
115   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
116   // it is truly free to use a source modifier in all cases. If there are
117   // multiple users but for each one will necessitate using VOP3, there will be
118   // a code size increase. Try to avoid increasing code size unless we know it
119   // will save on the instruction count.
120   unsigned NumMayIncreaseSize = 0;
121   Register Dst = MI.getOperand(0).getReg();
122   for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
123     if (!hasSourceMods(Use))
124       return false;
125 
126     if (!opMustUseVOP3Encoding(Use, MRI)) {
127       if (++NumMayIncreaseSize > CostThreshold)
128         return false;
129     }
130   }
131   return true;
132 }
133 
134 static bool mayIgnoreSignedZero(MachineInstr &MI) {
135   const TargetOptions &Options = MI.getMF()->getTarget().Options;
136   return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
137 }
138 
139 static bool isInv2Pi(const APFloat &APF) {
140   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
141   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
142   static const APFloat KF64(APFloat::IEEEdouble(),
143                             APInt(64, 0x3fc45f306dc9c882));
144 
145   return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
146          APF.bitwiseIsEqual(KF64);
147 }
148 
149 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
150 // additional cost to negate them.
151 static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
152                                        MachineRegisterInfo &MRI) {
153   std::optional<FPValueAndVReg> FPValReg;
154   if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
155     if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
156       return true;
157 
158     const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
159     if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
160       return true;
161   }
162   return false;
163 }
164 
165 static unsigned inverseMinMax(unsigned Opc) {
166   switch (Opc) {
167   case AMDGPU::G_FMAXNUM:
168     return AMDGPU::G_FMINNUM;
169   case AMDGPU::G_FMINNUM:
170     return AMDGPU::G_FMAXNUM;
171   case AMDGPU::G_FMAXNUM_IEEE:
172     return AMDGPU::G_FMINNUM_IEEE;
173   case AMDGPU::G_FMINNUM_IEEE:
174     return AMDGPU::G_FMAXNUM_IEEE;
175   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
176     return AMDGPU::G_AMDGPU_FMIN_LEGACY;
177   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
178     return AMDGPU::G_AMDGPU_FMAX_LEGACY;
179   default:
180     llvm_unreachable("invalid min/max opcode");
181   }
182 }
183 
184 bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
185                                              MachineInstr *&MatchInfo) {
186   Register Src = MI.getOperand(1).getReg();
187   MatchInfo = MRI.getVRegDef(Src);
188 
189   // If the input has multiple uses and we can either fold the negate down, or
190   // the other uses cannot, give up. This both prevents unprofitable
191   // transformations and infinite loops: we won't repeatedly try to fold around
192   // a negate that has no 'good' form.
193   if (MRI.hasOneNonDBGUse(Src)) {
194     if (allUsesHaveSourceMods(MI, MRI, 0))
195       return false;
196   } else {
197     if (fnegFoldsIntoMI(*MatchInfo) &&
198         (allUsesHaveSourceMods(MI, MRI) ||
199          !allUsesHaveSourceMods(*MatchInfo, MRI)))
200       return false;
201   }
202 
203   switch (MatchInfo->getOpcode()) {
204   case AMDGPU::G_FMINNUM:
205   case AMDGPU::G_FMAXNUM:
206   case AMDGPU::G_FMINNUM_IEEE:
207   case AMDGPU::G_FMAXNUM_IEEE:
208   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
209   case AMDGPU::G_AMDGPU_FMAX_LEGACY:
210     // 0 doesn't have a negated inline immediate.
211     return !isConstantCostlierToNegate(*MatchInfo,
212                                        MatchInfo->getOperand(2).getReg(), MRI);
213   case AMDGPU::G_FADD:
214   case AMDGPU::G_FSUB:
215   case AMDGPU::G_FMA:
216   case AMDGPU::G_FMAD:
217     return mayIgnoreSignedZero(*MatchInfo);
218   case AMDGPU::G_FMUL:
219   case AMDGPU::G_FPEXT:
220   case AMDGPU::G_INTRINSIC_TRUNC:
221   case AMDGPU::G_FPTRUNC:
222   case AMDGPU::G_FRINT:
223   case AMDGPU::G_FNEARBYINT:
224   case AMDGPU::G_INTRINSIC_ROUND:
225   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
226   case AMDGPU::G_FSIN:
227   case AMDGPU::G_FCANONICALIZE:
228   case AMDGPU::G_AMDGPU_RCP_IFLAG:
229     return true;
230   case AMDGPU::G_INTRINSIC: {
231     unsigned IntrinsicID = MatchInfo->getIntrinsicID();
232     switch (IntrinsicID) {
233     case Intrinsic::amdgcn_rcp:
234     case Intrinsic::amdgcn_rcp_legacy:
235     case Intrinsic::amdgcn_sin:
236     case Intrinsic::amdgcn_fmul_legacy:
237     case Intrinsic::amdgcn_fmed3:
238       return true;
239     case Intrinsic::amdgcn_fma_legacy:
240       return mayIgnoreSignedZero(*MatchInfo);
241     default:
242       return false;
243     }
244   }
245   default:
246     return false;
247   }
248 }
249 
250 void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
251                                              MachineInstr *&MatchInfo) {
252   // Transform:
253   // %A = inst %Op1, ...
254   // %B = fneg %A
255   //
256   // into:
257   //
258   // (if %A has one use, specifically fneg above)
259   // %B = inst (maybe fneg %Op1), ...
260   //
261   // (if %A has multiple uses)
262   // %B = inst (maybe fneg %Op1), ...
263   // %A = fneg %B
264 
265   // Replace register in operand with a register holding negated value.
266   auto NegateOperand = [&](MachineOperand &Op) {
267     Register Reg = Op.getReg();
268     if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
269       Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
270     replaceRegOpWith(MRI, Op, Reg);
271   };
272 
273   // Replace either register in operands with a register holding negated value.
274   auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
275     Register XReg = X.getReg();
276     Register YReg = Y.getReg();
277     if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
278       replaceRegOpWith(MRI, X, XReg);
279     else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
280       replaceRegOpWith(MRI, Y, YReg);
281     else {
282       YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
283       replaceRegOpWith(MRI, Y, YReg);
284     }
285   };
286 
287   Builder.setInstrAndDebugLoc(*MatchInfo);
288 
289   // Negate appropriate operands so that resulting value of MatchInfo is
290   // negated.
291   switch (MatchInfo->getOpcode()) {
292   case AMDGPU::G_FADD:
293   case AMDGPU::G_FSUB:
294     NegateOperand(MatchInfo->getOperand(1));
295     NegateOperand(MatchInfo->getOperand(2));
296     break;
297   case AMDGPU::G_FMUL:
298     NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
299     break;
300   case AMDGPU::G_FMINNUM:
301   case AMDGPU::G_FMAXNUM:
302   case AMDGPU::G_FMINNUM_IEEE:
303   case AMDGPU::G_FMAXNUM_IEEE:
304   case AMDGPU::G_AMDGPU_FMIN_LEGACY:
305   case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
306     NegateOperand(MatchInfo->getOperand(1));
307     NegateOperand(MatchInfo->getOperand(2));
308     unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
309     replaceOpcodeWith(*MatchInfo, Opposite);
310     break;
311   }
312   case AMDGPU::G_FMA:
313   case AMDGPU::G_FMAD:
314     NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
315     NegateOperand(MatchInfo->getOperand(3));
316     break;
317   case AMDGPU::G_FPEXT:
318   case AMDGPU::G_INTRINSIC_TRUNC:
319   case AMDGPU::G_FRINT:
320   case AMDGPU::G_FNEARBYINT:
321   case AMDGPU::G_INTRINSIC_ROUND:
322   case AMDGPU::G_INTRINSIC_ROUNDEVEN:
323   case AMDGPU::G_FSIN:
324   case AMDGPU::G_FCANONICALIZE:
325   case AMDGPU::G_AMDGPU_RCP_IFLAG:
326   case AMDGPU::G_FPTRUNC:
327     NegateOperand(MatchInfo->getOperand(1));
328     break;
329   case AMDGPU::G_INTRINSIC: {
330     unsigned IntrinsicID = MatchInfo->getIntrinsicID();
331     switch (IntrinsicID) {
332     case Intrinsic::amdgcn_rcp:
333     case Intrinsic::amdgcn_rcp_legacy:
334     case Intrinsic::amdgcn_sin:
335       NegateOperand(MatchInfo->getOperand(2));
336       break;
337     case Intrinsic::amdgcn_fmul_legacy:
338       NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
339       break;
340     case Intrinsic::amdgcn_fmed3:
341       NegateOperand(MatchInfo->getOperand(2));
342       NegateOperand(MatchInfo->getOperand(3));
343       NegateOperand(MatchInfo->getOperand(4));
344       break;
345     case Intrinsic::amdgcn_fma_legacy:
346       NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
347       NegateOperand(MatchInfo->getOperand(4));
348       break;
349     default:
350       llvm_unreachable("folding fneg not supported for this intrinsic");
351     }
352     break;
353   }
354   default:
355     llvm_unreachable("folding fneg not supported for this instruction");
356   }
357 
358   Register Dst = MI.getOperand(0).getReg();
359   Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
360 
361   if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
362     // MatchInfo now has negated value so use that instead of old Dst.
363     replaceRegWith(MRI, Dst, MatchInfoDst);
364   } else {
365     // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
366     // but replaceRegWith will replace defs as well. It is easier to replace one
367     // def with a new register.
368     LLT Type = MRI.getType(Dst);
369     Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
370     replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
371 
372     // MatchInfo now has negated value so use that instead of old Dst.
373     replaceRegWith(MRI, Dst, NegatedMatchInfo);
374 
375     // Recreate non negated value for other uses of old MatchInfoDst
376     auto NextInst = ++MatchInfo->getIterator();
377     Builder.setInstrAndDebugLoc(*NextInst);
378     Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
379   }
380 
381   MI.eraseFromParent();
382 }
383 
384 // TODO: Should return converted value / extension source and avoid introducing
385 // intermediate fptruncs in the apply function.
386 static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
387                                   Register Reg) {
388   const MachineInstr *Def = MRI.getVRegDef(Reg);
389   if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
390     Register SrcReg = Def->getOperand(1).getReg();
391     return MRI.getType(SrcReg) == LLT::scalar(16);
392   }
393 
394   if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
395     APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
396     bool LosesInfo = true;
397     Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
398     return !LosesInfo;
399   }
400 
401   return false;
402 }
403 
404 bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
405                                                        Register Src0,
406                                                        Register Src1,
407                                                        Register Src2) {
408   assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
409   Register SrcReg = MI.getOperand(1).getReg();
410   if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
411     return false;
412 
413   return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
414          isFPExtFromF16OrConst(MRI, Src2);
415 }
416 
417 void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
418                                                        Register Src0,
419                                                        Register Src1,
420                                                        Register Src2) {
421   Builder.setInstrAndDebugLoc(MI);
422 
423   // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
424   // sources.
425   Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
426   Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
427   Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);
428 
429   LLT Ty = MRI.getType(Src0);
430   auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
431   auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
432   auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
433   Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
434   MI.eraseFromParent();
435 }
436