xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (revision 62987288060ff68c817b7056815aa9fb8ba8ecd7)
1 //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// \file
8 //===----------------------------------------------------------------------===//
9 //
10 
11 #include "AMDGPU.h"
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "SIMachineFunctionInfo.h"
15 #include "llvm/ADT/DepthFirstIterator.h"
16 #include "llvm/CodeGen/MachineFunctionPass.h"
17 #include "llvm/CodeGen/MachineOperand.h"
18 
19 #define DEBUG_TYPE "si-fold-operands"
20 using namespace llvm;
21 
22 namespace {
23 
24 struct FoldCandidate {
25   MachineInstr *UseMI;
26   union {
27     MachineOperand *OpToFold;
28     uint64_t ImmToFold;
29     int FrameIndexToFold;
30   };
31   int ShrinkOpcode;
32   unsigned UseOpNo;
33   MachineOperand::MachineOperandType Kind;
34   bool Commuted;
35 
FoldCandidate__anon62acd4700111::FoldCandidate36   FoldCandidate(MachineInstr *MI, unsigned OpNo, MachineOperand *FoldOp,
37                 bool Commuted_ = false,
38                 int ShrinkOp = -1) :
39     UseMI(MI), OpToFold(nullptr), ShrinkOpcode(ShrinkOp), UseOpNo(OpNo),
40     Kind(FoldOp->getType()),
41     Commuted(Commuted_) {
42     if (FoldOp->isImm()) {
43       ImmToFold = FoldOp->getImm();
44     } else if (FoldOp->isFI()) {
45       FrameIndexToFold = FoldOp->getIndex();
46     } else {
47       assert(FoldOp->isReg() || FoldOp->isGlobal());
48       OpToFold = FoldOp;
49     }
50   }
51 
isFI__anon62acd4700111::FoldCandidate52   bool isFI() const {
53     return Kind == MachineOperand::MO_FrameIndex;
54   }
55 
isImm__anon62acd4700111::FoldCandidate56   bool isImm() const {
57     return Kind == MachineOperand::MO_Immediate;
58   }
59 
isReg__anon62acd4700111::FoldCandidate60   bool isReg() const {
61     return Kind == MachineOperand::MO_Register;
62   }
63 
isGlobal__anon62acd4700111::FoldCandidate64   bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
65 
needsShrink__anon62acd4700111::FoldCandidate66   bool needsShrink() const { return ShrinkOpcode != -1; }
67 };
68 
69 class SIFoldOperands : public MachineFunctionPass {
70 public:
71   static char ID;
72   MachineRegisterInfo *MRI;
73   const SIInstrInfo *TII;
74   const SIRegisterInfo *TRI;
75   const GCNSubtarget *ST;
76   const SIMachineFunctionInfo *MFI;
77 
78   bool frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
79                          const MachineOperand &OpToFold) const;
80 
81   bool updateOperand(FoldCandidate &Fold) const;
82 
83   bool canUseImmWithOpSel(FoldCandidate &Fold) const;
84 
85   bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
86 
87   bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
88                         MachineInstr *MI, unsigned OpNo,
89                         MachineOperand *OpToFold) const;
90   bool isUseSafeToFold(const MachineInstr &MI,
91                        const MachineOperand &UseMO) const;
92   bool
93   getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
94                 Register UseReg, uint8_t OpTy) const;
95   bool tryToFoldACImm(const MachineOperand &OpToFold, MachineInstr *UseMI,
96                       unsigned UseOpIdx,
97                       SmallVectorImpl<FoldCandidate> &FoldList) const;
98   void foldOperand(MachineOperand &OpToFold,
99                    MachineInstr *UseMI,
100                    int UseOpIdx,
101                    SmallVectorImpl<FoldCandidate> &FoldList,
102                    SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
103 
104   MachineOperand *getImmOrMaterializedImm(MachineOperand &Op) const;
105   bool tryConstantFoldOp(MachineInstr *MI) const;
106   bool tryFoldCndMask(MachineInstr &MI) const;
107   bool tryFoldZeroHighBits(MachineInstr &MI) const;
108   bool foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
109   bool tryFoldFoldableCopy(MachineInstr &MI,
110                            MachineOperand *&CurrentKnownM0Val) const;
111 
112   const MachineOperand *isClamp(const MachineInstr &MI) const;
113   bool tryFoldClamp(MachineInstr &MI);
114 
115   std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
116   bool tryFoldOMod(MachineInstr &MI);
117   bool tryFoldRegSequence(MachineInstr &MI);
118   bool tryFoldPhiAGPR(MachineInstr &MI);
119   bool tryFoldLoad(MachineInstr &MI);
120 
121   bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
122 
123 public:
SIFoldOperands()124   SIFoldOperands() : MachineFunctionPass(ID) {
125     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
126   }
127 
128   bool runOnMachineFunction(MachineFunction &MF) override;
129 
getPassName() const130   StringRef getPassName() const override { return "SI Fold Operands"; }
131 
getAnalysisUsage(AnalysisUsage & AU) const132   void getAnalysisUsage(AnalysisUsage &AU) const override {
133     AU.setPreservesCFG();
134     MachineFunctionPass::getAnalysisUsage(AU);
135   }
136 };
137 
138 } // End anonymous namespace.
139 
140 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
141                 "SI Fold Operands", false, false)
142 
143 char SIFoldOperands::ID = 0;
144 
145 char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
146 
getRegOpRC(const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const MachineOperand & MO)147 static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI,
148                                              const TargetRegisterInfo &TRI,
149                                              const MachineOperand &MO) {
150   const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
151   if (const TargetRegisterClass *SubRC =
152           TRI.getSubRegisterClass(RC, MO.getSubReg()))
153     RC = SubRC;
154   return RC;
155 }
156 
157 // Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
macToMad(unsigned Opc)158 static unsigned macToMad(unsigned Opc) {
159   switch (Opc) {
160   case AMDGPU::V_MAC_F32_e64:
161     return AMDGPU::V_MAD_F32_e64;
162   case AMDGPU::V_MAC_F16_e64:
163     return AMDGPU::V_MAD_F16_e64;
164   case AMDGPU::V_FMAC_F32_e64:
165     return AMDGPU::V_FMA_F32_e64;
166   case AMDGPU::V_FMAC_F16_e64:
167     return AMDGPU::V_FMA_F16_gfx9_e64;
168   case AMDGPU::V_FMAC_F16_t16_e64:
169     return AMDGPU::V_FMA_F16_gfx9_e64;
170   case AMDGPU::V_FMAC_LEGACY_F32_e64:
171     return AMDGPU::V_FMA_LEGACY_F32_e64;
172   case AMDGPU::V_FMAC_F64_e64:
173     return AMDGPU::V_FMA_F64_e64;
174   }
175   return AMDGPU::INSTRUCTION_LIST_END;
176 }
177 
178 // TODO: Add heuristic that the frame index might not fit in the addressing mode
179 // immediate offset to avoid materializing in loops.
frameIndexMayFold(const MachineInstr & UseMI,int OpNo,const MachineOperand & OpToFold) const180 bool SIFoldOperands::frameIndexMayFold(const MachineInstr &UseMI, int OpNo,
181                                        const MachineOperand &OpToFold) const {
182   if (!OpToFold.isFI())
183     return false;
184 
185   const unsigned Opc = UseMI.getOpcode();
186   if (TII->isMUBUF(UseMI))
187     return OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
188   if (!TII->isFLATScratch(UseMI))
189     return false;
190 
191   int SIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
192   if (OpNo == SIdx)
193     return true;
194 
195   int VIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
196   return OpNo == VIdx && SIdx == -1;
197 }
198 
createSIFoldOperandsPass()199 FunctionPass *llvm::createSIFoldOperandsPass() {
200   return new SIFoldOperands();
201 }
202 
canUseImmWithOpSel(FoldCandidate & Fold) const203 bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
204   MachineInstr *MI = Fold.UseMI;
205   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
206   const uint64_t TSFlags = MI->getDesc().TSFlags;
207 
208   assert(Old.isReg() && Fold.isImm());
209 
210   if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
211       (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
212       (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
213     return false;
214 
215   unsigned Opcode = MI->getOpcode();
216   int OpNo = MI->getOperandNo(&Old);
217   uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
218   switch (OpType) {
219   default:
220     return false;
221   case AMDGPU::OPERAND_REG_IMM_V2FP16:
222   case AMDGPU::OPERAND_REG_IMM_V2BF16:
223   case AMDGPU::OPERAND_REG_IMM_V2INT16:
224   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
225   case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
226   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
227     break;
228   }
229 
230   return true;
231 }
232 
tryFoldImmWithOpSel(FoldCandidate & Fold) const233 bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
234   MachineInstr *MI = Fold.UseMI;
235   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
236   unsigned Opcode = MI->getOpcode();
237   int OpNo = MI->getOperandNo(&Old);
238   uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
239 
240   // If the literal can be inlined as-is, apply it and short-circuit the
241   // tests below. The main motivation for this is to avoid unintuitive
242   // uses of opsel.
243   if (AMDGPU::isInlinableLiteralV216(Fold.ImmToFold, OpType)) {
244     Old.ChangeToImmediate(Fold.ImmToFold);
245     return true;
246   }
247 
248   // Refer to op_sel/op_sel_hi and check if we can change the immediate and
249   // op_sel in a way that allows an inline constant.
250   int ModIdx = -1;
251   unsigned SrcIdx = ~0;
252   if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0)) {
253     ModIdx = AMDGPU::OpName::src0_modifiers;
254     SrcIdx = 0;
255   } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1)) {
256     ModIdx = AMDGPU::OpName::src1_modifiers;
257     SrcIdx = 1;
258   } else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2)) {
259     ModIdx = AMDGPU::OpName::src2_modifiers;
260     SrcIdx = 2;
261   }
262   assert(ModIdx != -1);
263   ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
264   MachineOperand &Mod = MI->getOperand(ModIdx);
265   unsigned ModVal = Mod.getImm();
266 
267   uint16_t ImmLo = static_cast<uint16_t>(
268       Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_0 ? 16 : 0));
269   uint16_t ImmHi = static_cast<uint16_t>(
270       Fold.ImmToFold >> (ModVal & SISrcMods::OP_SEL_1 ? 16 : 0));
271   uint32_t Imm = (static_cast<uint32_t>(ImmHi) << 16) | ImmLo;
272   unsigned NewModVal = ModVal & ~(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
273 
274   // Helper function that attempts to inline the given value with a newly
275   // chosen opsel pattern.
276   auto tryFoldToInline = [&](uint32_t Imm) -> bool {
277     if (AMDGPU::isInlinableLiteralV216(Imm, OpType)) {
278       Mod.setImm(NewModVal | SISrcMods::OP_SEL_1);
279       Old.ChangeToImmediate(Imm);
280       return true;
281     }
282 
283     // Try to shuffle the halves around and leverage opsel to get an inline
284     // constant.
285     uint16_t Lo = static_cast<uint16_t>(Imm);
286     uint16_t Hi = static_cast<uint16_t>(Imm >> 16);
287     if (Lo == Hi) {
288       if (AMDGPU::isInlinableLiteralV216(Lo, OpType)) {
289         Mod.setImm(NewModVal);
290         Old.ChangeToImmediate(Lo);
291         return true;
292       }
293 
294       if (static_cast<int16_t>(Lo) < 0) {
295         int32_t SExt = static_cast<int16_t>(Lo);
296         if (AMDGPU::isInlinableLiteralV216(SExt, OpType)) {
297           Mod.setImm(NewModVal);
298           Old.ChangeToImmediate(SExt);
299           return true;
300         }
301       }
302 
303       // This check is only useful for integer instructions
304       if (OpType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
305           OpType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16) {
306         if (AMDGPU::isInlinableLiteralV216(Lo << 16, OpType)) {
307           Mod.setImm(NewModVal | SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1);
308           Old.ChangeToImmediate(static_cast<uint32_t>(Lo) << 16);
309           return true;
310         }
311       }
312     } else {
313       uint32_t Swapped = (static_cast<uint32_t>(Lo) << 16) | Hi;
314       if (AMDGPU::isInlinableLiteralV216(Swapped, OpType)) {
315         Mod.setImm(NewModVal | SISrcMods::OP_SEL_0);
316         Old.ChangeToImmediate(Swapped);
317         return true;
318       }
319     }
320 
321     return false;
322   };
323 
324   if (tryFoldToInline(Imm))
325     return true;
326 
327   // Replace integer addition by subtraction and vice versa if it allows
328   // folding the immediate to an inline constant.
329   //
330   // We should only ever get here for SrcIdx == 1 due to canonicalization
331   // earlier in the pipeline, but we double-check here to be safe / fully
332   // general.
333   bool IsUAdd = Opcode == AMDGPU::V_PK_ADD_U16;
334   bool IsUSub = Opcode == AMDGPU::V_PK_SUB_U16;
335   if (SrcIdx == 1 && (IsUAdd || IsUSub)) {
336     unsigned ClampIdx =
337         AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::clamp);
338     bool Clamp = MI->getOperand(ClampIdx).getImm() != 0;
339 
340     if (!Clamp) {
341       uint16_t NegLo = -static_cast<uint16_t>(Imm);
342       uint16_t NegHi = -static_cast<uint16_t>(Imm >> 16);
343       uint32_t NegImm = (static_cast<uint32_t>(NegHi) << 16) | NegLo;
344 
345       if (tryFoldToInline(NegImm)) {
346         unsigned NegOpcode =
347             IsUAdd ? AMDGPU::V_PK_SUB_U16 : AMDGPU::V_PK_ADD_U16;
348         MI->setDesc(TII->get(NegOpcode));
349         return true;
350       }
351     }
352   }
353 
354   return false;
355 }
356 
updateOperand(FoldCandidate & Fold) const357 bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
358   MachineInstr *MI = Fold.UseMI;
359   MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
360   assert(Old.isReg());
361 
362   if (Fold.isImm() && canUseImmWithOpSel(Fold)) {
363     if (tryFoldImmWithOpSel(Fold))
364       return true;
365 
366     // We can't represent the candidate as an inline constant. Try as a literal
367     // with the original opsel, checking constant bus limitations.
368     MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold);
369     int OpNo = MI->getOperandNo(&Old);
370     if (!TII->isOperandLegal(*MI, OpNo, &New))
371       return false;
372     Old.ChangeToImmediate(Fold.ImmToFold);
373     return true;
374   }
375 
376   if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
377     MachineBasicBlock *MBB = MI->getParent();
378     auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
379     if (Liveness != MachineBasicBlock::LQR_Dead) {
380       LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
381       return false;
382     }
383 
384     int Op32 = Fold.ShrinkOpcode;
385     MachineOperand &Dst0 = MI->getOperand(0);
386     MachineOperand &Dst1 = MI->getOperand(1);
387     assert(Dst0.isDef() && Dst1.isDef());
388 
389     bool HaveNonDbgCarryUse = !MRI->use_nodbg_empty(Dst1.getReg());
390 
391     const TargetRegisterClass *Dst0RC = MRI->getRegClass(Dst0.getReg());
392     Register NewReg0 = MRI->createVirtualRegister(Dst0RC);
393 
394     MachineInstr *Inst32 = TII->buildShrunkInst(*MI, Op32);
395 
396     if (HaveNonDbgCarryUse) {
397       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::COPY),
398               Dst1.getReg())
399         .addReg(AMDGPU::VCC, RegState::Kill);
400     }
401 
402     // Keep the old instruction around to avoid breaking iterators, but
403     // replace it with a dummy instruction to remove uses.
404     //
405     // FIXME: We should not invert how this pass looks at operands to avoid
406     // this. Should track set of foldable movs instead of looking for uses
407     // when looking at a use.
408     Dst0.setReg(NewReg0);
409     for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)
410       MI->removeOperand(I);
411     MI->setDesc(TII->get(AMDGPU::IMPLICIT_DEF));
412 
413     if (Fold.Commuted)
414       TII->commuteInstruction(*Inst32, false);
415     return true;
416   }
417 
418   assert(!Fold.needsShrink() && "not handled");
419 
420   if (Fold.isImm()) {
421     if (Old.isTied()) {
422       int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
423       if (NewMFMAOpc == -1)
424         return false;
425       MI->setDesc(TII->get(NewMFMAOpc));
426       MI->untieRegOperand(0);
427     }
428     Old.ChangeToImmediate(Fold.ImmToFold);
429     return true;
430   }
431 
432   if (Fold.isGlobal()) {
433     Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
434                    Fold.OpToFold->getTargetFlags());
435     return true;
436   }
437 
438   if (Fold.isFI()) {
439     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
440     return true;
441   }
442 
443   MachineOperand *New = Fold.OpToFold;
444   Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
445   Old.setIsUndef(New->isUndef());
446   return true;
447 }
448 
isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,const MachineInstr * MI)449 static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
450                               const MachineInstr *MI) {
451   return any_of(FoldList, [&](const auto &C) { return C.UseMI == MI; });
452 }
453 
appendFoldCandidate(SmallVectorImpl<FoldCandidate> & FoldList,MachineInstr * MI,unsigned OpNo,MachineOperand * FoldOp,bool Commuted=false,int ShrinkOp=-1)454 static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
455                                 MachineInstr *MI, unsigned OpNo,
456                                 MachineOperand *FoldOp, bool Commuted = false,
457                                 int ShrinkOp = -1) {
458   // Skip additional folding on the same operand.
459   for (FoldCandidate &Fold : FoldList)
460     if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
461       return;
462   LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
463                     << " operand " << OpNo << "\n  " << *MI);
464   FoldList.emplace_back(MI, OpNo, FoldOp, Commuted, ShrinkOp);
465 }
466 
tryAddToFoldList(SmallVectorImpl<FoldCandidate> & FoldList,MachineInstr * MI,unsigned OpNo,MachineOperand * OpToFold) const467 bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
468                                       MachineInstr *MI, unsigned OpNo,
469                                       MachineOperand *OpToFold) const {
470   const unsigned Opc = MI->getOpcode();
471 
472   auto tryToFoldAsFMAAKorMK = [&]() {
473     if (!OpToFold->isImm())
474       return false;
475 
476     const bool TryAK = OpNo == 3;
477     const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
478     MI->setDesc(TII->get(NewOpc));
479 
480     // We have to fold into operand which would be Imm not into OpNo.
481     bool FoldAsFMAAKorMK =
482         tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
483     if (FoldAsFMAAKorMK) {
484       // Untie Src2 of fmac.
485       MI->untieRegOperand(3);
486       // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
487       if (OpNo == 1) {
488         MachineOperand &Op1 = MI->getOperand(1);
489         MachineOperand &Op2 = MI->getOperand(2);
490         Register OldReg = Op1.getReg();
491         // Operand 2 might be an inlinable constant
492         if (Op2.isImm()) {
493           Op1.ChangeToImmediate(Op2.getImm());
494           Op2.ChangeToRegister(OldReg, false);
495         } else {
496           Op1.setReg(Op2.getReg());
497           Op2.setReg(OldReg);
498         }
499       }
500       return true;
501     }
502     MI->setDesc(TII->get(Opc));
503     return false;
504   };
505 
506   bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
507   if (!IsLegal && OpToFold->isImm()) {
508     FoldCandidate Fold(MI, OpNo, OpToFold);
509     IsLegal = canUseImmWithOpSel(Fold);
510   }
511 
512   if (!IsLegal) {
513     // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
514     unsigned NewOpc = macToMad(Opc);
515     if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
516       // Check if changing this to a v_mad_{f16, f32} instruction will allow us
517       // to fold the operand.
518       MI->setDesc(TII->get(NewOpc));
519       bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
520                       AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
521       if (AddOpSel)
522         MI->addOperand(MachineOperand::CreateImm(0));
523       bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
524       if (FoldAsMAD) {
525         MI->untieRegOperand(OpNo);
526         return true;
527       }
528       if (AddOpSel)
529         MI->removeOperand(MI->getNumExplicitOperands() - 1);
530       MI->setDesc(TII->get(Opc));
531     }
532 
533     // Special case for s_fmac_f32 if we are trying to fold into Src2.
534     // By transforming into fmaak we can untie Src2 and make folding legal.
535     if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
536       if (tryToFoldAsFMAAKorMK())
537         return true;
538     }
539 
540     // Special case for s_setreg_b32
541     if (OpToFold->isImm()) {
542       unsigned ImmOpc = 0;
543       if (Opc == AMDGPU::S_SETREG_B32)
544         ImmOpc = AMDGPU::S_SETREG_IMM32_B32;
545       else if (Opc == AMDGPU::S_SETREG_B32_mode)
546         ImmOpc = AMDGPU::S_SETREG_IMM32_B32_mode;
547       if (ImmOpc) {
548         MI->setDesc(TII->get(ImmOpc));
549         appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
550         return true;
551       }
552     }
553 
554     // If we are already folding into another operand of MI, then
555     // we can't commute the instruction, otherwise we risk making the
556     // other fold illegal.
557     if (isUseMIInFoldList(FoldList, MI))
558       return false;
559 
560     // Operand is not legal, so try to commute the instruction to
561     // see if this makes it possible to fold.
562     unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
563     bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
564     if (!CanCommute)
565       return false;
566 
567     // One of operands might be an Imm operand, and OpNo may refer to it after
568     // the call of commuteInstruction() below. Such situations are avoided
569     // here explicitly as OpNo must be a register operand to be a candidate
570     // for memory folding.
571     if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
572       return false;
573 
574     if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
575       return false;
576 
577     int Op32 = -1;
578     if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
579       if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
580            Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
581           (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
582         TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
583         return false;
584       }
585 
586       // Verify the other operand is a VGPR, otherwise we would violate the
587       // constant bus restriction.
588       MachineOperand &OtherOp = MI->getOperand(OpNo);
589       if (!OtherOp.isReg() ||
590           !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
591         return false;
592 
593       assert(MI->getOperand(1).isDef());
594 
595       // Make sure to get the 32-bit version of the commuted opcode.
596       unsigned MaybeCommutedOpc = MI->getOpcode();
597       Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
598     }
599 
600     appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
601     return true;
602   }
603 
604   // Inlineable constant might have been folded into Imm operand of fmaak or
605   // fmamk and we are trying to fold a non-inlinable constant.
606   if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
607       !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
608     unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
609     MachineOperand &OpImm = MI->getOperand(ImmIdx);
610     if (!OpImm.isReg() &&
611         TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
612       return tryToFoldAsFMAAKorMK();
613   }
614 
615   // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
616   // By changing into fmamk we can untie Src2.
617   // If folding for Src0 happens first and it is identical operand to Src1 we
618   // should avoid transforming into fmamk which requires commuting as it would
619   // cause folding into Src1 to fail later on due to wrong OpNo used.
620   if (Opc == AMDGPU::S_FMAC_F32 &&
621       (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
622     if (tryToFoldAsFMAAKorMK())
623       return true;
624   }
625 
626   // Check the case where we might introduce a second constant operand to a
627   // scalar instruction
628   if (TII->isSALU(MI->getOpcode())) {
629     const MCInstrDesc &InstDesc = MI->getDesc();
630     const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
631 
632     // Fine if the operand can be encoded as an inline constant
633     if (!OpToFold->isReg() && !TII->isInlineConstant(*OpToFold, OpInfo)) {
634       // Otherwise check for another constant
635       for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
636         auto &Op = MI->getOperand(i);
637         if (OpNo != i && !Op.isReg() &&
638             !TII->isInlineConstant(Op, InstDesc.operands()[i]))
639           return false;
640       }
641     }
642   }
643 
644   appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
645   return true;
646 }
647 
isUseSafeToFold(const MachineInstr & MI,const MachineOperand & UseMO) const648 bool SIFoldOperands::isUseSafeToFold(const MachineInstr &MI,
649                                      const MachineOperand &UseMO) const {
650   // Operands of SDWA instructions must be registers.
651   return !TII->isSDWA(MI);
652 }
653 
654 // Find a def of the UseReg, check if it is a reg_sequence and find initializers
655 // for each subreg, tracking it to foldable inline immediate if possible.
656 // Returns true on success.
getRegSeqInit(SmallVectorImpl<std::pair<MachineOperand *,unsigned>> & Defs,Register UseReg,uint8_t OpTy) const657 bool SIFoldOperands::getRegSeqInit(
658     SmallVectorImpl<std::pair<MachineOperand *, unsigned>> &Defs,
659     Register UseReg, uint8_t OpTy) const {
660   MachineInstr *Def = MRI->getVRegDef(UseReg);
661   if (!Def || !Def->isRegSequence())
662     return false;
663 
664   for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
665     MachineOperand *Sub = &Def->getOperand(I);
666     assert(Sub->isReg());
667 
668     for (MachineInstr *SubDef = MRI->getVRegDef(Sub->getReg());
669          SubDef && Sub->isReg() && Sub->getReg().isVirtual() &&
670          !Sub->getSubReg() && TII->isFoldableCopy(*SubDef);
671          SubDef = MRI->getVRegDef(Sub->getReg())) {
672       MachineOperand *Op = &SubDef->getOperand(1);
673       if (Op->isImm()) {
674         if (TII->isInlineConstant(*Op, OpTy))
675           Sub = Op;
676         break;
677       }
678       if (!Op->isReg() || Op->getReg().isPhysical())
679         break;
680       Sub = Op;
681     }
682 
683     Defs.emplace_back(Sub, Def->getOperand(I + 1).getImm());
684   }
685 
686   return true;
687 }
688 
tryToFoldACImm(const MachineOperand & OpToFold,MachineInstr * UseMI,unsigned UseOpIdx,SmallVectorImpl<FoldCandidate> & FoldList) const689 bool SIFoldOperands::tryToFoldACImm(
690     const MachineOperand &OpToFold, MachineInstr *UseMI, unsigned UseOpIdx,
691     SmallVectorImpl<FoldCandidate> &FoldList) const {
692   const MCInstrDesc &Desc = UseMI->getDesc();
693   if (UseOpIdx >= Desc.getNumOperands())
694     return false;
695 
696   if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx))
697     return false;
698 
699   uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
700   if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
701       TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
702     UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
703     return true;
704   }
705 
706   if (!OpToFold.isReg())
707     return false;
708 
709   Register UseReg = OpToFold.getReg();
710   if (!UseReg.isVirtual())
711     return false;
712 
713   if (isUseMIInFoldList(FoldList, UseMI))
714     return false;
715 
716   // Maybe it is just a COPY of an immediate itself.
717   MachineInstr *Def = MRI->getVRegDef(UseReg);
718   MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
719   if (!UseOp.getSubReg() && Def && TII->isFoldableCopy(*Def)) {
720     MachineOperand &DefOp = Def->getOperand(1);
721     if (DefOp.isImm() && TII->isInlineConstant(DefOp, OpTy) &&
722         TII->isOperandLegal(*UseMI, UseOpIdx, &DefOp)) {
723       UseMI->getOperand(UseOpIdx).ChangeToImmediate(DefOp.getImm());
724       return true;
725     }
726   }
727 
728   SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
729   if (!getRegSeqInit(Defs, UseReg, OpTy))
730     return false;
731 
732   int32_t Imm;
733   for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
734     const MachineOperand *Op = Defs[I].first;
735     if (!Op->isImm())
736       return false;
737 
738     auto SubImm = Op->getImm();
739     if (!I) {
740       Imm = SubImm;
741       if (!TII->isInlineConstant(*Op, OpTy) ||
742           !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
743         return false;
744 
745       continue;
746     }
747     if (Imm != SubImm)
748       return false; // Can only fold splat constants
749   }
750 
751   appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
752   return true;
753 }
754 
foldOperand(MachineOperand & OpToFold,MachineInstr * UseMI,int UseOpIdx,SmallVectorImpl<FoldCandidate> & FoldList,SmallVectorImpl<MachineInstr * > & CopiesToReplace) const755 void SIFoldOperands::foldOperand(
756   MachineOperand &OpToFold,
757   MachineInstr *UseMI,
758   int UseOpIdx,
759   SmallVectorImpl<FoldCandidate> &FoldList,
760   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
761   const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
762 
763   if (!isUseSafeToFold(*UseMI, *UseOp))
764     return;
765 
766   // FIXME: Fold operands with subregs.
767   if (UseOp->isReg() && OpToFold.isReg() &&
768       (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
769     return;
770 
771   // Special case for REG_SEQUENCE: We can't fold literals into
772   // REG_SEQUENCE instructions, so we have to fold them into the
773   // uses of REG_SEQUENCE.
774   if (UseMI->isRegSequence()) {
775     Register RegSeqDstReg = UseMI->getOperand(0).getReg();
776     unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
777 
778     // Grab the use operands first
779     SmallVector<MachineOperand *, 4> UsesToProcess;
780     for (auto &Use : MRI->use_nodbg_operands(RegSeqDstReg))
781       UsesToProcess.push_back(&Use);
782     for (auto *RSUse : UsesToProcess) {
783       MachineInstr *RSUseMI = RSUse->getParent();
784 
785       if (tryToFoldACImm(UseMI->getOperand(0), RSUseMI,
786                          RSUseMI->getOperandNo(RSUse), FoldList))
787         continue;
788 
789       if (RSUse->getSubReg() != RegSeqDstSubReg)
790         continue;
791 
792       foldOperand(OpToFold, RSUseMI, RSUseMI->getOperandNo(RSUse), FoldList,
793                   CopiesToReplace);
794     }
795     return;
796   }
797 
798   if (tryToFoldACImm(OpToFold, UseMI, UseOpIdx, FoldList))
799     return;
800 
801   if (frameIndexMayFold(*UseMI, UseOpIdx, OpToFold)) {
802     // Verify that this is a stack access.
803     // FIXME: Should probably use stack pseudos before frame lowering.
804 
805     if (TII->isMUBUF(*UseMI)) {
806       if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
807           MFI->getScratchRSrcReg())
808         return;
809 
810       // Ensure this is either relative to the current frame or the current
811       // wave.
812       MachineOperand &SOff =
813           *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
814       if (!SOff.isImm() || SOff.getImm() != 0)
815         return;
816     }
817 
818     // A frame index will resolve to a positive constant, so it should always be
819     // safe to fold the addressing mode, even pre-GFX9.
820     UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
821 
822     const unsigned Opc = UseMI->getOpcode();
823     if (TII->isFLATScratch(*UseMI) &&
824         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
825         !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::saddr)) {
826       unsigned NewOpc = AMDGPU::getFlatScratchInstSSfromSV(Opc);
827       UseMI->setDesc(TII->get(NewOpc));
828     }
829 
830     return;
831   }
832 
833   bool FoldingImmLike =
834       OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
835 
836   if (FoldingImmLike && UseMI->isCopy()) {
837     Register DestReg = UseMI->getOperand(0).getReg();
838     Register SrcReg = UseMI->getOperand(1).getReg();
839     assert(SrcReg.isVirtual());
840 
841     const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
842 
843     // Don't fold into a copy to a physical register with the same class. Doing
844     // so would interfere with the register coalescer's logic which would avoid
845     // redundant initializations.
846     if (DestReg.isPhysical() && SrcRC->contains(DestReg))
847       return;
848 
849     const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
850     if (!DestReg.isPhysical()) {
851       if (DestRC == &AMDGPU::AGPR_32RegClass &&
852           TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
853         UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
854         UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
855         CopiesToReplace.push_back(UseMI);
856         return;
857       }
858     }
859 
860     // In order to fold immediates into copies, we need to change the
861     // copy to a MOV.
862 
863     unsigned MovOp = TII->getMovOpcode(DestRC);
864     if (MovOp == AMDGPU::COPY)
865       return;
866 
867     MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
868     MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
869     while (ImpOpI != ImpOpE) {
870       MachineInstr::mop_iterator Tmp = ImpOpI;
871       ImpOpI++;
872       UseMI->removeOperand(UseMI->getOperandNo(Tmp));
873     }
874     UseMI->setDesc(TII->get(MovOp));
875 
876     if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
877       const auto &SrcOp = UseMI->getOperand(UseOpIdx);
878       MachineOperand NewSrcOp(SrcOp);
879       MachineFunction *MF = UseMI->getParent()->getParent();
880       UseMI->removeOperand(1);
881       UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
882       UseMI->addOperand(NewSrcOp);                          // src0
883       UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
884       UseOpIdx = 2;
885       UseOp = &UseMI->getOperand(UseOpIdx);
886     }
887     CopiesToReplace.push_back(UseMI);
888   } else {
889     if (UseMI->isCopy() && OpToFold.isReg() &&
890         UseMI->getOperand(0).getReg().isVirtual() &&
891         !UseMI->getOperand(1).getSubReg()) {
892       LLVM_DEBUG(dbgs() << "Folding " << OpToFold << "\n into " << *UseMI);
893       unsigned Size = TII->getOpSize(*UseMI, 1);
894       Register UseReg = OpToFold.getReg();
895       UseMI->getOperand(1).setReg(UseReg);
896       UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
897       UseMI->getOperand(1).setIsKill(false);
898       CopiesToReplace.push_back(UseMI);
899       OpToFold.setIsKill(false);
900 
901       // Remove kill flags as kills may now be out of order with uses.
902       MRI->clearKillFlags(OpToFold.getReg());
903 
904       // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
905       // can only accept VGPR or inline immediate. Recreate a reg_sequence with
906       // its initializers right here, so we will rematerialize immediates and
907       // avoid copies via different reg classes.
908       SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
909       if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
910           getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
911         const DebugLoc &DL = UseMI->getDebugLoc();
912         MachineBasicBlock &MBB = *UseMI->getParent();
913 
914         UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
915         for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
916           UseMI->removeOperand(I);
917 
918         MachineInstrBuilder B(*MBB.getParent(), UseMI);
919         DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
920         SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
921         for (unsigned I = 0; I < Size / 4; ++I) {
922           MachineOperand *Def = Defs[I].first;
923           TargetInstrInfo::RegSubRegPair CopyToVGPR;
924           if (Def->isImm() &&
925               TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
926             int64_t Imm = Def->getImm();
927 
928             auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
929             BuildMI(MBB, UseMI, DL,
930                     TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addImm(Imm);
931             B.addReg(Tmp);
932           } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
933             auto Src = getRegSubRegPair(*Def);
934             Def->setIsKill(false);
935             if (!SeenAGPRs.insert(Src)) {
936               // We cannot build a reg_sequence out of the same registers, they
937               // must be copied. Better do it here before copyPhysReg() created
938               // several reads to do the AGPR->VGPR->AGPR copy.
939               CopyToVGPR = Src;
940             } else {
941               B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
942                        Src.SubReg);
943             }
944           } else {
945             assert(Def->isReg());
946             Def->setIsKill(false);
947             auto Src = getRegSubRegPair(*Def);
948 
949             // Direct copy from SGPR to AGPR is not possible. To avoid creation
950             // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
951             // create a copy here and track if we already have such a copy.
952             if (TRI->isSGPRReg(*MRI, Src.Reg)) {
953               CopyToVGPR = Src;
954             } else {
955               auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
956               BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
957               B.addReg(Tmp);
958             }
959           }
960 
961           if (CopyToVGPR.Reg) {
962             Register Vgpr;
963             if (VGPRCopies.count(CopyToVGPR)) {
964               Vgpr = VGPRCopies[CopyToVGPR];
965             } else {
966               Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
967               BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
968               VGPRCopies[CopyToVGPR] = Vgpr;
969             }
970             auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
971             BuildMI(MBB, UseMI, DL,
972                     TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), Tmp).addReg(Vgpr);
973             B.addReg(Tmp);
974           }
975 
976           B.addImm(Defs[I].second);
977         }
978         LLVM_DEBUG(dbgs() << "Folded " << *UseMI);
979         return;
980       }
981 
982       if (Size != 4)
983         return;
984 
985       Register Reg0 = UseMI->getOperand(0).getReg();
986       Register Reg1 = UseMI->getOperand(1).getReg();
987       if (TRI->isAGPR(*MRI, Reg0) && TRI->isVGPR(*MRI, Reg1))
988         UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
989       else if (TRI->isVGPR(*MRI, Reg0) && TRI->isAGPR(*MRI, Reg1))
990         UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64));
991       else if (ST->hasGFX90AInsts() && TRI->isAGPR(*MRI, Reg0) &&
992                TRI->isAGPR(*MRI, Reg1))
993         UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_MOV_B32));
994       return;
995     }
996 
997     unsigned UseOpc = UseMI->getOpcode();
998     if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||
999         (UseOpc == AMDGPU::V_READLANE_B32 &&
1000          (int)UseOpIdx ==
1001          AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {
1002       // %vgpr = V_MOV_B32 imm
1003       // %sgpr = V_READFIRSTLANE_B32 %vgpr
1004       // =>
1005       // %sgpr = S_MOV_B32 imm
1006       if (FoldingImmLike) {
1007         if (execMayBeModifiedBeforeUse(*MRI,
1008                                        UseMI->getOperand(UseOpIdx).getReg(),
1009                                        *OpToFold.getParent(),
1010                                        *UseMI))
1011           return;
1012 
1013         UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));
1014 
1015         if (OpToFold.isImm())
1016           UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());
1017         else
1018           UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());
1019         UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1020         return;
1021       }
1022 
1023       if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {
1024         if (execMayBeModifiedBeforeUse(*MRI,
1025                                        UseMI->getOperand(UseOpIdx).getReg(),
1026                                        *OpToFold.getParent(),
1027                                        *UseMI))
1028           return;
1029 
1030         // %vgpr = COPY %sgpr0
1031         // %sgpr1 = V_READFIRSTLANE_B32 %vgpr
1032         // =>
1033         // %sgpr1 = COPY %sgpr0
1034         UseMI->setDesc(TII->get(AMDGPU::COPY));
1035         UseMI->getOperand(1).setReg(OpToFold.getReg());
1036         UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
1037         UseMI->getOperand(1).setIsKill(false);
1038         UseMI->removeOperand(2); // Remove exec read (or src1 for readlane)
1039         return;
1040       }
1041     }
1042 
1043     const MCInstrDesc &UseDesc = UseMI->getDesc();
1044 
1045     // Don't fold into target independent nodes.  Target independent opcodes
1046     // don't have defined register classes.
1047     if (UseDesc.isVariadic() || UseOp->isImplicit() ||
1048         UseDesc.operands()[UseOpIdx].RegClass == -1)
1049       return;
1050   }
1051 
1052   if (!FoldingImmLike) {
1053     if (OpToFold.isReg() && ST->needsAlignedVGPRs()) {
1054       // Don't fold if OpToFold doesn't hold an aligned register.
1055       const TargetRegisterClass *RC =
1056           TRI->getRegClassForReg(*MRI, OpToFold.getReg());
1057       assert(RC);
1058       if (TRI->hasVectorRegisters(RC) && OpToFold.getSubReg()) {
1059         unsigned SubReg = OpToFold.getSubReg();
1060         if (const TargetRegisterClass *SubRC =
1061                 TRI->getSubRegisterClass(RC, SubReg))
1062           RC = SubRC;
1063       }
1064 
1065       if (!RC || !TRI->isProperlyAlignedRC(*RC))
1066         return;
1067     }
1068 
1069     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1070 
1071     // FIXME: We could try to change the instruction from 64-bit to 32-bit
1072     // to enable more folding opportunities.  The shrink operands pass
1073     // already does this.
1074     return;
1075   }
1076 
1077 
1078   const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
1079   const TargetRegisterClass *FoldRC =
1080       TRI->getRegClass(FoldDesc.operands()[0].RegClass);
1081 
1082   // Split 64-bit constants into 32-bits for folding.
1083   if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
1084     Register UseReg = UseOp->getReg();
1085     const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
1086     if (AMDGPU::getRegBitWidth(*UseRC) != 64)
1087       return;
1088 
1089     APInt Imm(64, OpToFold.getImm());
1090     if (UseOp->getSubReg() == AMDGPU::sub0) {
1091       Imm = Imm.getLoBits(32);
1092     } else {
1093       assert(UseOp->getSubReg() == AMDGPU::sub1);
1094       Imm = Imm.getHiBits(32);
1095     }
1096 
1097     MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
1098     tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp);
1099     return;
1100   }
1101 
1102   tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold);
1103 }
1104 
evalBinaryInstruction(unsigned Opcode,int32_t & Result,uint32_t LHS,uint32_t RHS)1105 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
1106                                   uint32_t LHS, uint32_t RHS) {
1107   switch (Opcode) {
1108   case AMDGPU::V_AND_B32_e64:
1109   case AMDGPU::V_AND_B32_e32:
1110   case AMDGPU::S_AND_B32:
1111     Result = LHS & RHS;
1112     return true;
1113   case AMDGPU::V_OR_B32_e64:
1114   case AMDGPU::V_OR_B32_e32:
1115   case AMDGPU::S_OR_B32:
1116     Result = LHS | RHS;
1117     return true;
1118   case AMDGPU::V_XOR_B32_e64:
1119   case AMDGPU::V_XOR_B32_e32:
1120   case AMDGPU::S_XOR_B32:
1121     Result = LHS ^ RHS;
1122     return true;
1123   case AMDGPU::S_XNOR_B32:
1124     Result = ~(LHS ^ RHS);
1125     return true;
1126   case AMDGPU::S_NAND_B32:
1127     Result = ~(LHS & RHS);
1128     return true;
1129   case AMDGPU::S_NOR_B32:
1130     Result = ~(LHS | RHS);
1131     return true;
1132   case AMDGPU::S_ANDN2_B32:
1133     Result = LHS & ~RHS;
1134     return true;
1135   case AMDGPU::S_ORN2_B32:
1136     Result = LHS | ~RHS;
1137     return true;
1138   case AMDGPU::V_LSHL_B32_e64:
1139   case AMDGPU::V_LSHL_B32_e32:
1140   case AMDGPU::S_LSHL_B32:
1141     // The instruction ignores the high bits for out of bounds shifts.
1142     Result = LHS << (RHS & 31);
1143     return true;
1144   case AMDGPU::V_LSHLREV_B32_e64:
1145   case AMDGPU::V_LSHLREV_B32_e32:
1146     Result = RHS << (LHS & 31);
1147     return true;
1148   case AMDGPU::V_LSHR_B32_e64:
1149   case AMDGPU::V_LSHR_B32_e32:
1150   case AMDGPU::S_LSHR_B32:
1151     Result = LHS >> (RHS & 31);
1152     return true;
1153   case AMDGPU::V_LSHRREV_B32_e64:
1154   case AMDGPU::V_LSHRREV_B32_e32:
1155     Result = RHS >> (LHS & 31);
1156     return true;
1157   case AMDGPU::V_ASHR_I32_e64:
1158   case AMDGPU::V_ASHR_I32_e32:
1159   case AMDGPU::S_ASHR_I32:
1160     Result = static_cast<int32_t>(LHS) >> (RHS & 31);
1161     return true;
1162   case AMDGPU::V_ASHRREV_I32_e64:
1163   case AMDGPU::V_ASHRREV_I32_e32:
1164     Result = static_cast<int32_t>(RHS) >> (LHS & 31);
1165     return true;
1166   default:
1167     return false;
1168   }
1169 }
1170 
getMovOpc(bool IsScalar)1171 static unsigned getMovOpc(bool IsScalar) {
1172   return IsScalar ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1173 }
1174 
mutateCopyOp(MachineInstr & MI,const MCInstrDesc & NewDesc)1175 static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
1176   MI.setDesc(NewDesc);
1177 
1178   // Remove any leftover implicit operands from mutating the instruction. e.g.
1179   // if we replace an s_and_b32 with a copy, we don't need the implicit scc def
1180   // anymore.
1181   const MCInstrDesc &Desc = MI.getDesc();
1182   unsigned NumOps = Desc.getNumOperands() + Desc.implicit_uses().size() +
1183                     Desc.implicit_defs().size();
1184 
1185   for (unsigned I = MI.getNumOperands() - 1; I >= NumOps; --I)
1186     MI.removeOperand(I);
1187 }
1188 
1189 MachineOperand *
getImmOrMaterializedImm(MachineOperand & Op) const1190 SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
1191   // If this has a subregister, it obviously is a register source.
1192   if (!Op.isReg() || Op.getSubReg() != AMDGPU::NoSubRegister ||
1193       !Op.getReg().isVirtual())
1194     return &Op;
1195 
1196   MachineInstr *Def = MRI->getVRegDef(Op.getReg());
1197   if (Def && Def->isMoveImmediate()) {
1198     MachineOperand &ImmSrc = Def->getOperand(1);
1199     if (ImmSrc.isImm())
1200       return &ImmSrc;
1201   }
1202 
1203   return &Op;
1204 }
1205 
1206 // Try to simplify operations with a constant that may appear after instruction
1207 // selection.
1208 // TODO: See if a frame index with a fixed offset can fold.
tryConstantFoldOp(MachineInstr * MI) const1209 bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
1210   if (!MI->allImplicitDefsAreDead())
1211     return false;
1212 
1213   unsigned Opc = MI->getOpcode();
1214 
1215   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1216   if (Src0Idx == -1)
1217     return false;
1218   MachineOperand *Src0 = getImmOrMaterializedImm(MI->getOperand(Src0Idx));
1219 
1220   if ((Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
1221        Opc == AMDGPU::S_NOT_B32) &&
1222       Src0->isImm()) {
1223     MI->getOperand(1).ChangeToImmediate(~Src0->getImm());
1224     mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
1225     return true;
1226   }
1227 
1228   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1229   if (Src1Idx == -1)
1230     return false;
1231   MachineOperand *Src1 = getImmOrMaterializedImm(MI->getOperand(Src1Idx));
1232 
1233   if (!Src0->isImm() && !Src1->isImm())
1234     return false;
1235 
1236   // and k0, k1 -> v_mov_b32 (k0 & k1)
1237   // or k0, k1 -> v_mov_b32 (k0 | k1)
1238   // xor k0, k1 -> v_mov_b32 (k0 ^ k1)
1239   if (Src0->isImm() && Src1->isImm()) {
1240     int32_t NewImm;
1241     if (!evalBinaryInstruction(Opc, NewImm, Src0->getImm(), Src1->getImm()))
1242       return false;
1243 
1244     bool IsSGPR = TRI->isSGPRReg(*MRI, MI->getOperand(0).getReg());
1245 
1246     // Be careful to change the right operand, src0 may belong to a different
1247     // instruction.
1248     MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
1249     MI->removeOperand(Src1Idx);
1250     mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
1251     return true;
1252   }
1253 
1254   if (!MI->isCommutable())
1255     return false;
1256 
1257   if (Src0->isImm() && !Src1->isImm()) {
1258     std::swap(Src0, Src1);
1259     std::swap(Src0Idx, Src1Idx);
1260   }
1261 
1262   int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
1263   if (Opc == AMDGPU::V_OR_B32_e64 ||
1264       Opc == AMDGPU::V_OR_B32_e32 ||
1265       Opc == AMDGPU::S_OR_B32) {
1266     if (Src1Val == 0) {
1267       // y = or x, 0 => y = copy x
1268       MI->removeOperand(Src1Idx);
1269       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1270     } else if (Src1Val == -1) {
1271       // y = or x, -1 => y = v_mov_b32 -1
1272       MI->removeOperand(Src1Idx);
1273       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_OR_B32)));
1274     } else
1275       return false;
1276 
1277     return true;
1278   }
1279 
1280   if (Opc == AMDGPU::V_AND_B32_e64 || Opc == AMDGPU::V_AND_B32_e32 ||
1281       Opc == AMDGPU::S_AND_B32) {
1282     if (Src1Val == 0) {
1283       // y = and x, 0 => y = v_mov_b32 0
1284       MI->removeOperand(Src0Idx);
1285       mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_AND_B32)));
1286     } else if (Src1Val == -1) {
1287       // y = and x, -1 => y = copy x
1288       MI->removeOperand(Src1Idx);
1289       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1290     } else
1291       return false;
1292 
1293     return true;
1294   }
1295 
1296   if (Opc == AMDGPU::V_XOR_B32_e64 || Opc == AMDGPU::V_XOR_B32_e32 ||
1297       Opc == AMDGPU::S_XOR_B32) {
1298     if (Src1Val == 0) {
1299       // y = xor x, 0 => y = copy x
1300       MI->removeOperand(Src1Idx);
1301       mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
1302       return true;
1303     }
1304   }
1305 
1306   return false;
1307 }
1308 
1309 // Try to fold an instruction into a simpler one
tryFoldCndMask(MachineInstr & MI) const1310 bool SIFoldOperands::tryFoldCndMask(MachineInstr &MI) const {
1311   unsigned Opc = MI.getOpcode();
1312   if (Opc != AMDGPU::V_CNDMASK_B32_e32 && Opc != AMDGPU::V_CNDMASK_B32_e64 &&
1313       Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
1314     return false;
1315 
1316   MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1317   MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1318   if (!Src1->isIdenticalTo(*Src0)) {
1319     auto *Src0Imm = getImmOrMaterializedImm(*Src0);
1320     auto *Src1Imm = getImmOrMaterializedImm(*Src1);
1321     if (!Src1Imm->isIdenticalTo(*Src0Imm))
1322       return false;
1323   }
1324 
1325   int Src1ModIdx =
1326       AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
1327   int Src0ModIdx =
1328       AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
1329   if ((Src1ModIdx != -1 && MI.getOperand(Src1ModIdx).getImm() != 0) ||
1330       (Src0ModIdx != -1 && MI.getOperand(Src0ModIdx).getImm() != 0))
1331     return false;
1332 
1333   LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
1334   auto &NewDesc =
1335       TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));
1336   int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1337   if (Src2Idx != -1)
1338     MI.removeOperand(Src2Idx);
1339   MI.removeOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
1340   if (Src1ModIdx != -1)
1341     MI.removeOperand(Src1ModIdx);
1342   if (Src0ModIdx != -1)
1343     MI.removeOperand(Src0ModIdx);
1344   mutateCopyOp(MI, NewDesc);
1345   LLVM_DEBUG(dbgs() << MI);
1346   return true;
1347 }
1348 
tryFoldZeroHighBits(MachineInstr & MI) const1349 bool SIFoldOperands::tryFoldZeroHighBits(MachineInstr &MI) const {
1350   if (MI.getOpcode() != AMDGPU::V_AND_B32_e64 &&
1351       MI.getOpcode() != AMDGPU::V_AND_B32_e32)
1352     return false;
1353 
1354   MachineOperand *Src0 = getImmOrMaterializedImm(MI.getOperand(1));
1355   if (!Src0->isImm() || Src0->getImm() != 0xffff)
1356     return false;
1357 
1358   Register Src1 = MI.getOperand(2).getReg();
1359   MachineInstr *SrcDef = MRI->getVRegDef(Src1);
1360   if (!ST->zeroesHigh16BitsOfDest(SrcDef->getOpcode()))
1361     return false;
1362 
1363   Register Dst = MI.getOperand(0).getReg();
1364   MRI->replaceRegWith(Dst, Src1);
1365   if (!MI.getOperand(2).isKill())
1366     MRI->clearKillFlags(Src1);
1367   MI.eraseFromParent();
1368   return true;
1369 }
1370 
foldInstOperand(MachineInstr & MI,MachineOperand & OpToFold) const1371 bool SIFoldOperands::foldInstOperand(MachineInstr &MI,
1372                                      MachineOperand &OpToFold) const {
1373   // We need mutate the operands of new mov instructions to add implicit
1374   // uses of EXEC, but adding them invalidates the use_iterator, so defer
1375   // this.
1376   SmallVector<MachineInstr *, 4> CopiesToReplace;
1377   SmallVector<FoldCandidate, 4> FoldList;
1378   MachineOperand &Dst = MI.getOperand(0);
1379   bool Changed = false;
1380 
1381   if (OpToFold.isImm()) {
1382     for (auto &UseMI :
1383          make_early_inc_range(MRI->use_nodbg_instructions(Dst.getReg()))) {
1384       // Folding the immediate may reveal operations that can be constant
1385       // folded or replaced with a copy. This can happen for example after
1386       // frame indices are lowered to constants or from splitting 64-bit
1387       // constants.
1388       //
1389       // We may also encounter cases where one or both operands are
1390       // immediates materialized into a register, which would ordinarily not
1391       // be folded due to multiple uses or operand constraints.
1392       if (tryConstantFoldOp(&UseMI)) {
1393         LLVM_DEBUG(dbgs() << "Constant folded " << UseMI);
1394         Changed = true;
1395       }
1396     }
1397   }
1398 
1399   SmallVector<MachineOperand *, 4> UsesToProcess;
1400   for (auto &Use : MRI->use_nodbg_operands(Dst.getReg()))
1401     UsesToProcess.push_back(&Use);
1402   for (auto *U : UsesToProcess) {
1403     MachineInstr *UseMI = U->getParent();
1404     foldOperand(OpToFold, UseMI, UseMI->getOperandNo(U), FoldList,
1405                 CopiesToReplace);
1406   }
1407 
1408   if (CopiesToReplace.empty() && FoldList.empty())
1409     return Changed;
1410 
1411   MachineFunction *MF = MI.getParent()->getParent();
1412   // Make sure we add EXEC uses to any new v_mov instructions created.
1413   for (MachineInstr *Copy : CopiesToReplace)
1414     Copy->addImplicitDefUseOperands(*MF);
1415 
1416   for (FoldCandidate &Fold : FoldList) {
1417     assert(!Fold.isReg() || Fold.OpToFold);
1418     if (Fold.isReg() && Fold.OpToFold->getReg().isVirtual()) {
1419       Register Reg = Fold.OpToFold->getReg();
1420       MachineInstr *DefMI = Fold.OpToFold->getParent();
1421       if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
1422           execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
1423         continue;
1424     }
1425     if (updateOperand(Fold)) {
1426       // Clear kill flags.
1427       if (Fold.isReg()) {
1428         assert(Fold.OpToFold && Fold.OpToFold->isReg());
1429         // FIXME: Probably shouldn't bother trying to fold if not an
1430         // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
1431         // copies.
1432         MRI->clearKillFlags(Fold.OpToFold->getReg());
1433       }
1434       LLVM_DEBUG(dbgs() << "Folded source from " << MI << " into OpNo "
1435                         << static_cast<int>(Fold.UseOpNo) << " of "
1436                         << *Fold.UseMI);
1437     } else if (Fold.Commuted) {
1438       // Restoring instruction's original operand order if fold has failed.
1439       TII->commuteInstruction(*Fold.UseMI, false);
1440     }
1441   }
1442   return true;
1443 }
1444 
tryFoldFoldableCopy(MachineInstr & MI,MachineOperand * & CurrentKnownM0Val) const1445 bool SIFoldOperands::tryFoldFoldableCopy(
1446     MachineInstr &MI, MachineOperand *&CurrentKnownM0Val) const {
1447   // Specially track simple redefs of m0 to the same value in a block, so we
1448   // can erase the later ones.
1449   if (MI.getOperand(0).getReg() == AMDGPU::M0) {
1450     MachineOperand &NewM0Val = MI.getOperand(1);
1451     if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
1452       MI.eraseFromParent();
1453       return true;
1454     }
1455 
1456     // We aren't tracking other physical registers
1457     CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical())
1458                             ? nullptr
1459                             : &NewM0Val;
1460     return false;
1461   }
1462 
1463   MachineOperand &OpToFold = MI.getOperand(1);
1464   bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
1465 
1466   // FIXME: We could also be folding things like TargetIndexes.
1467   if (!FoldingImm && !OpToFold.isReg())
1468     return false;
1469 
1470   if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
1471     return false;
1472 
1473   // Prevent folding operands backwards in the function. For example,
1474   // the COPY opcode must not be replaced by 1 in this example:
1475   //
1476   //    %3 = COPY %vgpr0; VGPR_32:%3
1477   //    ...
1478   //    %vgpr0 = V_MOV_B32_e32 1, implicit %exec
1479   if (!MI.getOperand(0).getReg().isVirtual())
1480     return false;
1481 
1482   bool Changed = foldInstOperand(MI, OpToFold);
1483 
1484   // If we managed to fold all uses of this copy then we might as well
1485   // delete it now.
1486   // The only reason we need to follow chains of copies here is that
1487   // tryFoldRegSequence looks forward through copies before folding a
1488   // REG_SEQUENCE into its eventual users.
1489   auto *InstToErase = &MI;
1490   while (MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1491     auto &SrcOp = InstToErase->getOperand(1);
1492     auto SrcReg = SrcOp.isReg() ? SrcOp.getReg() : Register();
1493     InstToErase->eraseFromParent();
1494     Changed = true;
1495     InstToErase = nullptr;
1496     if (!SrcReg || SrcReg.isPhysical())
1497       break;
1498     InstToErase = MRI->getVRegDef(SrcReg);
1499     if (!InstToErase || !TII->isFoldableCopy(*InstToErase))
1500       break;
1501   }
1502 
1503   if (InstToErase && InstToErase->isRegSequence() &&
1504       MRI->use_nodbg_empty(InstToErase->getOperand(0).getReg())) {
1505     InstToErase->eraseFromParent();
1506     Changed = true;
1507   }
1508 
1509   return Changed;
1510 }
1511 
1512 // Clamp patterns are canonically selected to v_max_* instructions, so only
1513 // handle them.
isClamp(const MachineInstr & MI) const1514 const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
1515   unsigned Op = MI.getOpcode();
1516   switch (Op) {
1517   case AMDGPU::V_MAX_F32_e64:
1518   case AMDGPU::V_MAX_F16_e64:
1519   case AMDGPU::V_MAX_F16_t16_e64:
1520   case AMDGPU::V_MAX_F16_fake16_e64:
1521   case AMDGPU::V_MAX_F64_e64:
1522   case AMDGPU::V_MAX_NUM_F64_e64:
1523   case AMDGPU::V_PK_MAX_F16: {
1524     if (MI.mayRaiseFPException())
1525       return nullptr;
1526 
1527     if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
1528       return nullptr;
1529 
1530     // Make sure sources are identical.
1531     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1532     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1533     if (!Src0->isReg() || !Src1->isReg() ||
1534         Src0->getReg() != Src1->getReg() ||
1535         Src0->getSubReg() != Src1->getSubReg() ||
1536         Src0->getSubReg() != AMDGPU::NoSubRegister)
1537       return nullptr;
1538 
1539     // Can't fold up if we have modifiers.
1540     if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1541       return nullptr;
1542 
1543     unsigned Src0Mods
1544       = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
1545     unsigned Src1Mods
1546       = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
1547 
1548     // Having a 0 op_sel_hi would require swizzling the output in the source
1549     // instruction, which we can't do.
1550     unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1
1551                                                       : 0u;
1552     if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)
1553       return nullptr;
1554     return Src0;
1555   }
1556   default:
1557     return nullptr;
1558   }
1559 }
1560 
1561 // FIXME: Clamp for v_mad_mixhi_f16 handled during isel.
tryFoldClamp(MachineInstr & MI)1562 bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
1563   const MachineOperand *ClampSrc = isClamp(MI);
1564   if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
1565     return false;
1566 
1567   MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
1568 
1569   // The type of clamp must be compatible.
1570   if (TII->getClampMask(*Def) != TII->getClampMask(MI))
1571     return false;
1572 
1573   if (Def->mayRaiseFPException())
1574     return false;
1575 
1576   MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
1577   if (!DefClamp)
1578     return false;
1579 
1580   LLVM_DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def);
1581 
1582   // Clamp is applied after omod, so it is OK if omod is set.
1583   DefClamp->setImm(1);
1584 
1585   Register DefReg = Def->getOperand(0).getReg();
1586   Register MIDstReg = MI.getOperand(0).getReg();
1587   if (TRI->isSGPRReg(*MRI, DefReg)) {
1588     // Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
1589     // instruction with a VGPR dst.
1590     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
1591             MIDstReg)
1592         .addReg(DefReg);
1593   } else {
1594     MRI->replaceRegWith(MIDstReg, DefReg);
1595   }
1596   MI.eraseFromParent();
1597 
1598   // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1599   // instruction, so we might as well convert it to the more flexible VOP3-only
1600   // mad/fma form.
1601   if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1602     Def->eraseFromParent();
1603 
1604   return true;
1605 }
1606 
getOModValue(unsigned Opc,int64_t Val)1607 static int getOModValue(unsigned Opc, int64_t Val) {
1608   switch (Opc) {
1609   case AMDGPU::V_MUL_F64_e64:
1610   case AMDGPU::V_MUL_F64_pseudo_e64: {
1611     switch (Val) {
1612     case 0x3fe0000000000000: // 0.5
1613       return SIOutMods::DIV2;
1614     case 0x4000000000000000: // 2.0
1615       return SIOutMods::MUL2;
1616     case 0x4010000000000000: // 4.0
1617       return SIOutMods::MUL4;
1618     default:
1619       return SIOutMods::NONE;
1620     }
1621   }
1622   case AMDGPU::V_MUL_F32_e64: {
1623     switch (static_cast<uint32_t>(Val)) {
1624     case 0x3f000000: // 0.5
1625       return SIOutMods::DIV2;
1626     case 0x40000000: // 2.0
1627       return SIOutMods::MUL2;
1628     case 0x40800000: // 4.0
1629       return SIOutMods::MUL4;
1630     default:
1631       return SIOutMods::NONE;
1632     }
1633   }
1634   case AMDGPU::V_MUL_F16_e64:
1635   case AMDGPU::V_MUL_F16_t16_e64:
1636   case AMDGPU::V_MUL_F16_fake16_e64: {
1637     switch (static_cast<uint16_t>(Val)) {
1638     case 0x3800: // 0.5
1639       return SIOutMods::DIV2;
1640     case 0x4000: // 2.0
1641       return SIOutMods::MUL2;
1642     case 0x4400: // 4.0
1643       return SIOutMods::MUL4;
1644     default:
1645       return SIOutMods::NONE;
1646     }
1647   }
1648   default:
1649     llvm_unreachable("invalid mul opcode");
1650   }
1651 }
1652 
1653 // FIXME: Does this really not support denormals with f16?
1654 // FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
1655 // handled, so will anything other than that break?
1656 std::pair<const MachineOperand *, int>
isOMod(const MachineInstr & MI) const1657 SIFoldOperands::isOMod(const MachineInstr &MI) const {
1658   unsigned Op = MI.getOpcode();
1659   switch (Op) {
1660   case AMDGPU::V_MUL_F64_e64:
1661   case AMDGPU::V_MUL_F64_pseudo_e64:
1662   case AMDGPU::V_MUL_F32_e64:
1663   case AMDGPU::V_MUL_F16_t16_e64:
1664   case AMDGPU::V_MUL_F16_fake16_e64:
1665   case AMDGPU::V_MUL_F16_e64: {
1666     // If output denormals are enabled, omod is ignored.
1667     if ((Op == AMDGPU::V_MUL_F32_e64 &&
1668          MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1669         ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
1670           Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
1671           Op == AMDGPU::V_MUL_F16_fake16_e64) &&
1672          MFI->getMode().FP64FP16Denormals.Output !=
1673              DenormalMode::PreserveSign) ||
1674         MI.mayRaiseFPException())
1675       return std::pair(nullptr, SIOutMods::NONE);
1676 
1677     const MachineOperand *RegOp = nullptr;
1678     const MachineOperand *ImmOp = nullptr;
1679     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1680     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1681     if (Src0->isImm()) {
1682       ImmOp = Src0;
1683       RegOp = Src1;
1684     } else if (Src1->isImm()) {
1685       ImmOp = Src1;
1686       RegOp = Src0;
1687     } else
1688       return std::pair(nullptr, SIOutMods::NONE);
1689 
1690     int OMod = getOModValue(Op, ImmOp->getImm());
1691     if (OMod == SIOutMods::NONE ||
1692         TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
1693         TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
1694         TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
1695         TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
1696       return std::pair(nullptr, SIOutMods::NONE);
1697 
1698     return std::pair(RegOp, OMod);
1699   }
1700   case AMDGPU::V_ADD_F64_e64:
1701   case AMDGPU::V_ADD_F64_pseudo_e64:
1702   case AMDGPU::V_ADD_F32_e64:
1703   case AMDGPU::V_ADD_F16_e64:
1704   case AMDGPU::V_ADD_F16_t16_e64:
1705   case AMDGPU::V_ADD_F16_fake16_e64: {
1706     // If output denormals are enabled, omod is ignored.
1707     if ((Op == AMDGPU::V_ADD_F32_e64 &&
1708          MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
1709         ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
1710           Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
1711           Op == AMDGPU::V_ADD_F16_fake16_e64) &&
1712          MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
1713       return std::pair(nullptr, SIOutMods::NONE);
1714 
1715     // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
1716     const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1717     const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1718 
1719     if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
1720         Src0->getSubReg() == Src1->getSubReg() &&
1721         !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
1722         !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
1723         !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
1724         !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1725       return std::pair(Src0, SIOutMods::MUL2);
1726 
1727     return std::pair(nullptr, SIOutMods::NONE);
1728   }
1729   default:
1730     return std::pair(nullptr, SIOutMods::NONE);
1731   }
1732 }
1733 
1734 // FIXME: Does this need to check IEEE bit on function?
tryFoldOMod(MachineInstr & MI)1735 bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
1736   const MachineOperand *RegOp;
1737   int OMod;
1738   std::tie(RegOp, OMod) = isOMod(MI);
1739   if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
1740       RegOp->getSubReg() != AMDGPU::NoSubRegister ||
1741       !MRI->hasOneNonDBGUser(RegOp->getReg()))
1742     return false;
1743 
1744   MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
1745   MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
1746   if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
1747     return false;
1748 
1749   if (Def->mayRaiseFPException())
1750     return false;
1751 
1752   // Clamp is applied after omod. If the source already has clamp set, don't
1753   // fold it.
1754   if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
1755     return false;
1756 
1757   LLVM_DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def);
1758 
1759   DefOMod->setImm(OMod);
1760   MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
1761   MI.eraseFromParent();
1762 
1763   // Use of output modifiers forces VOP3 encoding for a VOP2 mac/fmac
1764   // instruction, so we might as well convert it to the more flexible VOP3-only
1765   // mad/fma form.
1766   if (TII->convertToThreeAddress(*Def, nullptr, nullptr))
1767     Def->eraseFromParent();
1768 
1769   return true;
1770 }
1771 
1772 // Try to fold a reg_sequence with vgpr output and agpr inputs into an
1773 // instruction which can take an agpr. So far that means a store.
tryFoldRegSequence(MachineInstr & MI)1774 bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1775   assert(MI.isRegSequence());
1776   auto Reg = MI.getOperand(0).getReg();
1777 
1778   if (!ST->hasGFX90AInsts() || !TRI->isVGPR(*MRI, Reg) ||
1779       !MRI->hasOneNonDBGUse(Reg))
1780     return false;
1781 
1782   SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
1783   if (!getRegSeqInit(Defs, Reg, MCOI::OPERAND_REGISTER))
1784     return false;
1785 
1786   for (auto &[Op, SubIdx] : Defs) {
1787     if (!Op->isReg())
1788       return false;
1789     if (TRI->isAGPR(*MRI, Op->getReg()))
1790       continue;
1791     // Maybe this is a COPY from AREG
1792     const MachineInstr *SubDef = MRI->getVRegDef(Op->getReg());
1793     if (!SubDef || !SubDef->isCopy() || SubDef->getOperand(1).getSubReg())
1794       return false;
1795     if (!TRI->isAGPR(*MRI, SubDef->getOperand(1).getReg()))
1796       return false;
1797   }
1798 
1799   MachineOperand *Op = &*MRI->use_nodbg_begin(Reg);
1800   MachineInstr *UseMI = Op->getParent();
1801   while (UseMI->isCopy() && !Op->getSubReg()) {
1802     Reg = UseMI->getOperand(0).getReg();
1803     if (!TRI->isVGPR(*MRI, Reg) || !MRI->hasOneNonDBGUse(Reg))
1804       return false;
1805     Op = &*MRI->use_nodbg_begin(Reg);
1806     UseMI = Op->getParent();
1807   }
1808 
1809   if (Op->getSubReg())
1810     return false;
1811 
1812   unsigned OpIdx = Op - &UseMI->getOperand(0);
1813   const MCInstrDesc &InstDesc = UseMI->getDesc();
1814   const TargetRegisterClass *OpRC =
1815       TII->getRegClass(InstDesc, OpIdx, TRI, *MI.getMF());
1816   if (!OpRC || !TRI->isVectorSuperClass(OpRC))
1817     return false;
1818 
1819   const auto *NewDstRC = TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg));
1820   auto Dst = MRI->createVirtualRegister(NewDstRC);
1821   auto RS = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
1822                     TII->get(AMDGPU::REG_SEQUENCE), Dst);
1823 
1824   for (auto &[Def, SubIdx] : Defs) {
1825     Def->setIsKill(false);
1826     if (TRI->isAGPR(*MRI, Def->getReg())) {
1827       RS.add(*Def);
1828     } else { // This is a copy
1829       MachineInstr *SubDef = MRI->getVRegDef(Def->getReg());
1830       SubDef->getOperand(1).setIsKill(false);
1831       RS.addReg(SubDef->getOperand(1).getReg(), 0, Def->getSubReg());
1832     }
1833     RS.addImm(SubIdx);
1834   }
1835 
1836   Op->setReg(Dst);
1837   if (!TII->isOperandLegal(*UseMI, OpIdx, Op)) {
1838     Op->setReg(Reg);
1839     RS->eraseFromParent();
1840     return false;
1841   }
1842 
1843   LLVM_DEBUG(dbgs() << "Folded " << *RS << " into " << *UseMI);
1844 
1845   // Erase the REG_SEQUENCE eagerly, unless we followed a chain of COPY users,
1846   // in which case we can erase them all later in runOnMachineFunction.
1847   if (MRI->use_nodbg_empty(MI.getOperand(0).getReg()))
1848     MI.eraseFromParent();
1849   return true;
1850 }
1851 
1852 /// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
1853 /// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
isAGPRCopy(const SIRegisterInfo & TRI,const MachineRegisterInfo & MRI,const MachineInstr & Copy,Register & OutReg,unsigned & OutSubReg)1854 static bool isAGPRCopy(const SIRegisterInfo &TRI,
1855                        const MachineRegisterInfo &MRI, const MachineInstr &Copy,
1856                        Register &OutReg, unsigned &OutSubReg) {
1857   assert(Copy.isCopy());
1858 
1859   const MachineOperand &CopySrc = Copy.getOperand(1);
1860   Register CopySrcReg = CopySrc.getReg();
1861   if (!CopySrcReg.isVirtual())
1862     return false;
1863 
1864   // Common case: copy from AGPR directly, e.g.
1865   //  %1:vgpr_32 = COPY %0:agpr_32
1866   if (TRI.isAGPR(MRI, CopySrcReg)) {
1867     OutReg = CopySrcReg;
1868     OutSubReg = CopySrc.getSubReg();
1869     return true;
1870   }
1871 
1872   // Sometimes it can also involve two copies, e.g.
1873   //  %1:vgpr_256 = COPY %0:agpr_256
1874   //  %2:vgpr_32 = COPY %1:vgpr_256.sub0
1875   const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
1876   if (!CopySrcDef || !CopySrcDef->isCopy())
1877     return false;
1878 
1879   const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
1880   Register OtherCopySrcReg = OtherCopySrc.getReg();
1881   if (!OtherCopySrcReg.isVirtual() ||
1882       CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
1883       OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
1884       !TRI.isAGPR(MRI, OtherCopySrcReg))
1885     return false;
1886 
1887   OutReg = OtherCopySrcReg;
1888   OutSubReg = CopySrc.getSubReg();
1889   return true;
1890 }
1891 
1892 // Try to hoist an AGPR to VGPR copy across a PHI.
1893 // This should allow folding of an AGPR into a consumer which may support it.
1894 //
1895 // Example 1: LCSSA PHI
1896 //      loop:
1897 //        %1:vreg = COPY %0:areg
1898 //      exit:
1899 //        %2:vreg = PHI %1:vreg, %loop
1900 //  =>
1901 //      loop:
1902 //      exit:
1903 //        %1:areg = PHI %0:areg, %loop
1904 //        %2:vreg = COPY %1:areg
1905 //
1906 // Example 2: PHI with multiple incoming values:
1907 //      entry:
1908 //        %1:vreg = GLOBAL_LOAD(..)
1909 //      loop:
1910 //        %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1911 //        %3:areg = COPY %2:vreg
1912 //        %4:areg = (instr using %3:areg)
1913 //        %5:vreg = COPY %4:areg
1914 //  =>
1915 //      entry:
1916 //        %1:vreg = GLOBAL_LOAD(..)
1917 //        %2:areg = COPY %1:vreg
1918 //      loop:
1919 //        %3:areg = PHI %2:areg, %entry, %X:areg,
1920 //        %4:areg = (instr using %3:areg)
tryFoldPhiAGPR(MachineInstr & PHI)1921 bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
1922   assert(PHI.isPHI());
1923 
1924   Register PhiOut = PHI.getOperand(0).getReg();
1925   if (!TRI->isVGPR(*MRI, PhiOut))
1926     return false;
1927 
1928   // Iterate once over all incoming values of the PHI to check if this PHI is
1929   // eligible, and determine the exact AGPR RC we'll target.
1930   const TargetRegisterClass *ARC = nullptr;
1931   for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1932     MachineOperand &MO = PHI.getOperand(K);
1933     MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
1934     if (!Copy || !Copy->isCopy())
1935       continue;
1936 
1937     Register AGPRSrc;
1938     unsigned AGPRRegMask = AMDGPU::NoSubRegister;
1939     if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
1940       continue;
1941 
1942     const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
1943     if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
1944       CopyInRC = SubRC;
1945 
1946     if (ARC && !ARC->hasSubClassEq(CopyInRC))
1947       return false;
1948     ARC = CopyInRC;
1949   }
1950 
1951   if (!ARC)
1952     return false;
1953 
1954   bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
1955 
1956   // Rewrite the PHI's incoming values to ARC.
1957   LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1958   for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1959     MachineOperand &MO = PHI.getOperand(K);
1960     Register Reg = MO.getReg();
1961 
1962     MachineBasicBlock::iterator InsertPt;
1963     MachineBasicBlock *InsertMBB = nullptr;
1964 
1965     // Look at the def of Reg, ignoring all copies.
1966     unsigned CopyOpc = AMDGPU::COPY;
1967     if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1968 
1969       // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1970       // the copy was single-use, it will be removed by DCE later.
1971       if (Def->isCopy()) {
1972         Register AGPRSrc;
1973         unsigned AGPRSubReg = AMDGPU::NoSubRegister;
1974         if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
1975           MO.setReg(AGPRSrc);
1976           MO.setSubReg(AGPRSubReg);
1977           continue;
1978         }
1979 
1980         // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1981         // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1982         // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1983         // is unlikely to be profitable.
1984         //
1985         // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
1986         MachineOperand &CopyIn = Def->getOperand(1);
1987         if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1988             TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1989           CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1990       }
1991 
1992       InsertMBB = Def->getParent();
1993       InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
1994     } else {
1995       InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1996       InsertPt = InsertMBB->getFirstTerminator();
1997     }
1998 
1999     Register NewReg = MRI->createVirtualRegister(ARC);
2000     MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
2001                                TII->get(CopyOpc), NewReg)
2002                            .addReg(Reg);
2003     MO.setReg(NewReg);
2004 
2005     (void)MI;
2006     LLVM_DEBUG(dbgs() << "  Created COPY: " << *MI);
2007   }
2008 
2009   // Replace the PHI's result with a new register.
2010   Register NewReg = MRI->createVirtualRegister(ARC);
2011   PHI.getOperand(0).setReg(NewReg);
2012 
2013   // COPY that new register back to the original PhiOut register. This COPY will
2014   // usually be folded out later.
2015   MachineBasicBlock *MBB = PHI.getParent();
2016   BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
2017           TII->get(AMDGPU::COPY), PhiOut)
2018       .addReg(NewReg);
2019 
2020   LLVM_DEBUG(dbgs() << "  Done: Folded " << PHI);
2021   return true;
2022 }
2023 
2024 // Attempt to convert VGPR load to an AGPR load.
tryFoldLoad(MachineInstr & MI)2025 bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
2026   assert(MI.mayLoad());
2027   if (!ST->hasGFX90AInsts() || MI.getNumExplicitDefs() != 1)
2028     return false;
2029 
2030   MachineOperand &Def = MI.getOperand(0);
2031   if (!Def.isDef())
2032     return false;
2033 
2034   Register DefReg = Def.getReg();
2035 
2036   if (DefReg.isPhysical() || !TRI->isVGPR(*MRI, DefReg))
2037     return false;
2038 
2039   SmallVector<const MachineInstr*, 8> Users;
2040   SmallVector<Register, 8> MoveRegs;
2041   for (const MachineInstr &I : MRI->use_nodbg_instructions(DefReg))
2042     Users.push_back(&I);
2043 
2044   if (Users.empty())
2045     return false;
2046 
2047   // Check that all uses a copy to an agpr or a reg_sequence producing an agpr.
2048   while (!Users.empty()) {
2049     const MachineInstr *I = Users.pop_back_val();
2050     if (!I->isCopy() && !I->isRegSequence())
2051       return false;
2052     Register DstReg = I->getOperand(0).getReg();
2053     // Physical registers may have more than one instruction definitions
2054     if (DstReg.isPhysical())
2055       return false;
2056     if (TRI->isAGPR(*MRI, DstReg))
2057       continue;
2058     MoveRegs.push_back(DstReg);
2059     for (const MachineInstr &U : MRI->use_nodbg_instructions(DstReg))
2060       Users.push_back(&U);
2061   }
2062 
2063   const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
2064   MRI->setRegClass(DefReg, TRI->getEquivalentAGPRClass(RC));
2065   if (!TII->isOperandLegal(MI, 0, &Def)) {
2066     MRI->setRegClass(DefReg, RC);
2067     return false;
2068   }
2069 
2070   while (!MoveRegs.empty()) {
2071     Register Reg = MoveRegs.pop_back_val();
2072     MRI->setRegClass(Reg, TRI->getEquivalentAGPRClass(MRI->getRegClass(Reg)));
2073   }
2074 
2075   LLVM_DEBUG(dbgs() << "Folded " << MI);
2076 
2077   return true;
2078 }
2079 
2080 // tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
2081 // For GFX90A and later, this is pretty much always a good thing, but for GFX908
2082 // there's cases where it can create a lot more AGPR-AGPR copies, which are
2083 // expensive on this architecture due to the lack of V_ACCVGPR_MOV.
2084 //
2085 // This function looks at all AGPR PHIs in a basic block and collects their
2086 // operands. Then, it checks for register that are used more than once across
2087 // all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
2088 // having to create one VGPR temporary per use, which can get very messy if
2089 // these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
2090 // element).
2091 //
2092 // Example
2093 //      a:
2094 //        %in:agpr_256 = COPY %foo:vgpr_256
2095 //      c:
2096 //        %x:agpr_32 = ..
2097 //      b:
2098 //        %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
2099 //        %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
2100 //        %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
2101 //  =>
2102 //      a:
2103 //        %in:agpr_256 = COPY %foo:vgpr_256
2104 //        %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
2105 //        %tmp_agpr:agpr_32 = COPY %tmp
2106 //      c:
2107 //        %x:agpr_32 = ..
2108 //      b:
2109 //        %0:areg = PHI %tmp_agpr, %a, %x, %c
2110 //        %1:areg = PHI %tmp_agpr, %a, %y, %c
2111 //        %2:areg = PHI %tmp_agpr, %a, %z, %c
tryOptimizeAGPRPhis(MachineBasicBlock & MBB)2112 bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
2113   // This is only really needed on GFX908 where AGPR-AGPR copies are
2114   // unreasonably difficult.
2115   if (ST->hasGFX90AInsts())
2116     return false;
2117 
2118   // Look at all AGPR Phis and collect the register + subregister used.
2119   DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
2120       RegToMO;
2121 
2122   for (auto &MI : MBB) {
2123     if (!MI.isPHI())
2124       break;
2125 
2126     if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
2127       continue;
2128 
2129     for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
2130       MachineOperand &PhiMO = MI.getOperand(K);
2131       if (!PhiMO.getSubReg())
2132         continue;
2133       RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
2134     }
2135   }
2136 
2137   // For all (Reg, SubReg) pair that are used more than once, cache the value in
2138   // a VGPR.
2139   bool Changed = false;
2140   for (const auto &[Entry, MOs] : RegToMO) {
2141     if (MOs.size() == 1)
2142       continue;
2143 
2144     const auto [Reg, SubReg] = Entry;
2145     MachineInstr *Def = MRI->getVRegDef(Reg);
2146     MachineBasicBlock *DefMBB = Def->getParent();
2147 
2148     // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
2149     // out.
2150     const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
2151     Register TempVGPR =
2152         MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
2153     MachineInstr *VGPRCopy =
2154         BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
2155                 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
2156             .addReg(Reg, /* flags */ 0, SubReg);
2157 
2158     // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
2159     Register TempAGPR = MRI->createVirtualRegister(ARC);
2160     BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
2161             TII->get(AMDGPU::COPY), TempAGPR)
2162         .addReg(TempVGPR);
2163 
2164     LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
2165     for (MachineOperand *MO : MOs) {
2166       MO->setReg(TempAGPR);
2167       MO->setSubReg(AMDGPU::NoSubRegister);
2168       LLVM_DEBUG(dbgs() << "  Changed PHI Operand: " << *MO << "\n");
2169     }
2170 
2171     Changed = true;
2172   }
2173 
2174   return Changed;
2175 }
2176 
runOnMachineFunction(MachineFunction & MF)2177 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
2178   if (skipFunction(MF.getFunction()))
2179     return false;
2180 
2181   MRI = &MF.getRegInfo();
2182   ST = &MF.getSubtarget<GCNSubtarget>();
2183   TII = ST->getInstrInfo();
2184   TRI = &TII->getRegisterInfo();
2185   MFI = MF.getInfo<SIMachineFunctionInfo>();
2186 
2187   // omod is ignored by hardware if IEEE bit is enabled. omod also does not
2188   // correctly handle signed zeros.
2189   //
2190   // FIXME: Also need to check strictfp
2191   bool IsIEEEMode = MFI->getMode().IEEE;
2192   bool HasNSZ = MFI->hasNoSignedZerosFPMath();
2193 
2194   bool Changed = false;
2195   for (MachineBasicBlock *MBB : depth_first(&MF)) {
2196     MachineOperand *CurrentKnownM0Val = nullptr;
2197     for (auto &MI : make_early_inc_range(*MBB)) {
2198       Changed |= tryFoldCndMask(MI);
2199 
2200       if (tryFoldZeroHighBits(MI)) {
2201         Changed = true;
2202         continue;
2203       }
2204 
2205       if (MI.isRegSequence() && tryFoldRegSequence(MI)) {
2206         Changed = true;
2207         continue;
2208       }
2209 
2210       if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
2211         Changed = true;
2212         continue;
2213       }
2214 
2215       if (MI.mayLoad() && tryFoldLoad(MI)) {
2216         Changed = true;
2217         continue;
2218       }
2219 
2220       if (TII->isFoldableCopy(MI)) {
2221         Changed |= tryFoldFoldableCopy(MI, CurrentKnownM0Val);
2222         continue;
2223       }
2224 
2225       // Saw an unknown clobber of m0, so we no longer know what it is.
2226       if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
2227         CurrentKnownM0Val = nullptr;
2228 
2229       // TODO: Omod might be OK if there is NSZ only on the source
2230       // instruction, and not the omod multiply.
2231       if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
2232           !tryFoldOMod(MI))
2233         Changed |= tryFoldClamp(MI);
2234     }
2235 
2236     Changed |= tryOptimizeAGPRPhis(*MBB);
2237   }
2238 
2239   return Changed;
2240 }
2241