xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (revision bc5304a006238115291e7568583632889dffbab9)
1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/MachineFunctionPass.h"
19 
20 using namespace llvm;
21 
22 #define DEBUG_TYPE "si-pre-emit-peephole"
23 
24 namespace {
25 
26 class SIPreEmitPeephole : public MachineFunctionPass {
27 private:
28   const SIInstrInfo *TII = nullptr;
29   const SIRegisterInfo *TRI = nullptr;
30 
31   bool optimizeVccBranch(MachineInstr &MI) const;
32   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
33 
34 public:
35   static char ID;
36 
37   SIPreEmitPeephole() : MachineFunctionPass(ID) {
38     initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
39   }
40 
41   bool runOnMachineFunction(MachineFunction &MF) override;
42 };
43 
44 } // End anonymous namespace.
45 
46 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
47                 "SI peephole optimizations", false, false)
48 
49 char SIPreEmitPeephole::ID = 0;
50 
51 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
52 
53 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
54   // Match:
55   // sreg = -1 or 0
56   // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
57   // S_CBRANCH_VCC[N]Z
58   // =>
59   // S_CBRANCH_EXEC[N]Z
60   // We end up with this pattern sometimes after basic block placement.
61   // It happens while combining a block which assigns -1 or 0 to a saved mask
62   // and another block which consumes that saved mask and then a branch.
63   bool Changed = false;
64   MachineBasicBlock &MBB = *MI.getParent();
65   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
66   const bool IsWave32 = ST.isWave32();
67   const unsigned CondReg = TRI->getVCC();
68   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
69   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
70   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
71   const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
72 
73   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
74                                       E = MBB.rend();
75   bool ReadsCond = false;
76   unsigned Threshold = 5;
77   for (++A; A != E; ++A) {
78     if (!--Threshold)
79       return false;
80     if (A->modifiesRegister(ExecReg, TRI))
81       return false;
82     if (A->modifiesRegister(CondReg, TRI)) {
83       if (!A->definesRegister(CondReg, TRI) ||
84           (A->getOpcode() != And && A->getOpcode() != AndN2))
85         return false;
86       break;
87     }
88     ReadsCond |= A->readsRegister(CondReg, TRI);
89   }
90   if (A == E)
91     return false;
92 
93   MachineOperand &Op1 = A->getOperand(1);
94   MachineOperand &Op2 = A->getOperand(2);
95   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
96     TII->commuteInstruction(*A);
97     Changed = true;
98   }
99   if (Op1.getReg() != ExecReg)
100     return Changed;
101   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
102     return Changed;
103 
104   int64_t MaskValue = 0;
105   Register SReg;
106   if (Op2.isReg()) {
107     SReg = Op2.getReg();
108     auto M = std::next(A);
109     bool ReadsSreg = false;
110     for (; M != E; ++M) {
111       if (M->definesRegister(SReg, TRI))
112         break;
113       if (M->modifiesRegister(SReg, TRI))
114         return Changed;
115       ReadsSreg |= M->readsRegister(SReg, TRI);
116     }
117     if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
118         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
119       return Changed;
120     MaskValue = M->getOperand(1).getImm();
121     // First if sreg is only used in the AND instruction fold the immediate
122     // into into the AND.
123     if (!ReadsSreg && Op2.isKill()) {
124       A->getOperand(2).ChangeToImmediate(MaskValue);
125       M->eraseFromParent();
126     }
127   } else if (Op2.isImm()) {
128     MaskValue = Op2.getImm();
129   } else {
130     llvm_unreachable("Op2 must be register or immediate");
131   }
132 
133   // Invert mask for s_andn2
134   assert(MaskValue == 0 || MaskValue == -1);
135   if (A->getOpcode() == AndN2)
136     MaskValue = ~MaskValue;
137 
138   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
139     if (!MI.killsRegister(CondReg, TRI)) {
140       // Replace AND with MOV
141       if (MaskValue == 0) {
142         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
143             .addImm(0);
144       } else {
145         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
146             .addReg(ExecReg);
147       }
148     }
149     // Remove AND instruction
150     A->eraseFromParent();
151   }
152 
153   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
154   if (SReg == ExecReg) {
155     // EXEC is updated directly
156     if (IsVCCZ) {
157       MI.eraseFromParent();
158       return true;
159     }
160     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
161   } else if (IsVCCZ && MaskValue == 0) {
162     // Will always branch
163     // Remove all succesors shadowed by new unconditional branch
164     MachineBasicBlock *Parent = MI.getParent();
165     SmallVector<MachineInstr *, 4> ToRemove;
166     bool Found = false;
167     for (MachineInstr &Term : Parent->terminators()) {
168       if (Found) {
169         if (Term.isBranch())
170           ToRemove.push_back(&Term);
171       } else {
172         Found = Term.isIdenticalTo(MI);
173       }
174     }
175     assert(Found && "conditional branch is not terminator");
176     for (auto BranchMI : ToRemove) {
177       MachineOperand &Dst = BranchMI->getOperand(0);
178       assert(Dst.isMBB() && "destination is not basic block");
179       Parent->removeSuccessor(Dst.getMBB());
180       BranchMI->eraseFromParent();
181     }
182 
183     if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
184       Parent->removeSuccessor(Succ);
185     }
186 
187     // Rewrite to unconditional branch
188     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
189   } else if (!IsVCCZ && MaskValue == 0) {
190     // Will never branch
191     MachineOperand &Dst = MI.getOperand(0);
192     assert(Dst.isMBB() && "destination is not basic block");
193     MI.getParent()->removeSuccessor(Dst.getMBB());
194     MI.eraseFromParent();
195     return true;
196   } else if (MaskValue == -1) {
197     // Depends only on EXEC
198     MI.setDesc(
199         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
200   }
201 
202   MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
203   MI.addImplicitDefUseOperands(*MBB.getParent());
204 
205   return true;
206 }
207 
208 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
209                                        MachineInstr &MI) const {
210   MachineBasicBlock &MBB = *MI.getParent();
211   const MachineFunction &MF = *MBB.getParent();
212   const MachineRegisterInfo &MRI = MF.getRegInfo();
213   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
214   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
215   SmallVector<MachineInstr *, 4> ToRemove;
216   bool IdxOn = true;
217 
218   if (!MI.isIdenticalTo(First))
219     return false;
220 
221   // Scan back to find an identical S_SET_GPR_IDX_ON
222   for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
223        E = MI.getIterator(); I != E; ++I) {
224     switch (I->getOpcode()) {
225     case AMDGPU::S_SET_GPR_IDX_MODE:
226       return false;
227     case AMDGPU::S_SET_GPR_IDX_OFF:
228       IdxOn = false;
229       ToRemove.push_back(&*I);
230       break;
231     default:
232       if (I->modifiesRegister(AMDGPU::M0, TRI))
233         return false;
234       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
235         return false;
236       if (llvm::any_of(I->operands(),
237                        [&MRI, this](const MachineOperand &MO) {
238                          return MO.isReg() &&
239                                 TRI->isVectorRegister(MRI, MO.getReg());
240                        })) {
241         // The only exception allowed here is another indirect vector move
242         // with the same mode.
243         if (!IdxOn ||
244             !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
245                I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
246               I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
247           return false;
248       }
249     }
250   }
251 
252   MI.eraseFromParent();
253   for (MachineInstr *RI : ToRemove)
254     RI->eraseFromParent();
255   return true;
256 }
257 
258 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
259   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
260   TII = ST.getInstrInfo();
261   TRI = &TII->getRegisterInfo();
262   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
263   bool Changed = false;
264 
265   for (MachineBasicBlock &MBB : MF) {
266     MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
267     MachineBasicBlock::iterator TermI = MBBE;
268     // Check first terminator for VCC branches to optimize
269     if (TermI != MBB.end()) {
270       MachineInstr &MI = *TermI;
271       switch (MI.getOpcode()) {
272       case AMDGPU::S_CBRANCH_VCCZ:
273       case AMDGPU::S_CBRANCH_VCCNZ:
274         Changed |= optimizeVccBranch(MI);
275         continue;
276       default:
277         break;
278       }
279     }
280     // Check all terminators for SI_RETURN_TO_EPILOG
281     // FIXME: This is not an optimization and should be moved somewhere else.
282     while (TermI != MBB.end()) {
283       MachineInstr &MI = *TermI;
284       if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
285         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
286 
287         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
288         // because external bytecode will be appended at the end.
289         if (&MBB != &MF.back() || &MI != &MBB.back()) {
290           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
291           // at the end and jump there.
292           if (!EmptyMBBAtEnd) {
293             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
294             MF.insert(MF.end(), EmptyMBBAtEnd);
295           }
296 
297           MBB.addSuccessor(EmptyMBBAtEnd);
298           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
299               .addMBB(EmptyMBBAtEnd);
300           MI.eraseFromParent();
301           MBBE = MBB.getFirstTerminator();
302           TermI = MBBE;
303           continue;
304         }
305       }
306       TermI++;
307     }
308 
309     if (!ST.hasVGPRIndexMode())
310       continue;
311 
312     MachineInstr *SetGPRMI = nullptr;
313     const unsigned Threshold = 20;
314     unsigned Count = 0;
315     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
316     // second is not needed. Do expensive checks in the optimizeSetGPR()
317     // and limit the distance to 20 instructions for compile time purposes.
318     for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
319       MachineInstr &MI = *MBBI;
320       ++MBBI;
321 
322       if (Count == Threshold)
323         SetGPRMI = nullptr;
324       else
325         ++Count;
326 
327       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
328         continue;
329 
330       Count = 0;
331       if (!SetGPRMI) {
332         SetGPRMI = &MI;
333         continue;
334       }
335 
336       if (optimizeSetGPR(*SetGPRMI, MI))
337         Changed = true;
338       else
339         SetGPRMI = &MI;
340     }
341   }
342 
343   return Changed;
344 }
345