xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "si-pre-emit-peephole"
22 
23 static unsigned SkipThreshold;
24 
25 static cl::opt<unsigned, true> SkipThresholdFlag(
26     "amdgpu-skip-threshold", cl::Hidden,
27     cl::desc(
28         "Number of instructions before jumping over divergent control flow"),
29     cl::location(SkipThreshold), cl::init(12));
30 
31 namespace {
32 
33 class SIPreEmitPeephole : public MachineFunctionPass {
34 private:
35   const SIInstrInfo *TII = nullptr;
36   const SIRegisterInfo *TRI = nullptr;
37 
38   bool optimizeVccBranch(MachineInstr &MI) const;
39   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
40   bool getBlockDestinations(MachineBasicBlock &SrcMBB,
41                             MachineBasicBlock *&TrueMBB,
42                             MachineBasicBlock *&FalseMBB,
43                             SmallVectorImpl<MachineOperand> &Cond);
44   bool mustRetainExeczBranch(const MachineBasicBlock &From,
45                              const MachineBasicBlock &To) const;
46   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
47 
48 public:
49   static char ID;
50 
51   SIPreEmitPeephole() : MachineFunctionPass(ID) {
52     initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
53   }
54 
55   bool runOnMachineFunction(MachineFunction &MF) override;
56 };
57 
58 } // End anonymous namespace.
59 
60 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
61                 "SI peephole optimizations", false, false)
62 
63 char SIPreEmitPeephole::ID = 0;
64 
65 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
66 
67 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
68   // Match:
69   // sreg = -1 or 0
70   // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
71   // S_CBRANCH_VCC[N]Z
72   // =>
73   // S_CBRANCH_EXEC[N]Z
74   // We end up with this pattern sometimes after basic block placement.
75   // It happens while combining a block which assigns -1 or 0 to a saved mask
76   // and another block which consumes that saved mask and then a branch.
77   bool Changed = false;
78   MachineBasicBlock &MBB = *MI.getParent();
79   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
80   const bool IsWave32 = ST.isWave32();
81   const unsigned CondReg = TRI->getVCC();
82   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
83   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
84   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
85   const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
86 
87   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
88                                       E = MBB.rend();
89   bool ReadsCond = false;
90   unsigned Threshold = 5;
91   for (++A; A != E; ++A) {
92     if (!--Threshold)
93       return false;
94     if (A->modifiesRegister(ExecReg, TRI))
95       return false;
96     if (A->modifiesRegister(CondReg, TRI)) {
97       if (!A->definesRegister(CondReg, TRI) ||
98           (A->getOpcode() != And && A->getOpcode() != AndN2))
99         return false;
100       break;
101     }
102     ReadsCond |= A->readsRegister(CondReg, TRI);
103   }
104   if (A == E)
105     return false;
106 
107   MachineOperand &Op1 = A->getOperand(1);
108   MachineOperand &Op2 = A->getOperand(2);
109   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
110     TII->commuteInstruction(*A);
111     Changed = true;
112   }
113   if (Op1.getReg() != ExecReg)
114     return Changed;
115   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
116     return Changed;
117 
118   int64_t MaskValue = 0;
119   Register SReg;
120   if (Op2.isReg()) {
121     SReg = Op2.getReg();
122     auto M = std::next(A);
123     bool ReadsSreg = false;
124     for (; M != E; ++M) {
125       if (M->definesRegister(SReg, TRI))
126         break;
127       if (M->modifiesRegister(SReg, TRI))
128         return Changed;
129       ReadsSreg |= M->readsRegister(SReg, TRI);
130     }
131     if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
132         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
133       return Changed;
134     MaskValue = M->getOperand(1).getImm();
135     // First if sreg is only used in the AND instruction fold the immediate
136     // into into the AND.
137     if (!ReadsSreg && Op2.isKill()) {
138       A->getOperand(2).ChangeToImmediate(MaskValue);
139       M->eraseFromParent();
140     }
141   } else if (Op2.isImm()) {
142     MaskValue = Op2.getImm();
143   } else {
144     llvm_unreachable("Op2 must be register or immediate");
145   }
146 
147   // Invert mask for s_andn2
148   assert(MaskValue == 0 || MaskValue == -1);
149   if (A->getOpcode() == AndN2)
150     MaskValue = ~MaskValue;
151 
152   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
153     if (!MI.killsRegister(CondReg, TRI)) {
154       // Replace AND with MOV
155       if (MaskValue == 0) {
156         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
157             .addImm(0);
158       } else {
159         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
160             .addReg(ExecReg);
161       }
162     }
163     // Remove AND instruction
164     A->eraseFromParent();
165   }
166 
167   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
168   if (SReg == ExecReg) {
169     // EXEC is updated directly
170     if (IsVCCZ) {
171       MI.eraseFromParent();
172       return true;
173     }
174     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
175   } else if (IsVCCZ && MaskValue == 0) {
176     // Will always branch
177     // Remove all succesors shadowed by new unconditional branch
178     MachineBasicBlock *Parent = MI.getParent();
179     SmallVector<MachineInstr *, 4> ToRemove;
180     bool Found = false;
181     for (MachineInstr &Term : Parent->terminators()) {
182       if (Found) {
183         if (Term.isBranch())
184           ToRemove.push_back(&Term);
185       } else {
186         Found = Term.isIdenticalTo(MI);
187       }
188     }
189     assert(Found && "conditional branch is not terminator");
190     for (auto BranchMI : ToRemove) {
191       MachineOperand &Dst = BranchMI->getOperand(0);
192       assert(Dst.isMBB() && "destination is not basic block");
193       Parent->removeSuccessor(Dst.getMBB());
194       BranchMI->eraseFromParent();
195     }
196 
197     if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
198       Parent->removeSuccessor(Succ);
199     }
200 
201     // Rewrite to unconditional branch
202     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
203   } else if (!IsVCCZ && MaskValue == 0) {
204     // Will never branch
205     MachineOperand &Dst = MI.getOperand(0);
206     assert(Dst.isMBB() && "destination is not basic block");
207     MI.getParent()->removeSuccessor(Dst.getMBB());
208     MI.eraseFromParent();
209     return true;
210   } else if (MaskValue == -1) {
211     // Depends only on EXEC
212     MI.setDesc(
213         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
214   }
215 
216   MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
217   MI.addImplicitDefUseOperands(*MBB.getParent());
218 
219   return true;
220 }
221 
222 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
223                                        MachineInstr &MI) const {
224   MachineBasicBlock &MBB = *MI.getParent();
225   const MachineFunction &MF = *MBB.getParent();
226   const MachineRegisterInfo &MRI = MF.getRegInfo();
227   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
228   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
229   SmallVector<MachineInstr *, 4> ToRemove;
230   bool IdxOn = true;
231 
232   if (!MI.isIdenticalTo(First))
233     return false;
234 
235   // Scan back to find an identical S_SET_GPR_IDX_ON
236   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
237                                          E = MI.getIterator();
238        I != E; ++I) {
239     if (I->isBundle())
240       continue;
241     switch (I->getOpcode()) {
242     case AMDGPU::S_SET_GPR_IDX_MODE:
243       return false;
244     case AMDGPU::S_SET_GPR_IDX_OFF:
245       IdxOn = false;
246       ToRemove.push_back(&*I);
247       break;
248     default:
249       if (I->modifiesRegister(AMDGPU::M0, TRI))
250         return false;
251       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
252         return false;
253       if (llvm::any_of(I->operands(),
254                        [&MRI, this](const MachineOperand &MO) {
255                          return MO.isReg() &&
256                                 TRI->isVectorRegister(MRI, MO.getReg());
257                        })) {
258         // The only exception allowed here is another indirect vector move
259         // with the same mode.
260         if (!IdxOn ||
261             !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
262                I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
263               I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
264           return false;
265       }
266     }
267   }
268 
269   MI.eraseFromBundle();
270   for (MachineInstr *RI : ToRemove)
271     RI->eraseFromBundle();
272   return true;
273 }
274 
275 bool SIPreEmitPeephole::getBlockDestinations(
276     MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
277     MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
278   if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
279     return false;
280 
281   if (!FalseMBB)
282     FalseMBB = SrcMBB.getNextNode();
283 
284   return true;
285 }
286 
287 bool SIPreEmitPeephole::mustRetainExeczBranch(
288     const MachineBasicBlock &From, const MachineBasicBlock &To) const {
289   unsigned NumInstr = 0;
290   const MachineFunction *MF = From.getParent();
291 
292   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
293        MBBI != End && MBBI != ToI; ++MBBI) {
294     const MachineBasicBlock &MBB = *MBBI;
295 
296     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
297          I != E; ++I) {
298       // When a uniform loop is inside non-uniform control flow, the branch
299       // leaving the loop might never be taken when EXEC = 0.
300       // Hence we should retain cbranch out of the loop lest it become infinite.
301       if (I->isConditionalBranch())
302         return true;
303 
304       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
305         return true;
306 
307       // These instructions are potentially expensive even if EXEC = 0.
308       if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
309           TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
310         return true;
311 
312       ++NumInstr;
313       if (NumInstr >= SkipThreshold)
314         return true;
315     }
316   }
317 
318   return false;
319 }
320 
321 // Returns true if the skip branch instruction is removed.
322 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
323                                           MachineBasicBlock &SrcMBB) {
324   MachineBasicBlock *TrueMBB = nullptr;
325   MachineBasicBlock *FalseMBB = nullptr;
326   SmallVector<MachineOperand, 1> Cond;
327 
328   if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
329     return false;
330 
331   // Consider only the forward branches.
332   if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
333       mustRetainExeczBranch(*FalseMBB, *TrueMBB))
334     return false;
335 
336   LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
337   MI.eraseFromParent();
338   SrcMBB.removeSuccessor(TrueMBB);
339 
340   return true;
341 }
342 
343 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
344   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
345   TII = ST.getInstrInfo();
346   TRI = &TII->getRegisterInfo();
347   bool Changed = false;
348 
349   MF.RenumberBlocks();
350 
351   for (MachineBasicBlock &MBB : MF) {
352     MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
353     // Check first terminator for branches to optimize
354     if (TermI != MBB.end()) {
355       MachineInstr &MI = *TermI;
356       switch (MI.getOpcode()) {
357       case AMDGPU::S_CBRANCH_VCCZ:
358       case AMDGPU::S_CBRANCH_VCCNZ:
359         Changed |= optimizeVccBranch(MI);
360         break;
361       case AMDGPU::S_CBRANCH_EXECZ:
362         Changed |= removeExeczBranch(MI, MBB);
363         break;
364       }
365     }
366 
367     if (!ST.hasVGPRIndexMode())
368       continue;
369 
370     MachineInstr *SetGPRMI = nullptr;
371     const unsigned Threshold = 20;
372     unsigned Count = 0;
373     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
374     // second is not needed. Do expensive checks in the optimizeSetGPR()
375     // and limit the distance to 20 instructions for compile time purposes.
376     // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
377     // may be bundled with the instructions they modify.
378     for (auto &MI :
379          make_early_inc_range(make_range(MBB.instr_begin(), MBB.instr_end()))) {
380       if (Count == Threshold)
381         SetGPRMI = nullptr;
382       else
383         ++Count;
384 
385       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
386         continue;
387 
388       Count = 0;
389       if (!SetGPRMI) {
390         SetGPRMI = &MI;
391         continue;
392       }
393 
394       if (optimizeSetGPR(*SetGPRMI, MI))
395         Changed = true;
396       else
397         SetGPRMI = &MI;
398     }
399   }
400 
401   return Changed;
402 }
403