xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 
19 using namespace llvm;
20 
21 #define DEBUG_TYPE "si-pre-emit-peephole"
22 
23 static unsigned SkipThreshold;
24 
25 static cl::opt<unsigned, true> SkipThresholdFlag(
26     "amdgpu-skip-threshold", cl::Hidden,
27     cl::desc(
28         "Number of instructions before jumping over divergent control flow"),
29     cl::location(SkipThreshold), cl::init(12));
30 
31 namespace {
32 
33 class SIPreEmitPeephole : public MachineFunctionPass {
34 private:
35   const SIInstrInfo *TII = nullptr;
36   const SIRegisterInfo *TRI = nullptr;
37 
38   bool optimizeVccBranch(MachineInstr &MI) const;
39   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
40   bool getBlockDestinations(MachineBasicBlock &SrcMBB,
41                             MachineBasicBlock *&TrueMBB,
42                             MachineBasicBlock *&FalseMBB,
43                             SmallVectorImpl<MachineOperand> &Cond);
44   bool mustRetainExeczBranch(const MachineBasicBlock &From,
45                              const MachineBasicBlock &To) const;
46   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
47 
48 public:
49   static char ID;
50 
51   SIPreEmitPeephole() : MachineFunctionPass(ID) {
52     initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
53   }
54 
55   bool runOnMachineFunction(MachineFunction &MF) override;
56 };
57 
58 } // End anonymous namespace.
59 
60 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
61                 "SI peephole optimizations", false, false)
62 
63 char SIPreEmitPeephole::ID = 0;
64 
65 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
66 
67 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
68   // Match:
69   // sreg = -1 or 0
70   // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
71   // S_CBRANCH_VCC[N]Z
72   // =>
73   // S_CBRANCH_EXEC[N]Z
74   // We end up with this pattern sometimes after basic block placement.
75   // It happens while combining a block which assigns -1 or 0 to a saved mask
76   // and another block which consumes that saved mask and then a branch.
77   //
78   // While searching this also performs the following substitution:
79   // vcc = V_CMP
80   // vcc = S_AND exec, vcc
81   // S_CBRANCH_VCC[N]Z
82   // =>
83   // vcc = V_CMP
84   // S_CBRANCH_VCC[N]Z
85 
86   bool Changed = false;
87   MachineBasicBlock &MBB = *MI.getParent();
88   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
89   const bool IsWave32 = ST.isWave32();
90   const unsigned CondReg = TRI->getVCC();
91   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
92   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
93   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
94   const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
95 
96   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
97                                       E = MBB.rend();
98   bool ReadsCond = false;
99   unsigned Threshold = 5;
100   for (++A; A != E; ++A) {
101     if (!--Threshold)
102       return false;
103     if (A->modifiesRegister(ExecReg, TRI))
104       return false;
105     if (A->modifiesRegister(CondReg, TRI)) {
106       if (!A->definesRegister(CondReg, TRI) ||
107           (A->getOpcode() != And && A->getOpcode() != AndN2))
108         return false;
109       break;
110     }
111     ReadsCond |= A->readsRegister(CondReg, TRI);
112   }
113   if (A == E)
114     return false;
115 
116   MachineOperand &Op1 = A->getOperand(1);
117   MachineOperand &Op2 = A->getOperand(2);
118   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
119     TII->commuteInstruction(*A);
120     Changed = true;
121   }
122   if (Op1.getReg() != ExecReg)
123     return Changed;
124   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
125     return Changed;
126 
127   int64_t MaskValue = 0;
128   Register SReg;
129   if (Op2.isReg()) {
130     SReg = Op2.getReg();
131     auto M = std::next(A);
132     bool ReadsSreg = false;
133     bool ModifiesExec = false;
134     for (; M != E; ++M) {
135       if (M->definesRegister(SReg, TRI))
136         break;
137       if (M->modifiesRegister(SReg, TRI))
138         return Changed;
139       ReadsSreg |= M->readsRegister(SReg, TRI);
140       ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
141     }
142     if (M == E)
143       return Changed;
144     // If SReg is VCC and SReg definition is a VALU comparison.
145     // This means S_AND with EXEC is not required.
146     // Erase the S_AND and return.
147     // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
148     if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
149         TII->isVOPC(*M)) {
150       A->eraseFromParent();
151       return true;
152     }
153     if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
154         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
155       return Changed;
156     MaskValue = M->getOperand(1).getImm();
157     // First if sreg is only used in the AND instruction fold the immediate
158     // into the AND.
159     if (!ReadsSreg && Op2.isKill()) {
160       A->getOperand(2).ChangeToImmediate(MaskValue);
161       M->eraseFromParent();
162     }
163   } else if (Op2.isImm()) {
164     MaskValue = Op2.getImm();
165   } else {
166     llvm_unreachable("Op2 must be register or immediate");
167   }
168 
169   // Invert mask for s_andn2
170   assert(MaskValue == 0 || MaskValue == -1);
171   if (A->getOpcode() == AndN2)
172     MaskValue = ~MaskValue;
173 
174   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) {
175     if (!MI.killsRegister(CondReg, TRI)) {
176       // Replace AND with MOV
177       if (MaskValue == 0) {
178         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
179             .addImm(0);
180       } else {
181         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
182             .addReg(ExecReg);
183       }
184     }
185     // Remove AND instruction
186     A->eraseFromParent();
187   }
188 
189   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
190   if (SReg == ExecReg) {
191     // EXEC is updated directly
192     if (IsVCCZ) {
193       MI.eraseFromParent();
194       return true;
195     }
196     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
197   } else if (IsVCCZ && MaskValue == 0) {
198     // Will always branch
199     // Remove all successors shadowed by new unconditional branch
200     MachineBasicBlock *Parent = MI.getParent();
201     SmallVector<MachineInstr *, 4> ToRemove;
202     bool Found = false;
203     for (MachineInstr &Term : Parent->terminators()) {
204       if (Found) {
205         if (Term.isBranch())
206           ToRemove.push_back(&Term);
207       } else {
208         Found = Term.isIdenticalTo(MI);
209       }
210     }
211     assert(Found && "conditional branch is not terminator");
212     for (auto *BranchMI : ToRemove) {
213       MachineOperand &Dst = BranchMI->getOperand(0);
214       assert(Dst.isMBB() && "destination is not basic block");
215       Parent->removeSuccessor(Dst.getMBB());
216       BranchMI->eraseFromParent();
217     }
218 
219     if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
220       Parent->removeSuccessor(Succ);
221     }
222 
223     // Rewrite to unconditional branch
224     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
225   } else if (!IsVCCZ && MaskValue == 0) {
226     // Will never branch
227     MachineOperand &Dst = MI.getOperand(0);
228     assert(Dst.isMBB() && "destination is not basic block");
229     MI.getParent()->removeSuccessor(Dst.getMBB());
230     MI.eraseFromParent();
231     return true;
232   } else if (MaskValue == -1) {
233     // Depends only on EXEC
234     MI.setDesc(
235         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
236   }
237 
238   MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
239   MI.addImplicitDefUseOperands(*MBB.getParent());
240 
241   return true;
242 }
243 
244 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
245                                        MachineInstr &MI) const {
246   MachineBasicBlock &MBB = *MI.getParent();
247   const MachineFunction &MF = *MBB.getParent();
248   const MachineRegisterInfo &MRI = MF.getRegInfo();
249   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
250   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
251   SmallVector<MachineInstr *, 4> ToRemove;
252   bool IdxOn = true;
253 
254   if (!MI.isIdenticalTo(First))
255     return false;
256 
257   // Scan back to find an identical S_SET_GPR_IDX_ON
258   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
259                                          E = MI.getIterator();
260        I != E; ++I) {
261     if (I->isBundle())
262       continue;
263     switch (I->getOpcode()) {
264     case AMDGPU::S_SET_GPR_IDX_MODE:
265       return false;
266     case AMDGPU::S_SET_GPR_IDX_OFF:
267       IdxOn = false;
268       ToRemove.push_back(&*I);
269       break;
270     default:
271       if (I->modifiesRegister(AMDGPU::M0, TRI))
272         return false;
273       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
274         return false;
275       if (llvm::any_of(I->operands(),
276                        [&MRI, this](const MachineOperand &MO) {
277                          return MO.isReg() &&
278                                 TRI->isVectorRegister(MRI, MO.getReg());
279                        })) {
280         // The only exception allowed here is another indirect vector move
281         // with the same mode.
282         if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
283                         I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
284           return false;
285       }
286     }
287   }
288 
289   MI.eraseFromBundle();
290   for (MachineInstr *RI : ToRemove)
291     RI->eraseFromBundle();
292   return true;
293 }
294 
295 bool SIPreEmitPeephole::getBlockDestinations(
296     MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
297     MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
298   if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
299     return false;
300 
301   if (!FalseMBB)
302     FalseMBB = SrcMBB.getNextNode();
303 
304   return true;
305 }
306 
307 bool SIPreEmitPeephole::mustRetainExeczBranch(
308     const MachineBasicBlock &From, const MachineBasicBlock &To) const {
309   unsigned NumInstr = 0;
310   const MachineFunction *MF = From.getParent();
311 
312   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
313        MBBI != End && MBBI != ToI; ++MBBI) {
314     const MachineBasicBlock &MBB = *MBBI;
315 
316     for (const MachineInstr &MI : MBB) {
317       // When a uniform loop is inside non-uniform control flow, the branch
318       // leaving the loop might never be taken when EXEC = 0.
319       // Hence we should retain cbranch out of the loop lest it become infinite.
320       if (MI.isConditionalBranch())
321         return true;
322 
323       if (MI.isMetaInstruction())
324         continue;
325 
326       if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
327         return true;
328 
329       // These instructions are potentially expensive even if EXEC = 0.
330       if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
331           TII->isDS(MI) || MI.getOpcode() == AMDGPU::S_WAITCNT)
332         return true;
333 
334       ++NumInstr;
335       if (NumInstr >= SkipThreshold)
336         return true;
337     }
338   }
339 
340   return false;
341 }
342 
343 // Returns true if the skip branch instruction is removed.
344 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
345                                           MachineBasicBlock &SrcMBB) {
346   MachineBasicBlock *TrueMBB = nullptr;
347   MachineBasicBlock *FalseMBB = nullptr;
348   SmallVector<MachineOperand, 1> Cond;
349 
350   if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
351     return false;
352 
353   // Consider only the forward branches.
354   if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
355       mustRetainExeczBranch(*FalseMBB, *TrueMBB))
356     return false;
357 
358   LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
359   MI.eraseFromParent();
360   SrcMBB.removeSuccessor(TrueMBB);
361 
362   return true;
363 }
364 
365 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
366   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
367   TII = ST.getInstrInfo();
368   TRI = &TII->getRegisterInfo();
369   bool Changed = false;
370 
371   MF.RenumberBlocks();
372 
373   for (MachineBasicBlock &MBB : MF) {
374     MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
375     // Check first terminator for branches to optimize
376     if (TermI != MBB.end()) {
377       MachineInstr &MI = *TermI;
378       switch (MI.getOpcode()) {
379       case AMDGPU::S_CBRANCH_VCCZ:
380       case AMDGPU::S_CBRANCH_VCCNZ:
381         Changed |= optimizeVccBranch(MI);
382         break;
383       case AMDGPU::S_CBRANCH_EXECZ:
384         Changed |= removeExeczBranch(MI, MBB);
385         break;
386       }
387     }
388 
389     if (!ST.hasVGPRIndexMode())
390       continue;
391 
392     MachineInstr *SetGPRMI = nullptr;
393     const unsigned Threshold = 20;
394     unsigned Count = 0;
395     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
396     // second is not needed. Do expensive checks in the optimizeSetGPR()
397     // and limit the distance to 20 instructions for compile time purposes.
398     // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
399     // may be bundled with the instructions they modify.
400     for (auto &MI :
401          make_early_inc_range(make_range(MBB.instr_begin(), MBB.instr_end()))) {
402       if (Count == Threshold)
403         SetGPRMI = nullptr;
404       else
405         ++Count;
406 
407       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
408         continue;
409 
410       Count = 0;
411       if (!SetGPRMI) {
412         SetGPRMI = &MI;
413         continue;
414       }
415 
416       if (optimizeSetGPR(*SetGPRMI, MI))
417         Changed = true;
418       else
419         SetGPRMI = &MI;
420     }
421   }
422 
423   return Changed;
424 }
425