xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp (revision 9c77fb6aaa366cbabc80ee1b834bcfe4df135491)
1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18 #include "llvm/CodeGen/TargetSchedule.h"
19 #include "llvm/Support/BranchProbability.h"
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "si-pre-emit-peephole"
24 
25 namespace {
26 
27 class SIPreEmitPeephole {
28 private:
29   const SIInstrInfo *TII = nullptr;
30   const SIRegisterInfo *TRI = nullptr;
31 
32   bool optimizeVccBranch(MachineInstr &MI) const;
33   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
34   bool getBlockDestinations(MachineBasicBlock &SrcMBB,
35                             MachineBasicBlock *&TrueMBB,
36                             MachineBasicBlock *&FalseMBB,
37                             SmallVectorImpl<MachineOperand> &Cond);
38   bool mustRetainExeczBranch(const MachineInstr &Branch,
39                              const MachineBasicBlock &From,
40                              const MachineBasicBlock &To) const;
41   bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
42 
43 public:
44   bool run(MachineFunction &MF);
45 };
46 
47 class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
48 public:
49   static char ID;
50 
51   SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
52     initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry());
53   }
54 
55   bool runOnMachineFunction(MachineFunction &MF) override {
56     return SIPreEmitPeephole().run(MF);
57   }
58 };
59 
60 } // End anonymous namespace.
61 
62 INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
63                 "SI peephole optimizations", false, false)
64 
65 char SIPreEmitPeepholeLegacy::ID = 0;
66 
67 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
68 
69 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
70   // Match:
71   // sreg = -1 or 0
72   // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
73   // S_CBRANCH_VCC[N]Z
74   // =>
75   // S_CBRANCH_EXEC[N]Z
76   // We end up with this pattern sometimes after basic block placement.
77   // It happens while combining a block which assigns -1 or 0 to a saved mask
78   // and another block which consumes that saved mask and then a branch.
79   //
80   // While searching this also performs the following substitution:
81   // vcc = V_CMP
82   // vcc = S_AND exec, vcc
83   // S_CBRANCH_VCC[N]Z
84   // =>
85   // vcc = V_CMP
86   // S_CBRANCH_VCC[N]Z
87 
88   bool Changed = false;
89   MachineBasicBlock &MBB = *MI.getParent();
90   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
91   const bool IsWave32 = ST.isWave32();
92   const unsigned CondReg = TRI->getVCC();
93   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
94   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
95   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
96   const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
97 
98   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
99                                       E = MBB.rend();
100   bool ReadsCond = false;
101   unsigned Threshold = 5;
102   for (++A; A != E; ++A) {
103     if (!--Threshold)
104       return false;
105     if (A->modifiesRegister(ExecReg, TRI))
106       return false;
107     if (A->modifiesRegister(CondReg, TRI)) {
108       if (!A->definesRegister(CondReg, TRI) ||
109           (A->getOpcode() != And && A->getOpcode() != AndN2))
110         return false;
111       break;
112     }
113     ReadsCond |= A->readsRegister(CondReg, TRI);
114   }
115   if (A == E)
116     return false;
117 
118   MachineOperand &Op1 = A->getOperand(1);
119   MachineOperand &Op2 = A->getOperand(2);
120   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
121     TII->commuteInstruction(*A);
122     Changed = true;
123   }
124   if (Op1.getReg() != ExecReg)
125     return Changed;
126   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
127     return Changed;
128 
129   int64_t MaskValue = 0;
130   Register SReg;
131   if (Op2.isReg()) {
132     SReg = Op2.getReg();
133     auto M = std::next(A);
134     bool ReadsSreg = false;
135     bool ModifiesExec = false;
136     for (; M != E; ++M) {
137       if (M->definesRegister(SReg, TRI))
138         break;
139       if (M->modifiesRegister(SReg, TRI))
140         return Changed;
141       ReadsSreg |= M->readsRegister(SReg, TRI);
142       ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
143     }
144     if (M == E)
145       return Changed;
146     // If SReg is VCC and SReg definition is a VALU comparison.
147     // This means S_AND with EXEC is not required.
148     // Erase the S_AND and return.
149     // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
150     if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
151         TII->isVOPC(*M)) {
152       A->eraseFromParent();
153       return true;
154     }
155     if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
156         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
157       return Changed;
158     MaskValue = M->getOperand(1).getImm();
159     // First if sreg is only used in the AND instruction fold the immediate
160     // into the AND.
161     if (!ReadsSreg && Op2.isKill()) {
162       A->getOperand(2).ChangeToImmediate(MaskValue);
163       M->eraseFromParent();
164     }
165   } else if (Op2.isImm()) {
166     MaskValue = Op2.getImm();
167   } else {
168     llvm_unreachable("Op2 must be register or immediate");
169   }
170 
171   // Invert mask for s_andn2
172   assert(MaskValue == 0 || MaskValue == -1);
173   if (A->getOpcode() == AndN2)
174     MaskValue = ~MaskValue;
175 
176   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
177     if (!MI.killsRegister(CondReg, TRI)) {
178       // Replace AND with MOV
179       if (MaskValue == 0) {
180         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
181             .addImm(0);
182       } else {
183         BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
184             .addReg(ExecReg);
185       }
186     }
187     // Remove AND instruction
188     A->eraseFromParent();
189   }
190 
191   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
192   if (SReg == ExecReg) {
193     // EXEC is updated directly
194     if (IsVCCZ) {
195       MI.eraseFromParent();
196       return true;
197     }
198     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
199   } else if (IsVCCZ && MaskValue == 0) {
200     // Will always branch
201     // Remove all successors shadowed by new unconditional branch
202     MachineBasicBlock *Parent = MI.getParent();
203     SmallVector<MachineInstr *, 4> ToRemove;
204     bool Found = false;
205     for (MachineInstr &Term : Parent->terminators()) {
206       if (Found) {
207         if (Term.isBranch())
208           ToRemove.push_back(&Term);
209       } else {
210         Found = Term.isIdenticalTo(MI);
211       }
212     }
213     assert(Found && "conditional branch is not terminator");
214     for (auto *BranchMI : ToRemove) {
215       MachineOperand &Dst = BranchMI->getOperand(0);
216       assert(Dst.isMBB() && "destination is not basic block");
217       Parent->removeSuccessor(Dst.getMBB());
218       BranchMI->eraseFromParent();
219     }
220 
221     if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
222       Parent->removeSuccessor(Succ);
223     }
224 
225     // Rewrite to unconditional branch
226     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
227   } else if (!IsVCCZ && MaskValue == 0) {
228     // Will never branch
229     MachineOperand &Dst = MI.getOperand(0);
230     assert(Dst.isMBB() && "destination is not basic block");
231     MI.getParent()->removeSuccessor(Dst.getMBB());
232     MI.eraseFromParent();
233     return true;
234   } else if (MaskValue == -1) {
235     // Depends only on EXEC
236     MI.setDesc(
237         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
238   }
239 
240   MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
241   MI.addImplicitDefUseOperands(*MBB.getParent());
242 
243   return true;
244 }
245 
246 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
247                                        MachineInstr &MI) const {
248   MachineBasicBlock &MBB = *MI.getParent();
249   const MachineFunction &MF = *MBB.getParent();
250   const MachineRegisterInfo &MRI = MF.getRegInfo();
251   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
252   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
253   SmallVector<MachineInstr *, 4> ToRemove;
254   bool IdxOn = true;
255 
256   if (!MI.isIdenticalTo(First))
257     return false;
258 
259   // Scan back to find an identical S_SET_GPR_IDX_ON
260   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
261                                          E = MI.getIterator();
262        I != E; ++I) {
263     if (I->isBundle())
264       continue;
265     switch (I->getOpcode()) {
266     case AMDGPU::S_SET_GPR_IDX_MODE:
267       return false;
268     case AMDGPU::S_SET_GPR_IDX_OFF:
269       IdxOn = false;
270       ToRemove.push_back(&*I);
271       break;
272     default:
273       if (I->modifiesRegister(AMDGPU::M0, TRI))
274         return false;
275       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
276         return false;
277       if (llvm::any_of(I->operands(),
278                        [&MRI, this](const MachineOperand &MO) {
279                          return MO.isReg() &&
280                                 TRI->isVectorRegister(MRI, MO.getReg());
281                        })) {
282         // The only exception allowed here is another indirect vector move
283         // with the same mode.
284         if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
285                         I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
286           return false;
287       }
288     }
289   }
290 
291   MI.eraseFromBundle();
292   for (MachineInstr *RI : ToRemove)
293     RI->eraseFromBundle();
294   return true;
295 }
296 
297 bool SIPreEmitPeephole::getBlockDestinations(
298     MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
299     MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
300   if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
301     return false;
302 
303   if (!FalseMBB)
304     FalseMBB = SrcMBB.getNextNode();
305 
306   return true;
307 }
308 
309 namespace {
310 class BranchWeightCostModel {
311   const SIInstrInfo &TII;
312   const TargetSchedModel &SchedModel;
313   BranchProbability BranchProb;
314   static constexpr uint64_t BranchNotTakenCost = 1;
315   uint64_t BranchTakenCost;
316   uint64_t ThenCyclesCost = 0;
317 
318 public:
319   BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
320                         const MachineBasicBlock &Succ)
321       : TII(TII), SchedModel(TII.getSchedModel()) {
322     const MachineBasicBlock &Head = *Branch.getParent();
323     const auto *FromIt = find(Head.successors(), &Succ);
324     assert(FromIt != Head.succ_end());
325 
326     BranchProb = Head.getSuccProbability(FromIt);
327     if (BranchProb.isUnknown())
328       BranchProb = BranchProbability::getZero();
329     BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
330   }
331 
332   bool isProfitable(const MachineInstr &MI) {
333     if (TII.isWaitcnt(MI.getOpcode()))
334       return false;
335 
336     ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
337 
338     // Consider `P = N/D` to be the probability of execz being false (skipping
339     // the then-block) The transformation is profitable if always executing the
340     // 'then' block is cheaper than executing sometimes 'then' and always
341     // executing s_cbranch_execz:
342     // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
343     // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
344     // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
345     // BranchNotTakenCost
346     uint64_t Numerator = BranchProb.getNumerator();
347     uint64_t Denominator = BranchProb.getDenominator();
348     return (Denominator - Numerator) * ThenCyclesCost <=
349            ((Denominator - Numerator) * BranchTakenCost +
350             Numerator * BranchNotTakenCost);
351   }
352 };
353 
354 bool SIPreEmitPeephole::mustRetainExeczBranch(
355     const MachineInstr &Branch, const MachineBasicBlock &From,
356     const MachineBasicBlock &To) const {
357   assert(is_contained(Branch.getParent()->successors(), &From));
358   BranchWeightCostModel CostModel{*TII, Branch, From};
359 
360   const MachineFunction *MF = From.getParent();
361   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
362        MBBI != End && MBBI != ToI; ++MBBI) {
363     const MachineBasicBlock &MBB = *MBBI;
364 
365     for (const MachineInstr &MI : MBB) {
366       // When a uniform loop is inside non-uniform control flow, the branch
367       // leaving the loop might never be taken when EXEC = 0.
368       // Hence we should retain cbranch out of the loop lest it become infinite.
369       if (MI.isConditionalBranch())
370         return true;
371 
372       if (MI.isUnconditionalBranch() &&
373           TII->getBranchDestBlock(MI) != MBB.getNextNode())
374         return true;
375 
376       if (MI.isMetaInstruction())
377         continue;
378 
379       if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
380         return true;
381 
382       if (!CostModel.isProfitable(MI))
383         return true;
384     }
385   }
386 
387   return false;
388 }
389 } // namespace
390 
391 // Returns true if the skip branch instruction is removed.
392 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
393                                           MachineBasicBlock &SrcMBB) {
394 
395   if (!TII->getSchedModel().hasInstrSchedModel())
396     return false;
397 
398   MachineBasicBlock *TrueMBB = nullptr;
399   MachineBasicBlock *FalseMBB = nullptr;
400   SmallVector<MachineOperand, 1> Cond;
401 
402   if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
403     return false;
404 
405   // Consider only the forward branches.
406   if (SrcMBB.getNumber() >= TrueMBB->getNumber())
407     return false;
408 
409   // Consider only when it is legal and profitable
410   if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
411     return false;
412 
413   LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
414   MI.eraseFromParent();
415   SrcMBB.removeSuccessor(TrueMBB);
416 
417   return true;
418 }
419 
420 PreservedAnalyses
421 llvm::SIPreEmitPeepholePass::run(MachineFunction &MF,
422                                  MachineFunctionAnalysisManager &MFAM) {
423   if (!SIPreEmitPeephole().run(MF))
424     return PreservedAnalyses::all();
425 
426   return getMachineFunctionPassPreservedAnalyses();
427 }
428 
429 bool SIPreEmitPeephole::run(MachineFunction &MF) {
430   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
431   TII = ST.getInstrInfo();
432   TRI = &TII->getRegisterInfo();
433   bool Changed = false;
434 
435   MF.RenumberBlocks();
436 
437   for (MachineBasicBlock &MBB : MF) {
438     MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
439     // Check first terminator for branches to optimize
440     if (TermI != MBB.end()) {
441       MachineInstr &MI = *TermI;
442       switch (MI.getOpcode()) {
443       case AMDGPU::S_CBRANCH_VCCZ:
444       case AMDGPU::S_CBRANCH_VCCNZ:
445         Changed |= optimizeVccBranch(MI);
446         break;
447       case AMDGPU::S_CBRANCH_EXECZ:
448         Changed |= removeExeczBranch(MI, MBB);
449         break;
450       }
451     }
452 
453     if (!ST.hasVGPRIndexMode())
454       continue;
455 
456     MachineInstr *SetGPRMI = nullptr;
457     const unsigned Threshold = 20;
458     unsigned Count = 0;
459     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
460     // second is not needed. Do expensive checks in the optimizeSetGPR()
461     // and limit the distance to 20 instructions for compile time purposes.
462     // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
463     // may be bundled with the instructions they modify.
464     for (auto &MI : make_early_inc_range(MBB.instrs())) {
465       if (Count == Threshold)
466         SetGPRMI = nullptr;
467       else
468         ++Count;
469 
470       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
471         continue;
472 
473       Count = 0;
474       if (!SetGPRMI) {
475         SetGPRMI = &MI;
476         continue;
477       }
478 
479       if (optimizeSetGPR(*SetGPRMI, MI))
480         Changed = true;
481       else
482         SetGPRMI = &MI;
483     }
484   }
485 
486   return Changed;
487 }
488