xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp (revision 1342eb5a832fa10e689a29faab3acb6054e4778c)
1 //===-- SILateBranchLowering.cpp - Final preparation of branches ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass mainly lowers early terminate pseudo instructions.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/MachineDominators.h"
19 #include "llvm/CodeGen/MachinePassManager.h"
20 
21 using namespace llvm;
22 
23 #define DEBUG_TYPE "si-late-branch-lowering"
24 
25 namespace {
26 
27 class SILateBranchLowering {
28 private:
29   const SIRegisterInfo *TRI = nullptr;
30   const SIInstrInfo *TII = nullptr;
31   MachineDominatorTree *MDT = nullptr;
32 
33   void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST,
34                        bool DynamicVGPR);
35   void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
36 
37 public:
38   SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {}
39 
40   bool run(MachineFunction &MF);
41 
42   unsigned MovOpc;
43   Register ExecReg;
44 };
45 
46 class SILateBranchLoweringLegacy : public MachineFunctionPass {
47 public:
48   static char ID;
49   SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {}
50 
51   bool runOnMachineFunction(MachineFunction &MF) override {
52     auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
53     return SILateBranchLowering(MDT).run(MF);
54   }
55 
56   StringRef getPassName() const override {
57     return "SI Final Branch Preparation";
58   }
59 
60   void getAnalysisUsage(AnalysisUsage &AU) const override {
61     AU.addRequired<MachineDominatorTreeWrapperPass>();
62     AU.addPreserved<MachineDominatorTreeWrapperPass>();
63     MachineFunctionPass::getAnalysisUsage(AU);
64   }
65 };
66 
67 } // end anonymous namespace
68 
69 char SILateBranchLoweringLegacy::ID = 0;
70 
71 INITIALIZE_PASS_BEGIN(SILateBranchLoweringLegacy, DEBUG_TYPE,
72                       "SI insert s_cbranch_execz instructions", false, false)
73 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
74 INITIALIZE_PASS_END(SILateBranchLoweringLegacy, DEBUG_TYPE,
75                     "SI insert s_cbranch_execz instructions", false, false)
76 
77 char &llvm::SILateBranchLoweringPassID = SILateBranchLoweringLegacy::ID;
78 
79 static void generateEndPgm(MachineBasicBlock &MBB,
80                            MachineBasicBlock::iterator I, DebugLoc DL,
81                            const SIInstrInfo *TII, MachineFunction &MF) {
82   const Function &F = MF.getFunction();
83   bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS;
84 
85   // Check if hardware has been configured to expect color or depth exports.
86   bool HasColorExports = AMDGPU::getHasColorExport(F);
87   bool HasDepthExports = AMDGPU::getHasDepthExport(F);
88   bool HasExports = HasColorExports || HasDepthExports;
89 
90   // Prior to GFX10, hardware always expects at least one export for PS.
91   bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget());
92 
93   if (IsPS && (HasExports || MustExport)) {
94     // Generate "null export" if hardware is expecting PS to export.
95     const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
96     int Target =
97         ST.hasNullExportTarget()
98             ? AMDGPU::Exp::ET_NULL
99             : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ);
100     BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
101         .addImm(Target)
102         .addReg(AMDGPU::VGPR0, RegState::Undef)
103         .addReg(AMDGPU::VGPR0, RegState::Undef)
104         .addReg(AMDGPU::VGPR0, RegState::Undef)
105         .addReg(AMDGPU::VGPR0, RegState::Undef)
106         .addImm(1)  // vm
107         .addImm(0)  // compr
108         .addImm(0); // en
109   }
110 
111   // s_endpgm
112   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
113 }
114 
115 static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
116                        MachineDominatorTree *MDT) {
117   MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
118 
119   // Update dominator tree
120   using DomTreeT = DomTreeBase<MachineBasicBlock>;
121   SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
122   for (MachineBasicBlock *Succ : SplitBB->successors()) {
123     DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
124     DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
125   }
126   DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
127   MDT->applyUpdates(DTUpdates);
128 }
129 
130 static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB,
131                                   MachineOperand &Op) {
132   if (Op.isReg())
133     MIB.addReg(Op.getReg());
134   else
135     MIB.add(Op);
136 }
137 
138 void SILateBranchLowering::expandChainCall(MachineInstr &MI,
139                                            const GCNSubtarget &ST,
140                                            bool DynamicVGPR) {
141   // This is a tail call that needs to be expanded into at least
142   // 2 instructions, one for setting EXEC and one for the actual tail call.
143   int ExecIdx =
144       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::exec);
145   assert(ExecIdx != -1 && "Missing EXEC operand");
146   const DebugLoc &DL = MI.getDebugLoc();
147   if (DynamicVGPR) {
148     // We have 3 extra operands and we need to:
149     // * Try to change the VGPR allocation
150     // * Select the callee based on the result of the reallocation attempt
151     // * Select the EXEC mask based on the result of the reallocation attempt
152     // If any of the register operands of the chain pseudo is used in more than
153     // one of these instructions, we need to make sure that the kill flags
154     // aren't copied along.
155     auto AllocMI =
156         BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_ALLOC_VGPR));
157     copyOpWithoutRegFlags(AllocMI,
158                           *TII->getNamedOperand(MI, AMDGPU::OpName::numvgprs));
159 
160     auto SelectCallee =
161         BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_CSELECT_B64))
162             .addDef(TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg());
163     copyOpWithoutRegFlags(SelectCallee,
164                           *TII->getNamedOperand(MI, AMDGPU::OpName::src0));
165     copyOpWithoutRegFlags(SelectCallee,
166                           *TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee));
167 
168     auto SelectExec = BuildMI(*MI.getParent(), MI, DL,
169                               TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
170                                                      : AMDGPU::S_CSELECT_B64))
171                           .addDef(ExecReg);
172 
173     copyOpWithoutRegFlags(SelectExec,
174                           *TII->getNamedOperand(MI, AMDGPU::OpName::exec));
175     copyOpWithoutRegFlags(SelectExec,
176                           *TII->getNamedOperand(MI, AMDGPU::OpName::fbexec));
177   } else {
178     auto SetExec = BuildMI(*MI.getParent(), MI, DL, TII->get(MovOpc), ExecReg);
179     copyOpWithoutRegFlags(SetExec,
180                           *TII->getNamedOperand(MI, AMDGPU::OpName::exec));
181   }
182 
183   for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
184     MI.removeOperand(OpIdx);
185 
186   MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
187 }
188 
189 void SILateBranchLowering::earlyTerm(MachineInstr &MI,
190                                      MachineBasicBlock *EarlyExitBlock) {
191   MachineBasicBlock &MBB = *MI.getParent();
192   const DebugLoc DL = MI.getDebugLoc();
193 
194   auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
195                       .addMBB(EarlyExitBlock);
196   auto Next = std::next(MI.getIterator());
197 
198   if (Next != MBB.end() && !Next->isTerminator())
199     splitBlock(MBB, *BranchMI, MDT);
200 
201   MBB.addSuccessor(EarlyExitBlock);
202   MDT->insertEdge(&MBB, EarlyExitBlock);
203 }
204 
205 PreservedAnalyses
206 llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
207                                     MachineFunctionAnalysisManager &MFAM) {
208   auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
209   if (!SILateBranchLowering(MDT).run(MF))
210     return PreservedAnalyses::all();
211 
212   return getMachineFunctionPassPreservedAnalyses()
213       .preserve<MachineDominatorTreeAnalysis>();
214 }
215 
216 bool SILateBranchLowering::run(MachineFunction &MF) {
217   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
218   TII = ST.getInstrInfo();
219   TRI = &TII->getRegisterInfo();
220 
221   MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
222   ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
223 
224   SmallVector<MachineInstr *, 4> EarlyTermInstrs;
225   SmallVector<MachineInstr *, 1> EpilogInstrs;
226   bool MadeChange = false;
227 
228   for (MachineBasicBlock &MBB : MF) {
229     for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
230       switch (MI.getOpcode()) {
231       case AMDGPU::S_BRANCH:
232         // Optimize out branches to the next block.
233         // This only occurs in -O0 when BranchFolding is not executed.
234         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
235           assert(&MI == &MBB.back());
236           MI.eraseFromParent();
237           MadeChange = true;
238         }
239         break;
240 
241       case AMDGPU::SI_CS_CHAIN_TC_W32:
242       case AMDGPU::SI_CS_CHAIN_TC_W64:
243         expandChainCall(MI, ST, /*DynamicVGPR=*/false);
244         MadeChange = true;
245         break;
246       case AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR:
247       case AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR:
248         expandChainCall(MI, ST, /*DynamicVGPR=*/true);
249         MadeChange = true;
250         break;
251 
252       case AMDGPU::SI_EARLY_TERMINATE_SCC0:
253         EarlyTermInstrs.push_back(&MI);
254         break;
255 
256       case AMDGPU::SI_RETURN_TO_EPILOG:
257         EpilogInstrs.push_back(&MI);
258         break;
259 
260       default:
261         break;
262       }
263     }
264   }
265 
266   // Lower any early exit branches first
267   if (!EarlyTermInstrs.empty()) {
268     MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock();
269     DebugLoc DL;
270 
271     MF.insert(MF.end(), EarlyExitBlock);
272     BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc),
273             ExecReg)
274         .addImm(0);
275     generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF);
276 
277     for (MachineInstr *Instr : EarlyTermInstrs) {
278       // Early termination in GS does nothing
279       if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
280         earlyTerm(*Instr, EarlyExitBlock);
281       Instr->eraseFromParent();
282     }
283 
284     EarlyTermInstrs.clear();
285     MadeChange = true;
286   }
287 
288   // Now check return to epilog instructions occur at function end
289   if (!EpilogInstrs.empty()) {
290     MachineBasicBlock *EmptyMBBAtEnd = nullptr;
291     assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
292 
293     // If there are multiple returns to epilog then all will
294     // become jumps to new empty end block.
295     if (EpilogInstrs.size() > 1) {
296       EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
297       MF.insert(MF.end(), EmptyMBBAtEnd);
298     }
299 
300     for (auto *MI : EpilogInstrs) {
301       auto *MBB = MI->getParent();
302       if (MBB == &MF.back() && MI == &MBB->back())
303         continue;
304 
305       // SI_RETURN_TO_EPILOG is not the last instruction.
306       // Jump to empty block at function end.
307       if (!EmptyMBBAtEnd) {
308         EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
309         MF.insert(MF.end(), EmptyMBBAtEnd);
310       }
311 
312       MBB->addSuccessor(EmptyMBBAtEnd);
313       MDT->insertEdge(MBB, EmptyMBBAtEnd);
314       BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
315           .addMBB(EmptyMBBAtEnd);
316       MI->eraseFromParent();
317       MadeChange = true;
318     }
319 
320     EpilogInstrs.clear();
321   }
322 
323   return MadeChange;
324 }
325