1 //===-- SILateBranchLowering.cpp - Final preparation of branches ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass mainly lowers early terminate pseudo instructions.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIMachineFunctionInfo.h"
18 #include "llvm/CodeGen/MachineDominators.h"
19 #include "llvm/CodeGen/MachinePassManager.h"
20
21 using namespace llvm;
22
23 #define DEBUG_TYPE "si-late-branch-lowering"
24
25 namespace {
26
27 class SILateBranchLowering {
28 private:
29 const SIRegisterInfo *TRI = nullptr;
30 const SIInstrInfo *TII = nullptr;
31 MachineDominatorTree *MDT = nullptr;
32
33 void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST,
34 bool DynamicVGPR);
35 void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
36
37 public:
SILateBranchLowering(MachineDominatorTree * MDT)38 SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {}
39
40 bool run(MachineFunction &MF);
41
42 unsigned MovOpc;
43 Register ExecReg;
44 };
45
46 class SILateBranchLoweringLegacy : public MachineFunctionPass {
47 public:
48 static char ID;
SILateBranchLoweringLegacy()49 SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {}
50
runOnMachineFunction(MachineFunction & MF)51 bool runOnMachineFunction(MachineFunction &MF) override {
52 auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
53 return SILateBranchLowering(MDT).run(MF);
54 }
55
getPassName() const56 StringRef getPassName() const override {
57 return "SI Final Branch Preparation";
58 }
59
getAnalysisUsage(AnalysisUsage & AU) const60 void getAnalysisUsage(AnalysisUsage &AU) const override {
61 AU.addRequired<MachineDominatorTreeWrapperPass>();
62 AU.addPreserved<MachineDominatorTreeWrapperPass>();
63 MachineFunctionPass::getAnalysisUsage(AU);
64 }
65 };
66
67 } // end anonymous namespace
68
69 char SILateBranchLoweringLegacy::ID = 0;
70
71 INITIALIZE_PASS_BEGIN(SILateBranchLoweringLegacy, DEBUG_TYPE,
72 "SI insert s_cbranch_execz instructions", false, false)
73 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
74 INITIALIZE_PASS_END(SILateBranchLoweringLegacy, DEBUG_TYPE,
75 "SI insert s_cbranch_execz instructions", false, false)
76
77 char &llvm::SILateBranchLoweringPassID = SILateBranchLoweringLegacy::ID;
78
generateEndPgm(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,const SIInstrInfo * TII,MachineFunction & MF)79 static void generateEndPgm(MachineBasicBlock &MBB,
80 MachineBasicBlock::iterator I, DebugLoc DL,
81 const SIInstrInfo *TII, MachineFunction &MF) {
82 const Function &F = MF.getFunction();
83 bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS;
84
85 // Check if hardware has been configured to expect color or depth exports.
86 bool HasColorExports = AMDGPU::getHasColorExport(F);
87 bool HasDepthExports = AMDGPU::getHasDepthExport(F);
88 bool HasExports = HasColorExports || HasDepthExports;
89
90 // Prior to GFX10, hardware always expects at least one export for PS.
91 bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget());
92
93 if (IsPS && (HasExports || MustExport)) {
94 // Generate "null export" if hardware is expecting PS to export.
95 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
96 int Target =
97 ST.hasNullExportTarget()
98 ? AMDGPU::Exp::ET_NULL
99 : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ);
100 BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
101 .addImm(Target)
102 .addReg(AMDGPU::VGPR0, RegState::Undef)
103 .addReg(AMDGPU::VGPR0, RegState::Undef)
104 .addReg(AMDGPU::VGPR0, RegState::Undef)
105 .addReg(AMDGPU::VGPR0, RegState::Undef)
106 .addImm(1) // vm
107 .addImm(0) // compr
108 .addImm(0); // en
109 }
110
111 // s_endpgm
112 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
113 }
114
splitBlock(MachineBasicBlock & MBB,MachineInstr & MI,MachineDominatorTree * MDT)115 static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
116 MachineDominatorTree *MDT) {
117 MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true);
118
119 // Update dominator tree
120 using DomTreeT = DomTreeBase<MachineBasicBlock>;
121 SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
122 for (MachineBasicBlock *Succ : SplitBB->successors()) {
123 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
124 DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
125 }
126 DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
127 MDT->applyUpdates(DTUpdates);
128 }
129
copyOpWithoutRegFlags(MachineInstrBuilder & MIB,MachineOperand & Op)130 static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB,
131 MachineOperand &Op) {
132 if (Op.isReg())
133 MIB.addReg(Op.getReg());
134 else
135 MIB.add(Op);
136 }
137
expandChainCall(MachineInstr & MI,const GCNSubtarget & ST,bool DynamicVGPR)138 void SILateBranchLowering::expandChainCall(MachineInstr &MI,
139 const GCNSubtarget &ST,
140 bool DynamicVGPR) {
141 // This is a tail call that needs to be expanded into at least
142 // 2 instructions, one for setting EXEC and one for the actual tail call.
143 int ExecIdx =
144 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::exec);
145 assert(ExecIdx != -1 && "Missing EXEC operand");
146 const DebugLoc &DL = MI.getDebugLoc();
147 if (DynamicVGPR) {
148 // We have 3 extra operands and we need to:
149 // * Try to change the VGPR allocation
150 // * Select the callee based on the result of the reallocation attempt
151 // * Select the EXEC mask based on the result of the reallocation attempt
152 // If any of the register operands of the chain pseudo is used in more than
153 // one of these instructions, we need to make sure that the kill flags
154 // aren't copied along.
155 auto AllocMI =
156 BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_ALLOC_VGPR));
157 copyOpWithoutRegFlags(AllocMI,
158 *TII->getNamedOperand(MI, AMDGPU::OpName::numvgprs));
159
160 auto SelectCallee =
161 BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_CSELECT_B64))
162 .addDef(TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg());
163 copyOpWithoutRegFlags(SelectCallee,
164 *TII->getNamedOperand(MI, AMDGPU::OpName::src0));
165 copyOpWithoutRegFlags(SelectCallee,
166 *TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee));
167
168 auto SelectExec = BuildMI(*MI.getParent(), MI, DL,
169 TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
170 : AMDGPU::S_CSELECT_B64))
171 .addDef(ExecReg);
172
173 copyOpWithoutRegFlags(SelectExec,
174 *TII->getNamedOperand(MI, AMDGPU::OpName::exec));
175 copyOpWithoutRegFlags(SelectExec,
176 *TII->getNamedOperand(MI, AMDGPU::OpName::fbexec));
177 } else {
178 auto SetExec = BuildMI(*MI.getParent(), MI, DL, TII->get(MovOpc), ExecReg);
179 copyOpWithoutRegFlags(SetExec,
180 *TII->getNamedOperand(MI, AMDGPU::OpName::exec));
181 }
182
183 for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx)
184 MI.removeOperand(OpIdx);
185
186 MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
187 }
188
earlyTerm(MachineInstr & MI,MachineBasicBlock * EarlyExitBlock)189 void SILateBranchLowering::earlyTerm(MachineInstr &MI,
190 MachineBasicBlock *EarlyExitBlock) {
191 MachineBasicBlock &MBB = *MI.getParent();
192 const DebugLoc DL = MI.getDebugLoc();
193
194 auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0))
195 .addMBB(EarlyExitBlock);
196 auto Next = std::next(MI.getIterator());
197
198 if (Next != MBB.end() && !Next->isTerminator())
199 splitBlock(MBB, *BranchMI, MDT);
200
201 MBB.addSuccessor(EarlyExitBlock);
202 MDT->insertEdge(&MBB, EarlyExitBlock);
203 }
204
205 PreservedAnalyses
run(MachineFunction & MF,MachineFunctionAnalysisManager & MFAM)206 llvm::SILateBranchLoweringPass::run(MachineFunction &MF,
207 MachineFunctionAnalysisManager &MFAM) {
208 auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
209 if (!SILateBranchLowering(MDT).run(MF))
210 return PreservedAnalyses::all();
211
212 return getMachineFunctionPassPreservedAnalyses()
213 .preserve<MachineDominatorTreeAnalysis>();
214 }
215
run(MachineFunction & MF)216 bool SILateBranchLowering::run(MachineFunction &MF) {
217 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
218 TII = ST.getInstrInfo();
219 TRI = &TII->getRegisterInfo();
220
221 MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
222 ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
223
224 SmallVector<MachineInstr *, 4> EarlyTermInstrs;
225 SmallVector<MachineInstr *, 1> EpilogInstrs;
226 bool MadeChange = false;
227
228 for (MachineBasicBlock &MBB : MF) {
229 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
230 switch (MI.getOpcode()) {
231 case AMDGPU::S_BRANCH:
232 // Optimize out branches to the next block.
233 // This only occurs in -O0 when BranchFolding is not executed.
234 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
235 assert(&MI == &MBB.back());
236 MI.eraseFromParent();
237 MadeChange = true;
238 }
239 break;
240
241 case AMDGPU::SI_CS_CHAIN_TC_W32:
242 case AMDGPU::SI_CS_CHAIN_TC_W64:
243 expandChainCall(MI, ST, /*DynamicVGPR=*/false);
244 MadeChange = true;
245 break;
246 case AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR:
247 case AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR:
248 expandChainCall(MI, ST, /*DynamicVGPR=*/true);
249 MadeChange = true;
250 break;
251
252 case AMDGPU::SI_EARLY_TERMINATE_SCC0:
253 EarlyTermInstrs.push_back(&MI);
254 break;
255
256 case AMDGPU::SI_RETURN_TO_EPILOG:
257 EpilogInstrs.push_back(&MI);
258 break;
259
260 default:
261 break;
262 }
263 }
264 }
265
266 // Lower any early exit branches first
267 if (!EarlyTermInstrs.empty()) {
268 MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock();
269 DebugLoc DL;
270
271 MF.insert(MF.end(), EarlyExitBlock);
272 BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc),
273 ExecReg)
274 .addImm(0);
275 generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF);
276
277 for (MachineInstr *Instr : EarlyTermInstrs) {
278 // Early termination in GS does nothing
279 if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS)
280 earlyTerm(*Instr, EarlyExitBlock);
281 Instr->eraseFromParent();
282 }
283
284 EarlyTermInstrs.clear();
285 MadeChange = true;
286 }
287
288 // Now check return to epilog instructions occur at function end
289 if (!EpilogInstrs.empty()) {
290 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
291 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
292
293 // If there are multiple returns to epilog then all will
294 // become jumps to new empty end block.
295 if (EpilogInstrs.size() > 1) {
296 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
297 MF.insert(MF.end(), EmptyMBBAtEnd);
298 }
299
300 for (auto *MI : EpilogInstrs) {
301 auto *MBB = MI->getParent();
302 if (MBB == &MF.back() && MI == &MBB->back())
303 continue;
304
305 // SI_RETURN_TO_EPILOG is not the last instruction.
306 // Jump to empty block at function end.
307 if (!EmptyMBBAtEnd) {
308 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
309 MF.insert(MF.end(), EmptyMBBAtEnd);
310 }
311
312 MBB->addSuccessor(EmptyMBBAtEnd);
313 MDT->insertEdge(MBB, EmptyMBBAtEnd);
314 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
315 .addMBB(EmptyMBBAtEnd);
316 MI->eraseFromParent();
317 MadeChange = true;
318 }
319
320 EpilogInstrs.clear();
321 }
322
323 return MadeChange;
324 }
325