1 //===-- SILateBranchLowering.cpp - Final preparation of branches ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass mainly lowers early terminate pseudo instructions. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "llvm/CodeGen/MachineDominators.h" 19 #include "llvm/CodeGen/MachinePassManager.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "si-late-branch-lowering" 24 25 namespace { 26 27 class SILateBranchLowering { 28 private: 29 const SIRegisterInfo *TRI = nullptr; 30 const SIInstrInfo *TII = nullptr; 31 MachineDominatorTree *MDT = nullptr; 32 33 void expandChainCall(MachineInstr &MI, const GCNSubtarget &ST, 34 bool DynamicVGPR); 35 void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); 36 37 public: 38 SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {} 39 40 bool run(MachineFunction &MF); 41 42 unsigned MovOpc; 43 Register ExecReg; 44 }; 45 46 class SILateBranchLoweringLegacy : public MachineFunctionPass { 47 public: 48 static char ID; 49 SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {} 50 51 bool runOnMachineFunction(MachineFunction &MF) override { 52 auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 53 return SILateBranchLowering(MDT).run(MF); 54 } 55 56 StringRef getPassName() const override { 57 return "SI Final Branch Preparation"; 58 } 59 60 void getAnalysisUsage(AnalysisUsage &AU) const override { 61 AU.addRequired<MachineDominatorTreeWrapperPass>(); 62 AU.addPreserved<MachineDominatorTreeWrapperPass>(); 63 MachineFunctionPass::getAnalysisUsage(AU); 64 } 65 }; 66 67 } // end anonymous namespace 68 69 char SILateBranchLoweringLegacy::ID = 0; 70 71 INITIALIZE_PASS_BEGIN(SILateBranchLoweringLegacy, DEBUG_TYPE, 72 "SI insert s_cbranch_execz instructions", false, false) 73 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 74 INITIALIZE_PASS_END(SILateBranchLoweringLegacy, DEBUG_TYPE, 75 "SI insert s_cbranch_execz instructions", false, false) 76 77 char &llvm::SILateBranchLoweringPassID = SILateBranchLoweringLegacy::ID; 78 79 static void generateEndPgm(MachineBasicBlock &MBB, 80 MachineBasicBlock::iterator I, DebugLoc DL, 81 const SIInstrInfo *TII, MachineFunction &MF) { 82 const Function &F = MF.getFunction(); 83 bool IsPS = F.getCallingConv() == CallingConv::AMDGPU_PS; 84 85 // Check if hardware has been configured to expect color or depth exports. 86 bool HasColorExports = AMDGPU::getHasColorExport(F); 87 bool HasDepthExports = AMDGPU::getHasDepthExport(F); 88 bool HasExports = HasColorExports || HasDepthExports; 89 90 // Prior to GFX10, hardware always expects at least one export for PS. 91 bool MustExport = !AMDGPU::isGFX10Plus(TII->getSubtarget()); 92 93 if (IsPS && (HasExports || MustExport)) { 94 // Generate "null export" if hardware is expecting PS to export. 95 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); 96 int Target = 97 ST.hasNullExportTarget() 98 ? AMDGPU::Exp::ET_NULL 99 : (HasColorExports ? AMDGPU::Exp::ET_MRT0 : AMDGPU::Exp::ET_MRTZ); 100 BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) 101 .addImm(Target) 102 .addReg(AMDGPU::VGPR0, RegState::Undef) 103 .addReg(AMDGPU::VGPR0, RegState::Undef) 104 .addReg(AMDGPU::VGPR0, RegState::Undef) 105 .addReg(AMDGPU::VGPR0, RegState::Undef) 106 .addImm(1) // vm 107 .addImm(0) // compr 108 .addImm(0); // en 109 } 110 111 // s_endpgm 112 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); 113 } 114 115 static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, 116 MachineDominatorTree *MDT) { 117 MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true); 118 119 // Update dominator tree 120 using DomTreeT = DomTreeBase<MachineBasicBlock>; 121 SmallVector<DomTreeT::UpdateType, 16> DTUpdates; 122 for (MachineBasicBlock *Succ : SplitBB->successors()) { 123 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ}); 124 DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ}); 125 } 126 DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB}); 127 MDT->applyUpdates(DTUpdates); 128 } 129 130 static void copyOpWithoutRegFlags(MachineInstrBuilder &MIB, 131 MachineOperand &Op) { 132 if (Op.isReg()) 133 MIB.addReg(Op.getReg()); 134 else 135 MIB.add(Op); 136 } 137 138 void SILateBranchLowering::expandChainCall(MachineInstr &MI, 139 const GCNSubtarget &ST, 140 bool DynamicVGPR) { 141 // This is a tail call that needs to be expanded into at least 142 // 2 instructions, one for setting EXEC and one for the actual tail call. 143 int ExecIdx = 144 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::exec); 145 assert(ExecIdx != -1 && "Missing EXEC operand"); 146 const DebugLoc &DL = MI.getDebugLoc(); 147 if (DynamicVGPR) { 148 // We have 3 extra operands and we need to: 149 // * Try to change the VGPR allocation 150 // * Select the callee based on the result of the reallocation attempt 151 // * Select the EXEC mask based on the result of the reallocation attempt 152 // If any of the register operands of the chain pseudo is used in more than 153 // one of these instructions, we need to make sure that the kill flags 154 // aren't copied along. 155 auto AllocMI = 156 BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_ALLOC_VGPR)); 157 copyOpWithoutRegFlags(AllocMI, 158 *TII->getNamedOperand(MI, AMDGPU::OpName::numvgprs)); 159 160 auto SelectCallee = 161 BuildMI(*MI.getParent(), MI, DL, TII->get(AMDGPU::S_CSELECT_B64)) 162 .addDef(TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg()); 163 copyOpWithoutRegFlags(SelectCallee, 164 *TII->getNamedOperand(MI, AMDGPU::OpName::src0)); 165 copyOpWithoutRegFlags(SelectCallee, 166 *TII->getNamedOperand(MI, AMDGPU::OpName::fbcallee)); 167 168 auto SelectExec = BuildMI(*MI.getParent(), MI, DL, 169 TII->get(ST.isWave32() ? AMDGPU::S_CSELECT_B32 170 : AMDGPU::S_CSELECT_B64)) 171 .addDef(ExecReg); 172 173 copyOpWithoutRegFlags(SelectExec, 174 *TII->getNamedOperand(MI, AMDGPU::OpName::exec)); 175 copyOpWithoutRegFlags(SelectExec, 176 *TII->getNamedOperand(MI, AMDGPU::OpName::fbexec)); 177 } else { 178 auto SetExec = BuildMI(*MI.getParent(), MI, DL, TII->get(MovOpc), ExecReg); 179 copyOpWithoutRegFlags(SetExec, 180 *TII->getNamedOperand(MI, AMDGPU::OpName::exec)); 181 } 182 183 for (int OpIdx = MI.getNumExplicitOperands() - 1; OpIdx >= ExecIdx; --OpIdx) 184 MI.removeOperand(OpIdx); 185 186 MI.setDesc(TII->get(AMDGPU::SI_TCRETURN)); 187 } 188 189 void SILateBranchLowering::earlyTerm(MachineInstr &MI, 190 MachineBasicBlock *EarlyExitBlock) { 191 MachineBasicBlock &MBB = *MI.getParent(); 192 const DebugLoc DL = MI.getDebugLoc(); 193 194 auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) 195 .addMBB(EarlyExitBlock); 196 auto Next = std::next(MI.getIterator()); 197 198 if (Next != MBB.end() && !Next->isTerminator()) 199 splitBlock(MBB, *BranchMI, MDT); 200 201 MBB.addSuccessor(EarlyExitBlock); 202 MDT->insertEdge(&MBB, EarlyExitBlock); 203 } 204 205 PreservedAnalyses 206 llvm::SILateBranchLoweringPass::run(MachineFunction &MF, 207 MachineFunctionAnalysisManager &MFAM) { 208 auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF); 209 if (!SILateBranchLowering(MDT).run(MF)) 210 return PreservedAnalyses::all(); 211 212 return getMachineFunctionPassPreservedAnalyses() 213 .preserve<MachineDominatorTreeAnalysis>(); 214 } 215 216 bool SILateBranchLowering::run(MachineFunction &MF) { 217 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 218 TII = ST.getInstrInfo(); 219 TRI = &TII->getRegisterInfo(); 220 221 MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 222 ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 223 224 SmallVector<MachineInstr *, 4> EarlyTermInstrs; 225 SmallVector<MachineInstr *, 1> EpilogInstrs; 226 bool MadeChange = false; 227 228 for (MachineBasicBlock &MBB : MF) { 229 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { 230 switch (MI.getOpcode()) { 231 case AMDGPU::S_BRANCH: 232 // Optimize out branches to the next block. 233 // This only occurs in -O0 when BranchFolding is not executed. 234 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { 235 assert(&MI == &MBB.back()); 236 MI.eraseFromParent(); 237 MadeChange = true; 238 } 239 break; 240 241 case AMDGPU::SI_CS_CHAIN_TC_W32: 242 case AMDGPU::SI_CS_CHAIN_TC_W64: 243 expandChainCall(MI, ST, /*DynamicVGPR=*/false); 244 MadeChange = true; 245 break; 246 case AMDGPU::SI_CS_CHAIN_TC_W32_DVGPR: 247 case AMDGPU::SI_CS_CHAIN_TC_W64_DVGPR: 248 expandChainCall(MI, ST, /*DynamicVGPR=*/true); 249 MadeChange = true; 250 break; 251 252 case AMDGPU::SI_EARLY_TERMINATE_SCC0: 253 EarlyTermInstrs.push_back(&MI); 254 break; 255 256 case AMDGPU::SI_RETURN_TO_EPILOG: 257 EpilogInstrs.push_back(&MI); 258 break; 259 260 default: 261 break; 262 } 263 } 264 } 265 266 // Lower any early exit branches first 267 if (!EarlyTermInstrs.empty()) { 268 MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock(); 269 DebugLoc DL; 270 271 MF.insert(MF.end(), EarlyExitBlock); 272 BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc), 273 ExecReg) 274 .addImm(0); 275 generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, MF); 276 277 for (MachineInstr *Instr : EarlyTermInstrs) { 278 // Early termination in GS does nothing 279 if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) 280 earlyTerm(*Instr, EarlyExitBlock); 281 Instr->eraseFromParent(); 282 } 283 284 EarlyTermInstrs.clear(); 285 MadeChange = true; 286 } 287 288 // Now check return to epilog instructions occur at function end 289 if (!EpilogInstrs.empty()) { 290 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 291 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 292 293 // If there are multiple returns to epilog then all will 294 // become jumps to new empty end block. 295 if (EpilogInstrs.size() > 1) { 296 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 297 MF.insert(MF.end(), EmptyMBBAtEnd); 298 } 299 300 for (auto *MI : EpilogInstrs) { 301 auto *MBB = MI->getParent(); 302 if (MBB == &MF.back() && MI == &MBB->back()) 303 continue; 304 305 // SI_RETURN_TO_EPILOG is not the last instruction. 306 // Jump to empty block at function end. 307 if (!EmptyMBBAtEnd) { 308 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 309 MF.insert(MF.end(), EmptyMBBAtEnd); 310 } 311 312 MBB->addSuccessor(EmptyMBBAtEnd); 313 MDT->insertEdge(MBB, EmptyMBBAtEnd); 314 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 315 .addMBB(EmptyMBBAtEnd); 316 MI->eraseFromParent(); 317 MadeChange = true; 318 } 319 320 EpilogInstrs.clear(); 321 } 322 323 return MadeChange; 324 } 325