1 //===-- SIPreEmitPeephole.cpp ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass performs the peephole optimizations before code emission. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "SIMachineFunctionInfo.h" 18 #include "llvm/CodeGen/MachineFunctionPass.h" 19 20 using namespace llvm; 21 22 #define DEBUG_TYPE "si-pre-emit-peephole" 23 24 namespace { 25 26 class SIPreEmitPeephole : public MachineFunctionPass { 27 private: 28 const SIInstrInfo *TII = nullptr; 29 const SIRegisterInfo *TRI = nullptr; 30 31 bool optimizeVccBranch(MachineInstr &MI) const; 32 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; 33 34 public: 35 static char ID; 36 37 SIPreEmitPeephole() : MachineFunctionPass(ID) { 38 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); 39 } 40 41 bool runOnMachineFunction(MachineFunction &MF) override; 42 }; 43 44 } // End anonymous namespace. 45 46 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, 47 "SI peephole optimizations", false, false) 48 49 char SIPreEmitPeephole::ID = 0; 50 51 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; 52 53 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { 54 // Match: 55 // sreg = -1 or 0 56 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg 57 // S_CBRANCH_VCC[N]Z 58 // => 59 // S_CBRANCH_EXEC[N]Z 60 // We end up with this pattern sometimes after basic block placement. 61 // It happens while combining a block which assigns -1 or 0 to a saved mask 62 // and another block which consumes that saved mask and then a branch. 63 bool Changed = false; 64 MachineBasicBlock &MBB = *MI.getParent(); 65 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); 66 const bool IsWave32 = ST.isWave32(); 67 const unsigned CondReg = TRI->getVCC(); 68 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 69 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 70 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; 71 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 72 73 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), 74 E = MBB.rend(); 75 bool ReadsCond = false; 76 unsigned Threshold = 5; 77 for (++A; A != E; ++A) { 78 if (!--Threshold) 79 return false; 80 if (A->modifiesRegister(ExecReg, TRI)) 81 return false; 82 if (A->modifiesRegister(CondReg, TRI)) { 83 if (!A->definesRegister(CondReg, TRI) || 84 (A->getOpcode() != And && A->getOpcode() != AndN2)) 85 return false; 86 break; 87 } 88 ReadsCond |= A->readsRegister(CondReg, TRI); 89 } 90 if (A == E) 91 return false; 92 93 MachineOperand &Op1 = A->getOperand(1); 94 MachineOperand &Op2 = A->getOperand(2); 95 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { 96 TII->commuteInstruction(*A); 97 Changed = true; 98 } 99 if (Op1.getReg() != ExecReg) 100 return Changed; 101 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) 102 return Changed; 103 104 int64_t MaskValue = 0; 105 Register SReg; 106 if (Op2.isReg()) { 107 SReg = Op2.getReg(); 108 auto M = std::next(A); 109 bool ReadsSreg = false; 110 for (; M != E; ++M) { 111 if (M->definesRegister(SReg, TRI)) 112 break; 113 if (M->modifiesRegister(SReg, TRI)) 114 return Changed; 115 ReadsSreg |= M->readsRegister(SReg, TRI); 116 } 117 if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() || 118 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0)) 119 return Changed; 120 MaskValue = M->getOperand(1).getImm(); 121 // First if sreg is only used in the AND instruction fold the immediate 122 // into into the AND. 123 if (!ReadsSreg && Op2.isKill()) { 124 A->getOperand(2).ChangeToImmediate(MaskValue); 125 M->eraseFromParent(); 126 } 127 } else if (Op2.isImm()) { 128 MaskValue = Op2.getImm(); 129 } else { 130 llvm_unreachable("Op2 must be register or immediate"); 131 } 132 133 // Invert mask for s_andn2 134 assert(MaskValue == 0 || MaskValue == -1); 135 if (A->getOpcode() == AndN2) 136 MaskValue = ~MaskValue; 137 138 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC)) { 139 if (!MI.killsRegister(CondReg, TRI)) { 140 // Replace AND with MOV 141 if (MaskValue == 0) { 142 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg) 143 .addImm(0); 144 } else { 145 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg) 146 .addReg(ExecReg); 147 } 148 } 149 // Remove AND instruction 150 A->eraseFromParent(); 151 } 152 153 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; 154 if (SReg == ExecReg) { 155 // EXEC is updated directly 156 if (IsVCCZ) { 157 MI.eraseFromParent(); 158 return true; 159 } 160 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 161 } else if (IsVCCZ && MaskValue == 0) { 162 // Will always branch 163 // Remove all succesors shadowed by new unconditional branch 164 MachineBasicBlock *Parent = MI.getParent(); 165 SmallVector<MachineInstr *, 4> ToRemove; 166 bool Found = false; 167 for (MachineInstr &Term : Parent->terminators()) { 168 if (Found) { 169 if (Term.isBranch()) 170 ToRemove.push_back(&Term); 171 } else { 172 Found = Term.isIdenticalTo(MI); 173 } 174 } 175 assert(Found && "conditional branch is not terminator"); 176 for (auto BranchMI : ToRemove) { 177 MachineOperand &Dst = BranchMI->getOperand(0); 178 assert(Dst.isMBB() && "destination is not basic block"); 179 Parent->removeSuccessor(Dst.getMBB()); 180 BranchMI->eraseFromParent(); 181 } 182 183 if (MachineBasicBlock *Succ = Parent->getFallThrough()) { 184 Parent->removeSuccessor(Succ); 185 } 186 187 // Rewrite to unconditional branch 188 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 189 } else if (!IsVCCZ && MaskValue == 0) { 190 // Will never branch 191 MachineOperand &Dst = MI.getOperand(0); 192 assert(Dst.isMBB() && "destination is not basic block"); 193 MI.getParent()->removeSuccessor(Dst.getMBB()); 194 MI.eraseFromParent(); 195 return true; 196 } else if (MaskValue == -1) { 197 // Depends only on EXEC 198 MI.setDesc( 199 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); 200 } 201 202 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); 203 MI.addImplicitDefUseOperands(*MBB.getParent()); 204 205 return true; 206 } 207 208 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, 209 MachineInstr &MI) const { 210 MachineBasicBlock &MBB = *MI.getParent(); 211 const MachineFunction &MF = *MBB.getParent(); 212 const MachineRegisterInfo &MRI = MF.getRegInfo(); 213 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 214 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register(); 215 SmallVector<MachineInstr *, 4> ToRemove; 216 bool IdxOn = true; 217 218 if (!MI.isIdenticalTo(First)) 219 return false; 220 221 // Scan back to find an identical S_SET_GPR_IDX_ON 222 for (MachineBasicBlock::iterator I = std::next(First.getIterator()), 223 E = MI.getIterator(); I != E; ++I) { 224 switch (I->getOpcode()) { 225 case AMDGPU::S_SET_GPR_IDX_MODE: 226 return false; 227 case AMDGPU::S_SET_GPR_IDX_OFF: 228 IdxOn = false; 229 ToRemove.push_back(&*I); 230 break; 231 default: 232 if (I->modifiesRegister(AMDGPU::M0, TRI)) 233 return false; 234 if (IdxReg && I->modifiesRegister(IdxReg, TRI)) 235 return false; 236 if (llvm::any_of(I->operands(), 237 [&MRI, this](const MachineOperand &MO) { 238 return MO.isReg() && 239 TRI->isVectorRegister(MRI, MO.getReg()); 240 })) { 241 // The only exception allowed here is another indirect vector move 242 // with the same mode. 243 if (!IdxOn || 244 !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 && 245 I->hasRegisterImplicitUseOperand(AMDGPU::M0)) || 246 I->getOpcode() == AMDGPU::V_MOV_B32_indirect)) 247 return false; 248 } 249 } 250 } 251 252 MI.eraseFromParent(); 253 for (MachineInstr *RI : ToRemove) 254 RI->eraseFromParent(); 255 return true; 256 } 257 258 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { 259 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 260 TII = ST.getInstrInfo(); 261 TRI = &TII->getRegisterInfo(); 262 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 263 bool Changed = false; 264 265 for (MachineBasicBlock &MBB : MF) { 266 MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator(); 267 MachineBasicBlock::iterator TermI = MBBE; 268 // Check first terminator for VCC branches to optimize 269 if (TermI != MBB.end()) { 270 MachineInstr &MI = *TermI; 271 switch (MI.getOpcode()) { 272 case AMDGPU::S_CBRANCH_VCCZ: 273 case AMDGPU::S_CBRANCH_VCCNZ: 274 Changed |= optimizeVccBranch(MI); 275 continue; 276 default: 277 break; 278 } 279 } 280 // Check all terminators for SI_RETURN_TO_EPILOG 281 // FIXME: This is not an optimization and should be moved somewhere else. 282 while (TermI != MBB.end()) { 283 MachineInstr &MI = *TermI; 284 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { 285 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 286 287 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 288 // because external bytecode will be appended at the end. 289 if (&MBB != &MF.back() || &MI != &MBB.back()) { 290 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block 291 // at the end and jump there. 292 if (!EmptyMBBAtEnd) { 293 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 294 MF.insert(MF.end(), EmptyMBBAtEnd); 295 } 296 297 MBB.addSuccessor(EmptyMBBAtEnd); 298 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 299 .addMBB(EmptyMBBAtEnd); 300 MI.eraseFromParent(); 301 MBBE = MBB.getFirstTerminator(); 302 TermI = MBBE; 303 continue; 304 } 305 } 306 TermI++; 307 } 308 309 if (!ST.hasVGPRIndexMode()) 310 continue; 311 312 MachineInstr *SetGPRMI = nullptr; 313 const unsigned Threshold = 20; 314 unsigned Count = 0; 315 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a 316 // second is not needed. Do expensive checks in the optimizeSetGPR() 317 // and limit the distance to 20 instructions for compile time purposes. 318 for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) { 319 MachineInstr &MI = *MBBI; 320 ++MBBI; 321 322 if (Count == Threshold) 323 SetGPRMI = nullptr; 324 else 325 ++Count; 326 327 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON) 328 continue; 329 330 Count = 0; 331 if (!SetGPRMI) { 332 SetGPRMI = &MI; 333 continue; 334 } 335 336 if (optimizeSetGPR(*SetGPRMI, MI)) 337 Changed = true; 338 else 339 SetGPRMI = &MI; 340 } 341 } 342 343 return Changed; 344 } 345