1 //===-- SIPreEmitPeephole.cpp ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass performs the peephole optimizations before code emission. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "AMDGPUSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "SIInstrInfo.h" 18 #include "SIMachineFunctionInfo.h" 19 #include "llvm/CodeGen/MachineFunctionPass.h" 20 #include "llvm/Support/CommandLine.h" 21 22 using namespace llvm; 23 24 #define DEBUG_TYPE "si-pre-emit-peephole" 25 26 namespace { 27 28 class SIPreEmitPeephole : public MachineFunctionPass { 29 private: 30 const SIInstrInfo *TII = nullptr; 31 const SIRegisterInfo *TRI = nullptr; 32 33 bool optimizeVccBranch(MachineInstr &MI) const; 34 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; 35 36 public: 37 static char ID; 38 39 SIPreEmitPeephole() : MachineFunctionPass(ID) { 40 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); 41 } 42 43 bool runOnMachineFunction(MachineFunction &MF) override; 44 }; 45 46 } // End anonymous namespace. 47 48 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, 49 "SI peephole optimizations", false, false) 50 51 char SIPreEmitPeephole::ID = 0; 52 53 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; 54 55 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { 56 // Match: 57 // sreg = -1 or 0 58 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg 59 // S_CBRANCH_VCC[N]Z 60 // => 61 // S_CBRANCH_EXEC[N]Z 62 // We end up with this pattern sometimes after basic block placement. 63 // It happens while combining a block which assigns -1 or 0 to a saved mask 64 // and another block which consumes that saved mask and then a branch. 65 bool Changed = false; 66 MachineBasicBlock &MBB = *MI.getParent(); 67 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); 68 const bool IsWave32 = ST.isWave32(); 69 const unsigned CondReg = TRI->getVCC(); 70 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 71 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 72 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; 73 74 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), 75 E = MBB.rend(); 76 bool ReadsCond = false; 77 unsigned Threshold = 5; 78 for (++A; A != E; ++A) { 79 if (!--Threshold) 80 return false; 81 if (A->modifiesRegister(ExecReg, TRI)) 82 return false; 83 if (A->modifiesRegister(CondReg, TRI)) { 84 if (!A->definesRegister(CondReg, TRI) || 85 (A->getOpcode() != And && A->getOpcode() != AndN2)) 86 return false; 87 break; 88 } 89 ReadsCond |= A->readsRegister(CondReg, TRI); 90 } 91 if (A == E) 92 return false; 93 94 MachineOperand &Op1 = A->getOperand(1); 95 MachineOperand &Op2 = A->getOperand(2); 96 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { 97 TII->commuteInstruction(*A); 98 Changed = true; 99 } 100 if (Op1.getReg() != ExecReg) 101 return Changed; 102 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) 103 return Changed; 104 105 int64_t MaskValue = 0; 106 Register SReg; 107 if (Op2.isReg()) { 108 SReg = Op2.getReg(); 109 auto M = std::next(A); 110 bool ReadsSreg = false; 111 for (; M != E; ++M) { 112 if (M->definesRegister(SReg, TRI)) 113 break; 114 if (M->modifiesRegister(SReg, TRI)) 115 return Changed; 116 ReadsSreg |= M->readsRegister(SReg, TRI); 117 } 118 if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() || 119 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0)) 120 return Changed; 121 MaskValue = M->getOperand(1).getImm(); 122 // First if sreg is only used in the AND instruction fold the immediate 123 // into into the AND. 124 if (!ReadsSreg && Op2.isKill()) { 125 A->getOperand(2).ChangeToImmediate(MaskValue); 126 M->eraseFromParent(); 127 } 128 } else if (Op2.isImm()) { 129 MaskValue = Op2.getImm(); 130 } else { 131 llvm_unreachable("Op2 must be register or immediate"); 132 } 133 134 // Invert mask for s_andn2 135 assert(MaskValue == 0 || MaskValue == -1); 136 if (A->getOpcode() == AndN2) 137 MaskValue = ~MaskValue; 138 139 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) && 140 MI.killsRegister(CondReg, TRI)) 141 A->eraseFromParent(); 142 143 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; 144 if (SReg == ExecReg) { 145 // EXEC is updated directly 146 if (IsVCCZ) { 147 MI.eraseFromParent(); 148 return true; 149 } 150 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 151 } else if (IsVCCZ && MaskValue == 0) { 152 // Will always branch 153 // Remove all succesors shadowed by new unconditional branch 154 MachineBasicBlock *Parent = MI.getParent(); 155 SmallVector<MachineInstr *, 4> ToRemove; 156 bool Found = false; 157 for (MachineInstr &Term : Parent->terminators()) { 158 if (Found) { 159 if (Term.isBranch()) 160 ToRemove.push_back(&Term); 161 } else { 162 Found = Term.isIdenticalTo(MI); 163 } 164 } 165 assert(Found && "conditional branch is not terminator"); 166 for (auto BranchMI : ToRemove) { 167 MachineOperand &Dst = BranchMI->getOperand(0); 168 assert(Dst.isMBB() && "destination is not basic block"); 169 Parent->removeSuccessor(Dst.getMBB()); 170 BranchMI->eraseFromParent(); 171 } 172 173 if (MachineBasicBlock *Succ = Parent->getFallThrough()) { 174 Parent->removeSuccessor(Succ); 175 } 176 177 // Rewrite to unconditional branch 178 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 179 } else if (!IsVCCZ && MaskValue == 0) { 180 // Will never branch 181 MachineOperand &Dst = MI.getOperand(0); 182 assert(Dst.isMBB() && "destination is not basic block"); 183 MI.getParent()->removeSuccessor(Dst.getMBB()); 184 MI.eraseFromParent(); 185 return true; 186 } else if (MaskValue == -1) { 187 // Depends only on EXEC 188 MI.setDesc( 189 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); 190 } 191 192 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI)); 193 MI.addImplicitDefUseOperands(*MBB.getParent()); 194 195 return true; 196 } 197 198 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, 199 MachineInstr &MI) const { 200 MachineBasicBlock &MBB = *MI.getParent(); 201 const MachineFunction &MF = *MBB.getParent(); 202 const MachineRegisterInfo &MRI = MF.getRegInfo(); 203 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 204 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register(); 205 SmallVector<MachineInstr *, 4> ToRemove; 206 bool IdxOn = true; 207 208 if (!MI.isIdenticalTo(First)) 209 return false; 210 211 // Scan back to find an identical S_SET_GPR_IDX_ON 212 for (MachineBasicBlock::iterator I = std::next(First.getIterator()), 213 E = MI.getIterator(); I != E; ++I) { 214 switch (I->getOpcode()) { 215 case AMDGPU::S_SET_GPR_IDX_MODE: 216 return false; 217 case AMDGPU::S_SET_GPR_IDX_OFF: 218 IdxOn = false; 219 ToRemove.push_back(&*I); 220 break; 221 default: 222 if (I->modifiesRegister(AMDGPU::M0, TRI)) 223 return false; 224 if (IdxReg && I->modifiesRegister(IdxReg, TRI)) 225 return false; 226 if (llvm::any_of(I->operands(), 227 [&MRI, this](const MachineOperand &MO) { 228 return MO.isReg() && 229 TRI->isVectorRegister(MRI, MO.getReg()); 230 })) { 231 // The only exception allowed here is another indirect vector move 232 // with the same mode. 233 if (!IdxOn || 234 !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 && 235 I->hasRegisterImplicitUseOperand(AMDGPU::M0)) || 236 I->getOpcode() == AMDGPU::V_MOV_B32_indirect)) 237 return false; 238 } 239 } 240 } 241 242 MI.eraseFromParent(); 243 for (MachineInstr *RI : ToRemove) 244 RI->eraseFromParent(); 245 return true; 246 } 247 248 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { 249 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 250 TII = ST.getInstrInfo(); 251 TRI = &TII->getRegisterInfo(); 252 MachineBasicBlock *EmptyMBBAtEnd = nullptr; 253 bool Changed = false; 254 255 for (MachineBasicBlock &MBB : MF) { 256 MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator(); 257 if (MBBE != MBB.end()) { 258 MachineInstr &MI = *MBBE; 259 switch (MI.getOpcode()) { 260 case AMDGPU::S_CBRANCH_VCCZ: 261 case AMDGPU::S_CBRANCH_VCCNZ: 262 Changed |= optimizeVccBranch(MI); 263 continue; 264 case AMDGPU::SI_RETURN_TO_EPILOG: 265 // FIXME: This is not an optimization and should be 266 // moved somewhere else. 267 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid()); 268 269 // Graphics shaders returning non-void shouldn't contain S_ENDPGM, 270 // because external bytecode will be appended at the end. 271 if (&MBB != &MF.back() || &MI != &MBB.back()) { 272 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block 273 // at the end and jump there. 274 if (!EmptyMBBAtEnd) { 275 EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); 276 MF.insert(MF.end(), EmptyMBBAtEnd); 277 } 278 279 MBB.addSuccessor(EmptyMBBAtEnd); 280 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 281 .addMBB(EmptyMBBAtEnd); 282 MI.eraseFromParent(); 283 MBBE = MBB.getFirstTerminator(); 284 } 285 break; 286 default: 287 break; 288 } 289 } 290 291 if (!ST.hasVGPRIndexMode()) 292 continue; 293 294 MachineInstr *SetGPRMI = nullptr; 295 const unsigned Threshold = 20; 296 unsigned Count = 0; 297 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a 298 // second is not needed. Do expensive checks in the optimizeSetGPR() 299 // and limit the distance to 20 instructions for compile time purposes. 300 for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) { 301 MachineInstr &MI = *MBBI; 302 ++MBBI; 303 304 if (Count == Threshold) 305 SetGPRMI = nullptr; 306 else 307 ++Count; 308 309 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON) 310 continue; 311 312 Count = 0; 313 if (!SetGPRMI) { 314 SetGPRMI = &MI; 315 continue; 316 } 317 318 if (optimizeSetGPR(*SetGPRMI, MI)) 319 Changed = true; 320 else 321 SetGPRMI = &MI; 322 } 323 } 324 325 return Changed; 326 } 327