1 //===-- SIPreEmitPeephole.cpp ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass performs the peephole optimizations before code emission. 11 /// 12 //===----------------------------------------------------------------------===// 13 14 #include "AMDGPU.h" 15 #include "GCNSubtarget.h" 16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 17 #include "llvm/CodeGen/MachineFunctionPass.h" 18 #include "llvm/CodeGen/TargetSchedule.h" 19 #include "llvm/Support/BranchProbability.h" 20 21 using namespace llvm; 22 23 #define DEBUG_TYPE "si-pre-emit-peephole" 24 25 namespace { 26 27 class SIPreEmitPeephole { 28 private: 29 const SIInstrInfo *TII = nullptr; 30 const SIRegisterInfo *TRI = nullptr; 31 32 bool optimizeVccBranch(MachineInstr &MI) const; 33 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; 34 bool getBlockDestinations(MachineBasicBlock &SrcMBB, 35 MachineBasicBlock *&TrueMBB, 36 MachineBasicBlock *&FalseMBB, 37 SmallVectorImpl<MachineOperand> &Cond); 38 bool mustRetainExeczBranch(const MachineInstr &Branch, 39 const MachineBasicBlock &From, 40 const MachineBasicBlock &To) const; 41 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); 42 43 public: 44 bool run(MachineFunction &MF); 45 }; 46 47 class SIPreEmitPeepholeLegacy : public MachineFunctionPass { 48 public: 49 static char ID; 50 51 SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) { 52 initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry()); 53 } 54 55 bool runOnMachineFunction(MachineFunction &MF) override { 56 return SIPreEmitPeephole().run(MF); 57 } 58 }; 59 60 } // End anonymous namespace. 61 62 INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE, 63 "SI peephole optimizations", false, false) 64 65 char SIPreEmitPeepholeLegacy::ID = 0; 66 67 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID; 68 69 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { 70 // Match: 71 // sreg = -1 or 0 72 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg 73 // S_CBRANCH_VCC[N]Z 74 // => 75 // S_CBRANCH_EXEC[N]Z 76 // We end up with this pattern sometimes after basic block placement. 77 // It happens while combining a block which assigns -1 or 0 to a saved mask 78 // and another block which consumes that saved mask and then a branch. 79 // 80 // While searching this also performs the following substitution: 81 // vcc = V_CMP 82 // vcc = S_AND exec, vcc 83 // S_CBRANCH_VCC[N]Z 84 // => 85 // vcc = V_CMP 86 // S_CBRANCH_VCC[N]Z 87 88 bool Changed = false; 89 MachineBasicBlock &MBB = *MI.getParent(); 90 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>(); 91 const bool IsWave32 = ST.isWave32(); 92 const unsigned CondReg = TRI->getVCC(); 93 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; 94 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64; 95 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64; 96 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; 97 98 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(), 99 E = MBB.rend(); 100 bool ReadsCond = false; 101 unsigned Threshold = 5; 102 for (++A; A != E; ++A) { 103 if (!--Threshold) 104 return false; 105 if (A->modifiesRegister(ExecReg, TRI)) 106 return false; 107 if (A->modifiesRegister(CondReg, TRI)) { 108 if (!A->definesRegister(CondReg, TRI) || 109 (A->getOpcode() != And && A->getOpcode() != AndN2)) 110 return false; 111 break; 112 } 113 ReadsCond |= A->readsRegister(CondReg, TRI); 114 } 115 if (A == E) 116 return false; 117 118 MachineOperand &Op1 = A->getOperand(1); 119 MachineOperand &Op2 = A->getOperand(2); 120 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) { 121 TII->commuteInstruction(*A); 122 Changed = true; 123 } 124 if (Op1.getReg() != ExecReg) 125 return Changed; 126 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0)) 127 return Changed; 128 129 int64_t MaskValue = 0; 130 Register SReg; 131 if (Op2.isReg()) { 132 SReg = Op2.getReg(); 133 auto M = std::next(A); 134 bool ReadsSreg = false; 135 bool ModifiesExec = false; 136 for (; M != E; ++M) { 137 if (M->definesRegister(SReg, TRI)) 138 break; 139 if (M->modifiesRegister(SReg, TRI)) 140 return Changed; 141 ReadsSreg |= M->readsRegister(SReg, TRI); 142 ModifiesExec |= M->modifiesRegister(ExecReg, TRI); 143 } 144 if (M == E) 145 return Changed; 146 // If SReg is VCC and SReg definition is a VALU comparison. 147 // This means S_AND with EXEC is not required. 148 // Erase the S_AND and return. 149 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS 150 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec && 151 TII->isVOPC(*M)) { 152 A->eraseFromParent(); 153 return true; 154 } 155 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() || 156 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0)) 157 return Changed; 158 MaskValue = M->getOperand(1).getImm(); 159 // First if sreg is only used in the AND instruction fold the immediate 160 // into the AND. 161 if (!ReadsSreg && Op2.isKill()) { 162 A->getOperand(2).ChangeToImmediate(MaskValue); 163 M->eraseFromParent(); 164 } 165 } else if (Op2.isImm()) { 166 MaskValue = Op2.getImm(); 167 } else { 168 llvm_unreachable("Op2 must be register or immediate"); 169 } 170 171 // Invert mask for s_andn2 172 assert(MaskValue == 0 || MaskValue == -1); 173 if (A->getOpcode() == AndN2) 174 MaskValue = ~MaskValue; 175 176 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) { 177 if (!MI.killsRegister(CondReg, TRI)) { 178 // Replace AND with MOV 179 if (MaskValue == 0) { 180 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg) 181 .addImm(0); 182 } else { 183 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg) 184 .addReg(ExecReg); 185 } 186 } 187 // Remove AND instruction 188 A->eraseFromParent(); 189 } 190 191 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ; 192 if (SReg == ExecReg) { 193 // EXEC is updated directly 194 if (IsVCCZ) { 195 MI.eraseFromParent(); 196 return true; 197 } 198 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 199 } else if (IsVCCZ && MaskValue == 0) { 200 // Will always branch 201 // Remove all successors shadowed by new unconditional branch 202 MachineBasicBlock *Parent = MI.getParent(); 203 SmallVector<MachineInstr *, 4> ToRemove; 204 bool Found = false; 205 for (MachineInstr &Term : Parent->terminators()) { 206 if (Found) { 207 if (Term.isBranch()) 208 ToRemove.push_back(&Term); 209 } else { 210 Found = Term.isIdenticalTo(MI); 211 } 212 } 213 assert(Found && "conditional branch is not terminator"); 214 for (auto *BranchMI : ToRemove) { 215 MachineOperand &Dst = BranchMI->getOperand(0); 216 assert(Dst.isMBB() && "destination is not basic block"); 217 Parent->removeSuccessor(Dst.getMBB()); 218 BranchMI->eraseFromParent(); 219 } 220 221 if (MachineBasicBlock *Succ = Parent->getFallThrough()) { 222 Parent->removeSuccessor(Succ); 223 } 224 225 // Rewrite to unconditional branch 226 MI.setDesc(TII->get(AMDGPU::S_BRANCH)); 227 } else if (!IsVCCZ && MaskValue == 0) { 228 // Will never branch 229 MachineOperand &Dst = MI.getOperand(0); 230 assert(Dst.isMBB() && "destination is not basic block"); 231 MI.getParent()->removeSuccessor(Dst.getMBB()); 232 MI.eraseFromParent(); 233 return true; 234 } else if (MaskValue == -1) { 235 // Depends only on EXEC 236 MI.setDesc( 237 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ)); 238 } 239 240 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/)); 241 MI.addImplicitDefUseOperands(*MBB.getParent()); 242 243 return true; 244 } 245 246 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, 247 MachineInstr &MI) const { 248 MachineBasicBlock &MBB = *MI.getParent(); 249 const MachineFunction &MF = *MBB.getParent(); 250 const MachineRegisterInfo &MRI = MF.getRegInfo(); 251 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0); 252 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register(); 253 SmallVector<MachineInstr *, 4> ToRemove; 254 bool IdxOn = true; 255 256 if (!MI.isIdenticalTo(First)) 257 return false; 258 259 // Scan back to find an identical S_SET_GPR_IDX_ON 260 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), 261 E = MI.getIterator(); 262 I != E; ++I) { 263 if (I->isBundle()) 264 continue; 265 switch (I->getOpcode()) { 266 case AMDGPU::S_SET_GPR_IDX_MODE: 267 return false; 268 case AMDGPU::S_SET_GPR_IDX_OFF: 269 IdxOn = false; 270 ToRemove.push_back(&*I); 271 break; 272 default: 273 if (I->modifiesRegister(AMDGPU::M0, TRI)) 274 return false; 275 if (IdxReg && I->modifiesRegister(IdxReg, TRI)) 276 return false; 277 if (llvm::any_of(I->operands(), 278 [&MRI, this](const MachineOperand &MO) { 279 return MO.isReg() && 280 TRI->isVectorRegister(MRI, MO.getReg()); 281 })) { 282 // The only exception allowed here is another indirect vector move 283 // with the same mode. 284 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write || 285 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read)) 286 return false; 287 } 288 } 289 } 290 291 MI.eraseFromBundle(); 292 for (MachineInstr *RI : ToRemove) 293 RI->eraseFromBundle(); 294 return true; 295 } 296 297 bool SIPreEmitPeephole::getBlockDestinations( 298 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, 299 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) { 300 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) 301 return false; 302 303 if (!FalseMBB) 304 FalseMBB = SrcMBB.getNextNode(); 305 306 return true; 307 } 308 309 namespace { 310 class BranchWeightCostModel { 311 const SIInstrInfo &TII; 312 const TargetSchedModel &SchedModel; 313 BranchProbability BranchProb; 314 static constexpr uint64_t BranchNotTakenCost = 1; 315 uint64_t BranchTakenCost; 316 uint64_t ThenCyclesCost = 0; 317 318 public: 319 BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch, 320 const MachineBasicBlock &Succ) 321 : TII(TII), SchedModel(TII.getSchedModel()) { 322 const MachineBasicBlock &Head = *Branch.getParent(); 323 const auto *FromIt = find(Head.successors(), &Succ); 324 assert(FromIt != Head.succ_end()); 325 326 BranchProb = Head.getSuccProbability(FromIt); 327 if (BranchProb.isUnknown()) 328 BranchProb = BranchProbability::getZero(); 329 BranchTakenCost = SchedModel.computeInstrLatency(&Branch); 330 } 331 332 bool isProfitable(const MachineInstr &MI) { 333 if (TII.isWaitcnt(MI.getOpcode())) 334 return false; 335 336 ThenCyclesCost += SchedModel.computeInstrLatency(&MI); 337 338 // Consider `P = N/D` to be the probability of execz being false (skipping 339 // the then-block) The transformation is profitable if always executing the 340 // 'then' block is cheaper than executing sometimes 'then' and always 341 // executing s_cbranch_execz: 342 // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost 343 // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost 344 // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D * 345 // BranchNotTakenCost 346 uint64_t Numerator = BranchProb.getNumerator(); 347 uint64_t Denominator = BranchProb.getDenominator(); 348 return (Denominator - Numerator) * ThenCyclesCost <= 349 ((Denominator - Numerator) * BranchTakenCost + 350 Numerator * BranchNotTakenCost); 351 } 352 }; 353 354 bool SIPreEmitPeephole::mustRetainExeczBranch( 355 const MachineInstr &Branch, const MachineBasicBlock &From, 356 const MachineBasicBlock &To) const { 357 assert(is_contained(Branch.getParent()->successors(), &From)); 358 BranchWeightCostModel CostModel{*TII, Branch, From}; 359 360 const MachineFunction *MF = From.getParent(); 361 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); 362 MBBI != End && MBBI != ToI; ++MBBI) { 363 const MachineBasicBlock &MBB = *MBBI; 364 365 for (const MachineInstr &MI : MBB) { 366 // When a uniform loop is inside non-uniform control flow, the branch 367 // leaving the loop might never be taken when EXEC = 0. 368 // Hence we should retain cbranch out of the loop lest it become infinite. 369 if (MI.isConditionalBranch()) 370 return true; 371 372 if (MI.isUnconditionalBranch() && 373 TII->getBranchDestBlock(MI) != MBB.getNextNode()) 374 return true; 375 376 if (MI.isMetaInstruction()) 377 continue; 378 379 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) 380 return true; 381 382 if (!CostModel.isProfitable(MI)) 383 return true; 384 } 385 } 386 387 return false; 388 } 389 } // namespace 390 391 // Returns true if the skip branch instruction is removed. 392 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, 393 MachineBasicBlock &SrcMBB) { 394 395 if (!TII->getSchedModel().hasInstrSchedModel()) 396 return false; 397 398 MachineBasicBlock *TrueMBB = nullptr; 399 MachineBasicBlock *FalseMBB = nullptr; 400 SmallVector<MachineOperand, 1> Cond; 401 402 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) 403 return false; 404 405 // Consider only the forward branches. 406 if (SrcMBB.getNumber() >= TrueMBB->getNumber()) 407 return false; 408 409 // Consider only when it is legal and profitable 410 if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB)) 411 return false; 412 413 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); 414 MI.eraseFromParent(); 415 SrcMBB.removeSuccessor(TrueMBB); 416 417 return true; 418 } 419 420 PreservedAnalyses 421 llvm::SIPreEmitPeepholePass::run(MachineFunction &MF, 422 MachineFunctionAnalysisManager &MFAM) { 423 if (!SIPreEmitPeephole().run(MF)) 424 return PreservedAnalyses::all(); 425 426 return getMachineFunctionPassPreservedAnalyses(); 427 } 428 429 bool SIPreEmitPeephole::run(MachineFunction &MF) { 430 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 431 TII = ST.getInstrInfo(); 432 TRI = &TII->getRegisterInfo(); 433 bool Changed = false; 434 435 MF.RenumberBlocks(); 436 437 for (MachineBasicBlock &MBB : MF) { 438 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator(); 439 // Check first terminator for branches to optimize 440 if (TermI != MBB.end()) { 441 MachineInstr &MI = *TermI; 442 switch (MI.getOpcode()) { 443 case AMDGPU::S_CBRANCH_VCCZ: 444 case AMDGPU::S_CBRANCH_VCCNZ: 445 Changed |= optimizeVccBranch(MI); 446 break; 447 case AMDGPU::S_CBRANCH_EXECZ: 448 Changed |= removeExeczBranch(MI, MBB); 449 break; 450 } 451 } 452 453 if (!ST.hasVGPRIndexMode()) 454 continue; 455 456 MachineInstr *SetGPRMI = nullptr; 457 const unsigned Threshold = 20; 458 unsigned Count = 0; 459 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a 460 // second is not needed. Do expensive checks in the optimizeSetGPR() 461 // and limit the distance to 20 instructions for compile time purposes. 462 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions 463 // may be bundled with the instructions they modify. 464 for (auto &MI : make_early_inc_range(MBB.instrs())) { 465 if (Count == Threshold) 466 SetGPRMI = nullptr; 467 else 468 ++Count; 469 470 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON) 471 continue; 472 473 Count = 0; 474 if (!SetGPRMI) { 475 SetGPRMI = &MI; 476 continue; 477 } 478 479 if (optimizeSetGPR(*SetGPRMI, MI)) 480 Changed = true; 481 else 482 SetGPRMI = &MI; 483 } 484 } 485 486 return Changed; 487 } 488