1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This pass lowers the pseudo control flow instructions to real 11 /// machine instructions. 12 /// 13 /// All control flow is handled using predicated instructions and 14 /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 15 /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 16 /// by writing to the 64-bit EXEC register (each bit corresponds to a 17 /// single vector ALU). Typically, for predicates, a vector ALU will write 18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the 20 /// EXEC to update the predicates. 21 /// 22 /// For example: 23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2 24 /// %sgpr0 = SI_IF %vcc 25 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 26 /// %sgpr0 = SI_ELSE %sgpr0 27 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0 28 /// SI_END_CF %sgpr0 29 /// 30 /// becomes: 31 /// 32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask 33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask 34 /// S_CBRANCH_EXECZ label0 // This instruction is an optional 35 /// // optimization which allows us to 36 /// // branch if all the bits of 37 /// // EXEC are zero. 38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch 39 /// 40 /// label0: 41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then 42 /// // block 43 /// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask 44 /// S_BRANCH_EXECZ label1 // Use our branch optimization 45 /// // instruction again. 46 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block 47 /// label1: 48 /// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits 49 //===----------------------------------------------------------------------===// 50 51 #include "AMDGPU.h" 52 #include "GCNSubtarget.h" 53 #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 54 #include "llvm/ADT/SmallSet.h" 55 #include "llvm/CodeGen/LiveIntervals.h" 56 #include "llvm/CodeGen/LiveVariables.h" 57 #include "llvm/CodeGen/MachineDominators.h" 58 #include "llvm/CodeGen/MachineFunctionPass.h" 59 60 using namespace llvm; 61 62 #define DEBUG_TYPE "si-lower-control-flow" 63 64 static cl::opt<bool> 65 RemoveRedundantEndcf("amdgpu-remove-redundant-endcf", 66 cl::init(true), cl::ReallyHidden); 67 68 namespace { 69 70 class SILowerControlFlow : public MachineFunctionPass { 71 private: 72 const SIRegisterInfo *TRI = nullptr; 73 const SIInstrInfo *TII = nullptr; 74 LiveIntervals *LIS = nullptr; 75 LiveVariables *LV = nullptr; 76 MachineDominatorTree *MDT = nullptr; 77 MachineRegisterInfo *MRI = nullptr; 78 SetVector<MachineInstr*> LoweredEndCf; 79 DenseSet<Register> LoweredIf; 80 SmallSet<MachineBasicBlock *, 4> KillBlocks; 81 82 const TargetRegisterClass *BoolRC = nullptr; 83 unsigned AndOpc; 84 unsigned OrOpc; 85 unsigned XorOpc; 86 unsigned MovTermOpc; 87 unsigned Andn2TermOpc; 88 unsigned XorTermrOpc; 89 unsigned OrTermrOpc; 90 unsigned OrSaveExecOpc; 91 unsigned Exec; 92 93 bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End); 94 95 void emitIf(MachineInstr &MI); 96 void emitElse(MachineInstr &MI); 97 void emitIfBreak(MachineInstr &MI); 98 void emitLoop(MachineInstr &MI); 99 100 MachineBasicBlock *emitEndCf(MachineInstr &MI); 101 102 void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI); 103 104 void findMaskOperands(MachineInstr &MI, unsigned OpNo, 105 SmallVectorImpl<MachineOperand> &Src) const; 106 107 void combineMasks(MachineInstr &MI); 108 109 bool removeMBBifRedundant(MachineBasicBlock &MBB); 110 111 MachineBasicBlock *process(MachineInstr &MI); 112 113 // Skip to the next instruction, ignoring debug instructions, and trivial 114 // block boundaries (blocks that have one (typically fallthrough) successor, 115 // and the successor has one predecessor. 116 MachineBasicBlock::iterator 117 skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB, 118 MachineBasicBlock::iterator It) const; 119 120 /// Find the insertion point for a new conditional branch. 121 MachineBasicBlock::iterator 122 skipToUncondBrOrEnd(MachineBasicBlock &MBB, 123 MachineBasicBlock::iterator I) const { 124 assert(I->isTerminator()); 125 126 // FIXME: What if we had multiple pre-existing conditional branches? 127 MachineBasicBlock::iterator End = MBB.end(); 128 while (I != End && !I->isUnconditionalBranch()) 129 ++I; 130 return I; 131 } 132 133 // Remove redundant SI_END_CF instructions. 134 void optimizeEndCf(); 135 136 public: 137 static char ID; 138 139 SILowerControlFlow() : MachineFunctionPass(ID) {} 140 141 bool runOnMachineFunction(MachineFunction &MF) override; 142 143 StringRef getPassName() const override { 144 return "SI Lower control flow pseudo instructions"; 145 } 146 147 void getAnalysisUsage(AnalysisUsage &AU) const override { 148 // Should preserve the same set that TwoAddressInstructions does. 149 AU.addPreserved<MachineDominatorTree>(); 150 AU.addPreserved<SlotIndexes>(); 151 AU.addPreserved<LiveIntervals>(); 152 AU.addPreservedID(LiveVariablesID); 153 MachineFunctionPass::getAnalysisUsage(AU); 154 } 155 }; 156 157 } // end anonymous namespace 158 159 char SILowerControlFlow::ID = 0; 160 161 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, 162 "SI lower control flow", false, false) 163 164 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { 165 MachineOperand &ImpDefSCC = MI.getOperand(3); 166 assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); 167 168 ImpDefSCC.setIsDead(IsDead); 169 } 170 171 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; 172 173 bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin, 174 const MachineBasicBlock *End) { 175 DenseSet<const MachineBasicBlock*> Visited; 176 SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors()); 177 178 while (!Worklist.empty()) { 179 MachineBasicBlock *MBB = Worklist.pop_back_val(); 180 181 if (MBB == End || !Visited.insert(MBB).second) 182 continue; 183 if (KillBlocks.contains(MBB)) 184 return true; 185 186 Worklist.append(MBB->succ_begin(), MBB->succ_end()); 187 } 188 189 return false; 190 } 191 192 static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) { 193 Register SaveExecReg = MI.getOperand(0).getReg(); 194 auto U = MRI->use_instr_nodbg_begin(SaveExecReg); 195 196 if (U == MRI->use_instr_nodbg_end() || 197 std::next(U) != MRI->use_instr_nodbg_end() || 198 U->getOpcode() != AMDGPU::SI_END_CF) 199 return false; 200 201 return true; 202 } 203 204 void SILowerControlFlow::emitIf(MachineInstr &MI) { 205 MachineBasicBlock &MBB = *MI.getParent(); 206 const DebugLoc &DL = MI.getDebugLoc(); 207 MachineBasicBlock::iterator I(&MI); 208 Register SaveExecReg = MI.getOperand(0).getReg(); 209 MachineOperand& Cond = MI.getOperand(1); 210 assert(Cond.getSubReg() == AMDGPU::NoSubRegister); 211 212 MachineOperand &ImpDefSCC = MI.getOperand(4); 213 assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); 214 215 // If there is only one use of save exec register and that use is SI_END_CF, 216 // we can optimize SI_IF by returning the full saved exec mask instead of 217 // just cleared bits. 218 bool SimpleIf = isSimpleIf(MI, MRI); 219 220 if (SimpleIf) { 221 // Check for SI_KILL_*_TERMINATOR on path from if to endif. 222 // if there is any such terminator simplifications are not safe. 223 auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg); 224 SimpleIf = !hasKill(MI.getParent(), UseMI->getParent()); 225 } 226 227 // Add an implicit def of exec to discourage scheduling VALU after this which 228 // will interfere with trying to form s_and_saveexec_b64 later. 229 Register CopyReg = SimpleIf ? SaveExecReg 230 : MRI->createVirtualRegister(BoolRC); 231 MachineInstr *CopyExec = 232 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) 233 .addReg(Exec) 234 .addReg(Exec, RegState::ImplicitDefine); 235 LoweredIf.insert(CopyReg); 236 237 Register Tmp = MRI->createVirtualRegister(BoolRC); 238 239 MachineInstr *And = 240 BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) 241 .addReg(CopyReg) 242 .add(Cond); 243 if (LV) 244 LV->replaceKillInstruction(Cond.getReg(), MI, *And); 245 246 setImpSCCDefDead(*And, true); 247 248 MachineInstr *Xor = nullptr; 249 if (!SimpleIf) { 250 Xor = 251 BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg) 252 .addReg(Tmp) 253 .addReg(CopyReg); 254 setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); 255 } 256 257 // Use a copy that is a terminator to get correct spill code placement it with 258 // fast regalloc. 259 MachineInstr *SetExec = 260 BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) 261 .addReg(Tmp, RegState::Kill); 262 if (LV) 263 LV->getVarInfo(Tmp).Kills.push_back(SetExec); 264 265 // Skip ahead to the unconditional branch in case there are other terminators 266 // present. 267 I = skipToUncondBrOrEnd(MBB, I); 268 269 // Insert the S_CBRANCH_EXECZ instruction which will be optimized later 270 // during SIRemoveShortExecBranches. 271 MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 272 .add(MI.getOperand(2)); 273 274 if (!LIS) { 275 MI.eraseFromParent(); 276 return; 277 } 278 279 LIS->InsertMachineInstrInMaps(*CopyExec); 280 281 // Replace with and so we don't need to fix the live interval for condition 282 // register. 283 LIS->ReplaceMachineInstrInMaps(MI, *And); 284 285 if (!SimpleIf) 286 LIS->InsertMachineInstrInMaps(*Xor); 287 LIS->InsertMachineInstrInMaps(*SetExec); 288 LIS->InsertMachineInstrInMaps(*NewBr); 289 290 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); 291 MI.eraseFromParent(); 292 293 // FIXME: Is there a better way of adjusting the liveness? It shouldn't be 294 // hard to add another def here but I'm not sure how to correctly update the 295 // valno. 296 LIS->removeInterval(SaveExecReg); 297 LIS->createAndComputeVirtRegInterval(SaveExecReg); 298 LIS->createAndComputeVirtRegInterval(Tmp); 299 if (!SimpleIf) 300 LIS->createAndComputeVirtRegInterval(CopyReg); 301 } 302 303 void SILowerControlFlow::emitElse(MachineInstr &MI) { 304 MachineBasicBlock &MBB = *MI.getParent(); 305 const DebugLoc &DL = MI.getDebugLoc(); 306 307 Register DstReg = MI.getOperand(0).getReg(); 308 309 MachineBasicBlock::iterator Start = MBB.begin(); 310 311 // This must be inserted before phis and any spill code inserted before the 312 // else. 313 Register SaveReg = MRI->createVirtualRegister(BoolRC); 314 MachineInstr *OrSaveExec = 315 BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) 316 .add(MI.getOperand(1)); // Saved EXEC 317 if (LV) 318 LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *OrSaveExec); 319 320 MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); 321 322 MachineBasicBlock::iterator ElsePt(MI); 323 324 // This accounts for any modification of the EXEC mask within the block and 325 // can be optimized out pre-RA when not required. 326 MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) 327 .addReg(Exec) 328 .addReg(SaveReg); 329 330 if (LIS) 331 LIS->InsertMachineInstrInMaps(*And); 332 333 MachineInstr *Xor = 334 BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) 335 .addReg(Exec) 336 .addReg(DstReg); 337 338 // Skip ahead to the unconditional branch in case there are other terminators 339 // present. 340 ElsePt = skipToUncondBrOrEnd(MBB, ElsePt); 341 342 MachineInstr *Branch = 343 BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 344 .addMBB(DestBB); 345 346 if (!LIS) { 347 MI.eraseFromParent(); 348 return; 349 } 350 351 LIS->RemoveMachineInstrFromMaps(MI); 352 MI.eraseFromParent(); 353 354 LIS->InsertMachineInstrInMaps(*OrSaveExec); 355 356 LIS->InsertMachineInstrInMaps(*Xor); 357 LIS->InsertMachineInstrInMaps(*Branch); 358 359 LIS->removeInterval(DstReg); 360 LIS->createAndComputeVirtRegInterval(DstReg); 361 LIS->createAndComputeVirtRegInterval(SaveReg); 362 363 // Let this be recomputed. 364 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); 365 } 366 367 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { 368 MachineBasicBlock &MBB = *MI.getParent(); 369 const DebugLoc &DL = MI.getDebugLoc(); 370 auto Dst = MI.getOperand(0).getReg(); 371 372 // Skip ANDing with exec if the break condition is already masked by exec 373 // because it is a V_CMP in the same basic block. (We know the break 374 // condition operand was an i1 in IR, so if it is a VALU instruction it must 375 // be one with a carry-out.) 376 bool SkipAnding = false; 377 if (MI.getOperand(1).isReg()) { 378 if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) { 379 SkipAnding = Def->getParent() == MI.getParent() 380 && SIInstrInfo::isVALU(*Def); 381 } 382 } 383 384 // AND the break condition operand with exec, then OR that into the "loop 385 // exit" mask. 386 MachineInstr *And = nullptr, *Or = nullptr; 387 if (!SkipAnding) { 388 Register AndReg = MRI->createVirtualRegister(BoolRC); 389 And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg) 390 .addReg(Exec) 391 .add(MI.getOperand(1)); 392 if (LV) 393 LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *And); 394 Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) 395 .addReg(AndReg) 396 .add(MI.getOperand(2)); 397 if (LIS) 398 LIS->createAndComputeVirtRegInterval(AndReg); 399 } else { 400 Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) 401 .add(MI.getOperand(1)) 402 .add(MI.getOperand(2)); 403 if (LV) 404 LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *Or); 405 } 406 if (LV) 407 LV->replaceKillInstruction(MI.getOperand(2).getReg(), MI, *Or); 408 409 if (LIS) { 410 if (And) 411 LIS->InsertMachineInstrInMaps(*And); 412 LIS->ReplaceMachineInstrInMaps(MI, *Or); 413 } 414 415 MI.eraseFromParent(); 416 } 417 418 void SILowerControlFlow::emitLoop(MachineInstr &MI) { 419 MachineBasicBlock &MBB = *MI.getParent(); 420 const DebugLoc &DL = MI.getDebugLoc(); 421 422 MachineInstr *AndN2 = 423 BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) 424 .addReg(Exec) 425 .add(MI.getOperand(0)); 426 427 auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator()); 428 MachineInstr *Branch = 429 BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 430 .add(MI.getOperand(1)); 431 432 if (LIS) { 433 LIS->ReplaceMachineInstrInMaps(MI, *AndN2); 434 LIS->InsertMachineInstrInMaps(*Branch); 435 } 436 437 MI.eraseFromParent(); 438 } 439 440 MachineBasicBlock::iterator 441 SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( 442 MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { 443 444 SmallSet<const MachineBasicBlock *, 4> Visited; 445 MachineBasicBlock *B = &MBB; 446 do { 447 if (!Visited.insert(B).second) 448 return MBB.end(); 449 450 auto E = B->end(); 451 for ( ; It != E; ++It) { 452 if (TII->mayReadEXEC(*MRI, *It)) 453 break; 454 } 455 456 if (It != E) 457 return It; 458 459 if (B->succ_size() != 1) 460 return MBB.end(); 461 462 // If there is one trivial successor, advance to the next block. 463 MachineBasicBlock *Succ = *B->succ_begin(); 464 465 It = Succ->begin(); 466 B = Succ; 467 } while (true); 468 } 469 470 MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) { 471 MachineBasicBlock &MBB = *MI.getParent(); 472 const DebugLoc &DL = MI.getDebugLoc(); 473 474 MachineBasicBlock::iterator InsPt = MBB.begin(); 475 476 // If we have instructions that aren't prolog instructions, split the block 477 // and emit a terminator instruction. This ensures correct spill placement. 478 // FIXME: We should unconditionally split the block here. 479 bool NeedBlockSplit = false; 480 Register DataReg = MI.getOperand(0).getReg(); 481 for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator(); 482 I != E; ++I) { 483 if (I->modifiesRegister(DataReg, TRI)) { 484 NeedBlockSplit = true; 485 break; 486 } 487 } 488 489 unsigned Opcode = OrOpc; 490 MachineBasicBlock *SplitBB = &MBB; 491 if (NeedBlockSplit) { 492 SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS); 493 if (MDT && SplitBB != &MBB) { 494 MachineDomTreeNode *MBBNode = (*MDT)[&MBB]; 495 SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(), 496 MBBNode->end()); 497 MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB); 498 for (MachineDomTreeNode *Child : Children) 499 MDT->changeImmediateDominator(Child, SplitBBNode); 500 } 501 Opcode = OrTermrOpc; 502 InsPt = MI; 503 } 504 505 MachineInstr *NewMI = 506 BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec) 507 .addReg(Exec) 508 .add(MI.getOperand(0)); 509 if (LV) 510 LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *NewMI); 511 512 LoweredEndCf.insert(NewMI); 513 514 if (LIS) 515 LIS->ReplaceMachineInstrInMaps(MI, *NewMI); 516 517 MI.eraseFromParent(); 518 519 if (LIS) 520 LIS->handleMove(*NewMI); 521 return SplitBB; 522 } 523 524 // Returns replace operands for a logical operation, either single result 525 // for exec or two operands if source was another equivalent operation. 526 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, 527 SmallVectorImpl<MachineOperand> &Src) const { 528 MachineOperand &Op = MI.getOperand(OpNo); 529 if (!Op.isReg() || !Op.getReg().isVirtual()) { 530 Src.push_back(Op); 531 return; 532 } 533 534 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 535 if (!Def || Def->getParent() != MI.getParent() || 536 !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode()))) 537 return; 538 539 // Make sure we do not modify exec between def and use. 540 // A copy with implcitly defined exec inserted earlier is an exclusion, it 541 // does not really modify exec. 542 for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) 543 if (I->modifiesRegister(AMDGPU::EXEC, TRI) && 544 !(I->isCopy() && I->getOperand(0).getReg() != Exec)) 545 return; 546 547 for (const auto &SrcOp : Def->explicit_operands()) 548 if (SrcOp.isReg() && SrcOp.isUse() && 549 (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec)) 550 Src.push_back(SrcOp); 551 } 552 553 // Search and combine pairs of equivalent instructions, like 554 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y 555 // S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y 556 // One of the operands is exec mask. 557 void SILowerControlFlow::combineMasks(MachineInstr &MI) { 558 assert(MI.getNumExplicitOperands() == 3); 559 SmallVector<MachineOperand, 4> Ops; 560 unsigned OpToReplace = 1; 561 findMaskOperands(MI, 1, Ops); 562 if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy 563 findMaskOperands(MI, 2, Ops); 564 if (Ops.size() != 3) return; 565 566 unsigned UniqueOpndIdx; 567 if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2; 568 else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; 569 else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; 570 else return; 571 572 Register Reg = MI.getOperand(OpToReplace).getReg(); 573 MI.RemoveOperand(OpToReplace); 574 MI.addOperand(Ops[UniqueOpndIdx]); 575 if (MRI->use_empty(Reg)) 576 MRI->getUniqueVRegDef(Reg)->eraseFromParent(); 577 } 578 579 void SILowerControlFlow::optimizeEndCf() { 580 // If the only instruction immediately following this END_CF is an another 581 // END_CF in the only successor we can avoid emitting exec mask restore here. 582 if (!RemoveRedundantEndcf) 583 return; 584 585 for (MachineInstr *MI : LoweredEndCf) { 586 MachineBasicBlock &MBB = *MI->getParent(); 587 auto Next = 588 skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator())); 589 if (Next == MBB.end() || !LoweredEndCf.count(&*Next)) 590 continue; 591 // Only skip inner END_CF if outer ENDCF belongs to SI_IF. 592 // If that belongs to SI_ELSE then saved mask has an inverted value. 593 Register SavedExec 594 = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg(); 595 assert(SavedExec.isVirtual() && "Expected saved exec to be src1!"); 596 597 const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec); 598 if (Def && LoweredIf.count(SavedExec)) { 599 LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump()); 600 if (LIS) 601 LIS->RemoveMachineInstrFromMaps(*MI); 602 Register Reg; 603 if (LV) 604 Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg(); 605 MI->eraseFromParent(); 606 if (LV) 607 LV->recomputeForSingleDefVirtReg(Reg); 608 removeMBBifRedundant(MBB); 609 } 610 } 611 } 612 613 MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { 614 MachineBasicBlock &MBB = *MI.getParent(); 615 MachineBasicBlock::iterator I(MI); 616 MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr; 617 618 MachineBasicBlock *SplitBB = &MBB; 619 620 switch (MI.getOpcode()) { 621 case AMDGPU::SI_IF: 622 emitIf(MI); 623 break; 624 625 case AMDGPU::SI_ELSE: 626 emitElse(MI); 627 break; 628 629 case AMDGPU::SI_IF_BREAK: 630 emitIfBreak(MI); 631 break; 632 633 case AMDGPU::SI_LOOP: 634 emitLoop(MI); 635 break; 636 637 case AMDGPU::SI_WATERFALL_LOOP: 638 MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ)); 639 break; 640 641 case AMDGPU::SI_END_CF: 642 SplitBB = emitEndCf(MI); 643 break; 644 645 default: 646 assert(false && "Attempt to process unsupported instruction"); 647 break; 648 } 649 650 MachineBasicBlock::iterator Next; 651 for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) { 652 Next = std::next(I); 653 MachineInstr &MaskMI = *I; 654 switch (MaskMI.getOpcode()) { 655 case AMDGPU::S_AND_B64: 656 case AMDGPU::S_OR_B64: 657 case AMDGPU::S_AND_B32: 658 case AMDGPU::S_OR_B32: 659 // Cleanup bit manipulations on exec mask 660 combineMasks(MaskMI); 661 break; 662 default: 663 I = MBB.end(); 664 break; 665 } 666 } 667 668 return SplitBB; 669 } 670 671 void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB, 672 MachineInstr &MI) { 673 MachineFunction &MF = *MBB->getParent(); 674 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 675 bool IsWave32 = ST.isWave32(); 676 677 if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { 678 // This should be before all vector instructions. 679 BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), 680 TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec) 681 .addImm(MI.getOperand(0).getImm()); 682 if (LIS) 683 LIS->RemoveMachineInstrFromMaps(MI); 684 MI.eraseFromParent(); 685 return; 686 } 687 688 // Extract the thread count from an SGPR input and set EXEC accordingly. 689 // Since BFM can't shift by 64, handle that case with CMP + CMOV. 690 // 691 // S_BFE_U32 count, input, {shift, 7} 692 // S_BFM_B64 exec, count, 0 693 // S_CMP_EQ_U32 count, 64 694 // S_CMOV_B64 exec, -1 695 Register InputReg = MI.getOperand(0).getReg(); 696 MachineInstr *FirstMI = &*MBB->begin(); 697 if (InputReg.isVirtual()) { 698 MachineInstr *DefInstr = MRI->getVRegDef(InputReg); 699 assert(DefInstr && DefInstr->isCopy()); 700 if (DefInstr->getParent() == MBB) { 701 if (DefInstr != FirstMI) { 702 // If the `InputReg` is defined in current block, we also need to 703 // move that instruction to the beginning of the block. 704 DefInstr->removeFromParent(); 705 MBB->insert(FirstMI, DefInstr); 706 if (LIS) 707 LIS->handleMove(*DefInstr); 708 } else { 709 // If first instruction is definition then move pointer after it. 710 FirstMI = &*std::next(FirstMI->getIterator()); 711 } 712 } 713 } 714 715 // Insert instruction sequence at block beginning (before vector operations). 716 const DebugLoc DL = MI.getDebugLoc(); 717 const unsigned WavefrontSize = ST.getWavefrontSize(); 718 const unsigned Mask = (WavefrontSize << 1) - 1; 719 Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); 720 auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) 721 .addReg(InputReg) 722 .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); 723 if (LV) 724 LV->recomputeForSingleDefVirtReg(InputReg); 725 auto BfmMI = 726 BuildMI(*MBB, FirstMI, DL, 727 TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) 728 .addReg(CountReg) 729 .addImm(0); 730 auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) 731 .addReg(CountReg, RegState::Kill) 732 .addImm(WavefrontSize); 733 if (LV) 734 LV->getVarInfo(CountReg).Kills.push_back(CmpMI); 735 auto CmovMI = 736 BuildMI(*MBB, FirstMI, DL, 737 TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), 738 Exec) 739 .addImm(-1); 740 741 if (!LIS) { 742 MI.eraseFromParent(); 743 return; 744 } 745 746 LIS->RemoveMachineInstrFromMaps(MI); 747 MI.eraseFromParent(); 748 749 LIS->InsertMachineInstrInMaps(*BfeMI); 750 LIS->InsertMachineInstrInMaps(*BfmMI); 751 LIS->InsertMachineInstrInMaps(*CmpMI); 752 LIS->InsertMachineInstrInMaps(*CmovMI); 753 754 LIS->removeInterval(InputReg); 755 LIS->createAndComputeVirtRegInterval(InputReg); 756 LIS->createAndComputeVirtRegInterval(CountReg); 757 } 758 759 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { 760 for (auto &I : MBB.instrs()) { 761 if (!I.isDebugInstr() && !I.isUnconditionalBranch()) 762 return false; 763 } 764 765 assert(MBB.succ_size() == 1 && "MBB has more than one successor"); 766 767 MachineBasicBlock *Succ = *MBB.succ_begin(); 768 MachineBasicBlock *FallThrough = nullptr; 769 770 while (!MBB.predecessors().empty()) { 771 MachineBasicBlock *P = *MBB.pred_begin(); 772 if (P->getFallThrough() == &MBB) 773 FallThrough = P; 774 P->ReplaceUsesOfBlockWith(&MBB, Succ); 775 } 776 MBB.removeSuccessor(Succ); 777 if (LIS) { 778 for (auto &I : MBB.instrs()) 779 LIS->RemoveMachineInstrFromMaps(I); 780 } 781 if (MDT) { 782 // If Succ, the single successor of MBB, is dominated by MBB, MDT needs 783 // updating by changing Succ's idom to the one of MBB; otherwise, MBB must 784 // be a leaf node in MDT and could be erased directly. 785 if (MDT->dominates(&MBB, Succ)) 786 MDT->changeImmediateDominator(MDT->getNode(Succ), 787 MDT->getNode(&MBB)->getIDom()); 788 MDT->eraseNode(&MBB); 789 } 790 MBB.clear(); 791 MBB.eraseFromParent(); 792 if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) { 793 if (!Succ->canFallThrough()) { 794 MachineFunction *MF = FallThrough->getParent(); 795 MachineFunction::iterator FallThroughPos(FallThrough); 796 MF->splice(std::next(FallThroughPos), Succ); 797 } else 798 BuildMI(*FallThrough, FallThrough->end(), 799 FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH)) 800 .addMBB(Succ); 801 } 802 803 return true; 804 } 805 806 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { 807 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 808 TII = ST.getInstrInfo(); 809 TRI = &TII->getRegisterInfo(); 810 811 // This doesn't actually need LiveIntervals, but we can preserve them. 812 LIS = getAnalysisIfAvailable<LiveIntervals>(); 813 // This doesn't actually need LiveVariables, but we can preserve them. 814 LV = getAnalysisIfAvailable<LiveVariables>(); 815 MDT = getAnalysisIfAvailable<MachineDominatorTree>(); 816 MRI = &MF.getRegInfo(); 817 BoolRC = TRI->getBoolRC(); 818 819 if (ST.isWave32()) { 820 AndOpc = AMDGPU::S_AND_B32; 821 OrOpc = AMDGPU::S_OR_B32; 822 XorOpc = AMDGPU::S_XOR_B32; 823 MovTermOpc = AMDGPU::S_MOV_B32_term; 824 Andn2TermOpc = AMDGPU::S_ANDN2_B32_term; 825 XorTermrOpc = AMDGPU::S_XOR_B32_term; 826 OrTermrOpc = AMDGPU::S_OR_B32_term; 827 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; 828 Exec = AMDGPU::EXEC_LO; 829 } else { 830 AndOpc = AMDGPU::S_AND_B64; 831 OrOpc = AMDGPU::S_OR_B64; 832 XorOpc = AMDGPU::S_XOR_B64; 833 MovTermOpc = AMDGPU::S_MOV_B64_term; 834 Andn2TermOpc = AMDGPU::S_ANDN2_B64_term; 835 XorTermrOpc = AMDGPU::S_XOR_B64_term; 836 OrTermrOpc = AMDGPU::S_OR_B64_term; 837 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; 838 Exec = AMDGPU::EXEC; 839 } 840 841 // Compute set of blocks with kills 842 const bool CanDemote = 843 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS; 844 for (auto &MBB : MF) { 845 bool IsKillBlock = false; 846 for (auto &Term : MBB.terminators()) { 847 if (TII->isKillTerminator(Term.getOpcode())) { 848 KillBlocks.insert(&MBB); 849 IsKillBlock = true; 850 break; 851 } 852 } 853 if (CanDemote && !IsKillBlock) { 854 for (auto &MI : MBB) { 855 if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) { 856 KillBlocks.insert(&MBB); 857 break; 858 } 859 } 860 } 861 } 862 863 MachineFunction::iterator NextBB; 864 for (MachineFunction::iterator BI = MF.begin(); 865 BI != MF.end(); BI = NextBB) { 866 NextBB = std::next(BI); 867 MachineBasicBlock *MBB = &*BI; 868 869 MachineBasicBlock::iterator I, E, Next; 870 E = MBB->end(); 871 for (I = MBB->begin(); I != E; I = Next) { 872 Next = std::next(I); 873 MachineInstr &MI = *I; 874 MachineBasicBlock *SplitMBB = MBB; 875 876 switch (MI.getOpcode()) { 877 case AMDGPU::SI_IF: 878 case AMDGPU::SI_ELSE: 879 case AMDGPU::SI_IF_BREAK: 880 case AMDGPU::SI_WATERFALL_LOOP: 881 case AMDGPU::SI_LOOP: 882 case AMDGPU::SI_END_CF: 883 SplitMBB = process(MI); 884 break; 885 886 // FIXME: find a better place for this 887 case AMDGPU::SI_INIT_EXEC: 888 case AMDGPU::SI_INIT_EXEC_FROM_INPUT: 889 lowerInitExec(MBB, MI); 890 if (LIS) 891 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); 892 break; 893 894 default: 895 break; 896 } 897 898 if (SplitMBB != MBB) { 899 MBB = Next->getParent(); 900 E = MBB->end(); 901 } 902 } 903 } 904 905 optimizeEndCf(); 906 907 LoweredEndCf.clear(); 908 LoweredIf.clear(); 909 KillBlocks.clear(); 910 911 return true; 912 } 913