10b57cec5SDimitry Andric //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric /// \file 100b57cec5SDimitry Andric /// This pass lowers the pseudo control flow instructions to real 110b57cec5SDimitry Andric /// machine instructions. 120b57cec5SDimitry Andric /// 130b57cec5SDimitry Andric /// All control flow is handled using predicated instructions and 140b57cec5SDimitry Andric /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 150b57cec5SDimitry Andric /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 160b57cec5SDimitry Andric /// by writting to the 64-bit EXEC register (each bit corresponds to a 170b57cec5SDimitry Andric /// single vector ALU). Typically, for predicates, a vector ALU will write 180b57cec5SDimitry Andric /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 190b57cec5SDimitry Andric /// Vector ALU) and then the ScalarALU will AND the VCC register with the 200b57cec5SDimitry Andric /// EXEC to update the predicates. 210b57cec5SDimitry Andric /// 220b57cec5SDimitry Andric /// For example: 230b57cec5SDimitry Andric /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2 240b57cec5SDimitry Andric /// %sgpr0 = SI_IF %vcc 250b57cec5SDimitry Andric /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 260b57cec5SDimitry Andric /// %sgpr0 = SI_ELSE %sgpr0 270b57cec5SDimitry Andric /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0 280b57cec5SDimitry Andric /// SI_END_CF %sgpr0 290b57cec5SDimitry Andric /// 300b57cec5SDimitry Andric /// becomes: 310b57cec5SDimitry Andric /// 320b57cec5SDimitry Andric /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask 330b57cec5SDimitry Andric /// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask 340b57cec5SDimitry Andric /// S_CBRANCH_EXECZ label0 // This instruction is an optional 350b57cec5SDimitry Andric /// // optimization which allows us to 360b57cec5SDimitry Andric /// // branch if all the bits of 370b57cec5SDimitry Andric /// // EXEC are zero. 380b57cec5SDimitry Andric /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch 390b57cec5SDimitry Andric /// 400b57cec5SDimitry Andric /// label0: 410b57cec5SDimitry Andric /// %sgpr0 = S_OR_SAVEEXEC_B64 %exec // Restore the exec mask for the Then block 420b57cec5SDimitry Andric /// %exec = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask 430b57cec5SDimitry Andric /// S_BRANCH_EXECZ label1 // Use our branch optimization 440b57cec5SDimitry Andric /// // instruction again. 450b57cec5SDimitry Andric /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the THEN block 460b57cec5SDimitry Andric /// label1: 470b57cec5SDimitry Andric /// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits 480b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 490b57cec5SDimitry Andric 500b57cec5SDimitry Andric #include "AMDGPU.h" 510b57cec5SDimitry Andric #include "AMDGPUSubtarget.h" 520b57cec5SDimitry Andric #include "SIInstrInfo.h" 530b57cec5SDimitry Andric #include "MCTargetDesc/AMDGPUMCTargetDesc.h" 540b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 550b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 560b57cec5SDimitry Andric #include "llvm/CodeGen/LiveIntervals.h" 570b57cec5SDimitry Andric #include "llvm/CodeGen/MachineBasicBlock.h" 580b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunction.h" 590b57cec5SDimitry Andric #include "llvm/CodeGen/MachineFunctionPass.h" 600b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstr.h" 610b57cec5SDimitry Andric #include "llvm/CodeGen/MachineInstrBuilder.h" 620b57cec5SDimitry Andric #include "llvm/CodeGen/MachineOperand.h" 630b57cec5SDimitry Andric #include "llvm/CodeGen/MachineRegisterInfo.h" 640b57cec5SDimitry Andric #include "llvm/CodeGen/Passes.h" 650b57cec5SDimitry Andric #include "llvm/CodeGen/SlotIndexes.h" 660b57cec5SDimitry Andric #include "llvm/CodeGen/TargetRegisterInfo.h" 670b57cec5SDimitry Andric #include "llvm/MC/MCRegisterInfo.h" 680b57cec5SDimitry Andric #include "llvm/Pass.h" 690b57cec5SDimitry Andric #include <cassert> 700b57cec5SDimitry Andric #include <iterator> 710b57cec5SDimitry Andric 720b57cec5SDimitry Andric using namespace llvm; 730b57cec5SDimitry Andric 740b57cec5SDimitry Andric #define DEBUG_TYPE "si-lower-control-flow" 750b57cec5SDimitry Andric 760b57cec5SDimitry Andric namespace { 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric class SILowerControlFlow : public MachineFunctionPass { 790b57cec5SDimitry Andric private: 800b57cec5SDimitry Andric const SIRegisterInfo *TRI = nullptr; 810b57cec5SDimitry Andric const SIInstrInfo *TII = nullptr; 820b57cec5SDimitry Andric LiveIntervals *LIS = nullptr; 830b57cec5SDimitry Andric MachineRegisterInfo *MRI = nullptr; 840b57cec5SDimitry Andric 850b57cec5SDimitry Andric const TargetRegisterClass *BoolRC = nullptr; 860b57cec5SDimitry Andric unsigned AndOpc; 870b57cec5SDimitry Andric unsigned OrOpc; 880b57cec5SDimitry Andric unsigned XorOpc; 890b57cec5SDimitry Andric unsigned MovTermOpc; 900b57cec5SDimitry Andric unsigned Andn2TermOpc; 910b57cec5SDimitry Andric unsigned XorTermrOpc; 920b57cec5SDimitry Andric unsigned OrSaveExecOpc; 930b57cec5SDimitry Andric unsigned Exec; 940b57cec5SDimitry Andric 950b57cec5SDimitry Andric void emitIf(MachineInstr &MI); 960b57cec5SDimitry Andric void emitElse(MachineInstr &MI); 970b57cec5SDimitry Andric void emitIfBreak(MachineInstr &MI); 980b57cec5SDimitry Andric void emitLoop(MachineInstr &MI); 990b57cec5SDimitry Andric void emitEndCf(MachineInstr &MI); 1000b57cec5SDimitry Andric 1018bcb0991SDimitry Andric Register getSaveExec(MachineInstr* MI); 1028bcb0991SDimitry Andric 1030b57cec5SDimitry Andric void findMaskOperands(MachineInstr &MI, unsigned OpNo, 1040b57cec5SDimitry Andric SmallVectorImpl<MachineOperand> &Src) const; 1050b57cec5SDimitry Andric 1060b57cec5SDimitry Andric void combineMasks(MachineInstr &MI); 1070b57cec5SDimitry Andric 1080b57cec5SDimitry Andric public: 1090b57cec5SDimitry Andric static char ID; 1100b57cec5SDimitry Andric 1110b57cec5SDimitry Andric SILowerControlFlow() : MachineFunctionPass(ID) {} 1120b57cec5SDimitry Andric 1130b57cec5SDimitry Andric bool runOnMachineFunction(MachineFunction &MF) override; 1140b57cec5SDimitry Andric 1150b57cec5SDimitry Andric StringRef getPassName() const override { 1160b57cec5SDimitry Andric return "SI Lower control flow pseudo instructions"; 1170b57cec5SDimitry Andric } 1180b57cec5SDimitry Andric 1190b57cec5SDimitry Andric void getAnalysisUsage(AnalysisUsage &AU) const override { 1200b57cec5SDimitry Andric // Should preserve the same set that TwoAddressInstructions does. 1210b57cec5SDimitry Andric AU.addPreserved<SlotIndexes>(); 1220b57cec5SDimitry Andric AU.addPreserved<LiveIntervals>(); 1230b57cec5SDimitry Andric AU.addPreservedID(LiveVariablesID); 1240b57cec5SDimitry Andric AU.addPreservedID(MachineLoopInfoID); 1250b57cec5SDimitry Andric AU.addPreservedID(MachineDominatorsID); 1260b57cec5SDimitry Andric AU.setPreservesCFG(); 1270b57cec5SDimitry Andric MachineFunctionPass::getAnalysisUsage(AU); 1280b57cec5SDimitry Andric } 1290b57cec5SDimitry Andric }; 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric } // end anonymous namespace 1320b57cec5SDimitry Andric 1330b57cec5SDimitry Andric char SILowerControlFlow::ID = 0; 1340b57cec5SDimitry Andric 1350b57cec5SDimitry Andric INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE, 1360b57cec5SDimitry Andric "SI lower control flow", false, false) 1370b57cec5SDimitry Andric 1380b57cec5SDimitry Andric static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) { 1390b57cec5SDimitry Andric MachineOperand &ImpDefSCC = MI.getOperand(3); 1400b57cec5SDimitry Andric assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); 1410b57cec5SDimitry Andric 1420b57cec5SDimitry Andric ImpDefSCC.setIsDead(IsDead); 1430b57cec5SDimitry Andric } 1440b57cec5SDimitry Andric 1450b57cec5SDimitry Andric char &llvm::SILowerControlFlowID = SILowerControlFlow::ID; 1460b57cec5SDimitry Andric 1470b57cec5SDimitry Andric static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI, 1480b57cec5SDimitry Andric const SIInstrInfo *TII) { 1498bcb0991SDimitry Andric Register SaveExecReg = MI.getOperand(0).getReg(); 1500b57cec5SDimitry Andric auto U = MRI->use_instr_nodbg_begin(SaveExecReg); 1510b57cec5SDimitry Andric 1520b57cec5SDimitry Andric if (U == MRI->use_instr_nodbg_end() || 1530b57cec5SDimitry Andric std::next(U) != MRI->use_instr_nodbg_end() || 1540b57cec5SDimitry Andric U->getOpcode() != AMDGPU::SI_END_CF) 1550b57cec5SDimitry Andric return false; 1560b57cec5SDimitry Andric 1570b57cec5SDimitry Andric // Check for SI_KILL_*_TERMINATOR on path from if to endif. 1580b57cec5SDimitry Andric // if there is any such terminator simplififcations are not safe. 1590b57cec5SDimitry Andric auto SMBB = MI.getParent(); 1600b57cec5SDimitry Andric auto EMBB = U->getParent(); 1610b57cec5SDimitry Andric DenseSet<const MachineBasicBlock*> Visited; 1620b57cec5SDimitry Andric SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(), 1630b57cec5SDimitry Andric SMBB->succ_end()); 1640b57cec5SDimitry Andric 1650b57cec5SDimitry Andric while (!Worklist.empty()) { 1660b57cec5SDimitry Andric MachineBasicBlock *MBB = Worklist.pop_back_val(); 1670b57cec5SDimitry Andric 1680b57cec5SDimitry Andric if (MBB == EMBB || !Visited.insert(MBB).second) 1690b57cec5SDimitry Andric continue; 1700b57cec5SDimitry Andric for(auto &Term : MBB->terminators()) 1710b57cec5SDimitry Andric if (TII->isKillTerminator(Term.getOpcode())) 1720b57cec5SDimitry Andric return false; 1730b57cec5SDimitry Andric 1740b57cec5SDimitry Andric Worklist.append(MBB->succ_begin(), MBB->succ_end()); 1750b57cec5SDimitry Andric } 1760b57cec5SDimitry Andric 1770b57cec5SDimitry Andric return true; 1780b57cec5SDimitry Andric } 1790b57cec5SDimitry Andric 1808bcb0991SDimitry Andric Register SILowerControlFlow::getSaveExec(MachineInstr *MI) { 1818bcb0991SDimitry Andric MachineBasicBlock *MBB = MI->getParent(); 1828bcb0991SDimitry Andric MachineOperand &SaveExec = MI->getOperand(0); 1838bcb0991SDimitry Andric assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister); 1848bcb0991SDimitry Andric 1858bcb0991SDimitry Andric Register SaveExecReg = SaveExec.getReg(); 1868bcb0991SDimitry Andric unsigned FalseTermOpc = 1878bcb0991SDimitry Andric TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term; 1888bcb0991SDimitry Andric MachineBasicBlock::iterator I = (MI); 1898bcb0991SDimitry Andric MachineBasicBlock::iterator J = std::next(I); 1908bcb0991SDimitry Andric if (J != MBB->end() && J->getOpcode() == FalseTermOpc && 1918bcb0991SDimitry Andric J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) { 1928bcb0991SDimitry Andric SaveExecReg = J->getOperand(0).getReg(); 1938bcb0991SDimitry Andric J->eraseFromParent(); 1948bcb0991SDimitry Andric } 1958bcb0991SDimitry Andric return SaveExecReg; 1968bcb0991SDimitry Andric } 1978bcb0991SDimitry Andric 1980b57cec5SDimitry Andric void SILowerControlFlow::emitIf(MachineInstr &MI) { 1990b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 2000b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 2010b57cec5SDimitry Andric MachineBasicBlock::iterator I(&MI); 2028bcb0991SDimitry Andric Register SaveExecReg = getSaveExec(&MI); 2030b57cec5SDimitry Andric MachineOperand& Cond = MI.getOperand(1); 2048bcb0991SDimitry Andric assert(Cond.getSubReg() == AMDGPU::NoSubRegister); 2050b57cec5SDimitry Andric 2060b57cec5SDimitry Andric MachineOperand &ImpDefSCC = MI.getOperand(4); 2070b57cec5SDimitry Andric assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef()); 2080b57cec5SDimitry Andric 2090b57cec5SDimitry Andric // If there is only one use of save exec register and that use is SI_END_CF, 2100b57cec5SDimitry Andric // we can optimize SI_IF by returning the full saved exec mask instead of 2110b57cec5SDimitry Andric // just cleared bits. 2120b57cec5SDimitry Andric bool SimpleIf = isSimpleIf(MI, MRI, TII); 2130b57cec5SDimitry Andric 2140b57cec5SDimitry Andric // Add an implicit def of exec to discourage scheduling VALU after this which 2150b57cec5SDimitry Andric // will interfere with trying to form s_and_saveexec_b64 later. 2160b57cec5SDimitry Andric Register CopyReg = SimpleIf ? SaveExecReg 2170b57cec5SDimitry Andric : MRI->createVirtualRegister(BoolRC); 2180b57cec5SDimitry Andric MachineInstr *CopyExec = 2190b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg) 2200b57cec5SDimitry Andric .addReg(Exec) 2210b57cec5SDimitry Andric .addReg(Exec, RegState::ImplicitDefine); 2220b57cec5SDimitry Andric 2238bcb0991SDimitry Andric Register Tmp = MRI->createVirtualRegister(BoolRC); 2240b57cec5SDimitry Andric 2250b57cec5SDimitry Andric MachineInstr *And = 2260b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp) 2270b57cec5SDimitry Andric .addReg(CopyReg) 2280b57cec5SDimitry Andric .add(Cond); 2290b57cec5SDimitry Andric 2300b57cec5SDimitry Andric setImpSCCDefDead(*And, true); 2310b57cec5SDimitry Andric 2320b57cec5SDimitry Andric MachineInstr *Xor = nullptr; 2330b57cec5SDimitry Andric if (!SimpleIf) { 2340b57cec5SDimitry Andric Xor = 2350b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg) 2360b57cec5SDimitry Andric .addReg(Tmp) 2370b57cec5SDimitry Andric .addReg(CopyReg); 2380b57cec5SDimitry Andric setImpSCCDefDead(*Xor, ImpDefSCC.isDead()); 2390b57cec5SDimitry Andric } 2400b57cec5SDimitry Andric 2410b57cec5SDimitry Andric // Use a copy that is a terminator to get correct spill code placement it with 2420b57cec5SDimitry Andric // fast regalloc. 2430b57cec5SDimitry Andric MachineInstr *SetExec = 2440b57cec5SDimitry Andric BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec) 2450b57cec5SDimitry Andric .addReg(Tmp, RegState::Kill); 2460b57cec5SDimitry Andric 247*480093f4SDimitry Andric // Insert the S_CBRANCH_EXECZ instruction which will be optimized later 248*480093f4SDimitry Andric // during SIRemoveShortExecBranches. 249*480093f4SDimitry Andric MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 2500b57cec5SDimitry Andric .add(MI.getOperand(2)); 2510b57cec5SDimitry Andric 2520b57cec5SDimitry Andric if (!LIS) { 2530b57cec5SDimitry Andric MI.eraseFromParent(); 2540b57cec5SDimitry Andric return; 2550b57cec5SDimitry Andric } 2560b57cec5SDimitry Andric 2570b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*CopyExec); 2580b57cec5SDimitry Andric 2590b57cec5SDimitry Andric // Replace with and so we don't need to fix the live interval for condition 2600b57cec5SDimitry Andric // register. 2610b57cec5SDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *And); 2620b57cec5SDimitry Andric 2630b57cec5SDimitry Andric if (!SimpleIf) 2640b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*Xor); 2650b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*SetExec); 2660b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*NewBr); 2670b57cec5SDimitry Andric 2680b57cec5SDimitry Andric LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); 2690b57cec5SDimitry Andric MI.eraseFromParent(); 2700b57cec5SDimitry Andric 2710b57cec5SDimitry Andric // FIXME: Is there a better way of adjusting the liveness? It shouldn't be 2720b57cec5SDimitry Andric // hard to add another def here but I'm not sure how to correctly update the 2730b57cec5SDimitry Andric // valno. 2740b57cec5SDimitry Andric LIS->removeInterval(SaveExecReg); 2750b57cec5SDimitry Andric LIS->createAndComputeVirtRegInterval(SaveExecReg); 2760b57cec5SDimitry Andric LIS->createAndComputeVirtRegInterval(Tmp); 2770b57cec5SDimitry Andric if (!SimpleIf) 2780b57cec5SDimitry Andric LIS->createAndComputeVirtRegInterval(CopyReg); 2790b57cec5SDimitry Andric } 2800b57cec5SDimitry Andric 2810b57cec5SDimitry Andric void SILowerControlFlow::emitElse(MachineInstr &MI) { 2820b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 2830b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 2840b57cec5SDimitry Andric 2858bcb0991SDimitry Andric Register DstReg = getSaveExec(&MI); 2860b57cec5SDimitry Andric 2870b57cec5SDimitry Andric bool ExecModified = MI.getOperand(3).getImm() != 0; 2880b57cec5SDimitry Andric MachineBasicBlock::iterator Start = MBB.begin(); 2890b57cec5SDimitry Andric 2900b57cec5SDimitry Andric // We are running before TwoAddressInstructions, and si_else's operands are 2910b57cec5SDimitry Andric // tied. In order to correctly tie the registers, split this into a copy of 2920b57cec5SDimitry Andric // the src like it does. 2930b57cec5SDimitry Andric Register CopyReg = MRI->createVirtualRegister(BoolRC); 2940b57cec5SDimitry Andric MachineInstr *CopyExec = 2950b57cec5SDimitry Andric BuildMI(MBB, Start, DL, TII->get(AMDGPU::COPY), CopyReg) 2960b57cec5SDimitry Andric .add(MI.getOperand(1)); // Saved EXEC 2970b57cec5SDimitry Andric 2980b57cec5SDimitry Andric // This must be inserted before phis and any spill code inserted before the 2990b57cec5SDimitry Andric // else. 3000b57cec5SDimitry Andric Register SaveReg = ExecModified ? 3010b57cec5SDimitry Andric MRI->createVirtualRegister(BoolRC) : DstReg; 3020b57cec5SDimitry Andric MachineInstr *OrSaveExec = 3030b57cec5SDimitry Andric BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg) 3040b57cec5SDimitry Andric .addReg(CopyReg); 3050b57cec5SDimitry Andric 3060b57cec5SDimitry Andric MachineBasicBlock *DestBB = MI.getOperand(2).getMBB(); 3070b57cec5SDimitry Andric 3080b57cec5SDimitry Andric MachineBasicBlock::iterator ElsePt(MI); 3090b57cec5SDimitry Andric 3100b57cec5SDimitry Andric if (ExecModified) { 3110b57cec5SDimitry Andric MachineInstr *And = 3120b57cec5SDimitry Andric BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg) 3130b57cec5SDimitry Andric .addReg(Exec) 3140b57cec5SDimitry Andric .addReg(SaveReg); 3150b57cec5SDimitry Andric 3160b57cec5SDimitry Andric if (LIS) 3170b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*And); 3180b57cec5SDimitry Andric } 3190b57cec5SDimitry Andric 3200b57cec5SDimitry Andric MachineInstr *Xor = 3210b57cec5SDimitry Andric BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec) 3220b57cec5SDimitry Andric .addReg(Exec) 3230b57cec5SDimitry Andric .addReg(DstReg); 3240b57cec5SDimitry Andric 3250b57cec5SDimitry Andric MachineInstr *Branch = 326*480093f4SDimitry Andric BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) 3270b57cec5SDimitry Andric .addMBB(DestBB); 3280b57cec5SDimitry Andric 3290b57cec5SDimitry Andric if (!LIS) { 3300b57cec5SDimitry Andric MI.eraseFromParent(); 3310b57cec5SDimitry Andric return; 3320b57cec5SDimitry Andric } 3330b57cec5SDimitry Andric 3340b57cec5SDimitry Andric LIS->RemoveMachineInstrFromMaps(MI); 3350b57cec5SDimitry Andric MI.eraseFromParent(); 3360b57cec5SDimitry Andric 3370b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*CopyExec); 3380b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*OrSaveExec); 3390b57cec5SDimitry Andric 3400b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*Xor); 3410b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*Branch); 3420b57cec5SDimitry Andric 3430b57cec5SDimitry Andric // src reg is tied to dst reg. 3440b57cec5SDimitry Andric LIS->removeInterval(DstReg); 3450b57cec5SDimitry Andric LIS->createAndComputeVirtRegInterval(DstReg); 3460b57cec5SDimitry Andric LIS->createAndComputeVirtRegInterval(CopyReg); 3470b57cec5SDimitry Andric if (ExecModified) 3480b57cec5SDimitry Andric LIS->createAndComputeVirtRegInterval(SaveReg); 3490b57cec5SDimitry Andric 3500b57cec5SDimitry Andric // Let this be recomputed. 3510b57cec5SDimitry Andric LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); 3520b57cec5SDimitry Andric } 3530b57cec5SDimitry Andric 3540b57cec5SDimitry Andric void SILowerControlFlow::emitIfBreak(MachineInstr &MI) { 3550b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 3560b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 3578bcb0991SDimitry Andric auto Dst = getSaveExec(&MI); 3580b57cec5SDimitry Andric 3590b57cec5SDimitry Andric // Skip ANDing with exec if the break condition is already masked by exec 3600b57cec5SDimitry Andric // because it is a V_CMP in the same basic block. (We know the break 3610b57cec5SDimitry Andric // condition operand was an i1 in IR, so if it is a VALU instruction it must 3620b57cec5SDimitry Andric // be one with a carry-out.) 3630b57cec5SDimitry Andric bool SkipAnding = false; 3640b57cec5SDimitry Andric if (MI.getOperand(1).isReg()) { 3650b57cec5SDimitry Andric if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) { 3660b57cec5SDimitry Andric SkipAnding = Def->getParent() == MI.getParent() 3670b57cec5SDimitry Andric && SIInstrInfo::isVALU(*Def); 3680b57cec5SDimitry Andric } 3690b57cec5SDimitry Andric } 3700b57cec5SDimitry Andric 3710b57cec5SDimitry Andric // AND the break condition operand with exec, then OR that into the "loop 3720b57cec5SDimitry Andric // exit" mask. 3730b57cec5SDimitry Andric MachineInstr *And = nullptr, *Or = nullptr; 3740b57cec5SDimitry Andric if (!SkipAnding) { 375*480093f4SDimitry Andric Register AndReg = MRI->createVirtualRegister(BoolRC); 376*480093f4SDimitry Andric And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg) 3770b57cec5SDimitry Andric .addReg(Exec) 3780b57cec5SDimitry Andric .add(MI.getOperand(1)); 3790b57cec5SDimitry Andric Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) 380*480093f4SDimitry Andric .addReg(AndReg) 3810b57cec5SDimitry Andric .add(MI.getOperand(2)); 382*480093f4SDimitry Andric if (LIS) 383*480093f4SDimitry Andric LIS->createAndComputeVirtRegInterval(AndReg); 3840b57cec5SDimitry Andric } else 3850b57cec5SDimitry Andric Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst) 3860b57cec5SDimitry Andric .add(MI.getOperand(1)) 3870b57cec5SDimitry Andric .add(MI.getOperand(2)); 3880b57cec5SDimitry Andric 3890b57cec5SDimitry Andric if (LIS) { 3900b57cec5SDimitry Andric if (And) 3910b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*And); 3920b57cec5SDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *Or); 3930b57cec5SDimitry Andric } 3940b57cec5SDimitry Andric 3950b57cec5SDimitry Andric MI.eraseFromParent(); 3960b57cec5SDimitry Andric } 3970b57cec5SDimitry Andric 3980b57cec5SDimitry Andric void SILowerControlFlow::emitLoop(MachineInstr &MI) { 3990b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 4000b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 4010b57cec5SDimitry Andric 4020b57cec5SDimitry Andric MachineInstr *AndN2 = 4030b57cec5SDimitry Andric BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec) 4040b57cec5SDimitry Andric .addReg(Exec) 4050b57cec5SDimitry Andric .add(MI.getOperand(0)); 4060b57cec5SDimitry Andric 4070b57cec5SDimitry Andric MachineInstr *Branch = 4080b57cec5SDimitry Andric BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) 4090b57cec5SDimitry Andric .add(MI.getOperand(1)); 4100b57cec5SDimitry Andric 4110b57cec5SDimitry Andric if (LIS) { 4120b57cec5SDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *AndN2); 4130b57cec5SDimitry Andric LIS->InsertMachineInstrInMaps(*Branch); 4140b57cec5SDimitry Andric } 4150b57cec5SDimitry Andric 4160b57cec5SDimitry Andric MI.eraseFromParent(); 4170b57cec5SDimitry Andric } 4180b57cec5SDimitry Andric 4190b57cec5SDimitry Andric void SILowerControlFlow::emitEndCf(MachineInstr &MI) { 4200b57cec5SDimitry Andric MachineBasicBlock &MBB = *MI.getParent(); 4218bcb0991SDimitry Andric MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); 4228bcb0991SDimitry Andric unsigned CFMask = MI.getOperand(0).getReg(); 4238bcb0991SDimitry Andric MachineInstr *Def = MRI.getUniqueVRegDef(CFMask); 4240b57cec5SDimitry Andric const DebugLoc &DL = MI.getDebugLoc(); 4250b57cec5SDimitry Andric 4268bcb0991SDimitry Andric MachineBasicBlock::iterator InsPt = 4278bcb0991SDimitry Andric Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def)) 4288bcb0991SDimitry Andric : MBB.begin(); 4298bcb0991SDimitry Andric MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec) 4300b57cec5SDimitry Andric .addReg(Exec) 4310b57cec5SDimitry Andric .add(MI.getOperand(0)); 4320b57cec5SDimitry Andric 4330b57cec5SDimitry Andric if (LIS) 4340b57cec5SDimitry Andric LIS->ReplaceMachineInstrInMaps(MI, *NewMI); 4350b57cec5SDimitry Andric 4360b57cec5SDimitry Andric MI.eraseFromParent(); 4370b57cec5SDimitry Andric 4380b57cec5SDimitry Andric if (LIS) 4390b57cec5SDimitry Andric LIS->handleMove(*NewMI); 4400b57cec5SDimitry Andric } 4410b57cec5SDimitry Andric 4420b57cec5SDimitry Andric // Returns replace operands for a logical operation, either single result 4430b57cec5SDimitry Andric // for exec or two operands if source was another equivalent operation. 4440b57cec5SDimitry Andric void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo, 4450b57cec5SDimitry Andric SmallVectorImpl<MachineOperand> &Src) const { 4460b57cec5SDimitry Andric MachineOperand &Op = MI.getOperand(OpNo); 4478bcb0991SDimitry Andric if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) { 4480b57cec5SDimitry Andric Src.push_back(Op); 4490b57cec5SDimitry Andric return; 4500b57cec5SDimitry Andric } 4510b57cec5SDimitry Andric 4520b57cec5SDimitry Andric MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg()); 4530b57cec5SDimitry Andric if (!Def || Def->getParent() != MI.getParent() || 4540b57cec5SDimitry Andric !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode()))) 4550b57cec5SDimitry Andric return; 4560b57cec5SDimitry Andric 4570b57cec5SDimitry Andric // Make sure we do not modify exec between def and use. 4580b57cec5SDimitry Andric // A copy with implcitly defined exec inserted earlier is an exclusion, it 4590b57cec5SDimitry Andric // does not really modify exec. 4600b57cec5SDimitry Andric for (auto I = Def->getIterator(); I != MI.getIterator(); ++I) 4610b57cec5SDimitry Andric if (I->modifiesRegister(AMDGPU::EXEC, TRI) && 4620b57cec5SDimitry Andric !(I->isCopy() && I->getOperand(0).getReg() != Exec)) 4630b57cec5SDimitry Andric return; 4640b57cec5SDimitry Andric 4650b57cec5SDimitry Andric for (const auto &SrcOp : Def->explicit_operands()) 4660b57cec5SDimitry Andric if (SrcOp.isReg() && SrcOp.isUse() && 4678bcb0991SDimitry Andric (Register::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == Exec)) 4680b57cec5SDimitry Andric Src.push_back(SrcOp); 4690b57cec5SDimitry Andric } 4700b57cec5SDimitry Andric 4710b57cec5SDimitry Andric // Search and combine pairs of equivalent instructions, like 4720b57cec5SDimitry Andric // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y 4730b57cec5SDimitry Andric // S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y 4740b57cec5SDimitry Andric // One of the operands is exec mask. 4750b57cec5SDimitry Andric void SILowerControlFlow::combineMasks(MachineInstr &MI) { 4760b57cec5SDimitry Andric assert(MI.getNumExplicitOperands() == 3); 4770b57cec5SDimitry Andric SmallVector<MachineOperand, 4> Ops; 4780b57cec5SDimitry Andric unsigned OpToReplace = 1; 4790b57cec5SDimitry Andric findMaskOperands(MI, 1, Ops); 4800b57cec5SDimitry Andric if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy 4810b57cec5SDimitry Andric findMaskOperands(MI, 2, Ops); 4820b57cec5SDimitry Andric if (Ops.size() != 3) return; 4830b57cec5SDimitry Andric 4840b57cec5SDimitry Andric unsigned UniqueOpndIdx; 4850b57cec5SDimitry Andric if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2; 4860b57cec5SDimitry Andric else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; 4870b57cec5SDimitry Andric else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1; 4880b57cec5SDimitry Andric else return; 4890b57cec5SDimitry Andric 4908bcb0991SDimitry Andric Register Reg = MI.getOperand(OpToReplace).getReg(); 4910b57cec5SDimitry Andric MI.RemoveOperand(OpToReplace); 4920b57cec5SDimitry Andric MI.addOperand(Ops[UniqueOpndIdx]); 4930b57cec5SDimitry Andric if (MRI->use_empty(Reg)) 4940b57cec5SDimitry Andric MRI->getUniqueVRegDef(Reg)->eraseFromParent(); 4950b57cec5SDimitry Andric } 4960b57cec5SDimitry Andric 4970b57cec5SDimitry Andric bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { 4980b57cec5SDimitry Andric const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); 4990b57cec5SDimitry Andric TII = ST.getInstrInfo(); 5000b57cec5SDimitry Andric TRI = &TII->getRegisterInfo(); 5010b57cec5SDimitry Andric 5020b57cec5SDimitry Andric // This doesn't actually need LiveIntervals, but we can preserve them. 5030b57cec5SDimitry Andric LIS = getAnalysisIfAvailable<LiveIntervals>(); 5040b57cec5SDimitry Andric MRI = &MF.getRegInfo(); 5050b57cec5SDimitry Andric BoolRC = TRI->getBoolRC(); 5060b57cec5SDimitry Andric 5070b57cec5SDimitry Andric if (ST.isWave32()) { 5080b57cec5SDimitry Andric AndOpc = AMDGPU::S_AND_B32; 5090b57cec5SDimitry Andric OrOpc = AMDGPU::S_OR_B32; 5100b57cec5SDimitry Andric XorOpc = AMDGPU::S_XOR_B32; 5110b57cec5SDimitry Andric MovTermOpc = AMDGPU::S_MOV_B32_term; 5120b57cec5SDimitry Andric Andn2TermOpc = AMDGPU::S_ANDN2_B32_term; 5130b57cec5SDimitry Andric XorTermrOpc = AMDGPU::S_XOR_B32_term; 5140b57cec5SDimitry Andric OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32; 5150b57cec5SDimitry Andric Exec = AMDGPU::EXEC_LO; 5160b57cec5SDimitry Andric } else { 5170b57cec5SDimitry Andric AndOpc = AMDGPU::S_AND_B64; 5180b57cec5SDimitry Andric OrOpc = AMDGPU::S_OR_B64; 5190b57cec5SDimitry Andric XorOpc = AMDGPU::S_XOR_B64; 5200b57cec5SDimitry Andric MovTermOpc = AMDGPU::S_MOV_B64_term; 5210b57cec5SDimitry Andric Andn2TermOpc = AMDGPU::S_ANDN2_B64_term; 5220b57cec5SDimitry Andric XorTermrOpc = AMDGPU::S_XOR_B64_term; 5230b57cec5SDimitry Andric OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64; 5240b57cec5SDimitry Andric Exec = AMDGPU::EXEC; 5250b57cec5SDimitry Andric } 5260b57cec5SDimitry Andric 5270b57cec5SDimitry Andric MachineFunction::iterator NextBB; 5280b57cec5SDimitry Andric for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); 5290b57cec5SDimitry Andric BI != BE; BI = NextBB) { 5300b57cec5SDimitry Andric NextBB = std::next(BI); 5310b57cec5SDimitry Andric MachineBasicBlock &MBB = *BI; 5320b57cec5SDimitry Andric 5330b57cec5SDimitry Andric MachineBasicBlock::iterator I, Next, Last; 5340b57cec5SDimitry Andric 5350b57cec5SDimitry Andric for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) { 5360b57cec5SDimitry Andric Next = std::next(I); 5370b57cec5SDimitry Andric MachineInstr &MI = *I; 5380b57cec5SDimitry Andric 5390b57cec5SDimitry Andric switch (MI.getOpcode()) { 5400b57cec5SDimitry Andric case AMDGPU::SI_IF: 5410b57cec5SDimitry Andric emitIf(MI); 5420b57cec5SDimitry Andric break; 5430b57cec5SDimitry Andric 5440b57cec5SDimitry Andric case AMDGPU::SI_ELSE: 5450b57cec5SDimitry Andric emitElse(MI); 5460b57cec5SDimitry Andric break; 5470b57cec5SDimitry Andric 5480b57cec5SDimitry Andric case AMDGPU::SI_IF_BREAK: 5490b57cec5SDimitry Andric emitIfBreak(MI); 5500b57cec5SDimitry Andric break; 5510b57cec5SDimitry Andric 5520b57cec5SDimitry Andric case AMDGPU::SI_LOOP: 5530b57cec5SDimitry Andric emitLoop(MI); 5540b57cec5SDimitry Andric break; 5550b57cec5SDimitry Andric 5560b57cec5SDimitry Andric case AMDGPU::SI_END_CF: 5570b57cec5SDimitry Andric emitEndCf(MI); 5580b57cec5SDimitry Andric break; 5590b57cec5SDimitry Andric 5600b57cec5SDimitry Andric case AMDGPU::S_AND_B64: 5610b57cec5SDimitry Andric case AMDGPU::S_OR_B64: 5620b57cec5SDimitry Andric case AMDGPU::S_AND_B32: 5630b57cec5SDimitry Andric case AMDGPU::S_OR_B32: 5640b57cec5SDimitry Andric // Cleanup bit manipulations on exec mask 5650b57cec5SDimitry Andric combineMasks(MI); 5660b57cec5SDimitry Andric Last = I; 5670b57cec5SDimitry Andric continue; 5680b57cec5SDimitry Andric 5690b57cec5SDimitry Andric default: 5700b57cec5SDimitry Andric Last = I; 5710b57cec5SDimitry Andric continue; 5720b57cec5SDimitry Andric } 5730b57cec5SDimitry Andric 5740b57cec5SDimitry Andric // Replay newly inserted code to combine masks 5750b57cec5SDimitry Andric Next = (Last == MBB.end()) ? MBB.begin() : Last; 5760b57cec5SDimitry Andric } 5770b57cec5SDimitry Andric } 5780b57cec5SDimitry Andric 5790b57cec5SDimitry Andric return true; 5800b57cec5SDimitry Andric } 581