1 //===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// GlobalISel pass that selects divergent i1 phis as lane mask phis. 11 /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies. 12 /// Handles all cases of temporal divergence. 13 /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass 14 /// currently depends on LCSSA to insert phis with one incoming. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPU.h" 19 #include "SILowerI1Copies.h" 20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 21 #include "llvm/CodeGen/MachineFunctionPass.h" 22 #include "llvm/CodeGen/MachineUniformityAnalysis.h" 23 #include "llvm/InitializePasses.h" 24 25 #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering" 26 27 using namespace llvm; 28 29 namespace { 30 31 class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { 32 public: 33 static char ID; 34 35 public: 36 AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) { 37 initializeAMDGPUGlobalISelDivergenceLoweringPass( 38 *PassRegistry::getPassRegistry()); 39 } 40 41 bool runOnMachineFunction(MachineFunction &MF) override; 42 43 StringRef getPassName() const override { 44 return "AMDGPU GlobalISel divergence lowering"; 45 } 46 47 void getAnalysisUsage(AnalysisUsage &AU) const override { 48 AU.setPreservesCFG(); 49 AU.addRequired<MachineDominatorTreeWrapperPass>(); 50 AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 51 AU.addRequired<MachineUniformityAnalysisPass>(); 52 MachineFunctionPass::getAnalysisUsage(AU); 53 } 54 }; 55 56 class DivergenceLoweringHelper : public PhiLoweringHelper { 57 public: 58 DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, 59 MachinePostDominatorTree *PDT, 60 MachineUniformityInfo *MUI); 61 62 private: 63 MachineUniformityInfo *MUI = nullptr; 64 MachineIRBuilder B; 65 Register buildRegCopyToLaneMask(Register Reg); 66 67 public: 68 void markAsLaneMask(Register DstReg) const override; 69 void getCandidatesForLowering( 70 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; 71 void collectIncomingValuesFromPhi( 72 const MachineInstr *MI, 73 SmallVectorImpl<Incoming> &Incomings) const override; 74 void replaceDstReg(Register NewReg, Register OldReg, 75 MachineBasicBlock *MBB) override; 76 void buildMergeLaneMasks(MachineBasicBlock &MBB, 77 MachineBasicBlock::iterator I, const DebugLoc &DL, 78 Register DstReg, Register PrevReg, 79 Register CurReg) override; 80 void constrainAsLaneMask(Incoming &In) override; 81 }; 82 83 DivergenceLoweringHelper::DivergenceLoweringHelper( 84 MachineFunction *MF, MachineDominatorTree *DT, 85 MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI) 86 : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {} 87 88 // _(s1) -> SReg_32/64(s1) 89 void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const { 90 assert(MRI->getType(DstReg) == LLT::scalar(1)); 91 92 if (MRI->getRegClassOrNull(DstReg)) { 93 if (MRI->constrainRegClass(DstReg, ST->getBoolRC())) 94 return; 95 llvm_unreachable("Failed to constrain register class"); 96 } 97 98 MRI->setRegClass(DstReg, ST->getBoolRC()); 99 } 100 101 void DivergenceLoweringHelper::getCandidatesForLowering( 102 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { 103 LLT S1 = LLT::scalar(1); 104 105 // Add divergent i1 phis to the list 106 for (MachineBasicBlock &MBB : *MF) { 107 for (MachineInstr &MI : MBB.phis()) { 108 Register Dst = MI.getOperand(0).getReg(); 109 if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst)) 110 Vreg1Phis.push_back(&MI); 111 } 112 } 113 } 114 115 void DivergenceLoweringHelper::collectIncomingValuesFromPhi( 116 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { 117 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 118 Incomings.emplace_back(MI->getOperand(i).getReg(), 119 MI->getOperand(i + 1).getMBB(), Register()); 120 } 121 } 122 123 void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg, 124 MachineBasicBlock *MBB) { 125 BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg) 126 .addReg(NewReg); 127 } 128 129 // Copy Reg to new lane mask register, insert a copy after instruction that 130 // defines Reg while skipping phis if needed. 131 Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) { 132 Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs); 133 MachineInstr *Instr = MRI->getVRegDef(Reg); 134 MachineBasicBlock *MBB = Instr->getParent(); 135 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator()))); 136 B.buildCopy(LaneMask, Reg); 137 return LaneMask; 138 } 139 140 // bb.previous 141 // %PrevReg = ... 142 // 143 // bb.current 144 // %CurReg = ... 145 // 146 // %DstReg - not defined 147 // 148 // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) 149 // 150 // bb.previous 151 // %PrevReg = ... 152 // %PrevRegCopy:sreg_32(s1) = COPY %PrevReg 153 // 154 // bb.current 155 // %CurReg = ... 156 // %CurRegCopy:sreg_32(s1) = COPY %CurReg 157 // ... 158 // %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 159 // %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 160 // %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg 161 // 162 // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg 163 void DivergenceLoweringHelper::buildMergeLaneMasks( 164 MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, 165 Register DstReg, Register PrevReg, Register CurReg) { 166 // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC) 167 // TODO: check if inputs are constants or results of a compare. 168 169 Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg); 170 Register CurRegCopy = buildRegCopyToLaneMask(CurReg); 171 Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 172 Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 173 174 B.setInsertPt(MBB, I); 175 B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg}); 176 B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy}); 177 B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg}); 178 } 179 180 // GlobalISel has to constrain S1 incoming taken as-is with lane mask register 181 // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block, 182 // Incoming.Reg becomes that new lane mask. 183 void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { 184 B.setInsertPt(*In.Block, In.Block->getFirstTerminator()); 185 186 auto Copy = B.buildCopy(LLT::scalar(1), In.Reg); 187 MRI->setRegClass(Copy.getReg(0), ST->getBoolRC()); 188 In.Reg = Copy.getReg(0); 189 } 190 191 } // End anonymous namespace. 192 193 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, 194 "AMDGPU GlobalISel divergence lowering", false, false) 195 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 196 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 197 INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) 198 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, 199 "AMDGPU GlobalISel divergence lowering", false, false) 200 201 char AMDGPUGlobalISelDivergenceLowering::ID = 0; 202 203 char &llvm::AMDGPUGlobalISelDivergenceLoweringID = 204 AMDGPUGlobalISelDivergenceLowering::ID; 205 206 FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { 207 return new AMDGPUGlobalISelDivergenceLowering(); 208 } 209 210 bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( 211 MachineFunction &MF) { 212 MachineDominatorTree &DT = 213 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 214 MachinePostDominatorTree &PDT = 215 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); 216 MachineUniformityInfo &MUI = 217 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); 218 219 DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI); 220 221 return Helper.lowerPhis(); 222 } 223