1 //===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// GlobalISel pass that selects divergent i1 phis as lane mask phis. 11 /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies. 12 /// Handles all cases of temporal divergence. 13 /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass 14 /// currently depends on LCSSA to insert phis with one incoming. 15 // 16 //===----------------------------------------------------------------------===// 17 18 #include "AMDGPU.h" 19 #include "AMDGPUGlobalISelUtils.h" 20 #include "SILowerI1Copies.h" 21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h" 23 #include "llvm/CodeGen/MachineUniformityAnalysis.h" 24 #include "llvm/InitializePasses.h" 25 26 #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering" 27 28 using namespace llvm; 29 30 namespace { 31 32 class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass { 33 public: 34 static char ID; 35 36 public: 37 AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {} 38 39 bool runOnMachineFunction(MachineFunction &MF) override; 40 41 StringRef getPassName() const override { 42 return "AMDGPU GlobalISel divergence lowering"; 43 } 44 45 void getAnalysisUsage(AnalysisUsage &AU) const override { 46 AU.setPreservesCFG(); 47 AU.addRequired<MachineDominatorTreeWrapperPass>(); 48 AU.addRequired<MachinePostDominatorTreeWrapperPass>(); 49 AU.addRequired<MachineUniformityAnalysisPass>(); 50 MachineFunctionPass::getAnalysisUsage(AU); 51 } 52 }; 53 54 class DivergenceLoweringHelper : public PhiLoweringHelper { 55 public: 56 DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, 57 MachinePostDominatorTree *PDT, 58 MachineUniformityInfo *MUI); 59 60 private: 61 MachineUniformityInfo *MUI = nullptr; 62 MachineIRBuilder B; 63 Register buildRegCopyToLaneMask(Register Reg); 64 65 public: 66 void markAsLaneMask(Register DstReg) const override; 67 void getCandidatesForLowering( 68 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override; 69 void collectIncomingValuesFromPhi( 70 const MachineInstr *MI, 71 SmallVectorImpl<Incoming> &Incomings) const override; 72 void replaceDstReg(Register NewReg, Register OldReg, 73 MachineBasicBlock *MBB) override; 74 void buildMergeLaneMasks(MachineBasicBlock &MBB, 75 MachineBasicBlock::iterator I, const DebugLoc &DL, 76 Register DstReg, Register PrevReg, 77 Register CurReg) override; 78 void constrainAsLaneMask(Incoming &In) override; 79 80 bool lowerTemporalDivergence(); 81 bool lowerTemporalDivergenceI1(); 82 }; 83 84 DivergenceLoweringHelper::DivergenceLoweringHelper( 85 MachineFunction *MF, MachineDominatorTree *DT, 86 MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI) 87 : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {} 88 89 // _(s1) -> SReg_32/64(s1) 90 void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const { 91 assert(MRI->getType(DstReg) == LLT::scalar(1)); 92 93 if (MRI->getRegClassOrNull(DstReg)) { 94 if (MRI->constrainRegClass(DstReg, ST->getBoolRC())) 95 return; 96 llvm_unreachable("Failed to constrain register class"); 97 } 98 99 MRI->setRegClass(DstReg, ST->getBoolRC()); 100 } 101 102 void DivergenceLoweringHelper::getCandidatesForLowering( 103 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const { 104 LLT S1 = LLT::scalar(1); 105 106 // Add divergent i1 phis to the list 107 for (MachineBasicBlock &MBB : *MF) { 108 for (MachineInstr &MI : MBB.phis()) { 109 Register Dst = MI.getOperand(0).getReg(); 110 if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst)) 111 Vreg1Phis.push_back(&MI); 112 } 113 } 114 } 115 116 void DivergenceLoweringHelper::collectIncomingValuesFromPhi( 117 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const { 118 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) { 119 Incomings.emplace_back(MI->getOperand(i).getReg(), 120 MI->getOperand(i + 1).getMBB(), Register()); 121 } 122 } 123 124 void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg, 125 MachineBasicBlock *MBB) { 126 BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg) 127 .addReg(NewReg); 128 } 129 130 // Copy Reg to new lane mask register, insert a copy after instruction that 131 // defines Reg while skipping phis if needed. 132 Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) { 133 Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs); 134 MachineInstr *Instr = MRI->getVRegDef(Reg); 135 MachineBasicBlock *MBB = Instr->getParent(); 136 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator()))); 137 B.buildCopy(LaneMask, Reg); 138 return LaneMask; 139 } 140 141 // bb.previous 142 // %PrevReg = ... 143 // 144 // bb.current 145 // %CurReg = ... 146 // 147 // %DstReg - not defined 148 // 149 // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT) 150 // 151 // bb.previous 152 // %PrevReg = ... 153 // %PrevRegCopy:sreg_32(s1) = COPY %PrevReg 154 // 155 // bb.current 156 // %CurReg = ... 157 // %CurRegCopy:sreg_32(s1) = COPY %CurReg 158 // ... 159 // %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0 160 // %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0 161 // %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg 162 // 163 // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg 164 void DivergenceLoweringHelper::buildMergeLaneMasks( 165 MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, 166 Register DstReg, Register PrevReg, Register CurReg) { 167 // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC) 168 // TODO: check if inputs are constants or results of a compare. 169 170 Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg); 171 Register CurRegCopy = buildRegCopyToLaneMask(CurReg); 172 Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 173 Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs); 174 175 B.setInsertPt(MBB, I); 176 B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg}); 177 B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy}); 178 B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg}); 179 } 180 181 // GlobalISel has to constrain S1 incoming taken as-is with lane mask register 182 // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block, 183 // Incoming.Reg becomes that new lane mask. 184 void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { 185 B.setInsertPt(*In.Block, In.Block->getFirstTerminator()); 186 187 auto Copy = B.buildCopy(LLT::scalar(1), In.Reg); 188 MRI->setRegClass(Copy.getReg(0), ST->getBoolRC()); 189 In.Reg = Copy.getReg(0); 190 } 191 192 void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst, 193 Register NewReg) { 194 for (MachineOperand &Op : Inst->operands()) { 195 if (Op.isReg() && Op.getReg() == Reg) 196 Op.setReg(NewReg); 197 } 198 } 199 200 bool DivergenceLoweringHelper::lowerTemporalDivergence() { 201 AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF); 202 DenseMap<Register, Register> TDCache; 203 204 for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) { 205 if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) || 206 ILMA.isS32S64LaneMask(Reg)) 207 continue; 208 209 Register CachedTDCopy = TDCache.lookup(Reg); 210 if (CachedTDCopy) { 211 replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy); 212 continue; 213 } 214 215 MachineInstr *Inst = MRI->getVRegDef(Reg); 216 MachineBasicBlock *MBB = Inst->getParent(); 217 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator()))); 218 219 Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg)); 220 B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg}) 221 .addUse(ExecReg, RegState::Implicit); 222 223 replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg); 224 TDCache[Reg] = VgprReg; 225 } 226 return false; 227 } 228 229 bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() { 230 MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)}; 231 initializeLaneMaskRegisterAttributes(BoolS1); 232 MachineSSAUpdater SSAUpdater(*MF); 233 234 // In case of use outside muliple nested cycles or muliple uses we only need 235 // to merge lane mask across largest relevant cycle. 236 SmallDenseMap<Register, std::pair<const MachineCycle *, Register>> LRCCache; 237 for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) { 238 if (MRI->getType(Reg) != LLT::scalar(1)) 239 continue; 240 241 auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg); 242 auto &CycleMergedMask = LRCCacheIter->getSecond(); 243 const MachineCycle *&CachedLRC = CycleMergedMask.first; 244 if (RegNotCached || LRC->contains(CachedLRC)) { 245 CachedLRC = LRC; 246 } 247 } 248 249 for (auto &LRCCacheEntry : LRCCache) { 250 Register Reg = LRCCacheEntry.first; 251 auto &CycleMergedMask = LRCCacheEntry.getSecond(); 252 const MachineCycle *Cycle = CycleMergedMask.first; 253 254 Register MergedMask = MRI->createVirtualRegister(BoolS1); 255 SSAUpdater.Initialize(MergedMask); 256 257 MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent(); 258 SSAUpdater.AddAvailableValue(MBB, MergedMask); 259 260 for (auto Entry : Cycle->getEntries()) { 261 for (MachineBasicBlock *Pred : Entry->predecessors()) { 262 if (!Cycle->contains(Pred)) { 263 B.setInsertPt(*Pred, Pred->getFirstTerminator()); 264 auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {}); 265 SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0)); 266 } 267 } 268 } 269 270 buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask, 271 SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg); 272 273 CycleMergedMask.second = MergedMask; 274 } 275 276 for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) { 277 if (MRI->getType(Reg) != LLT::scalar(1)) 278 continue; 279 280 replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second); 281 } 282 283 return false; 284 } 285 286 } // End anonymous namespace. 287 288 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, 289 "AMDGPU GlobalISel divergence lowering", false, false) 290 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) 291 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) 292 INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass) 293 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE, 294 "AMDGPU GlobalISel divergence lowering", false, false) 295 296 char AMDGPUGlobalISelDivergenceLowering::ID = 0; 297 298 char &llvm::AMDGPUGlobalISelDivergenceLoweringID = 299 AMDGPUGlobalISelDivergenceLowering::ID; 300 301 FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() { 302 return new AMDGPUGlobalISelDivergenceLowering(); 303 } 304 305 bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction( 306 MachineFunction &MF) { 307 MachineDominatorTree &DT = 308 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); 309 MachinePostDominatorTree &PDT = 310 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); 311 MachineUniformityInfo &MUI = 312 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); 313 314 DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI); 315 316 bool Changed = false; 317 // Temporal divergence lowering needs to inspect list of instructions used 318 // outside cycle with divergent exit provided by uniformity analysis. Uniform 319 // instructions from the list require lowering, no instruction is deleted. 320 // Thus it needs to be run before lowerPhis that deletes phis that require 321 // lowering and replaces them with new instructions. 322 323 // Non-i1 temporal divergence lowering. 324 Changed |= Helper.lowerTemporalDivergence(); 325 // This covers both uniform and divergent i1s. Lane masks are in sgpr and need 326 // to be updated in each iteration. 327 Changed |= Helper.lowerTemporalDivergenceI1(); 328 // Temporal divergence lowering of divergent i1 phi used outside of the cycle 329 // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1 330 // since in some case lowerPhis does unnecessary lane mask merging. 331 Changed |= Helper.lowerPhis(); 332 return Changed; 333 } 334