xref: /freebsd/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// GlobalISel pass that selects divergent i1 phis as lane mask phis.
11 /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
12 /// Handles all cases of temporal divergence.
13 /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
14 /// currently depends on LCSSA to insert phis with one incoming.
15 //
16 //===----------------------------------------------------------------------===//
17 
18 #include "AMDGPU.h"
19 #include "AMDGPUGlobalISelUtils.h"
20 #include "SILowerI1Copies.h"
21 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
24 #include "llvm/InitializePasses.h"
25 
26 #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
27 
28 using namespace llvm;
29 
30 namespace {
31 
32 class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
33 public:
34   static char ID;
35 
36 public:
AMDGPUGlobalISelDivergenceLowering()37   AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {}
38 
39   bool runOnMachineFunction(MachineFunction &MF) override;
40 
getPassName() const41   StringRef getPassName() const override {
42     return "AMDGPU GlobalISel divergence lowering";
43   }
44 
getAnalysisUsage(AnalysisUsage & AU) const45   void getAnalysisUsage(AnalysisUsage &AU) const override {
46     AU.setPreservesCFG();
47     AU.addRequired<MachineDominatorTreeWrapperPass>();
48     AU.addRequired<MachinePostDominatorTreeWrapperPass>();
49     AU.addRequired<MachineUniformityAnalysisPass>();
50     MachineFunctionPass::getAnalysisUsage(AU);
51   }
52 };
53 
54 class DivergenceLoweringHelper : public PhiLoweringHelper {
55 public:
56   DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
57                            MachinePostDominatorTree *PDT,
58                            MachineUniformityInfo *MUI);
59 
60 private:
61   MachineUniformityInfo *MUI = nullptr;
62   MachineIRBuilder B;
63   Register buildRegCopyToLaneMask(Register Reg);
64 
65 public:
66   void markAsLaneMask(Register DstReg) const override;
67   void getCandidatesForLowering(
68       SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
69   void collectIncomingValuesFromPhi(
70       const MachineInstr *MI,
71       SmallVectorImpl<Incoming> &Incomings) const override;
72   void replaceDstReg(Register NewReg, Register OldReg,
73                      MachineBasicBlock *MBB) override;
74   void buildMergeLaneMasks(MachineBasicBlock &MBB,
75                            MachineBasicBlock::iterator I, const DebugLoc &DL,
76                            Register DstReg, Register PrevReg,
77                            Register CurReg) override;
78   void constrainAsLaneMask(Incoming &In) override;
79 
80   bool lowerTemporalDivergence();
81   bool lowerTemporalDivergenceI1();
82 };
83 
DivergenceLoweringHelper(MachineFunction * MF,MachineDominatorTree * DT,MachinePostDominatorTree * PDT,MachineUniformityInfo * MUI)84 DivergenceLoweringHelper::DivergenceLoweringHelper(
85     MachineFunction *MF, MachineDominatorTree *DT,
86     MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
87     : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
88 
89 // _(s1) -> SReg_32/64(s1)
markAsLaneMask(Register DstReg) const90 void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
91   assert(MRI->getType(DstReg) == LLT::scalar(1));
92 
93   if (MRI->getRegClassOrNull(DstReg)) {
94     if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
95       return;
96     llvm_unreachable("Failed to constrain register class");
97   }
98 
99   MRI->setRegClass(DstReg, ST->getBoolRC());
100 }
101 
getCandidatesForLowering(SmallVectorImpl<MachineInstr * > & Vreg1Phis) const102 void DivergenceLoweringHelper::getCandidatesForLowering(
103     SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
104   LLT S1 = LLT::scalar(1);
105 
106   // Add divergent i1 phis to the list
107   for (MachineBasicBlock &MBB : *MF) {
108     for (MachineInstr &MI : MBB.phis()) {
109       Register Dst = MI.getOperand(0).getReg();
110       if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
111         Vreg1Phis.push_back(&MI);
112     }
113   }
114 }
115 
collectIncomingValuesFromPhi(const MachineInstr * MI,SmallVectorImpl<Incoming> & Incomings) const116 void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
117     const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
118   for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
119     Incomings.emplace_back(MI->getOperand(i).getReg(),
120                            MI->getOperand(i + 1).getMBB(), Register());
121   }
122 }
123 
replaceDstReg(Register NewReg,Register OldReg,MachineBasicBlock * MBB)124 void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
125                                              MachineBasicBlock *MBB) {
126   BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
127       .addReg(NewReg);
128 }
129 
130 // Copy Reg to new lane mask register, insert a copy after instruction that
131 // defines Reg while skipping phis if needed.
buildRegCopyToLaneMask(Register Reg)132 Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
133   Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
134   MachineInstr *Instr = MRI->getVRegDef(Reg);
135   MachineBasicBlock *MBB = Instr->getParent();
136   B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
137   B.buildCopy(LaneMask, Reg);
138   return LaneMask;
139 }
140 
141 // bb.previous
142 //   %PrevReg = ...
143 //
144 // bb.current
145 //   %CurReg = ...
146 //
147 //   %DstReg - not defined
148 //
149 // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
150 //
151 // bb.previous
152 //   %PrevReg = ...
153 //   %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
154 //
155 // bb.current
156 //   %CurReg = ...
157 //   %CurRegCopy:sreg_32(s1) = COPY %CurReg
158 //   ...
159 //   %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
160 //   %CurMaskedReg:sreg_32(s1)  = AND %ExecReg, CurRegCopy - inactive lanes to 0
161 //   %DstReg:sreg_32(s1)        = OR %PrevMaskedReg, CurMaskedReg
162 //
163 // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
buildMergeLaneMasks(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,Register PrevReg,Register CurReg)164 void DivergenceLoweringHelper::buildMergeLaneMasks(
165     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
166     Register DstReg, Register PrevReg, Register CurReg) {
167   // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
168   // TODO: check if inputs are constants or results of a compare.
169 
170   Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
171   Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
172   Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
173   Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
174 
175   B.setInsertPt(MBB, I);
176   B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
177   B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
178   B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
179 }
180 
181 // GlobalISel has to constrain S1 incoming taken as-is with lane mask register
182 // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
183 // Incoming.Reg becomes that new lane mask.
constrainAsLaneMask(Incoming & In)184 void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
185   B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
186 
187   auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
188   MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
189   In.Reg = Copy.getReg(0);
190 }
191 
replaceUsesOfRegInInstWith(Register Reg,MachineInstr * Inst,Register NewReg)192 void replaceUsesOfRegInInstWith(Register Reg, MachineInstr *Inst,
193                                 Register NewReg) {
194   for (MachineOperand &Op : Inst->operands()) {
195     if (Op.isReg() && Op.getReg() == Reg)
196       Op.setReg(NewReg);
197   }
198 }
199 
lowerTemporalDivergence()200 bool DivergenceLoweringHelper::lowerTemporalDivergence() {
201   AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(*MF);
202   DenseMap<Register, Register> TDCache;
203 
204   for (auto [Reg, UseInst, _] : MUI->getTemporalDivergenceList()) {
205     if (MRI->getType(Reg) == LLT::scalar(1) || MUI->isDivergent(Reg) ||
206         ILMA.isS32S64LaneMask(Reg))
207       continue;
208 
209     Register CachedTDCopy = TDCache.lookup(Reg);
210     if (CachedTDCopy) {
211       replaceUsesOfRegInInstWith(Reg, UseInst, CachedTDCopy);
212       continue;
213     }
214 
215     MachineInstr *Inst = MRI->getVRegDef(Reg);
216     MachineBasicBlock *MBB = Inst->getParent();
217     B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Inst->getIterator())));
218 
219     Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
220     B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
221         .addUse(ExecReg, RegState::Implicit);
222 
223     replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
224     TDCache[Reg] = VgprReg;
225   }
226   return false;
227 }
228 
lowerTemporalDivergenceI1()229 bool DivergenceLoweringHelper::lowerTemporalDivergenceI1() {
230   MachineRegisterInfo::VRegAttrs BoolS1 = {ST->getBoolRC(), LLT::scalar(1)};
231   initializeLaneMaskRegisterAttributes(BoolS1);
232   MachineSSAUpdater SSAUpdater(*MF);
233 
234   // In case of use outside muliple nested cycles or muliple uses we only need
235   // to merge lane mask across largest relevant cycle.
236   SmallDenseMap<Register, std::pair<const MachineCycle *, Register>> LRCCache;
237   for (auto [Reg, UseInst, LRC] : MUI->getTemporalDivergenceList()) {
238     if (MRI->getType(Reg) != LLT::scalar(1))
239       continue;
240 
241     auto [LRCCacheIter, RegNotCached] = LRCCache.try_emplace(Reg);
242     auto &CycleMergedMask = LRCCacheIter->getSecond();
243     const MachineCycle *&CachedLRC = CycleMergedMask.first;
244     if (RegNotCached || LRC->contains(CachedLRC)) {
245       CachedLRC = LRC;
246     }
247   }
248 
249   for (auto &LRCCacheEntry : LRCCache) {
250     Register Reg = LRCCacheEntry.first;
251     auto &CycleMergedMask = LRCCacheEntry.getSecond();
252     const MachineCycle *Cycle = CycleMergedMask.first;
253 
254     Register MergedMask = MRI->createVirtualRegister(BoolS1);
255     SSAUpdater.Initialize(MergedMask);
256 
257     MachineBasicBlock *MBB = MRI->getVRegDef(Reg)->getParent();
258     SSAUpdater.AddAvailableValue(MBB, MergedMask);
259 
260     for (auto Entry : Cycle->getEntries()) {
261       for (MachineBasicBlock *Pred : Entry->predecessors()) {
262         if (!Cycle->contains(Pred)) {
263           B.setInsertPt(*Pred, Pred->getFirstTerminator());
264           auto ImplDef = B.buildInstr(AMDGPU::IMPLICIT_DEF, {BoolS1}, {});
265           SSAUpdater.AddAvailableValue(Pred, ImplDef.getReg(0));
266         }
267       }
268     }
269 
270     buildMergeLaneMasks(*MBB, MBB->getFirstTerminator(), {}, MergedMask,
271                         SSAUpdater.GetValueInMiddleOfBlock(MBB), Reg);
272 
273     CycleMergedMask.second = MergedMask;
274   }
275 
276   for (auto [Reg, UseInst, Cycle] : MUI->getTemporalDivergenceList()) {
277     if (MRI->getType(Reg) != LLT::scalar(1))
278       continue;
279 
280     replaceUsesOfRegInInstWith(Reg, UseInst, LRCCache.lookup(Reg).second);
281   }
282 
283   return false;
284 }
285 
286 } // End anonymous namespace.
287 
288 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
289                       "AMDGPU GlobalISel divergence lowering", false, false)
290 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
291 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
292 INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
293 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
294                     "AMDGPU GlobalISel divergence lowering", false, false)
295 
296 char AMDGPUGlobalISelDivergenceLowering::ID = 0;
297 
298 char &llvm::AMDGPUGlobalISelDivergenceLoweringID =
299     AMDGPUGlobalISelDivergenceLowering::ID;
300 
createAMDGPUGlobalISelDivergenceLoweringPass()301 FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
302   return new AMDGPUGlobalISelDivergenceLowering();
303 }
304 
runOnMachineFunction(MachineFunction & MF)305 bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
306     MachineFunction &MF) {
307   MachineDominatorTree &DT =
308       getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
309   MachinePostDominatorTree &PDT =
310       getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
311   MachineUniformityInfo &MUI =
312       getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
313 
314   DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
315 
316   bool Changed = false;
317   // Temporal divergence lowering needs to inspect list of instructions used
318   // outside cycle with divergent exit provided by uniformity analysis. Uniform
319   // instructions from the list require lowering, no instruction is deleted.
320   // Thus it needs to be run before lowerPhis that deletes phis that require
321   // lowering and replaces them with new instructions.
322 
323   // Non-i1 temporal divergence lowering.
324   Changed |= Helper.lowerTemporalDivergence();
325   // This covers both uniform and divergent i1s. Lane masks are in sgpr and need
326   // to be updated in each iteration.
327   Changed |= Helper.lowerTemporalDivergenceI1();
328   // Temporal divergence lowering of divergent i1 phi used outside of the cycle
329   // could also be handled by lowerPhis but we do it in lowerTempDivergenceI1
330   // since in some case lowerPhis does unnecessary lane mask merging.
331   Changed |= Helper.lowerPhis();
332   return Changed;
333 }
334