1 //===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// GlobalISel pass that selects divergent i1 phis as lane mask phis.
11 /// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
12 /// Handles all cases of temporal divergence.
13 /// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
14 /// currently depends on LCSSA to insert phis with one incoming.
15 //
16 //===----------------------------------------------------------------------===//
17
18 #include "AMDGPU.h"
19 #include "SILowerI1Copies.h"
20 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
21 #include "llvm/CodeGen/MachineFunctionPass.h"
22 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
23 #include "llvm/InitializePasses.h"
24
25 #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
26
27 using namespace llvm;
28
29 namespace {
30
31 class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
32 public:
33 static char ID;
34
35 public:
AMDGPUGlobalISelDivergenceLowering()36 AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {
37 initializeAMDGPUGlobalISelDivergenceLoweringPass(
38 *PassRegistry::getPassRegistry());
39 }
40
41 bool runOnMachineFunction(MachineFunction &MF) override;
42
getPassName() const43 StringRef getPassName() const override {
44 return "AMDGPU GlobalISel divergence lowering";
45 }
46
getAnalysisUsage(AnalysisUsage & AU) const47 void getAnalysisUsage(AnalysisUsage &AU) const override {
48 AU.setPreservesCFG();
49 AU.addRequired<MachineDominatorTreeWrapperPass>();
50 AU.addRequired<MachinePostDominatorTreeWrapperPass>();
51 AU.addRequired<MachineUniformityAnalysisPass>();
52 MachineFunctionPass::getAnalysisUsage(AU);
53 }
54 };
55
56 class DivergenceLoweringHelper : public PhiLoweringHelper {
57 public:
58 DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
59 MachinePostDominatorTree *PDT,
60 MachineUniformityInfo *MUI);
61
62 private:
63 MachineUniformityInfo *MUI = nullptr;
64 MachineIRBuilder B;
65 Register buildRegCopyToLaneMask(Register Reg);
66
67 public:
68 void markAsLaneMask(Register DstReg) const override;
69 void getCandidatesForLowering(
70 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
71 void collectIncomingValuesFromPhi(
72 const MachineInstr *MI,
73 SmallVectorImpl<Incoming> &Incomings) const override;
74 void replaceDstReg(Register NewReg, Register OldReg,
75 MachineBasicBlock *MBB) override;
76 void buildMergeLaneMasks(MachineBasicBlock &MBB,
77 MachineBasicBlock::iterator I, const DebugLoc &DL,
78 Register DstReg, Register PrevReg,
79 Register CurReg) override;
80 void constrainAsLaneMask(Incoming &In) override;
81 };
82
DivergenceLoweringHelper(MachineFunction * MF,MachineDominatorTree * DT,MachinePostDominatorTree * PDT,MachineUniformityInfo * MUI)83 DivergenceLoweringHelper::DivergenceLoweringHelper(
84 MachineFunction *MF, MachineDominatorTree *DT,
85 MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
86 : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
87
88 // _(s1) -> SReg_32/64(s1)
markAsLaneMask(Register DstReg) const89 void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
90 assert(MRI->getType(DstReg) == LLT::scalar(1));
91
92 if (MRI->getRegClassOrNull(DstReg)) {
93 if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
94 return;
95 llvm_unreachable("Failed to constrain register class");
96 }
97
98 MRI->setRegClass(DstReg, ST->getBoolRC());
99 }
100
getCandidatesForLowering(SmallVectorImpl<MachineInstr * > & Vreg1Phis) const101 void DivergenceLoweringHelper::getCandidatesForLowering(
102 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
103 LLT S1 = LLT::scalar(1);
104
105 // Add divergent i1 phis to the list
106 for (MachineBasicBlock &MBB : *MF) {
107 for (MachineInstr &MI : MBB.phis()) {
108 Register Dst = MI.getOperand(0).getReg();
109 if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
110 Vreg1Phis.push_back(&MI);
111 }
112 }
113 }
114
collectIncomingValuesFromPhi(const MachineInstr * MI,SmallVectorImpl<Incoming> & Incomings) const115 void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
116 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
117 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
118 Incomings.emplace_back(MI->getOperand(i).getReg(),
119 MI->getOperand(i + 1).getMBB(), Register());
120 }
121 }
122
replaceDstReg(Register NewReg,Register OldReg,MachineBasicBlock * MBB)123 void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
124 MachineBasicBlock *MBB) {
125 BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
126 .addReg(NewReg);
127 }
128
129 // Copy Reg to new lane mask register, insert a copy after instruction that
130 // defines Reg while skipping phis if needed.
buildRegCopyToLaneMask(Register Reg)131 Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
132 Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
133 MachineInstr *Instr = MRI->getVRegDef(Reg);
134 MachineBasicBlock *MBB = Instr->getParent();
135 B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
136 B.buildCopy(LaneMask, Reg);
137 return LaneMask;
138 }
139
140 // bb.previous
141 // %PrevReg = ...
142 //
143 // bb.current
144 // %CurReg = ...
145 //
146 // %DstReg - not defined
147 //
148 // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
149 //
150 // bb.previous
151 // %PrevReg = ...
152 // %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
153 //
154 // bb.current
155 // %CurReg = ...
156 // %CurRegCopy:sreg_32(s1) = COPY %CurReg
157 // ...
158 // %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
159 // %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
160 // %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
161 //
162 // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
buildMergeLaneMasks(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register DstReg,Register PrevReg,Register CurReg)163 void DivergenceLoweringHelper::buildMergeLaneMasks(
164 MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
165 Register DstReg, Register PrevReg, Register CurReg) {
166 // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
167 // TODO: check if inputs are constants or results of a compare.
168
169 Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
170 Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
171 Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
172 Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
173
174 B.setInsertPt(MBB, I);
175 B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
176 B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
177 B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
178 }
179
180 // GlobalISel has to constrain S1 incoming taken as-is with lane mask register
181 // class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
182 // Incoming.Reg becomes that new lane mask.
constrainAsLaneMask(Incoming & In)183 void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
184 B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
185
186 auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
187 MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
188 In.Reg = Copy.getReg(0);
189 }
190
191 } // End anonymous namespace.
192
193 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
194 "AMDGPU GlobalISel divergence lowering", false, false)
195 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
196 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass)
197 INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
198 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
199 "AMDGPU GlobalISel divergence lowering", false, false)
200
201 char AMDGPUGlobalISelDivergenceLowering::ID = 0;
202
203 char &llvm::AMDGPUGlobalISelDivergenceLoweringID =
204 AMDGPUGlobalISelDivergenceLowering::ID;
205
createAMDGPUGlobalISelDivergenceLoweringPass()206 FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
207 return new AMDGPUGlobalISelDivergenceLowering();
208 }
209
runOnMachineFunction(MachineFunction & MF)210 bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
211 MachineFunction &MF) {
212 MachineDominatorTree &DT =
213 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
214 MachinePostDominatorTree &PDT =
215 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
216 MachineUniformityInfo &MUI =
217 getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
218
219 DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
220
221 return Helper.lowerPhis();
222 }
223