1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "llvm/CodeGen/MachineFunctionPass.h"
18
19 using namespace llvm;
20
21 #define DEBUG_TYPE "si-pre-emit-peephole"
22
23 static unsigned SkipThreshold;
24
25 static cl::opt<unsigned, true> SkipThresholdFlag(
26 "amdgpu-skip-threshold", cl::Hidden,
27 cl::desc(
28 "Number of instructions before jumping over divergent control flow"),
29 cl::location(SkipThreshold), cl::init(12));
30
31 namespace {
32
33 class SIPreEmitPeephole : public MachineFunctionPass {
34 private:
35 const SIInstrInfo *TII = nullptr;
36 const SIRegisterInfo *TRI = nullptr;
37
38 bool optimizeVccBranch(MachineInstr &MI) const;
39 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
40 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
41 MachineBasicBlock *&TrueMBB,
42 MachineBasicBlock *&FalseMBB,
43 SmallVectorImpl<MachineOperand> &Cond);
44 bool mustRetainExeczBranch(const MachineBasicBlock &From,
45 const MachineBasicBlock &To) const;
46 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
47
48 public:
49 static char ID;
50
SIPreEmitPeephole()51 SIPreEmitPeephole() : MachineFunctionPass(ID) {
52 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
53 }
54
55 bool runOnMachineFunction(MachineFunction &MF) override;
56 };
57
58 } // End anonymous namespace.
59
60 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
61 "SI peephole optimizations", false, false)
62
63 char SIPreEmitPeephole::ID = 0;
64
65 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
66
optimizeVccBranch(MachineInstr & MI) const67 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
68 // Match:
69 // sreg = -1 or 0
70 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
71 // S_CBRANCH_VCC[N]Z
72 // =>
73 // S_CBRANCH_EXEC[N]Z
74 // We end up with this pattern sometimes after basic block placement.
75 // It happens while combining a block which assigns -1 or 0 to a saved mask
76 // and another block which consumes that saved mask and then a branch.
77 //
78 // While searching this also performs the following substitution:
79 // vcc = V_CMP
80 // vcc = S_AND exec, vcc
81 // S_CBRANCH_VCC[N]Z
82 // =>
83 // vcc = V_CMP
84 // S_CBRANCH_VCC[N]Z
85
86 bool Changed = false;
87 MachineBasicBlock &MBB = *MI.getParent();
88 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
89 const bool IsWave32 = ST.isWave32();
90 const unsigned CondReg = TRI->getVCC();
91 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
92 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
93 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
94 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
95
96 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
97 E = MBB.rend();
98 bool ReadsCond = false;
99 unsigned Threshold = 5;
100 for (++A; A != E; ++A) {
101 if (!--Threshold)
102 return false;
103 if (A->modifiesRegister(ExecReg, TRI))
104 return false;
105 if (A->modifiesRegister(CondReg, TRI)) {
106 if (!A->definesRegister(CondReg, TRI) ||
107 (A->getOpcode() != And && A->getOpcode() != AndN2))
108 return false;
109 break;
110 }
111 ReadsCond |= A->readsRegister(CondReg, TRI);
112 }
113 if (A == E)
114 return false;
115
116 MachineOperand &Op1 = A->getOperand(1);
117 MachineOperand &Op2 = A->getOperand(2);
118 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
119 TII->commuteInstruction(*A);
120 Changed = true;
121 }
122 if (Op1.getReg() != ExecReg)
123 return Changed;
124 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
125 return Changed;
126
127 int64_t MaskValue = 0;
128 Register SReg;
129 if (Op2.isReg()) {
130 SReg = Op2.getReg();
131 auto M = std::next(A);
132 bool ReadsSreg = false;
133 bool ModifiesExec = false;
134 for (; M != E; ++M) {
135 if (M->definesRegister(SReg, TRI))
136 break;
137 if (M->modifiesRegister(SReg, TRI))
138 return Changed;
139 ReadsSreg |= M->readsRegister(SReg, TRI);
140 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
141 }
142 if (M == E)
143 return Changed;
144 // If SReg is VCC and SReg definition is a VALU comparison.
145 // This means S_AND with EXEC is not required.
146 // Erase the S_AND and return.
147 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
148 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
149 TII->isVOPC(*M)) {
150 A->eraseFromParent();
151 return true;
152 }
153 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
154 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
155 return Changed;
156 MaskValue = M->getOperand(1).getImm();
157 // First if sreg is only used in the AND instruction fold the immediate
158 // into the AND.
159 if (!ReadsSreg && Op2.isKill()) {
160 A->getOperand(2).ChangeToImmediate(MaskValue);
161 M->eraseFromParent();
162 }
163 } else if (Op2.isImm()) {
164 MaskValue = Op2.getImm();
165 } else {
166 llvm_unreachable("Op2 must be register or immediate");
167 }
168
169 // Invert mask for s_andn2
170 assert(MaskValue == 0 || MaskValue == -1);
171 if (A->getOpcode() == AndN2)
172 MaskValue = ~MaskValue;
173
174 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
175 if (!MI.killsRegister(CondReg, TRI)) {
176 // Replace AND with MOV
177 if (MaskValue == 0) {
178 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
179 .addImm(0);
180 } else {
181 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
182 .addReg(ExecReg);
183 }
184 }
185 // Remove AND instruction
186 A->eraseFromParent();
187 }
188
189 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
190 if (SReg == ExecReg) {
191 // EXEC is updated directly
192 if (IsVCCZ) {
193 MI.eraseFromParent();
194 return true;
195 }
196 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
197 } else if (IsVCCZ && MaskValue == 0) {
198 // Will always branch
199 // Remove all successors shadowed by new unconditional branch
200 MachineBasicBlock *Parent = MI.getParent();
201 SmallVector<MachineInstr *, 4> ToRemove;
202 bool Found = false;
203 for (MachineInstr &Term : Parent->terminators()) {
204 if (Found) {
205 if (Term.isBranch())
206 ToRemove.push_back(&Term);
207 } else {
208 Found = Term.isIdenticalTo(MI);
209 }
210 }
211 assert(Found && "conditional branch is not terminator");
212 for (auto *BranchMI : ToRemove) {
213 MachineOperand &Dst = BranchMI->getOperand(0);
214 assert(Dst.isMBB() && "destination is not basic block");
215 Parent->removeSuccessor(Dst.getMBB());
216 BranchMI->eraseFromParent();
217 }
218
219 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
220 Parent->removeSuccessor(Succ);
221 }
222
223 // Rewrite to unconditional branch
224 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
225 } else if (!IsVCCZ && MaskValue == 0) {
226 // Will never branch
227 MachineOperand &Dst = MI.getOperand(0);
228 assert(Dst.isMBB() && "destination is not basic block");
229 MI.getParent()->removeSuccessor(Dst.getMBB());
230 MI.eraseFromParent();
231 return true;
232 } else if (MaskValue == -1) {
233 // Depends only on EXEC
234 MI.setDesc(
235 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
236 }
237
238 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
239 MI.addImplicitDefUseOperands(*MBB.getParent());
240
241 return true;
242 }
243
optimizeSetGPR(MachineInstr & First,MachineInstr & MI) const244 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
245 MachineInstr &MI) const {
246 MachineBasicBlock &MBB = *MI.getParent();
247 const MachineFunction &MF = *MBB.getParent();
248 const MachineRegisterInfo &MRI = MF.getRegInfo();
249 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
250 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
251 SmallVector<MachineInstr *, 4> ToRemove;
252 bool IdxOn = true;
253
254 if (!MI.isIdenticalTo(First))
255 return false;
256
257 // Scan back to find an identical S_SET_GPR_IDX_ON
258 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
259 E = MI.getIterator();
260 I != E; ++I) {
261 if (I->isBundle())
262 continue;
263 switch (I->getOpcode()) {
264 case AMDGPU::S_SET_GPR_IDX_MODE:
265 return false;
266 case AMDGPU::S_SET_GPR_IDX_OFF:
267 IdxOn = false;
268 ToRemove.push_back(&*I);
269 break;
270 default:
271 if (I->modifiesRegister(AMDGPU::M0, TRI))
272 return false;
273 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
274 return false;
275 if (llvm::any_of(I->operands(),
276 [&MRI, this](const MachineOperand &MO) {
277 return MO.isReg() &&
278 TRI->isVectorRegister(MRI, MO.getReg());
279 })) {
280 // The only exception allowed here is another indirect vector move
281 // with the same mode.
282 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
283 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
284 return false;
285 }
286 }
287 }
288
289 MI.eraseFromBundle();
290 for (MachineInstr *RI : ToRemove)
291 RI->eraseFromBundle();
292 return true;
293 }
294
getBlockDestinations(MachineBasicBlock & SrcMBB,MachineBasicBlock * & TrueMBB,MachineBasicBlock * & FalseMBB,SmallVectorImpl<MachineOperand> & Cond)295 bool SIPreEmitPeephole::getBlockDestinations(
296 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
297 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
298 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
299 return false;
300
301 if (!FalseMBB)
302 FalseMBB = SrcMBB.getNextNode();
303
304 return true;
305 }
306
mustRetainExeczBranch(const MachineBasicBlock & From,const MachineBasicBlock & To) const307 bool SIPreEmitPeephole::mustRetainExeczBranch(
308 const MachineBasicBlock &From, const MachineBasicBlock &To) const {
309 unsigned NumInstr = 0;
310 const MachineFunction *MF = From.getParent();
311
312 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
313 MBBI != End && MBBI != ToI; ++MBBI) {
314 const MachineBasicBlock &MBB = *MBBI;
315
316 for (const MachineInstr &MI : MBB) {
317 // When a uniform loop is inside non-uniform control flow, the branch
318 // leaving the loop might never be taken when EXEC = 0.
319 // Hence we should retain cbranch out of the loop lest it become infinite.
320 if (MI.isConditionalBranch())
321 return true;
322
323 if (MI.isMetaInstruction())
324 continue;
325
326 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
327 return true;
328
329 // These instructions are potentially expensive even if EXEC = 0.
330 if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
331 TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
332 return true;
333
334 ++NumInstr;
335 if (NumInstr >= SkipThreshold)
336 return true;
337 }
338 }
339
340 return false;
341 }
342
343 // Returns true if the skip branch instruction is removed.
removeExeczBranch(MachineInstr & MI,MachineBasicBlock & SrcMBB)344 bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
345 MachineBasicBlock &SrcMBB) {
346 MachineBasicBlock *TrueMBB = nullptr;
347 MachineBasicBlock *FalseMBB = nullptr;
348 SmallVector<MachineOperand, 1> Cond;
349
350 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
351 return false;
352
353 // Consider only the forward branches.
354 if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
355 mustRetainExeczBranch(*FalseMBB, *TrueMBB))
356 return false;
357
358 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
359 MI.eraseFromParent();
360 SrcMBB.removeSuccessor(TrueMBB);
361
362 return true;
363 }
364
runOnMachineFunction(MachineFunction & MF)365 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
366 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
367 TII = ST.getInstrInfo();
368 TRI = &TII->getRegisterInfo();
369 bool Changed = false;
370
371 MF.RenumberBlocks();
372
373 for (MachineBasicBlock &MBB : MF) {
374 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
375 // Check first terminator for branches to optimize
376 if (TermI != MBB.end()) {
377 MachineInstr &MI = *TermI;
378 switch (MI.getOpcode()) {
379 case AMDGPU::S_CBRANCH_VCCZ:
380 case AMDGPU::S_CBRANCH_VCCNZ:
381 Changed |= optimizeVccBranch(MI);
382 break;
383 case AMDGPU::S_CBRANCH_EXECZ:
384 Changed |= removeExeczBranch(MI, MBB);
385 break;
386 }
387 }
388
389 if (!ST.hasVGPRIndexMode())
390 continue;
391
392 MachineInstr *SetGPRMI = nullptr;
393 const unsigned Threshold = 20;
394 unsigned Count = 0;
395 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
396 // second is not needed. Do expensive checks in the optimizeSetGPR()
397 // and limit the distance to 20 instructions for compile time purposes.
398 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
399 // may be bundled with the instructions they modify.
400 for (auto &MI : make_early_inc_range(MBB.instrs())) {
401 if (Count == Threshold)
402 SetGPRMI = nullptr;
403 else
404 ++Count;
405
406 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
407 continue;
408
409 Count = 0;
410 if (!SetGPRMI) {
411 SetGPRMI = &MI;
412 continue;
413 }
414
415 if (optimizeSetGPR(*SetGPRMI, MI))
416 Changed = true;
417 else
418 SetGPRMI = &MI;
419 }
420 }
421
422 return Changed;
423 }
424